Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <emmintrin.h>
13 :
14 : #include "aom_dsp/aom_dsp_common.h"
15 : #include "aom_mem/aom_mem.h"
16 : #include "aom_ports/mem.h"
17 :
18 : #if CONFIG_HIGHBITDEPTH
19 0 : void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
20 : int skip_block, const int16_t *zbin_ptr,
21 : const int16_t *round_ptr,
22 : const int16_t *quant_ptr,
23 : const int16_t *quant_shift_ptr,
24 : tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
25 : const int16_t *dequant_ptr, uint16_t *eob_ptr,
26 : const int16_t *scan, const int16_t *iscan) {
27 0 : int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
28 : __m128i zbins[2];
29 : __m128i nzbins[2];
30 :
31 0 : zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
32 0 : (int)zbin_ptr[0]);
33 0 : zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
34 :
35 0 : nzbins[0] = _mm_setzero_si128();
36 0 : nzbins[1] = _mm_setzero_si128();
37 0 : nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
38 0 : nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
39 :
40 : (void)scan;
41 :
42 0 : memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
43 0 : memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
44 :
45 0 : if (!skip_block) {
46 : // Pre-scan pass
47 0 : for (i = ((int)count / 4) - 1; i >= 0; i--) {
48 : __m128i coeffs, cmp1, cmp2;
49 : int test;
50 0 : coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
51 0 : cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
52 0 : cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
53 0 : cmp1 = _mm_and_si128(cmp1, cmp2);
54 0 : test = _mm_movemask_epi8(cmp1);
55 0 : if (test == 0xffff)
56 0 : non_zero_regs--;
57 : else
58 0 : break;
59 : }
60 :
61 : // Quantization pass:
62 0 : for (i = 0; i < non_zero_regs; i++) {
63 : __m128i coeffs, coeffs_sign, tmp1, tmp2;
64 : int test;
65 : int abs_coeff[4];
66 : int coeff_sign[4];
67 :
68 0 : coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
69 0 : coeffs_sign = _mm_srai_epi32(coeffs, 31);
70 0 : coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
71 0 : tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
72 0 : tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
73 0 : tmp1 = _mm_or_si128(tmp1, tmp2);
74 0 : test = _mm_movemask_epi8(tmp1);
75 : _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
76 : _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
77 :
78 0 : for (j = 0; j < 4; j++) {
79 0 : if (test & (1 << (4 * j))) {
80 0 : int k = 4 * i + j;
81 0 : const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
82 0 : const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
83 0 : const uint32_t abs_qcoeff =
84 0 : (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
85 0 : qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
86 0 : dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
87 0 : if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
88 : }
89 : }
90 : }
91 : }
92 0 : *eob_ptr = eob_i + 1;
93 0 : }
94 :
95 0 : void aom_highbd_quantize_b_32x32_sse2(
96 : const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
97 : const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
98 : const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
99 : tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
100 : const int16_t *scan, const int16_t *iscan) {
101 : __m128i zbins[2];
102 : __m128i nzbins[2];
103 0 : int idx = 0;
104 : int idx_arr[1024];
105 0 : int i, eob = -1;
106 0 : const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
107 0 : const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
108 : (void)scan;
109 0 : zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
110 0 : zbins[1] = _mm_set1_epi32(zbin1_tmp);
111 :
112 0 : nzbins[0] = _mm_setzero_si128();
113 0 : nzbins[1] = _mm_setzero_si128();
114 0 : nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
115 0 : nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
116 :
117 0 : memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
118 0 : memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
119 :
120 0 : if (!skip_block) {
121 : // Pre-scan pass
122 0 : for (i = 0; i < n_coeffs / 4; i++) {
123 : __m128i coeffs, cmp1, cmp2;
124 : int test;
125 0 : coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
126 0 : cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
127 0 : cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
128 0 : cmp1 = _mm_and_si128(cmp1, cmp2);
129 0 : test = _mm_movemask_epi8(cmp1);
130 0 : if (!(test & 0xf)) idx_arr[idx++] = i * 4;
131 0 : if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
132 0 : if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
133 0 : if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
134 : }
135 :
136 : // Quantization pass: only process the coefficients selected in
137 : // pre-scan pass. Note: idx can be zero.
138 0 : for (i = 0; i < idx; i++) {
139 0 : const int rc = idx_arr[i];
140 0 : const int coeff = coeff_ptr[rc];
141 0 : const int coeff_sign = (coeff >> 31);
142 0 : const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
143 0 : const int64_t tmp1 =
144 0 : abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
145 0 : const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
146 0 : const uint32_t abs_qcoeff =
147 0 : (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
148 0 : qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
149 0 : dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
150 0 : if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
151 : }
152 : }
153 0 : *eob_ptr = eob + 1;
154 0 : }
155 : #endif
|