Line data Source code
1 : /*
2 : * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include <emmintrin.h>
12 : #include <xmmintrin.h>
13 :
14 : #include "./vpx_dsp_rtcd.h"
15 : #include "vpx/vpx_integer.h"
16 : #include "vpx_dsp/x86/fdct.h"
17 :
18 0 : void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
19 : int skip_block, const int16_t *zbin_ptr,
20 : const int16_t *round_ptr, const int16_t *quant_ptr,
21 : const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
22 : tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
23 : uint16_t *eob_ptr, const int16_t *scan_ptr,
24 : const int16_t *iscan_ptr) {
25 : __m128i zero;
26 : (void)scan_ptr;
27 :
28 0 : coeff_ptr += n_coeffs;
29 0 : iscan_ptr += n_coeffs;
30 0 : qcoeff_ptr += n_coeffs;
31 0 : dqcoeff_ptr += n_coeffs;
32 0 : n_coeffs = -n_coeffs;
33 0 : zero = _mm_setzero_si128();
34 0 : if (!skip_block) {
35 : __m128i eob;
36 : __m128i zbin;
37 : __m128i round, quant, dequant, shift;
38 : {
39 : __m128i coeff0, coeff1;
40 :
41 : // Setup global values
42 : {
43 : __m128i pw_1;
44 0 : zbin = _mm_load_si128((const __m128i *)zbin_ptr);
45 0 : round = _mm_load_si128((const __m128i *)round_ptr);
46 0 : quant = _mm_load_si128((const __m128i *)quant_ptr);
47 0 : pw_1 = _mm_set1_epi16(1);
48 0 : zbin = _mm_sub_epi16(zbin, pw_1);
49 0 : dequant = _mm_load_si128((const __m128i *)dequant_ptr);
50 0 : shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
51 : }
52 :
53 : {
54 : __m128i coeff0_sign, coeff1_sign;
55 : __m128i qcoeff0, qcoeff1;
56 : __m128i qtmp0, qtmp1;
57 : __m128i cmp_mask0, cmp_mask1;
58 : // Do DC and first 15 AC
59 0 : coeff0 = load_tran_low(coeff_ptr + n_coeffs);
60 0 : coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
61 :
62 : // Poor man's sign extract
63 0 : coeff0_sign = _mm_srai_epi16(coeff0, 15);
64 0 : coeff1_sign = _mm_srai_epi16(coeff1, 15);
65 0 : qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
66 0 : qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
67 0 : qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
68 0 : qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
69 :
70 0 : cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
71 0 : zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
72 0 : cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
73 0 : qcoeff0 = _mm_adds_epi16(qcoeff0, round);
74 0 : round = _mm_unpackhi_epi64(round, round);
75 0 : qcoeff1 = _mm_adds_epi16(qcoeff1, round);
76 0 : qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
77 0 : quant = _mm_unpackhi_epi64(quant, quant);
78 0 : qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
79 0 : qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
80 0 : qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
81 0 : qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
82 0 : shift = _mm_unpackhi_epi64(shift, shift);
83 0 : qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
84 :
85 : // Reinsert signs
86 0 : qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
87 0 : qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
88 0 : qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
89 0 : qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
90 :
91 : // Mask out zbin threshold coeffs
92 0 : qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
93 0 : qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
94 :
95 0 : store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
96 0 : store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
97 :
98 0 : coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
99 0 : dequant = _mm_unpackhi_epi64(dequant, dequant);
100 0 : coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
101 :
102 0 : store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
103 0 : store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
104 : }
105 :
106 : {
107 : // Scan for eob
108 : __m128i zero_coeff0, zero_coeff1;
109 : __m128i nzero_coeff0, nzero_coeff1;
110 : __m128i iscan0, iscan1;
111 : __m128i eob1;
112 0 : zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
113 0 : zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
114 0 : nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
115 0 : nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
116 0 : iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
117 0 : iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
118 : // Add one to convert from indices to counts
119 0 : iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
120 0 : iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
121 0 : eob = _mm_and_si128(iscan0, nzero_coeff0);
122 0 : eob1 = _mm_and_si128(iscan1, nzero_coeff1);
123 0 : eob = _mm_max_epi16(eob, eob1);
124 : }
125 0 : n_coeffs += 8 * 2;
126 : }
127 :
128 : // AC only loop
129 0 : while (n_coeffs < 0) {
130 : __m128i coeff0, coeff1;
131 : {
132 : __m128i coeff0_sign, coeff1_sign;
133 : __m128i qcoeff0, qcoeff1;
134 : __m128i qtmp0, qtmp1;
135 : __m128i cmp_mask0, cmp_mask1;
136 :
137 0 : coeff0 = load_tran_low(coeff_ptr + n_coeffs);
138 0 : coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
139 :
140 : // Poor man's sign extract
141 0 : coeff0_sign = _mm_srai_epi16(coeff0, 15);
142 0 : coeff1_sign = _mm_srai_epi16(coeff1, 15);
143 0 : qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
144 0 : qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
145 0 : qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
146 0 : qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
147 :
148 0 : cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
149 0 : cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
150 0 : qcoeff0 = _mm_adds_epi16(qcoeff0, round);
151 0 : qcoeff1 = _mm_adds_epi16(qcoeff1, round);
152 0 : qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
153 0 : qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
154 0 : qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
155 0 : qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
156 0 : qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
157 0 : qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
158 :
159 : // Reinsert signs
160 0 : qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
161 0 : qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
162 0 : qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
163 0 : qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
164 :
165 : // Mask out zbin threshold coeffs
166 0 : qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
167 0 : qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
168 :
169 0 : store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
170 0 : store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
171 :
172 0 : coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
173 0 : coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
174 :
175 0 : store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
176 0 : store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
177 : }
178 :
179 : {
180 : // Scan for eob
181 : __m128i zero_coeff0, zero_coeff1;
182 : __m128i nzero_coeff0, nzero_coeff1;
183 : __m128i iscan0, iscan1;
184 : __m128i eob0, eob1;
185 0 : zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
186 0 : zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
187 0 : nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
188 0 : nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
189 0 : iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
190 0 : iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
191 : // Add one to convert from indices to counts
192 0 : iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
193 0 : iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
194 0 : eob0 = _mm_and_si128(iscan0, nzero_coeff0);
195 0 : eob1 = _mm_and_si128(iscan1, nzero_coeff1);
196 0 : eob0 = _mm_max_epi16(eob0, eob1);
197 0 : eob = _mm_max_epi16(eob, eob0);
198 : }
199 0 : n_coeffs += 8 * 2;
200 : }
201 :
202 : // Accumulate EOB
203 : {
204 : __m128i eob_shuffled;
205 0 : eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
206 0 : eob = _mm_max_epi16(eob, eob_shuffled);
207 0 : eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
208 0 : eob = _mm_max_epi16(eob, eob_shuffled);
209 0 : eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
210 0 : eob = _mm_max_epi16(eob, eob_shuffled);
211 0 : *eob_ptr = _mm_extract_epi16(eob, 1);
212 : }
213 : } else {
214 : do {
215 0 : store_tran_low(zero, dqcoeff_ptr + n_coeffs);
216 0 : store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);
217 0 : store_tran_low(zero, qcoeff_ptr + n_coeffs);
218 0 : store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);
219 0 : n_coeffs += 8 * 2;
220 0 : } while (n_coeffs < 0);
221 0 : *eob_ptr = 0;
222 : }
223 0 : }
|