Line data Source code
1 : /*
2 : * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include <emmintrin.h>
12 : #include <xmmintrin.h>
13 :
14 : #include "./vp9_rtcd.h"
15 : #include "vpx/vpx_integer.h"
16 :
17 0 : void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
18 : int skip_block, const int16_t *zbin_ptr,
19 : const int16_t *round_ptr, const int16_t *quant_ptr,
20 : const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
21 : int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
22 : uint16_t *eob_ptr, const int16_t *scan_ptr,
23 : const int16_t *iscan_ptr) {
24 : __m128i zero;
25 : __m128i thr;
26 : int16_t nzflag;
27 : (void)scan_ptr;
28 : (void)zbin_ptr;
29 : (void)quant_shift_ptr;
30 :
31 0 : coeff_ptr += n_coeffs;
32 0 : iscan_ptr += n_coeffs;
33 0 : qcoeff_ptr += n_coeffs;
34 0 : dqcoeff_ptr += n_coeffs;
35 0 : n_coeffs = -n_coeffs;
36 0 : zero = _mm_setzero_si128();
37 :
38 0 : if (!skip_block) {
39 : __m128i eob;
40 : __m128i round, quant, dequant;
41 : {
42 : __m128i coeff0, coeff1;
43 :
44 : // Setup global values
45 : {
46 0 : round = _mm_load_si128((const __m128i *)round_ptr);
47 0 : quant = _mm_load_si128((const __m128i *)quant_ptr);
48 0 : dequant = _mm_load_si128((const __m128i *)dequant_ptr);
49 : }
50 :
51 : {
52 : __m128i coeff0_sign, coeff1_sign;
53 : __m128i qcoeff0, qcoeff1;
54 : __m128i qtmp0, qtmp1;
55 : // Do DC and first 15 AC
56 0 : coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
57 0 : coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
58 :
59 : // Poor man's sign extract
60 0 : coeff0_sign = _mm_srai_epi16(coeff0, 15);
61 0 : coeff1_sign = _mm_srai_epi16(coeff1, 15);
62 0 : qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
63 0 : qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
64 0 : qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
65 0 : qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
66 :
67 0 : qcoeff0 = _mm_adds_epi16(qcoeff0, round);
68 0 : round = _mm_unpackhi_epi64(round, round);
69 0 : qcoeff1 = _mm_adds_epi16(qcoeff1, round);
70 0 : qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
71 0 : quant = _mm_unpackhi_epi64(quant, quant);
72 0 : qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
73 :
74 : // Reinsert signs
75 0 : qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
76 0 : qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
77 0 : qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
78 0 : qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
79 :
80 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
81 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
82 :
83 0 : coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
84 0 : dequant = _mm_unpackhi_epi64(dequant, dequant);
85 0 : coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
86 :
87 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
88 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
89 : }
90 :
91 : {
92 : // Scan for eob
93 : __m128i zero_coeff0, zero_coeff1;
94 : __m128i nzero_coeff0, nzero_coeff1;
95 : __m128i iscan0, iscan1;
96 : __m128i eob1;
97 0 : zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
98 0 : zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
99 0 : nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
100 0 : nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
101 0 : iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
102 0 : iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
103 : // Add one to convert from indices to counts
104 0 : iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
105 0 : iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
106 0 : eob = _mm_and_si128(iscan0, nzero_coeff0);
107 0 : eob1 = _mm_and_si128(iscan1, nzero_coeff1);
108 0 : eob = _mm_max_epi16(eob, eob1);
109 : }
110 0 : n_coeffs += 8 * 2;
111 : }
112 :
113 0 : thr = _mm_srai_epi16(dequant, 1);
114 :
115 : // AC only loop
116 0 : while (n_coeffs < 0) {
117 : __m128i coeff0, coeff1;
118 : {
119 : __m128i coeff0_sign, coeff1_sign;
120 : __m128i qcoeff0, qcoeff1;
121 : __m128i qtmp0, qtmp1;
122 :
123 0 : coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
124 0 : coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
125 :
126 : // Poor man's sign extract
127 0 : coeff0_sign = _mm_srai_epi16(coeff0, 15);
128 0 : coeff1_sign = _mm_srai_epi16(coeff1, 15);
129 0 : qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
130 0 : qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
131 0 : qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
132 0 : qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
133 :
134 0 : nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
135 0 : _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
136 :
137 0 : if (nzflag) {
138 0 : qcoeff0 = _mm_adds_epi16(qcoeff0, round);
139 0 : qcoeff1 = _mm_adds_epi16(qcoeff1, round);
140 0 : qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
141 0 : qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
142 :
143 : // Reinsert signs
144 0 : qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
145 0 : qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
146 0 : qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
147 0 : qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
148 :
149 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
150 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
151 :
152 0 : coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
153 0 : coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
154 :
155 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
156 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
157 : } else {
158 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
159 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
160 :
161 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
162 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
163 : }
164 : }
165 :
166 0 : if (nzflag) {
167 : // Scan for eob
168 : __m128i zero_coeff0, zero_coeff1;
169 : __m128i nzero_coeff0, nzero_coeff1;
170 : __m128i iscan0, iscan1;
171 : __m128i eob0, eob1;
172 0 : zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
173 0 : zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
174 0 : nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
175 0 : nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
176 0 : iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
177 0 : iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
178 : // Add one to convert from indices to counts
179 0 : iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
180 0 : iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
181 0 : eob0 = _mm_and_si128(iscan0, nzero_coeff0);
182 0 : eob1 = _mm_and_si128(iscan1, nzero_coeff1);
183 0 : eob0 = _mm_max_epi16(eob0, eob1);
184 0 : eob = _mm_max_epi16(eob, eob0);
185 : }
186 0 : n_coeffs += 8 * 2;
187 : }
188 :
189 : // Accumulate EOB
190 : {
191 : __m128i eob_shuffled;
192 0 : eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
193 0 : eob = _mm_max_epi16(eob, eob_shuffled);
194 0 : eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
195 0 : eob = _mm_max_epi16(eob, eob_shuffled);
196 0 : eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
197 0 : eob = _mm_max_epi16(eob, eob_shuffled);
198 0 : *eob_ptr = _mm_extract_epi16(eob, 1);
199 : }
200 : } else {
201 : do {
202 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
203 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
204 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
205 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
206 0 : n_coeffs += 8 * 2;
207 0 : } while (n_coeffs < 0);
208 0 : *eob_ptr = 0;
209 : }
210 0 : }
|