Line data Source code
1 : /*
2 : * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #ifndef VPX_DSP_X86_FWD_TXFM_SSE2_H_
12 : #define VPX_DSP_X86_FWD_TXFM_SSE2_H_
13 :
14 : #ifdef __cplusplus
15 : extern "C" {
16 : #endif
17 :
18 : #define pair_set_epi32(a, b) \
19 : _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
20 :
21 0 : static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
22 : __m128i buf0, buf1;
23 0 : buf0 = _mm_mul_epu32(a, b);
24 0 : a = _mm_srli_epi64(a, 32);
25 0 : b = _mm_srli_epi64(b, 32);
26 0 : buf1 = _mm_mul_epu32(a, b);
27 0 : return _mm_add_epi64(buf0, buf1);
28 : }
29 :
30 0 : static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
31 0 : __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
32 0 : __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
33 0 : return _mm_unpacklo_epi64(buf0, buf1);
34 : }
35 :
36 : static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
37 : const __m128i *preg1) {
38 : const __m128i max_overflow = _mm_set1_epi16(0x7fff);
39 : const __m128i min_overflow = _mm_set1_epi16(0x8000);
40 : __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
41 : _mm_cmpeq_epi16(*preg0, min_overflow));
42 : __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
43 : _mm_cmpeq_epi16(*preg1, min_overflow));
44 : cmp0 = _mm_or_si128(cmp0, cmp1);
45 : return _mm_movemask_epi8(cmp0);
46 : }
47 :
48 : static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
49 : const __m128i *preg1,
50 : const __m128i *preg2,
51 : const __m128i *preg3) {
52 : const __m128i max_overflow = _mm_set1_epi16(0x7fff);
53 : const __m128i min_overflow = _mm_set1_epi16(0x8000);
54 : __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
55 : _mm_cmpeq_epi16(*preg0, min_overflow));
56 : __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
57 : _mm_cmpeq_epi16(*preg1, min_overflow));
58 : __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
59 : _mm_cmpeq_epi16(*preg2, min_overflow));
60 : __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
61 : _mm_cmpeq_epi16(*preg3, min_overflow));
62 : cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
63 : return _mm_movemask_epi8(cmp0);
64 : }
65 :
66 : static INLINE int check_epi16_overflow_x8(
67 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
68 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
69 : const __m128i *preg6, const __m128i *preg7) {
70 : int res0, res1;
71 : res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
72 : res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
73 : return res0 + res1;
74 : }
75 :
76 : static INLINE int check_epi16_overflow_x12(
77 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
78 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
79 : const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
80 : const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
81 : int res0, res1;
82 : res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
83 : res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
84 : if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
85 : return res0 + res1;
86 : }
87 :
88 : static INLINE int check_epi16_overflow_x16(
89 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
90 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
91 : const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
92 : const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
93 : const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
94 : const __m128i *preg15) {
95 : int res0, res1;
96 : res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
97 : res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
98 : if (!res0) {
99 : res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
100 : if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
101 : }
102 : return res0 + res1;
103 : }
104 :
105 : static INLINE int check_epi16_overflow_x32(
106 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
107 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
108 : const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
109 : const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
110 : const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
111 : const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
112 : const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
113 : const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
114 : const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
115 : const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
116 : const __m128i *preg30, const __m128i *preg31) {
117 : int res0, res1;
118 : res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
119 : res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
120 : if (!res0) {
121 : res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
122 : if (!res1) {
123 : res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
124 : if (!res0) {
125 : res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
126 : if (!res1) {
127 : res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
128 : if (!res0) {
129 : res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
130 : if (!res1)
131 : res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
132 : }
133 : }
134 : }
135 : }
136 : }
137 : return res0 + res1;
138 : }
139 :
140 : static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
141 : const __m128i *preg1,
142 : const __m128i *preg2,
143 : const __m128i *preg3,
144 : const __m128i *zero) {
145 : __m128i minus_one = _mm_set1_epi32(-1);
146 : // Check for overflows
147 : __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
148 : __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
149 : __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
150 : __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
151 : __m128i reg0_top_dwords =
152 : _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
153 : __m128i reg1_top_dwords =
154 : _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
155 : __m128i reg2_top_dwords =
156 : _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
157 : __m128i reg3_top_dwords =
158 : _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
159 : __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
160 : __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
161 : __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
162 : __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
163 : __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
164 : __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
165 : int overflow_01 =
166 : _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
167 : int overflow_23 =
168 : _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
169 : return (overflow_01 + overflow_23);
170 : }
171 :
172 : static INLINE int k_check_epi32_overflow_8(
173 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
174 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
175 : const __m128i *preg6, const __m128i *preg7, const __m128i *zero) {
176 : int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
177 : if (!overflow) {
178 : overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
179 : }
180 : return overflow;
181 : }
182 :
183 : static INLINE int k_check_epi32_overflow_16(
184 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
185 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
186 : const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
187 : const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
188 : const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
189 : const __m128i *preg15, const __m128i *zero) {
190 : int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
191 : if (!overflow) {
192 : overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
193 : if (!overflow) {
194 : overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
195 : if (!overflow) {
196 : overflow =
197 : k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
198 : }
199 : }
200 : }
201 : return overflow;
202 : }
203 :
204 : static INLINE int k_check_epi32_overflow_32(
205 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
206 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
207 : const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
208 : const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
209 : const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
210 : const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
211 : const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
212 : const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
213 : const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
214 : const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
215 : const __m128i *preg30, const __m128i *preg31, const __m128i *zero) {
216 : int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
217 : if (!overflow) {
218 : overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
219 : if (!overflow) {
220 : overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
221 : if (!overflow) {
222 : overflow =
223 : k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
224 : if (!overflow) {
225 : overflow =
226 : k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero);
227 : if (!overflow) {
228 : overflow =
229 : k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero);
230 : if (!overflow) {
231 : overflow = k_check_epi32_overflow_4(preg24, preg25, preg26,
232 : preg27, zero);
233 : if (!overflow) {
234 : overflow = k_check_epi32_overflow_4(preg28, preg29, preg30,
235 : preg31, zero);
236 : }
237 : }
238 : }
239 : }
240 : }
241 : }
242 : }
243 : return overflow;
244 : }
245 :
246 0 : static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
247 : #if CONFIG_VP9_HIGHBITDEPTH
248 : const __m128i zero = _mm_setzero_si128();
249 : const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
250 : __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
251 : __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
252 : _mm_store_si128((__m128i *)(dst_ptr), out0);
253 : _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
254 : #else
255 0 : _mm_store_si128((__m128i *)(dst_ptr), *poutput);
256 : #endif // CONFIG_VP9_HIGHBITDEPTH
257 0 : }
258 :
259 0 : static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
260 : #if CONFIG_VP9_HIGHBITDEPTH
261 : const __m128i zero = _mm_setzero_si128();
262 : const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
263 : __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
264 : __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
265 : _mm_storeu_si128((__m128i *)(dst_ptr), out0);
266 : _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
267 : #else
268 0 : _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
269 : #endif // CONFIG_VP9_HIGHBITDEPTH
270 0 : }
271 :
272 0 : static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
273 : const __m128i *pmultiplier,
274 : const __m128i *prounding,
275 : const int shift) {
276 0 : const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
277 0 : const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
278 0 : const __m128i v0 = _mm_add_epi32(u0, *prounding);
279 0 : const __m128i v1 = _mm_add_epi32(u1, *prounding);
280 0 : const __m128i w0 = _mm_srai_epi32(v0, shift);
281 0 : const __m128i w1 = _mm_srai_epi32(v1, shift);
282 0 : return _mm_packs_epi32(w0, w1);
283 : }
284 :
285 0 : static INLINE void transpose_and_output8x8(
286 : const __m128i *pin00, const __m128i *pin01, const __m128i *pin02,
287 : const __m128i *pin03, const __m128i *pin04, const __m128i *pin05,
288 : const __m128i *pin06, const __m128i *pin07, const int pass,
289 : int16_t *out0_ptr, tran_low_t *out1_ptr) {
290 : // 00 01 02 03 04 05 06 07
291 : // 10 11 12 13 14 15 16 17
292 : // 20 21 22 23 24 25 26 27
293 : // 30 31 32 33 34 35 36 37
294 : // 40 41 42 43 44 45 46 47
295 : // 50 51 52 53 54 55 56 57
296 : // 60 61 62 63 64 65 66 67
297 : // 70 71 72 73 74 75 76 77
298 0 : const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
299 0 : const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
300 0 : const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
301 0 : const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
302 0 : const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
303 0 : const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
304 0 : const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
305 0 : const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
306 : // 00 10 01 11 02 12 03 13
307 : // 20 30 21 31 22 32 23 33
308 : // 04 14 05 15 06 16 07 17
309 : // 24 34 25 35 26 36 27 37
310 : // 40 50 41 51 42 52 43 53
311 : // 60 70 61 71 62 72 63 73
312 : // 54 54 55 55 56 56 57 57
313 : // 64 74 65 75 66 76 67 77
314 0 : const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
315 0 : const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
316 0 : const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
317 0 : const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
318 0 : const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
319 0 : const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
320 0 : const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
321 0 : const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
322 : // 00 10 20 30 01 11 21 31
323 : // 40 50 60 70 41 51 61 71
324 : // 02 12 22 32 03 13 23 33
325 : // 42 52 62 72 43 53 63 73
326 : // 04 14 24 34 05 15 21 36
327 : // 44 54 64 74 45 55 61 76
328 : // 06 16 26 36 07 17 27 37
329 : // 46 56 66 76 47 57 67 77
330 0 : const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
331 0 : const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
332 0 : const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
333 0 : const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
334 0 : const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
335 0 : const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
336 0 : const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
337 0 : const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
338 : // 00 10 20 30 40 50 60 70
339 : // 01 11 21 31 41 51 61 71
340 : // 02 12 22 32 42 52 62 72
341 : // 03 13 23 33 43 53 63 73
342 : // 04 14 24 34 44 54 64 74
343 : // 05 15 25 35 45 55 65 75
344 : // 06 16 26 36 46 56 66 76
345 : // 07 17 27 37 47 57 67 77
346 0 : if (pass == 0) {
347 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
348 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
349 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
350 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
351 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
352 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
353 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
354 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
355 : } else {
356 0 : storeu_output(&tr2_0, (out1_ptr + 0 * 16));
357 0 : storeu_output(&tr2_1, (out1_ptr + 1 * 16));
358 0 : storeu_output(&tr2_2, (out1_ptr + 2 * 16));
359 0 : storeu_output(&tr2_3, (out1_ptr + 3 * 16));
360 0 : storeu_output(&tr2_4, (out1_ptr + 4 * 16));
361 0 : storeu_output(&tr2_5, (out1_ptr + 5 * 16));
362 0 : storeu_output(&tr2_6, (out1_ptr + 6 * 16));
363 0 : storeu_output(&tr2_7, (out1_ptr + 7 * 16));
364 : }
365 0 : }
366 :
367 : #ifdef __cplusplus
368 : } // extern "C"
369 : #endif
370 :
371 : #endif // VPX_DSP_X86_FWD_TXFM_SSE2_H_
|