Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_
13 : #define AOM_DSP_X86_FWD_TXFM_SSE2_H_
14 :
15 : #include "aom_dsp/x86/txfm_common_intrin.h"
16 :
17 : #ifdef __cplusplus
18 : extern "C" {
19 : #endif
20 :
21 : #define pair_set_epi32(a, b) \
22 : _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
23 :
24 0 : static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
25 : __m128i buf0, buf1;
26 0 : buf0 = _mm_mul_epu32(a, b);
27 0 : a = _mm_srli_epi64(a, 32);
28 0 : b = _mm_srli_epi64(b, 32);
29 0 : buf1 = _mm_mul_epu32(a, b);
30 0 : return _mm_add_epi64(buf0, buf1);
31 : }
32 :
33 0 : static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
34 0 : __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
35 0 : __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
36 0 : return _mm_unpacklo_epi64(buf0, buf1);
37 : }
38 :
39 0 : static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
40 : const __m128i *preg1) {
41 0 : const __m128i max_overflow = _mm_set1_epi16(0x7fff);
42 0 : const __m128i min_overflow = _mm_set1_epi16(0x8000);
43 0 : __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
44 : _mm_cmpeq_epi16(*preg0, min_overflow));
45 0 : __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
46 : _mm_cmpeq_epi16(*preg1, min_overflow));
47 0 : cmp0 = _mm_or_si128(cmp0, cmp1);
48 0 : return _mm_movemask_epi8(cmp0);
49 : }
50 :
51 0 : static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
52 : const __m128i *preg1,
53 : const __m128i *preg2,
54 : const __m128i *preg3) {
55 0 : const __m128i max_overflow = _mm_set1_epi16(0x7fff);
56 0 : const __m128i min_overflow = _mm_set1_epi16(0x8000);
57 0 : __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
58 : _mm_cmpeq_epi16(*preg0, min_overflow));
59 0 : __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
60 : _mm_cmpeq_epi16(*preg1, min_overflow));
61 0 : __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
62 : _mm_cmpeq_epi16(*preg2, min_overflow));
63 0 : __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
64 : _mm_cmpeq_epi16(*preg3, min_overflow));
65 0 : cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
66 0 : return _mm_movemask_epi8(cmp0);
67 : }
68 :
69 0 : static INLINE int check_epi16_overflow_x8(
70 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
71 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
72 : const __m128i *preg6, const __m128i *preg7) {
73 : int res0, res1;
74 0 : res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
75 0 : res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
76 0 : return res0 + res1;
77 : }
78 :
79 : static INLINE int check_epi16_overflow_x12(
80 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
81 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
82 : const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
83 : const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
84 : int res0, res1;
85 : res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
86 : res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
87 : if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
88 : return res0 + res1;
89 : }
90 :
91 0 : static INLINE int check_epi16_overflow_x16(
92 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
93 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
94 : const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
95 : const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
96 : const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
97 : const __m128i *preg15) {
98 : int res0, res1;
99 0 : res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
100 0 : res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
101 0 : if (!res0) {
102 0 : res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
103 0 : if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
104 : }
105 0 : return res0 + res1;
106 : }
107 :
108 0 : static INLINE int check_epi16_overflow_x32(
109 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
110 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
111 : const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
112 : const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
113 : const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
114 : const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
115 : const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
116 : const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
117 : const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
118 : const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
119 : const __m128i *preg30, const __m128i *preg31) {
120 : int res0, res1;
121 0 : res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
122 0 : res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
123 0 : if (!res0) {
124 0 : res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
125 0 : if (!res1) {
126 0 : res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
127 0 : if (!res0) {
128 0 : res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
129 0 : if (!res1) {
130 0 : res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
131 0 : if (!res0) {
132 0 : res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
133 0 : if (!res1)
134 0 : res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
135 : }
136 : }
137 : }
138 : }
139 : }
140 0 : return res0 + res1;
141 : }
142 :
143 0 : static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
144 : const __m128i *preg1,
145 : const __m128i *preg2,
146 : const __m128i *preg3,
147 : const __m128i *zero) {
148 0 : __m128i minus_one = _mm_set1_epi32(-1);
149 : // Check for overflows
150 0 : __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
151 0 : __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
152 0 : __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
153 0 : __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
154 0 : __m128i reg0_top_dwords =
155 0 : _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
156 0 : __m128i reg1_top_dwords =
157 0 : _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
158 0 : __m128i reg2_top_dwords =
159 0 : _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
160 0 : __m128i reg3_top_dwords =
161 0 : _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
162 0 : __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
163 0 : __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
164 0 : __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
165 0 : __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
166 0 : __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
167 0 : __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
168 0 : int overflow_01 =
169 0 : _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
170 0 : int overflow_23 =
171 0 : _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
172 0 : return (overflow_01 + overflow_23);
173 : }
174 :
175 0 : static INLINE int k_check_epi32_overflow_8(
176 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
177 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
178 : const __m128i *preg6, const __m128i *preg7, const __m128i *zero) {
179 0 : int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
180 0 : if (!overflow) {
181 0 : overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
182 : }
183 0 : return overflow;
184 : }
185 :
186 0 : static INLINE int k_check_epi32_overflow_16(
187 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
188 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
189 : const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
190 : const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
191 : const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
192 : const __m128i *preg15, const __m128i *zero) {
193 0 : int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
194 0 : if (!overflow) {
195 0 : overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
196 0 : if (!overflow) {
197 0 : overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
198 0 : if (!overflow) {
199 0 : overflow =
200 : k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
201 : }
202 : }
203 : }
204 0 : return overflow;
205 : }
206 :
207 0 : static INLINE int k_check_epi32_overflow_32(
208 : const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
209 : const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
210 : const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
211 : const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
212 : const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
213 : const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
214 : const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
215 : const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
216 : const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
217 : const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
218 : const __m128i *preg30, const __m128i *preg31, const __m128i *zero) {
219 0 : int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
220 0 : if (!overflow) {
221 0 : overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
222 0 : if (!overflow) {
223 0 : overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
224 0 : if (!overflow) {
225 0 : overflow =
226 : k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
227 0 : if (!overflow) {
228 0 : overflow =
229 : k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero);
230 0 : if (!overflow) {
231 0 : overflow =
232 : k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero);
233 0 : if (!overflow) {
234 0 : overflow = k_check_epi32_overflow_4(preg24, preg25, preg26,
235 : preg27, zero);
236 0 : if (!overflow) {
237 0 : overflow = k_check_epi32_overflow_4(preg28, preg29, preg30,
238 : preg31, zero);
239 : }
240 : }
241 : }
242 : }
243 : }
244 : }
245 : }
246 0 : return overflow;
247 : }
248 :
249 0 : static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
250 : #if CONFIG_HIGHBITDEPTH
251 0 : const __m128i zero = _mm_setzero_si128();
252 0 : const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
253 0 : __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
254 0 : __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
255 : _mm_store_si128((__m128i *)(dst_ptr), out0);
256 0 : _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
257 : #else
258 : _mm_store_si128((__m128i *)(dst_ptr), *poutput);
259 : #endif // CONFIG_HIGHBITDEPTH
260 0 : }
261 :
262 0 : static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
263 : const __m128i *pmultiplier,
264 : const __m128i *prounding, int shift) {
265 0 : const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
266 0 : const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
267 0 : const __m128i v0 = _mm_add_epi32(u0, *prounding);
268 0 : const __m128i v1 = _mm_add_epi32(u1, *prounding);
269 0 : const __m128i w0 = _mm_srai_epi32(v0, shift);
270 0 : const __m128i w1 = _mm_srai_epi32(v1, shift);
271 0 : return _mm_packs_epi32(w0, w1);
272 : }
273 :
274 0 : static INLINE void transpose_and_output8x8(
275 : const __m128i *pin00, const __m128i *pin01, const __m128i *pin02,
276 : const __m128i *pin03, const __m128i *pin04, const __m128i *pin05,
277 : const __m128i *pin06, const __m128i *pin07, int pass, int16_t *out0_ptr,
278 : tran_low_t *out1_ptr) {
279 : // 00 01 02 03 04 05 06 07
280 : // 10 11 12 13 14 15 16 17
281 : // 20 21 22 23 24 25 26 27
282 : // 30 31 32 33 34 35 36 37
283 : // 40 41 42 43 44 45 46 47
284 : // 50 51 52 53 54 55 56 57
285 : // 60 61 62 63 64 65 66 67
286 : // 70 71 72 73 74 75 76 77
287 0 : const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
288 0 : const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
289 0 : const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
290 0 : const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
291 0 : const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
292 0 : const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
293 0 : const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
294 0 : const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
295 : // 00 10 01 11 02 12 03 13
296 : // 20 30 21 31 22 32 23 33
297 : // 04 14 05 15 06 16 07 17
298 : // 24 34 25 35 26 36 27 37
299 : // 40 50 41 51 42 52 43 53
300 : // 60 70 61 71 62 72 63 73
301 : // 54 54 55 55 56 56 57 57
302 : // 64 74 65 75 66 76 67 77
303 0 : const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
304 0 : const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
305 0 : const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
306 0 : const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
307 0 : const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
308 0 : const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
309 0 : const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
310 0 : const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
311 : // 00 10 20 30 01 11 21 31
312 : // 40 50 60 70 41 51 61 71
313 : // 02 12 22 32 03 13 23 33
314 : // 42 52 62 72 43 53 63 73
315 : // 04 14 24 34 05 15 21 36
316 : // 44 54 64 74 45 55 61 76
317 : // 06 16 26 36 07 17 27 37
318 : // 46 56 66 76 47 57 67 77
319 0 : const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
320 0 : const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
321 0 : const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
322 0 : const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
323 0 : const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
324 0 : const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
325 0 : const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
326 0 : const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
327 : // 00 10 20 30 40 50 60 70
328 : // 01 11 21 31 41 51 61 71
329 : // 02 12 22 32 42 52 62 72
330 : // 03 13 23 33 43 53 63 73
331 : // 04 14 24 34 44 54 64 74
332 : // 05 15 25 35 45 55 65 75
333 : // 06 16 26 36 46 56 66 76
334 : // 07 17 27 37 47 57 67 77
335 0 : if (pass == 0) {
336 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
337 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
338 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
339 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
340 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
341 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
342 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
343 0 : _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
344 : } else {
345 0 : storeu_output(&tr2_0, (out1_ptr + 0 * 16));
346 0 : storeu_output(&tr2_1, (out1_ptr + 1 * 16));
347 0 : storeu_output(&tr2_2, (out1_ptr + 2 * 16));
348 0 : storeu_output(&tr2_3, (out1_ptr + 3 * 16));
349 0 : storeu_output(&tr2_4, (out1_ptr + 4 * 16));
350 0 : storeu_output(&tr2_5, (out1_ptr + 5 * 16));
351 0 : storeu_output(&tr2_6, (out1_ptr + 6 * 16));
352 0 : storeu_output(&tr2_7, (out1_ptr + 7 * 16));
353 : }
354 0 : }
355 :
356 : void fdct32_8col(__m128i *in0, __m128i *in1);
357 :
358 : #ifdef __cplusplus
359 : } // extern "C"
360 : #endif
361 :
362 : #endif // AOM_DSP_X86_FWD_TXFM_SSE2_H_
|