Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include "./av1_rtcd.h"
13 : #include "aom_dsp/x86/inv_txfm_sse2.h"
14 : #include "aom_dsp/x86/synonyms.h"
15 : #include "aom_dsp/x86/txfm_common_sse2.h"
16 : #include "aom_ports/mem.h"
17 : #include "av1/common/enums.h"
18 :
19 : #if CONFIG_EXT_TX
20 0 : static INLINE void fliplr_4x4(__m128i *in /*in[2]*/) {
21 0 : in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
22 0 : in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
23 0 : in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
24 0 : in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
25 0 : }
26 :
27 0 : static INLINE void fliplr_8x8(__m128i *in /*in[8]*/) {
28 0 : in[0] = mm_reverse_epi16(in[0]);
29 0 : in[1] = mm_reverse_epi16(in[1]);
30 0 : in[2] = mm_reverse_epi16(in[2]);
31 0 : in[3] = mm_reverse_epi16(in[3]);
32 :
33 0 : in[4] = mm_reverse_epi16(in[4]);
34 0 : in[5] = mm_reverse_epi16(in[5]);
35 0 : in[6] = mm_reverse_epi16(in[6]);
36 0 : in[7] = mm_reverse_epi16(in[7]);
37 0 : }
38 :
39 0 : static INLINE void fliplr_16x8(__m128i *in /*in[16]*/) {
40 0 : fliplr_8x8(&in[0]);
41 0 : fliplr_8x8(&in[8]);
42 0 : }
43 :
44 : #define FLIPLR_16x16(in0, in1) \
45 : do { \
46 : __m128i *tmp; \
47 : fliplr_16x8(in0); \
48 : fliplr_16x8(in1); \
49 : tmp = (in0); \
50 : (in0) = (in1); \
51 : (in1) = tmp; \
52 : } while (0)
53 :
54 : #define FLIPUD_PTR(dest, stride, size) \
55 : do { \
56 : (dest) = (dest) + ((size)-1) * (stride); \
57 : (stride) = -(stride); \
58 : } while (0)
59 : #endif
60 :
61 0 : void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
62 : int tx_type) {
63 : __m128i in[2];
64 0 : const __m128i zero = _mm_setzero_si128();
65 0 : const __m128i eight = _mm_set1_epi16(8);
66 :
67 0 : in[0] = load_input_data(input);
68 0 : in[1] = load_input_data(input + 8);
69 :
70 0 : switch (tx_type) {
71 : case DCT_DCT:
72 0 : aom_idct4_sse2(in);
73 0 : aom_idct4_sse2(in);
74 0 : break;
75 : case ADST_DCT:
76 0 : aom_idct4_sse2(in);
77 0 : aom_iadst4_sse2(in);
78 0 : break;
79 : case DCT_ADST:
80 0 : aom_iadst4_sse2(in);
81 0 : aom_idct4_sse2(in);
82 0 : break;
83 : case ADST_ADST:
84 0 : aom_iadst4_sse2(in);
85 0 : aom_iadst4_sse2(in);
86 0 : break;
87 : #if CONFIG_EXT_TX
88 : case FLIPADST_DCT:
89 0 : aom_idct4_sse2(in);
90 0 : aom_iadst4_sse2(in);
91 0 : FLIPUD_PTR(dest, stride, 4);
92 0 : break;
93 : case DCT_FLIPADST:
94 0 : aom_iadst4_sse2(in);
95 0 : aom_idct4_sse2(in);
96 0 : fliplr_4x4(in);
97 0 : break;
98 : case FLIPADST_FLIPADST:
99 0 : aom_iadst4_sse2(in);
100 0 : aom_iadst4_sse2(in);
101 0 : FLIPUD_PTR(dest, stride, 4);
102 0 : fliplr_4x4(in);
103 0 : break;
104 : case ADST_FLIPADST:
105 0 : aom_iadst4_sse2(in);
106 0 : aom_iadst4_sse2(in);
107 0 : fliplr_4x4(in);
108 0 : break;
109 : case FLIPADST_ADST:
110 0 : aom_iadst4_sse2(in);
111 0 : aom_iadst4_sse2(in);
112 0 : FLIPUD_PTR(dest, stride, 4);
113 0 : break;
114 : #endif // CONFIG_EXT_TX
115 0 : default: assert(0); break;
116 : }
117 :
118 : // Final round and shift
119 0 : in[0] = _mm_add_epi16(in[0], eight);
120 0 : in[1] = _mm_add_epi16(in[1], eight);
121 :
122 0 : in[0] = _mm_srai_epi16(in[0], 4);
123 0 : in[1] = _mm_srai_epi16(in[1], 4);
124 :
125 : // Reconstruction and Store
126 : {
127 0 : __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
128 0 : __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
129 0 : __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
130 0 : __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
131 0 : d0 = _mm_unpacklo_epi32(d0, d1);
132 0 : d2 = _mm_unpacklo_epi32(d2, d3);
133 0 : d0 = _mm_unpacklo_epi8(d0, zero);
134 0 : d2 = _mm_unpacklo_epi8(d2, zero);
135 0 : d0 = _mm_add_epi16(d0, in[0]);
136 0 : d2 = _mm_add_epi16(d2, in[1]);
137 0 : d0 = _mm_packus_epi16(d0, d2);
138 : // store result[0]
139 0 : *(int *)dest = _mm_cvtsi128_si32(d0);
140 : // store result[1]
141 0 : d0 = _mm_srli_si128(d0, 4);
142 0 : *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
143 : // store result[2]
144 0 : d0 = _mm_srli_si128(d0, 4);
145 0 : *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
146 : // store result[3]
147 0 : d0 = _mm_srli_si128(d0, 4);
148 0 : *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
149 : }
150 0 : }
151 :
152 0 : void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
153 : int tx_type) {
154 : __m128i in[8];
155 0 : const __m128i zero = _mm_setzero_si128();
156 0 : const __m128i final_rounding = _mm_set1_epi16(1 << 4);
157 :
158 : // load input data
159 0 : in[0] = load_input_data(input);
160 0 : in[1] = load_input_data(input + 8 * 1);
161 0 : in[2] = load_input_data(input + 8 * 2);
162 0 : in[3] = load_input_data(input + 8 * 3);
163 0 : in[4] = load_input_data(input + 8 * 4);
164 0 : in[5] = load_input_data(input + 8 * 5);
165 0 : in[6] = load_input_data(input + 8 * 6);
166 0 : in[7] = load_input_data(input + 8 * 7);
167 :
168 0 : switch (tx_type) {
169 : case DCT_DCT:
170 0 : aom_idct8_sse2(in);
171 0 : aom_idct8_sse2(in);
172 0 : break;
173 : case ADST_DCT:
174 0 : aom_idct8_sse2(in);
175 0 : aom_iadst8_sse2(in);
176 0 : break;
177 : case DCT_ADST:
178 0 : aom_iadst8_sse2(in);
179 0 : aom_idct8_sse2(in);
180 0 : break;
181 : case ADST_ADST:
182 0 : aom_iadst8_sse2(in);
183 0 : aom_iadst8_sse2(in);
184 0 : break;
185 : #if CONFIG_EXT_TX
186 : case FLIPADST_DCT:
187 0 : aom_idct8_sse2(in);
188 0 : aom_iadst8_sse2(in);
189 0 : FLIPUD_PTR(dest, stride, 8);
190 0 : break;
191 : case DCT_FLIPADST:
192 0 : aom_iadst8_sse2(in);
193 0 : aom_idct8_sse2(in);
194 0 : fliplr_8x8(in);
195 0 : break;
196 : case FLIPADST_FLIPADST:
197 0 : aom_iadst8_sse2(in);
198 0 : aom_iadst8_sse2(in);
199 0 : FLIPUD_PTR(dest, stride, 8);
200 0 : fliplr_8x8(in);
201 0 : break;
202 : case ADST_FLIPADST:
203 0 : aom_iadst8_sse2(in);
204 0 : aom_iadst8_sse2(in);
205 0 : fliplr_8x8(in);
206 0 : break;
207 : case FLIPADST_ADST:
208 0 : aom_iadst8_sse2(in);
209 0 : aom_iadst8_sse2(in);
210 0 : FLIPUD_PTR(dest, stride, 8);
211 0 : break;
212 : #endif // CONFIG_EXT_TX
213 0 : default: assert(0); break;
214 : }
215 :
216 : // Final rounding and shift
217 0 : in[0] = _mm_adds_epi16(in[0], final_rounding);
218 0 : in[1] = _mm_adds_epi16(in[1], final_rounding);
219 0 : in[2] = _mm_adds_epi16(in[2], final_rounding);
220 0 : in[3] = _mm_adds_epi16(in[3], final_rounding);
221 0 : in[4] = _mm_adds_epi16(in[4], final_rounding);
222 0 : in[5] = _mm_adds_epi16(in[5], final_rounding);
223 0 : in[6] = _mm_adds_epi16(in[6], final_rounding);
224 0 : in[7] = _mm_adds_epi16(in[7], final_rounding);
225 :
226 0 : in[0] = _mm_srai_epi16(in[0], 5);
227 0 : in[1] = _mm_srai_epi16(in[1], 5);
228 0 : in[2] = _mm_srai_epi16(in[2], 5);
229 0 : in[3] = _mm_srai_epi16(in[3], 5);
230 0 : in[4] = _mm_srai_epi16(in[4], 5);
231 0 : in[5] = _mm_srai_epi16(in[5], 5);
232 0 : in[6] = _mm_srai_epi16(in[6], 5);
233 0 : in[7] = _mm_srai_epi16(in[7], 5);
234 :
235 0 : RECON_AND_STORE(dest + 0 * stride, in[0]);
236 0 : RECON_AND_STORE(dest + 1 * stride, in[1]);
237 0 : RECON_AND_STORE(dest + 2 * stride, in[2]);
238 0 : RECON_AND_STORE(dest + 3 * stride, in[3]);
239 0 : RECON_AND_STORE(dest + 4 * stride, in[4]);
240 0 : RECON_AND_STORE(dest + 5 * stride, in[5]);
241 0 : RECON_AND_STORE(dest + 6 * stride, in[6]);
242 0 : RECON_AND_STORE(dest + 7 * stride, in[7]);
243 0 : }
244 :
245 : #if CONFIG_EXT_TX
246 0 : static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
247 0 : array_transpose_16x16(in0, in1);
248 0 : idtx16_8col(in0);
249 0 : idtx16_8col(in1);
250 0 : }
251 : #endif // CONFIG_EXT_TX
252 :
253 0 : void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
254 : int stride, int tx_type) {
255 : __m128i in[32];
256 0 : __m128i *in0 = &in[0];
257 0 : __m128i *in1 = &in[16];
258 :
259 0 : load_buffer_8x16(input, in0);
260 0 : input += 8;
261 0 : load_buffer_8x16(input, in1);
262 :
263 0 : switch (tx_type) {
264 : case DCT_DCT:
265 0 : aom_idct16_sse2(in0, in1);
266 0 : aom_idct16_sse2(in0, in1);
267 0 : break;
268 : case ADST_DCT:
269 0 : aom_idct16_sse2(in0, in1);
270 0 : aom_iadst16_sse2(in0, in1);
271 0 : break;
272 : case DCT_ADST:
273 0 : aom_iadst16_sse2(in0, in1);
274 0 : aom_idct16_sse2(in0, in1);
275 0 : break;
276 : case ADST_ADST:
277 0 : aom_iadst16_sse2(in0, in1);
278 0 : aom_iadst16_sse2(in0, in1);
279 0 : break;
280 : #if CONFIG_EXT_TX
281 : case FLIPADST_DCT:
282 0 : aom_idct16_sse2(in0, in1);
283 0 : aom_iadst16_sse2(in0, in1);
284 0 : FLIPUD_PTR(dest, stride, 16);
285 0 : break;
286 : case DCT_FLIPADST:
287 0 : aom_iadst16_sse2(in0, in1);
288 0 : aom_idct16_sse2(in0, in1);
289 0 : FLIPLR_16x16(in0, in1);
290 0 : break;
291 : case FLIPADST_FLIPADST:
292 0 : aom_iadst16_sse2(in0, in1);
293 0 : aom_iadst16_sse2(in0, in1);
294 0 : FLIPUD_PTR(dest, stride, 16);
295 0 : FLIPLR_16x16(in0, in1);
296 0 : break;
297 : case ADST_FLIPADST:
298 0 : aom_iadst16_sse2(in0, in1);
299 0 : aom_iadst16_sse2(in0, in1);
300 0 : FLIPLR_16x16(in0, in1);
301 0 : break;
302 : case FLIPADST_ADST:
303 0 : aom_iadst16_sse2(in0, in1);
304 0 : aom_iadst16_sse2(in0, in1);
305 0 : FLIPUD_PTR(dest, stride, 16);
306 0 : break;
307 : case IDTX:
308 0 : iidtx16_sse2(in0, in1);
309 0 : iidtx16_sse2(in0, in1);
310 0 : break;
311 : case V_DCT:
312 0 : iidtx16_sse2(in0, in1);
313 0 : aom_idct16_sse2(in0, in1);
314 0 : break;
315 : case H_DCT:
316 0 : aom_idct16_sse2(in0, in1);
317 0 : iidtx16_sse2(in0, in1);
318 0 : break;
319 : case V_ADST:
320 0 : iidtx16_sse2(in0, in1);
321 0 : aom_iadst16_sse2(in0, in1);
322 0 : break;
323 : case H_ADST:
324 0 : aom_iadst16_sse2(in0, in1);
325 0 : iidtx16_sse2(in0, in1);
326 0 : break;
327 : case V_FLIPADST:
328 0 : iidtx16_sse2(in0, in1);
329 0 : aom_iadst16_sse2(in0, in1);
330 0 : FLIPUD_PTR(dest, stride, 16);
331 0 : break;
332 : case H_FLIPADST:
333 0 : aom_iadst16_sse2(in0, in1);
334 0 : iidtx16_sse2(in0, in1);
335 0 : FLIPLR_16x16(in0, in1);
336 0 : break;
337 : #endif // CONFIG_EXT_TX
338 0 : default: assert(0); break;
339 : }
340 :
341 0 : write_buffer_8x16(dest, in0, stride);
342 0 : dest += 8;
343 0 : write_buffer_8x16(dest, in1, stride);
344 0 : }
345 :
346 : #if CONFIG_EXT_TX
347 0 : static void iidtx8_sse2(__m128i *in) {
348 0 : in[0] = _mm_slli_epi16(in[0], 1);
349 0 : in[1] = _mm_slli_epi16(in[1], 1);
350 0 : in[2] = _mm_slli_epi16(in[2], 1);
351 0 : in[3] = _mm_slli_epi16(in[3], 1);
352 0 : in[4] = _mm_slli_epi16(in[4], 1);
353 0 : in[5] = _mm_slli_epi16(in[5], 1);
354 0 : in[6] = _mm_slli_epi16(in[6], 1);
355 0 : in[7] = _mm_slli_epi16(in[7], 1);
356 0 : }
357 :
358 0 : static INLINE void iidtx4_sse2(__m128i *in) {
359 0 : const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
360 :
361 0 : const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
362 0 : const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
363 0 : const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
364 0 : const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
365 :
366 0 : const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
367 0 : const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
368 0 : const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
369 0 : const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
370 :
371 0 : in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
372 : xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
373 0 : in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
374 : xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
375 0 : }
376 :
377 : // load 8x8 array
378 0 : static INLINE void flip_buffer_lr_8x8(__m128i *in) {
379 0 : in[0] = mm_reverse_epi16(in[0]);
380 0 : in[1] = mm_reverse_epi16(in[1]);
381 0 : in[2] = mm_reverse_epi16(in[2]);
382 0 : in[3] = mm_reverse_epi16(in[3]);
383 0 : in[4] = mm_reverse_epi16(in[4]);
384 0 : in[5] = mm_reverse_epi16(in[5]);
385 0 : in[6] = mm_reverse_epi16(in[6]);
386 0 : in[7] = mm_reverse_epi16(in[7]);
387 0 : }
388 : #endif // CONFIG_EXT_TX
389 :
390 0 : void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
391 : int stride, int tx_type) {
392 : __m128i in[16];
393 :
394 0 : in[0] = load_input_data(input + 0 * 8);
395 0 : in[1] = load_input_data(input + 1 * 8);
396 0 : in[2] = load_input_data(input + 2 * 8);
397 0 : in[3] = load_input_data(input + 3 * 8);
398 0 : in[4] = load_input_data(input + 4 * 8);
399 0 : in[5] = load_input_data(input + 5 * 8);
400 0 : in[6] = load_input_data(input + 6 * 8);
401 0 : in[7] = load_input_data(input + 7 * 8);
402 :
403 0 : in[8] = load_input_data(input + 8 * 8);
404 0 : in[9] = load_input_data(input + 9 * 8);
405 0 : in[10] = load_input_data(input + 10 * 8);
406 0 : in[11] = load_input_data(input + 11 * 8);
407 0 : in[12] = load_input_data(input + 12 * 8);
408 0 : in[13] = load_input_data(input + 13 * 8);
409 0 : in[14] = load_input_data(input + 14 * 8);
410 0 : in[15] = load_input_data(input + 15 * 8);
411 :
412 : // Row transform
413 0 : switch (tx_type) {
414 : case DCT_DCT:
415 : case ADST_DCT:
416 : #if CONFIG_EXT_TX
417 : case FLIPADST_DCT:
418 : case H_DCT:
419 : #endif
420 0 : aom_idct8_sse2(in);
421 0 : array_transpose_8x8(in, in);
422 0 : aom_idct8_sse2(in + 8);
423 0 : array_transpose_8x8(in + 8, in + 8);
424 0 : break;
425 : case DCT_ADST:
426 : case ADST_ADST:
427 : #if CONFIG_EXT_TX
428 : case DCT_FLIPADST:
429 : case FLIPADST_FLIPADST:
430 : case ADST_FLIPADST:
431 : case FLIPADST_ADST:
432 : case H_ADST:
433 : case H_FLIPADST:
434 : #endif
435 0 : aom_iadst8_sse2(in);
436 0 : array_transpose_8x8(in, in);
437 0 : aom_iadst8_sse2(in + 8);
438 0 : array_transpose_8x8(in + 8, in + 8);
439 0 : break;
440 : #if CONFIG_EXT_TX
441 : case V_FLIPADST:
442 : case V_ADST:
443 : case V_DCT:
444 : case IDTX:
445 0 : iidtx8_sse2(in);
446 0 : iidtx8_sse2(in + 8);
447 0 : break;
448 : #endif
449 0 : default: assert(0); break;
450 : }
451 0 : scale_sqrt2_8x8(in);
452 0 : scale_sqrt2_8x8(in + 8);
453 :
454 : // Column transform
455 0 : switch (tx_type) {
456 : case DCT_DCT:
457 : case DCT_ADST:
458 : #if CONFIG_EXT_TX
459 : case DCT_FLIPADST:
460 : case V_DCT:
461 : #endif
462 0 : idct16_8col(in);
463 0 : break;
464 : case ADST_DCT:
465 : case ADST_ADST:
466 : #if CONFIG_EXT_TX
467 : case FLIPADST_ADST:
468 : case ADST_FLIPADST:
469 : case FLIPADST_FLIPADST:
470 : case FLIPADST_DCT:
471 : case V_ADST:
472 : case V_FLIPADST:
473 : #endif
474 0 : iadst16_8col(in);
475 0 : break;
476 : #if CONFIG_EXT_TX
477 : case H_DCT:
478 : case H_ADST:
479 : case H_FLIPADST:
480 0 : case IDTX: idtx16_8col(in); break;
481 : #endif
482 0 : default: assert(0); break;
483 : }
484 :
485 0 : switch (tx_type) {
486 : case DCT_DCT:
487 : case ADST_DCT:
488 : #if CONFIG_EXT_TX
489 : case H_DCT:
490 : #endif
491 : case DCT_ADST:
492 : case ADST_ADST:
493 : #if CONFIG_EXT_TX
494 : case H_ADST:
495 : case V_ADST:
496 : case V_DCT:
497 : case IDTX:
498 : #endif
499 0 : write_buffer_8x16(dest, in, stride);
500 0 : break;
501 : #if CONFIG_EXT_TX
502 : case FLIPADST_DCT:
503 : case FLIPADST_ADST:
504 0 : case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break;
505 : case DCT_FLIPADST:
506 : case ADST_FLIPADST:
507 : case H_FLIPADST:
508 0 : flip_buffer_lr_8x8(in);
509 0 : flip_buffer_lr_8x8(in + 8);
510 0 : write_buffer_8x16(dest, in, stride);
511 0 : break;
512 : case FLIPADST_FLIPADST:
513 0 : flip_buffer_lr_8x8(in);
514 0 : flip_buffer_lr_8x8(in + 8);
515 0 : write_buffer_8x16(dest + stride * 15, in, -stride);
516 0 : break;
517 : #endif
518 0 : default: assert(0); break;
519 : }
520 0 : }
521 :
522 0 : static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in,
523 : int stride) {
524 0 : const __m128i final_rounding = _mm_set1_epi16(1 << 5);
525 0 : const __m128i zero = _mm_setzero_si128();
526 : // Final rounding and shift
527 0 : in[0] = _mm_adds_epi16(in[0], final_rounding);
528 0 : in[1] = _mm_adds_epi16(in[1], final_rounding);
529 0 : in[2] = _mm_adds_epi16(in[2], final_rounding);
530 0 : in[3] = _mm_adds_epi16(in[3], final_rounding);
531 0 : in[4] = _mm_adds_epi16(in[4], final_rounding);
532 0 : in[5] = _mm_adds_epi16(in[5], final_rounding);
533 0 : in[6] = _mm_adds_epi16(in[6], final_rounding);
534 0 : in[7] = _mm_adds_epi16(in[7], final_rounding);
535 :
536 0 : in[0] = _mm_srai_epi16(in[0], 6);
537 0 : in[1] = _mm_srai_epi16(in[1], 6);
538 0 : in[2] = _mm_srai_epi16(in[2], 6);
539 0 : in[3] = _mm_srai_epi16(in[3], 6);
540 0 : in[4] = _mm_srai_epi16(in[4], 6);
541 0 : in[5] = _mm_srai_epi16(in[5], 6);
542 0 : in[6] = _mm_srai_epi16(in[6], 6);
543 0 : in[7] = _mm_srai_epi16(in[7], 6);
544 :
545 0 : RECON_AND_STORE(dest + 0 * stride, in[0]);
546 0 : RECON_AND_STORE(dest + 1 * stride, in[1]);
547 0 : RECON_AND_STORE(dest + 2 * stride, in[2]);
548 0 : RECON_AND_STORE(dest + 3 * stride, in[3]);
549 0 : RECON_AND_STORE(dest + 4 * stride, in[4]);
550 0 : RECON_AND_STORE(dest + 5 * stride, in[5]);
551 0 : RECON_AND_STORE(dest + 6 * stride, in[6]);
552 0 : RECON_AND_STORE(dest + 7 * stride, in[7]);
553 0 : }
554 :
555 0 : void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest,
556 : int stride, int tx_type) {
557 : __m128i in[16];
558 :
559 : // Transpose 16x8 input into in[]
560 0 : in[0] = load_input_data(input + 0 * 16);
561 0 : in[1] = load_input_data(input + 1 * 16);
562 0 : in[2] = load_input_data(input + 2 * 16);
563 0 : in[3] = load_input_data(input + 3 * 16);
564 0 : in[4] = load_input_data(input + 4 * 16);
565 0 : in[5] = load_input_data(input + 5 * 16);
566 0 : in[6] = load_input_data(input + 6 * 16);
567 0 : in[7] = load_input_data(input + 7 * 16);
568 0 : array_transpose_8x8(in, in);
569 :
570 0 : in[8] = load_input_data(input + 8 + 0 * 16);
571 0 : in[9] = load_input_data(input + 8 + 1 * 16);
572 0 : in[10] = load_input_data(input + 8 + 2 * 16);
573 0 : in[11] = load_input_data(input + 8 + 3 * 16);
574 0 : in[12] = load_input_data(input + 8 + 4 * 16);
575 0 : in[13] = load_input_data(input + 8 + 5 * 16);
576 0 : in[14] = load_input_data(input + 8 + 6 * 16);
577 0 : in[15] = load_input_data(input + 8 + 7 * 16);
578 0 : array_transpose_8x8(in + 8, in + 8);
579 :
580 : // Row transform
581 0 : switch (tx_type) {
582 : case DCT_DCT:
583 : case ADST_DCT:
584 : #if CONFIG_EXT_TX
585 : case FLIPADST_DCT:
586 : case H_DCT:
587 : #endif
588 0 : idct16_8col(in);
589 0 : break;
590 : case DCT_ADST:
591 : case ADST_ADST:
592 : #if CONFIG_EXT_TX
593 : case DCT_FLIPADST:
594 : case FLIPADST_FLIPADST:
595 : case ADST_FLIPADST:
596 : case FLIPADST_ADST:
597 : case H_ADST:
598 : case H_FLIPADST:
599 : #endif
600 0 : iadst16_8col(in);
601 0 : break;
602 : #if CONFIG_EXT_TX
603 : case V_FLIPADST:
604 : case V_ADST:
605 : case V_DCT:
606 0 : case IDTX: idtx16_8col(in); break;
607 : #endif
608 0 : default: assert(0); break;
609 : }
610 :
611 : // Scale
612 0 : scale_sqrt2_8x8(in);
613 0 : scale_sqrt2_8x8(in + 8);
614 :
615 : // Column transform
616 0 : switch (tx_type) {
617 : case DCT_DCT:
618 : case DCT_ADST:
619 : #if CONFIG_EXT_TX
620 : case DCT_FLIPADST:
621 : case V_DCT:
622 : #endif
623 0 : aom_idct8_sse2(in);
624 0 : aom_idct8_sse2(in + 8);
625 0 : break;
626 : case ADST_DCT:
627 : case ADST_ADST:
628 : #if CONFIG_EXT_TX
629 : case FLIPADST_ADST:
630 : case ADST_FLIPADST:
631 : case FLIPADST_FLIPADST:
632 : case FLIPADST_DCT:
633 : case V_ADST:
634 : case V_FLIPADST:
635 : #endif
636 0 : aom_iadst8_sse2(in);
637 0 : aom_iadst8_sse2(in + 8);
638 0 : break;
639 : #if CONFIG_EXT_TX
640 : case H_DCT:
641 : case H_ADST:
642 : case H_FLIPADST:
643 : case IDTX:
644 0 : array_transpose_8x8(in, in);
645 0 : array_transpose_8x8(in + 8, in + 8);
646 0 : iidtx8_sse2(in);
647 0 : iidtx8_sse2(in + 8);
648 0 : break;
649 : #endif
650 0 : default: assert(0); break;
651 : }
652 :
653 0 : switch (tx_type) {
654 : case DCT_DCT:
655 : case ADST_DCT:
656 : case DCT_ADST:
657 : case ADST_ADST:
658 : #if CONFIG_EXT_TX
659 : case H_DCT:
660 : case H_ADST:
661 : case V_ADST:
662 : case V_DCT:
663 : case IDTX:
664 : #endif
665 0 : write_buffer_8x8_round6(dest, in, stride);
666 0 : write_buffer_8x8_round6(dest + 8, in + 8, stride);
667 0 : break;
668 : #if CONFIG_EXT_TX
669 : case FLIPADST_DCT:
670 : case FLIPADST_ADST:
671 : case V_FLIPADST:
672 0 : write_buffer_8x8_round6(dest + stride * 7, in, -stride);
673 0 : write_buffer_8x8_round6(dest + stride * 7 + 8, in + 8, -stride);
674 0 : break;
675 : case DCT_FLIPADST:
676 : case ADST_FLIPADST:
677 : case H_FLIPADST:
678 0 : flip_buffer_lr_8x8(in);
679 0 : flip_buffer_lr_8x8(in + 8);
680 0 : write_buffer_8x8_round6(dest, in + 8, stride);
681 0 : write_buffer_8x8_round6(dest + 8, in, stride);
682 0 : break;
683 : case FLIPADST_FLIPADST:
684 0 : flip_buffer_lr_8x8(in);
685 0 : flip_buffer_lr_8x8(in + 8);
686 0 : write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride);
687 0 : write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride);
688 0 : break;
689 : #endif
690 0 : default: assert(0); break;
691 : }
692 0 : }
693 :
694 0 : static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in,
695 : int stride) {
696 0 : const __m128i final_rounding = _mm_set1_epi16(1 << 4);
697 0 : const __m128i zero = _mm_setzero_si128();
698 : // Final rounding and shift
699 0 : in[0] = _mm_adds_epi16(in[0], final_rounding);
700 0 : in[1] = _mm_adds_epi16(in[1], final_rounding);
701 0 : in[2] = _mm_adds_epi16(in[2], final_rounding);
702 0 : in[3] = _mm_adds_epi16(in[3], final_rounding);
703 :
704 0 : in[0] = _mm_srai_epi16(in[0], 5);
705 0 : in[1] = _mm_srai_epi16(in[1], 5);
706 0 : in[2] = _mm_srai_epi16(in[2], 5);
707 0 : in[3] = _mm_srai_epi16(in[3], 5);
708 :
709 0 : RECON_AND_STORE(dest + 0 * stride, in[0]);
710 0 : RECON_AND_STORE(dest + 1 * stride, in[1]);
711 0 : RECON_AND_STORE(dest + 2 * stride, in[2]);
712 0 : RECON_AND_STORE(dest + 3 * stride, in[3]);
713 0 : }
714 :
715 0 : void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
716 : int tx_type) {
717 : __m128i in[8];
718 :
719 0 : in[0] = load_input_data(input + 0 * 8);
720 0 : in[1] = load_input_data(input + 1 * 8);
721 0 : in[2] = load_input_data(input + 2 * 8);
722 0 : in[3] = load_input_data(input + 3 * 8);
723 :
724 : // Row transform
725 0 : switch (tx_type) {
726 : case DCT_DCT:
727 : case ADST_DCT:
728 : #if CONFIG_EXT_TX
729 : case FLIPADST_DCT:
730 : case H_DCT:
731 : #endif
732 0 : aom_idct8_sse2(in);
733 0 : break;
734 : case DCT_ADST:
735 0 : case ADST_ADST: aom_iadst8_sse2(in); break;
736 : #if CONFIG_EXT_TX
737 : case DCT_FLIPADST:
738 : case FLIPADST_FLIPADST:
739 : case ADST_FLIPADST:
740 : case FLIPADST_ADST:
741 : case H_ADST:
742 0 : case H_FLIPADST: aom_iadst8_sse2(in); break;
743 : case V_FLIPADST:
744 : case V_ADST:
745 : case V_DCT:
746 0 : case IDTX: iidtx8_sse2(in); array_transpose_8x8(in, in);
747 : #endif
748 0 : break;
749 0 : default: assert(0); break;
750 : }
751 :
752 0 : scale_sqrt2_8x8(in);
753 :
754 : // Repack data. We pack into the bottom half of 'in'
755 : // so that the next repacking stage can pack into the
756 : // top half without overwriting anything
757 0 : in[7] = _mm_unpacklo_epi64(in[6], in[7]);
758 0 : in[6] = _mm_unpacklo_epi64(in[4], in[5]);
759 0 : in[5] = _mm_unpacklo_epi64(in[2], in[3]);
760 0 : in[4] = _mm_unpacklo_epi64(in[0], in[1]);
761 :
762 : // Column transform
763 0 : switch (tx_type) {
764 : case DCT_DCT:
765 : case DCT_ADST:
766 : #if CONFIG_EXT_TX
767 : case DCT_FLIPADST:
768 : case V_DCT:
769 : #endif
770 0 : aom_idct4_sse2(in + 4);
771 0 : aom_idct4_sse2(in + 6);
772 0 : break;
773 : case ADST_DCT:
774 : case ADST_ADST:
775 : #if CONFIG_EXT_TX
776 : case FLIPADST_ADST:
777 : case ADST_FLIPADST:
778 : case FLIPADST_FLIPADST:
779 : case FLIPADST_DCT:
780 : case V_ADST:
781 : case V_FLIPADST:
782 : #endif
783 0 : aom_iadst4_sse2(in + 4);
784 0 : aom_iadst4_sse2(in + 6);
785 0 : break;
786 : #if CONFIG_EXT_TX
787 : case H_DCT:
788 : case H_ADST:
789 : case H_FLIPADST:
790 : case IDTX:
791 0 : iidtx4_sse2(in + 4);
792 0 : array_transpose_4x4(in + 4);
793 0 : iidtx4_sse2(in + 6);
794 0 : array_transpose_4x4(in + 6);
795 0 : break;
796 : #endif
797 0 : default: assert(0); break;
798 : }
799 :
800 : // Repack data
801 0 : in[0] = _mm_unpacklo_epi64(in[4], in[6]);
802 0 : in[1] = _mm_unpackhi_epi64(in[4], in[6]);
803 0 : in[2] = _mm_unpacklo_epi64(in[5], in[7]);
804 0 : in[3] = _mm_unpackhi_epi64(in[5], in[7]);
805 :
806 0 : switch (tx_type) {
807 : case DCT_DCT:
808 : case ADST_DCT:
809 : case DCT_ADST:
810 : case ADST_ADST:
811 : #if CONFIG_EXT_TX
812 : case H_DCT:
813 : case H_ADST:
814 : case V_ADST:
815 : case V_DCT:
816 0 : case IDTX: break;
817 : case FLIPADST_DCT:
818 : case FLIPADST_ADST:
819 0 : case V_FLIPADST: FLIPUD_PTR(dest, stride, 4); break;
820 : case DCT_FLIPADST:
821 : case ADST_FLIPADST:
822 : case H_FLIPADST:
823 0 : in[0] = mm_reverse_epi16(in[0]);
824 0 : in[1] = mm_reverse_epi16(in[1]);
825 0 : in[2] = mm_reverse_epi16(in[2]);
826 0 : in[3] = mm_reverse_epi16(in[3]);
827 0 : break;
828 : case FLIPADST_FLIPADST:
829 0 : in[0] = mm_reverse_epi16(in[0]);
830 0 : in[1] = mm_reverse_epi16(in[1]);
831 0 : in[2] = mm_reverse_epi16(in[2]);
832 0 : in[3] = mm_reverse_epi16(in[3]);
833 0 : FLIPUD_PTR(dest, stride, 4);
834 : #endif
835 0 : break;
836 0 : default: assert(0); break;
837 : }
838 0 : write_buffer_8x4_round5(dest, in, stride);
839 0 : }
840 :
841 0 : static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in,
842 : int stride) {
843 0 : const __m128i final_rounding = _mm_set1_epi16(1 << 4);
844 0 : const __m128i zero = _mm_setzero_si128();
845 : // Final rounding and shift
846 0 : in[0] = _mm_adds_epi16(in[0], final_rounding);
847 0 : in[1] = _mm_adds_epi16(in[1], final_rounding);
848 0 : in[2] = _mm_adds_epi16(in[2], final_rounding);
849 0 : in[3] = _mm_adds_epi16(in[3], final_rounding);
850 :
851 0 : in[0] = _mm_srai_epi16(in[0], 5);
852 0 : in[1] = _mm_srai_epi16(in[1], 5);
853 0 : in[2] = _mm_srai_epi16(in[2], 5);
854 0 : in[3] = _mm_srai_epi16(in[3], 5);
855 :
856 : // Reconstruction and Store
857 : {
858 0 : __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
859 0 : __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
860 0 : __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
861 0 : __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
862 0 : __m128i d4 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 4));
863 0 : __m128i d5 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 5));
864 0 : __m128i d6 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 6));
865 0 : __m128i d7 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 7));
866 :
867 0 : d0 = _mm_unpacklo_epi32(d0, d1);
868 0 : d2 = _mm_unpacklo_epi32(d2, d3);
869 0 : d4 = _mm_unpacklo_epi32(d4, d5);
870 0 : d6 = _mm_unpacklo_epi32(d6, d7);
871 0 : d0 = _mm_unpacklo_epi8(d0, zero);
872 0 : d2 = _mm_unpacklo_epi8(d2, zero);
873 0 : d4 = _mm_unpacklo_epi8(d4, zero);
874 0 : d6 = _mm_unpacklo_epi8(d6, zero);
875 0 : d0 = _mm_add_epi16(d0, in[0]);
876 0 : d2 = _mm_add_epi16(d2, in[1]);
877 0 : d4 = _mm_add_epi16(d4, in[2]);
878 0 : d6 = _mm_add_epi16(d6, in[3]);
879 :
880 0 : d0 = _mm_packus_epi16(d0, d2);
881 0 : *(int *)dest = _mm_cvtsi128_si32(d0);
882 0 : d0 = _mm_srli_si128(d0, 4);
883 0 : *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
884 0 : d0 = _mm_srli_si128(d0, 4);
885 0 : *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
886 0 : d0 = _mm_srli_si128(d0, 4);
887 0 : *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
888 0 : d0 = _mm_packus_epi16(d4, d6);
889 0 : *(int *)(dest + stride * 4) = _mm_cvtsi128_si32(d0);
890 0 : d0 = _mm_srli_si128(d0, 4);
891 0 : *(int *)(dest + stride * 5) = _mm_cvtsi128_si32(d0);
892 0 : d0 = _mm_srli_si128(d0, 4);
893 0 : *(int *)(dest + stride * 6) = _mm_cvtsi128_si32(d0);
894 0 : d0 = _mm_srli_si128(d0, 4);
895 0 : *(int *)(dest + stride * 7) = _mm_cvtsi128_si32(d0);
896 : }
897 0 : }
898 :
899 0 : void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
900 : int tx_type) {
901 : __m128i in[8];
902 :
903 : // Load rows, packed two per element of 'in'.
904 : // We pack into the bottom half of 'in' so that the
905 : // later repacking stage can pack into the
906 : // top half without overwriting anything
907 0 : in[4] = load_input_data(input + 0 * 8);
908 0 : in[5] = load_input_data(input + 1 * 8);
909 0 : in[6] = load_input_data(input + 2 * 8);
910 0 : in[7] = load_input_data(input + 3 * 8);
911 :
912 : // Row transform
913 0 : switch (tx_type) {
914 : case DCT_DCT:
915 : case ADST_DCT:
916 : #if CONFIG_EXT_TX
917 : case FLIPADST_DCT:
918 : case H_DCT:
919 : #endif
920 0 : aom_idct4_sse2(in + 4);
921 0 : aom_idct4_sse2(in + 6);
922 0 : break;
923 : case DCT_ADST:
924 : case ADST_ADST:
925 : #if CONFIG_EXT_TX
926 : case DCT_FLIPADST:
927 : case FLIPADST_FLIPADST:
928 : case ADST_FLIPADST:
929 : case FLIPADST_ADST:
930 : case H_ADST:
931 : case H_FLIPADST:
932 : #endif
933 0 : aom_iadst4_sse2(in + 4);
934 0 : aom_iadst4_sse2(in + 6);
935 0 : break;
936 : #if CONFIG_EXT_TX
937 : case V_FLIPADST:
938 : case V_ADST:
939 : case V_DCT:
940 : case IDTX:
941 0 : iidtx4_sse2(in + 4);
942 0 : array_transpose_4x4(in + 4);
943 0 : iidtx4_sse2(in + 6);
944 0 : array_transpose_4x4(in + 6);
945 0 : break;
946 : #endif
947 0 : default: assert(0); break;
948 : }
949 :
950 0 : scale_sqrt2_8x4(in + 4);
951 :
952 : // Repack data
953 0 : in[0] = _mm_unpacklo_epi64(in[4], in[6]);
954 0 : in[1] = _mm_unpackhi_epi64(in[4], in[6]);
955 0 : in[2] = _mm_unpacklo_epi64(in[5], in[7]);
956 0 : in[3] = _mm_unpackhi_epi64(in[5], in[7]);
957 :
958 : // Column transform
959 0 : switch (tx_type) {
960 : case DCT_DCT:
961 : case DCT_ADST:
962 : #if CONFIG_EXT_TX
963 : case DCT_FLIPADST:
964 : case V_DCT:
965 : #endif
966 0 : aom_idct8_sse2(in);
967 0 : break;
968 : case ADST_DCT:
969 : case ADST_ADST:
970 : #if CONFIG_EXT_TX
971 : case FLIPADST_ADST:
972 : case ADST_FLIPADST:
973 : case FLIPADST_FLIPADST:
974 : case FLIPADST_DCT:
975 : case V_ADST:
976 : case V_FLIPADST:
977 : #endif
978 0 : aom_iadst8_sse2(in);
979 0 : break;
980 : #if CONFIG_EXT_TX
981 : case H_DCT:
982 : case H_ADST:
983 : case H_FLIPADST:
984 : case IDTX:
985 0 : iidtx8_sse2(in);
986 0 : array_transpose_8x8(in, in);
987 0 : break;
988 : #endif
989 0 : default: assert(0); break;
990 : }
991 :
992 0 : switch (tx_type) {
993 : case DCT_DCT:
994 : case ADST_DCT:
995 : case DCT_ADST:
996 : case ADST_ADST:
997 : #if CONFIG_EXT_TX
998 : case H_DCT:
999 : case H_ADST:
1000 : case V_ADST:
1001 : case V_DCT:
1002 : case IDTX:
1003 : #endif
1004 0 : break;
1005 : #if CONFIG_EXT_TX
1006 : case FLIPADST_DCT:
1007 : case FLIPADST_ADST:
1008 0 : case V_FLIPADST: FLIPUD_PTR(dest, stride, 8); break;
1009 : case DCT_FLIPADST:
1010 : case ADST_FLIPADST:
1011 : case H_FLIPADST:
1012 0 : in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
1013 0 : in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
1014 0 : in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
1015 0 : in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
1016 0 : in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
1017 0 : in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
1018 0 : in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
1019 0 : in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
1020 0 : break;
1021 : case FLIPADST_FLIPADST:
1022 0 : in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
1023 0 : in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
1024 0 : in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
1025 0 : in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
1026 0 : in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
1027 0 : in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
1028 0 : in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
1029 0 : in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
1030 0 : FLIPUD_PTR(dest, stride, 8);
1031 0 : break;
1032 : #endif
1033 0 : default: assert(0); break;
1034 : }
1035 0 : in[0] = _mm_unpacklo_epi64(in[0], in[1]);
1036 0 : in[1] = _mm_unpacklo_epi64(in[2], in[3]);
1037 0 : in[2] = _mm_unpacklo_epi64(in[4], in[5]);
1038 0 : in[3] = _mm_unpacklo_epi64(in[6], in[7]);
1039 0 : write_buffer_4x8_round5(dest, in, stride);
1040 0 : }
1041 :
1042 : // Note: The 16-column 32-element transforms take input in the form of four
1043 : // 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants
1044 : // of the overall 16x32 input buffer.
1045 0 : static INLINE void idct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
1046 : __m128i *br) {
1047 0 : array_transpose_16x16(tl, tr);
1048 0 : array_transpose_16x16(bl, br);
1049 0 : idct32_8col(tl, bl);
1050 0 : idct32_8col(tr, br);
1051 0 : }
1052 :
1053 0 : static INLINE void ihalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
1054 : __m128i *br) {
1055 : __m128i tmpl[16], tmpr[16];
1056 : int i;
1057 :
1058 : // Copy the top half of the input to temporary storage
1059 0 : for (i = 0; i < 16; ++i) {
1060 0 : tmpl[i] = tl[i];
1061 0 : tmpr[i] = tr[i];
1062 : }
1063 :
1064 : // Generate the top half of the output
1065 0 : for (i = 0; i < 16; ++i) {
1066 0 : tl[i] = _mm_slli_epi16(bl[i], 2);
1067 0 : tr[i] = _mm_slli_epi16(br[i], 2);
1068 : }
1069 0 : array_transpose_16x16(tl, tr);
1070 :
1071 : // Copy the temporary storage back to the bottom half of the input
1072 0 : for (i = 0; i < 16; ++i) {
1073 0 : bl[i] = tmpl[i];
1074 0 : br[i] = tmpr[i];
1075 : }
1076 :
1077 : // Generate the bottom half of the output
1078 0 : scale_sqrt2_8x16(bl);
1079 0 : scale_sqrt2_8x16(br);
1080 0 : aom_idct16_sse2(bl, br); // Includes a transposition
1081 0 : }
1082 :
1083 : #if CONFIG_EXT_TX
1084 0 : static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
1085 : __m128i *br) {
1086 : int i;
1087 0 : array_transpose_16x16(tl, tr);
1088 0 : array_transpose_16x16(bl, br);
1089 0 : for (i = 0; i < 16; ++i) {
1090 0 : tl[i] = _mm_slli_epi16(tl[i], 2);
1091 0 : tr[i] = _mm_slli_epi16(tr[i], 2);
1092 0 : bl[i] = _mm_slli_epi16(bl[i], 2);
1093 0 : br[i] = _mm_slli_epi16(br[i], 2);
1094 : }
1095 0 : }
1096 : #endif // CONFIG_EXT_TX
1097 :
1098 0 : static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
1099 : __m128i *intr, __m128i *inbl,
1100 : __m128i *inbr, int stride) {
1101 0 : const __m128i zero = _mm_setzero_si128();
1102 0 : const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1103 : int i;
1104 :
1105 0 : for (i = 0; i < 16; ++i) {
1106 0 : intl[i] = _mm_adds_epi16(intl[i], final_rounding);
1107 0 : intr[i] = _mm_adds_epi16(intr[i], final_rounding);
1108 0 : inbl[i] = _mm_adds_epi16(inbl[i], final_rounding);
1109 0 : inbr[i] = _mm_adds_epi16(inbr[i], final_rounding);
1110 0 : intl[i] = _mm_srai_epi16(intl[i], 6);
1111 0 : intr[i] = _mm_srai_epi16(intr[i], 6);
1112 0 : inbl[i] = _mm_srai_epi16(inbl[i], 6);
1113 0 : inbr[i] = _mm_srai_epi16(inbr[i], 6);
1114 0 : RECON_AND_STORE(dest + i * stride + 0, intl[i]);
1115 0 : RECON_AND_STORE(dest + i * stride + 8, intr[i]);
1116 0 : RECON_AND_STORE(dest + (i + 16) * stride + 0, inbl[i]);
1117 0 : RECON_AND_STORE(dest + (i + 16) * stride + 8, inbr[i]);
1118 : }
1119 0 : }
1120 :
1121 0 : void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
1122 : int stride, int tx_type) {
1123 : __m128i intl[16], intr[16], inbl[16], inbr[16];
1124 :
1125 : int i;
1126 0 : for (i = 0; i < 16; ++i) {
1127 0 : intl[i] = load_input_data(input + i * 16 + 0);
1128 0 : intr[i] = load_input_data(input + i * 16 + 8);
1129 0 : inbl[i] = load_input_data(input + (i + 16) * 16 + 0);
1130 0 : inbr[i] = load_input_data(input + (i + 16) * 16 + 8);
1131 : }
1132 :
1133 : // Row transform
1134 0 : switch (tx_type) {
1135 : case DCT_DCT:
1136 : case ADST_DCT:
1137 : #if CONFIG_EXT_TX
1138 : case FLIPADST_DCT:
1139 : case H_DCT:
1140 : #endif
1141 0 : aom_idct16_sse2(intl, intr);
1142 0 : aom_idct16_sse2(inbl, inbr);
1143 0 : break;
1144 : case DCT_ADST:
1145 : case ADST_ADST:
1146 : #if CONFIG_EXT_TX
1147 : case DCT_FLIPADST:
1148 : case FLIPADST_FLIPADST:
1149 : case ADST_FLIPADST:
1150 : case FLIPADST_ADST:
1151 : case H_ADST:
1152 : case H_FLIPADST:
1153 : #endif
1154 0 : aom_iadst16_sse2(intl, intr);
1155 0 : aom_iadst16_sse2(inbl, inbr);
1156 0 : break;
1157 : #if CONFIG_EXT_TX
1158 : case V_FLIPADST:
1159 : case V_ADST:
1160 : case V_DCT:
1161 : case IDTX:
1162 0 : iidtx16_sse2(intl, intr);
1163 0 : iidtx16_sse2(inbl, inbr);
1164 0 : break;
1165 : #endif
1166 0 : default: assert(0); break;
1167 : }
1168 :
1169 0 : scale_sqrt2_8x16(intl);
1170 0 : scale_sqrt2_8x16(intr);
1171 0 : scale_sqrt2_8x16(inbl);
1172 0 : scale_sqrt2_8x16(inbr);
1173 :
1174 : // Column transform
1175 0 : switch (tx_type) {
1176 : case DCT_DCT:
1177 : case DCT_ADST:
1178 : #if CONFIG_EXT_TX
1179 : case DCT_FLIPADST:
1180 : case V_DCT:
1181 : #endif
1182 0 : idct32_16col(intl, intr, inbl, inbr);
1183 0 : break;
1184 : case ADST_DCT:
1185 : case ADST_ADST:
1186 : #if CONFIG_EXT_TX
1187 : case FLIPADST_ADST:
1188 : case ADST_FLIPADST:
1189 : case FLIPADST_FLIPADST:
1190 : case FLIPADST_DCT:
1191 : case V_ADST:
1192 : case V_FLIPADST:
1193 : #endif
1194 0 : ihalfright32_16col(intl, intr, inbl, inbr);
1195 0 : break;
1196 : #if CONFIG_EXT_TX
1197 : case H_DCT:
1198 : case H_ADST:
1199 : case H_FLIPADST:
1200 0 : case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
1201 : #endif
1202 0 : default: assert(0); break;
1203 : }
1204 :
1205 0 : switch (tx_type) {
1206 : case DCT_DCT:
1207 : case ADST_DCT:
1208 : case DCT_ADST:
1209 : case ADST_ADST:
1210 : #if CONFIG_EXT_TX
1211 : case H_DCT:
1212 : case H_ADST:
1213 : case V_ADST:
1214 : case V_DCT:
1215 : case IDTX:
1216 : #endif
1217 0 : break;
1218 : #if CONFIG_EXT_TX
1219 : case FLIPADST_DCT:
1220 : case FLIPADST_ADST:
1221 0 : case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
1222 : case DCT_FLIPADST:
1223 : case ADST_FLIPADST:
1224 : case H_FLIPADST:
1225 0 : for (i = 0; i < 16; ++i) {
1226 0 : __m128i tmp = intl[i];
1227 0 : intl[i] = mm_reverse_epi16(intr[i]);
1228 0 : intr[i] = mm_reverse_epi16(tmp);
1229 0 : tmp = inbl[i];
1230 0 : inbl[i] = mm_reverse_epi16(inbr[i]);
1231 0 : inbr[i] = mm_reverse_epi16(tmp);
1232 : }
1233 0 : break;
1234 : case FLIPADST_FLIPADST:
1235 0 : for (i = 0; i < 16; ++i) {
1236 0 : __m128i tmp = intl[i];
1237 0 : intl[i] = mm_reverse_epi16(intr[i]);
1238 0 : intr[i] = mm_reverse_epi16(tmp);
1239 0 : tmp = inbl[i];
1240 0 : inbl[i] = mm_reverse_epi16(inbr[i]);
1241 0 : inbr[i] = mm_reverse_epi16(tmp);
1242 : }
1243 0 : FLIPUD_PTR(dest, stride, 32);
1244 0 : break;
1245 : #endif
1246 0 : default: assert(0); break;
1247 : }
1248 0 : write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
1249 0 : }
1250 :
1251 0 : static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
1252 : __m128i *in1, __m128i *in2,
1253 : __m128i *in3, int stride) {
1254 0 : const __m128i zero = _mm_setzero_si128();
1255 0 : const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1256 : int i;
1257 :
1258 0 : for (i = 0; i < 16; ++i) {
1259 0 : in0[i] = _mm_adds_epi16(in0[i], final_rounding);
1260 0 : in1[i] = _mm_adds_epi16(in1[i], final_rounding);
1261 0 : in2[i] = _mm_adds_epi16(in2[i], final_rounding);
1262 0 : in3[i] = _mm_adds_epi16(in3[i], final_rounding);
1263 0 : in0[i] = _mm_srai_epi16(in0[i], 6);
1264 0 : in1[i] = _mm_srai_epi16(in1[i], 6);
1265 0 : in2[i] = _mm_srai_epi16(in2[i], 6);
1266 0 : in3[i] = _mm_srai_epi16(in3[i], 6);
1267 0 : RECON_AND_STORE(dest + i * stride + 0, in0[i]);
1268 0 : RECON_AND_STORE(dest + i * stride + 8, in1[i]);
1269 0 : RECON_AND_STORE(dest + i * stride + 16, in2[i]);
1270 0 : RECON_AND_STORE(dest + i * stride + 24, in3[i]);
1271 : }
1272 0 : }
1273 :
1274 0 : void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
1275 : int stride, int tx_type) {
1276 : __m128i in0[16], in1[16], in2[16], in3[16];
1277 : int i;
1278 :
1279 0 : for (i = 0; i < 16; ++i) {
1280 0 : in0[i] = load_input_data(input + i * 32 + 0);
1281 0 : in1[i] = load_input_data(input + i * 32 + 8);
1282 0 : in2[i] = load_input_data(input + i * 32 + 16);
1283 0 : in3[i] = load_input_data(input + i * 32 + 24);
1284 : }
1285 :
1286 : // Row transform
1287 0 : switch (tx_type) {
1288 : case DCT_DCT:
1289 : case ADST_DCT:
1290 : #if CONFIG_EXT_TX
1291 : case FLIPADST_DCT:
1292 : case H_DCT:
1293 : #endif
1294 0 : idct32_16col(in0, in1, in2, in3);
1295 0 : break;
1296 : case DCT_ADST:
1297 : case ADST_ADST:
1298 : #if CONFIG_EXT_TX
1299 : case DCT_FLIPADST:
1300 : case FLIPADST_FLIPADST:
1301 : case ADST_FLIPADST:
1302 : case FLIPADST_ADST:
1303 : case H_ADST:
1304 : case H_FLIPADST:
1305 : #endif
1306 0 : ihalfright32_16col(in0, in1, in2, in3);
1307 0 : break;
1308 : #if CONFIG_EXT_TX
1309 : case V_FLIPADST:
1310 : case V_ADST:
1311 : case V_DCT:
1312 0 : case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
1313 : #endif
1314 0 : default: assert(0); break;
1315 : }
1316 :
1317 0 : scale_sqrt2_8x16(in0);
1318 0 : scale_sqrt2_8x16(in1);
1319 0 : scale_sqrt2_8x16(in2);
1320 0 : scale_sqrt2_8x16(in3);
1321 :
1322 : // Column transform
1323 0 : switch (tx_type) {
1324 : case DCT_DCT:
1325 : case DCT_ADST:
1326 : #if CONFIG_EXT_TX
1327 : case DCT_FLIPADST:
1328 : case V_DCT:
1329 : #endif
1330 0 : aom_idct16_sse2(in0, in1);
1331 0 : aom_idct16_sse2(in2, in3);
1332 0 : break;
1333 : case ADST_DCT:
1334 : case ADST_ADST:
1335 : #if CONFIG_EXT_TX
1336 : case FLIPADST_ADST:
1337 : case ADST_FLIPADST:
1338 : case FLIPADST_FLIPADST:
1339 : case FLIPADST_DCT:
1340 : case V_ADST:
1341 : case V_FLIPADST:
1342 : #endif
1343 0 : aom_iadst16_sse2(in0, in1);
1344 0 : aom_iadst16_sse2(in2, in3);
1345 0 : break;
1346 : #if CONFIG_EXT_TX
1347 : case H_DCT:
1348 : case H_ADST:
1349 : case H_FLIPADST:
1350 : case IDTX:
1351 0 : iidtx16_sse2(in0, in1);
1352 0 : iidtx16_sse2(in2, in3);
1353 0 : break;
1354 : #endif
1355 0 : default: assert(0); break;
1356 : }
1357 :
1358 0 : switch (tx_type) {
1359 : case DCT_DCT:
1360 : case ADST_DCT:
1361 : case DCT_ADST:
1362 : case ADST_ADST:
1363 : #if CONFIG_EXT_TX
1364 : case H_DCT:
1365 : case H_ADST:
1366 : case V_ADST:
1367 : case V_DCT:
1368 : case IDTX:
1369 : #endif
1370 0 : break;
1371 : #if CONFIG_EXT_TX
1372 : case FLIPADST_DCT:
1373 : case FLIPADST_ADST:
1374 0 : case V_FLIPADST: FLIPUD_PTR(dest, stride, 16); break;
1375 : case DCT_FLIPADST:
1376 : case ADST_FLIPADST:
1377 : case H_FLIPADST:
1378 0 : for (i = 0; i < 16; ++i) {
1379 0 : __m128i tmp1 = in0[i];
1380 0 : __m128i tmp2 = in1[i];
1381 0 : in0[i] = mm_reverse_epi16(in3[i]);
1382 0 : in1[i] = mm_reverse_epi16(in2[i]);
1383 0 : in2[i] = mm_reverse_epi16(tmp2);
1384 0 : in3[i] = mm_reverse_epi16(tmp1);
1385 : }
1386 0 : break;
1387 : case FLIPADST_FLIPADST:
1388 0 : for (i = 0; i < 16; ++i) {
1389 0 : __m128i tmp1 = in0[i];
1390 0 : __m128i tmp2 = in1[i];
1391 0 : in0[i] = mm_reverse_epi16(in3[i]);
1392 0 : in1[i] = mm_reverse_epi16(in2[i]);
1393 0 : in2[i] = mm_reverse_epi16(tmp2);
1394 0 : in3[i] = mm_reverse_epi16(tmp1);
1395 : }
1396 0 : FLIPUD_PTR(dest, stride, 16);
1397 0 : break;
1398 : #endif
1399 0 : default: assert(0); break;
1400 : }
1401 0 : write_buffer_32x16_round6(dest, in0, in1, in2, in3, stride);
1402 0 : }
|