Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <assert.h>
13 : #include <emmintrin.h> // SSE2
14 :
15 : #include "./aom_dsp_rtcd.h"
16 : #include "./av1_rtcd.h"
17 : #include "aom_dsp/txfm_common.h"
18 : #include "aom_dsp/x86/fwd_txfm_sse2.h"
19 : #include "aom_dsp/x86/synonyms.h"
20 : #include "aom_dsp/x86/txfm_common_sse2.h"
21 : #include "aom_ports/mem.h"
22 :
23 0 : static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
24 : int stride, int flipud, int fliplr) {
25 0 : const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
26 0 : const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
27 : __m128i mask;
28 :
29 0 : if (!flipud) {
30 0 : in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
31 0 : in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
32 0 : in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
33 0 : in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
34 : } else {
35 0 : in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
36 0 : in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
37 0 : in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
38 0 : in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
39 : }
40 :
41 0 : if (fliplr) {
42 0 : in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
43 0 : in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
44 0 : in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
45 0 : in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
46 : }
47 :
48 0 : in[0] = _mm_slli_epi16(in[0], 4);
49 0 : in[1] = _mm_slli_epi16(in[1], 4);
50 0 : in[2] = _mm_slli_epi16(in[2], 4);
51 0 : in[3] = _mm_slli_epi16(in[3], 4);
52 :
53 0 : mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
54 0 : in[0] = _mm_add_epi16(in[0], mask);
55 0 : in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
56 0 : }
57 :
58 0 : static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
59 0 : const __m128i kOne = _mm_set1_epi16(1);
60 0 : __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
61 0 : __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
62 0 : __m128i out01 = _mm_add_epi16(in01, kOne);
63 0 : __m128i out23 = _mm_add_epi16(in23, kOne);
64 0 : out01 = _mm_srai_epi16(out01, 2);
65 0 : out23 = _mm_srai_epi16(out23, 2);
66 0 : store_output(&out01, (output + 0 * 8));
67 0 : store_output(&out23, (output + 1 * 8));
68 0 : }
69 :
70 0 : static INLINE void transpose_4x4(__m128i *res) {
71 : // Combine and transpose
72 : // 00 01 02 03 20 21 22 23
73 : // 10 11 12 13 30 31 32 33
74 0 : const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
75 0 : const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
76 :
77 : // 00 10 01 11 02 12 03 13
78 : // 20 30 21 31 22 32 23 33
79 0 : res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
80 0 : res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
81 :
82 : // 00 10 20 30 01 11 21 31
83 : // 02 12 22 32 03 13 23 33
84 : // only use the first 4 16-bit integers
85 0 : res[1] = _mm_unpackhi_epi64(res[0], res[0]);
86 0 : res[3] = _mm_unpackhi_epi64(res[2], res[2]);
87 0 : }
88 :
89 0 : static void fdct4_sse2(__m128i *in) {
90 0 : const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
91 0 : const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
92 0 : const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
93 0 : const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
94 0 : const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
95 :
96 : __m128i u[4], v[4];
97 0 : u[0] = _mm_unpacklo_epi16(in[0], in[1]);
98 0 : u[1] = _mm_unpacklo_epi16(in[3], in[2]);
99 :
100 0 : v[0] = _mm_add_epi16(u[0], u[1]);
101 0 : v[1] = _mm_sub_epi16(u[0], u[1]);
102 :
103 0 : u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
104 0 : u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
105 0 : u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1
106 0 : u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3
107 :
108 0 : v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
109 0 : v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
110 0 : v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
111 0 : v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
112 0 : u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
113 0 : u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
114 0 : u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
115 0 : u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
116 :
117 0 : in[0] = _mm_packs_epi32(u[0], u[1]);
118 0 : in[1] = _mm_packs_epi32(u[2], u[3]);
119 0 : transpose_4x4(in);
120 0 : }
121 :
122 0 : static void fadst4_sse2(__m128i *in) {
123 0 : const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
124 0 : const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
125 0 : const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
126 0 : const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
127 0 : const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
128 0 : const __m128i kZero = _mm_set1_epi16(0);
129 0 : const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
130 : __m128i u[8], v[8];
131 0 : __m128i in7 = _mm_add_epi16(in[0], in[1]);
132 :
133 0 : u[0] = _mm_unpacklo_epi16(in[0], in[1]);
134 0 : u[1] = _mm_unpacklo_epi16(in[2], in[3]);
135 0 : u[2] = _mm_unpacklo_epi16(in7, kZero);
136 0 : u[3] = _mm_unpacklo_epi16(in[2], kZero);
137 0 : u[4] = _mm_unpacklo_epi16(in[3], kZero);
138 :
139 0 : v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
140 0 : v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
141 0 : v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
142 0 : v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
143 0 : v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
144 0 : v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
145 0 : v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
146 :
147 0 : u[0] = _mm_add_epi32(v[0], v[1]);
148 0 : u[1] = _mm_sub_epi32(v[2], v[6]);
149 0 : u[2] = _mm_add_epi32(v[3], v[4]);
150 0 : u[3] = _mm_sub_epi32(u[2], u[0]);
151 0 : u[4] = _mm_slli_epi32(v[5], 2);
152 0 : u[5] = _mm_sub_epi32(u[4], v[5]);
153 0 : u[6] = _mm_add_epi32(u[3], u[5]);
154 :
155 0 : v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
156 0 : v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
157 0 : v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
158 0 : v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
159 :
160 0 : u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
161 0 : u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
162 0 : u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
163 0 : u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
164 :
165 0 : in[0] = _mm_packs_epi32(u[0], u[2]);
166 0 : in[1] = _mm_packs_epi32(u[1], u[3]);
167 0 : transpose_4x4(in);
168 0 : }
169 :
170 : #if CONFIG_EXT_TX
171 0 : static void fidtx4_sse2(__m128i *in) {
172 0 : const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
173 0 : const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
174 0 : const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
175 :
176 : __m128i v0, v1, v2, v3;
177 : __m128i u0, u1, u2, u3;
178 :
179 0 : v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
180 0 : v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
181 0 : v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
182 0 : v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
183 :
184 0 : u0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
185 0 : u1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
186 0 : u2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
187 0 : u3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
188 :
189 0 : v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
190 0 : v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
191 0 : v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
192 0 : v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
193 :
194 0 : u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
195 0 : u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
196 0 : u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
197 0 : u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
198 :
199 0 : in[0] = _mm_packs_epi32(u0, u2);
200 0 : in[1] = _mm_packs_epi32(u1, u3);
201 0 : transpose_4x4(in);
202 0 : }
203 : #endif // CONFIG_EXT_TX
204 :
205 0 : void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
206 : int tx_type) {
207 : __m128i in[4];
208 :
209 0 : switch (tx_type) {
210 0 : case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break;
211 : case ADST_DCT:
212 0 : load_buffer_4x4(input, in, stride, 0, 0);
213 0 : fadst4_sse2(in);
214 0 : fdct4_sse2(in);
215 0 : write_buffer_4x4(output, in);
216 0 : break;
217 : case DCT_ADST:
218 0 : load_buffer_4x4(input, in, stride, 0, 0);
219 0 : fdct4_sse2(in);
220 0 : fadst4_sse2(in);
221 0 : write_buffer_4x4(output, in);
222 0 : break;
223 : case ADST_ADST:
224 0 : load_buffer_4x4(input, in, stride, 0, 0);
225 0 : fadst4_sse2(in);
226 0 : fadst4_sse2(in);
227 0 : write_buffer_4x4(output, in);
228 0 : break;
229 : #if CONFIG_EXT_TX
230 : case FLIPADST_DCT:
231 0 : load_buffer_4x4(input, in, stride, 1, 0);
232 0 : fadst4_sse2(in);
233 0 : fdct4_sse2(in);
234 0 : write_buffer_4x4(output, in);
235 0 : break;
236 : case DCT_FLIPADST:
237 0 : load_buffer_4x4(input, in, stride, 0, 1);
238 0 : fdct4_sse2(in);
239 0 : fadst4_sse2(in);
240 0 : write_buffer_4x4(output, in);
241 0 : break;
242 : case FLIPADST_FLIPADST:
243 0 : load_buffer_4x4(input, in, stride, 1, 1);
244 0 : fadst4_sse2(in);
245 0 : fadst4_sse2(in);
246 0 : write_buffer_4x4(output, in);
247 0 : break;
248 : case ADST_FLIPADST:
249 0 : load_buffer_4x4(input, in, stride, 0, 1);
250 0 : fadst4_sse2(in);
251 0 : fadst4_sse2(in);
252 0 : write_buffer_4x4(output, in);
253 0 : break;
254 : case FLIPADST_ADST:
255 0 : load_buffer_4x4(input, in, stride, 1, 0);
256 0 : fadst4_sse2(in);
257 0 : fadst4_sse2(in);
258 0 : write_buffer_4x4(output, in);
259 0 : break;
260 : case IDTX:
261 0 : load_buffer_4x4(input, in, stride, 0, 0);
262 0 : fidtx4_sse2(in);
263 0 : fidtx4_sse2(in);
264 0 : write_buffer_4x4(output, in);
265 0 : break;
266 : case V_DCT:
267 0 : load_buffer_4x4(input, in, stride, 0, 0);
268 0 : fdct4_sse2(in);
269 0 : fidtx4_sse2(in);
270 0 : write_buffer_4x4(output, in);
271 0 : break;
272 : case H_DCT:
273 0 : load_buffer_4x4(input, in, stride, 0, 0);
274 0 : fidtx4_sse2(in);
275 0 : fdct4_sse2(in);
276 0 : write_buffer_4x4(output, in);
277 0 : break;
278 : case V_ADST:
279 0 : load_buffer_4x4(input, in, stride, 0, 0);
280 0 : fadst4_sse2(in);
281 0 : fidtx4_sse2(in);
282 0 : write_buffer_4x4(output, in);
283 0 : break;
284 : case H_ADST:
285 0 : load_buffer_4x4(input, in, stride, 0, 0);
286 0 : fidtx4_sse2(in);
287 0 : fadst4_sse2(in);
288 0 : write_buffer_4x4(output, in);
289 0 : break;
290 : case V_FLIPADST:
291 0 : load_buffer_4x4(input, in, stride, 1, 0);
292 0 : fadst4_sse2(in);
293 0 : fidtx4_sse2(in);
294 0 : write_buffer_4x4(output, in);
295 0 : break;
296 : case H_FLIPADST:
297 0 : load_buffer_4x4(input, in, stride, 0, 1);
298 0 : fidtx4_sse2(in);
299 0 : fadst4_sse2(in);
300 0 : write_buffer_4x4(output, in);
301 0 : break;
302 : #endif // CONFIG_EXT_TX
303 0 : default: assert(0);
304 : }
305 0 : }
306 :
307 0 : void av1_fdct8x8_quant_sse2(const int16_t *input, int stride,
308 : int16_t *coeff_ptr, intptr_t n_coeffs,
309 : int skip_block, const int16_t *zbin_ptr,
310 : const int16_t *round_ptr, const int16_t *quant_ptr,
311 : const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
312 : int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
313 : uint16_t *eob_ptr, const int16_t *scan_ptr,
314 : const int16_t *iscan_ptr) {
315 : __m128i zero;
316 : int pass;
317 : // Constants
318 : // When we use them, in one case, they are all the same. In all others
319 : // it's a pair of them that we need to repeat four times. This is done
320 : // by constructing the 32 bit constant corresponding to that pair.
321 0 : const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
322 0 : const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
323 0 : const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
324 0 : const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
325 0 : const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
326 0 : const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
327 0 : const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
328 0 : const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
329 0 : const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
330 : // Load input
331 0 : __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
332 0 : __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
333 0 : __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
334 0 : __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
335 0 : __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
336 0 : __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
337 0 : __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
338 0 : __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
339 : __m128i *in[8];
340 0 : int index = 0;
341 :
342 : (void)scan_ptr;
343 : (void)zbin_ptr;
344 : (void)quant_shift_ptr;
345 : (void)coeff_ptr;
346 :
347 : // Pre-condition input (shift by two)
348 0 : in0 = _mm_slli_epi16(in0, 2);
349 0 : in1 = _mm_slli_epi16(in1, 2);
350 0 : in2 = _mm_slli_epi16(in2, 2);
351 0 : in3 = _mm_slli_epi16(in3, 2);
352 0 : in4 = _mm_slli_epi16(in4, 2);
353 0 : in5 = _mm_slli_epi16(in5, 2);
354 0 : in6 = _mm_slli_epi16(in6, 2);
355 0 : in7 = _mm_slli_epi16(in7, 2);
356 :
357 0 : in[0] = &in0;
358 0 : in[1] = &in1;
359 0 : in[2] = &in2;
360 0 : in[3] = &in3;
361 0 : in[4] = &in4;
362 0 : in[5] = &in5;
363 0 : in[6] = &in6;
364 0 : in[7] = &in7;
365 :
366 : // We do two passes, first the columns, then the rows. The results of the
367 : // first pass are transposed so that the same column code can be reused. The
368 : // results of the second pass are also transposed so that the rows (processed
369 : // as columns) are put back in row positions.
370 0 : for (pass = 0; pass < 2; pass++) {
371 : // To store results of each pass before the transpose.
372 : __m128i res0, res1, res2, res3, res4, res5, res6, res7;
373 : // Add/subtract
374 0 : const __m128i q0 = _mm_add_epi16(in0, in7);
375 0 : const __m128i q1 = _mm_add_epi16(in1, in6);
376 0 : const __m128i q2 = _mm_add_epi16(in2, in5);
377 0 : const __m128i q3 = _mm_add_epi16(in3, in4);
378 0 : const __m128i q4 = _mm_sub_epi16(in3, in4);
379 0 : const __m128i q5 = _mm_sub_epi16(in2, in5);
380 0 : const __m128i q6 = _mm_sub_epi16(in1, in6);
381 0 : const __m128i q7 = _mm_sub_epi16(in0, in7);
382 : // Work on first four results
383 : {
384 : // Add/subtract
385 0 : const __m128i r0 = _mm_add_epi16(q0, q3);
386 0 : const __m128i r1 = _mm_add_epi16(q1, q2);
387 0 : const __m128i r2 = _mm_sub_epi16(q1, q2);
388 0 : const __m128i r3 = _mm_sub_epi16(q0, q3);
389 : // Interleave to do the multiply by constants which gets us into 32bits
390 0 : const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
391 0 : const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
392 0 : const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
393 0 : const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
394 0 : const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
395 0 : const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
396 0 : const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
397 0 : const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
398 0 : const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
399 0 : const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
400 0 : const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
401 0 : const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
402 : // dct_const_round_shift
403 0 : const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
404 0 : const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
405 0 : const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
406 0 : const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
407 0 : const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
408 0 : const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
409 0 : const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
410 0 : const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
411 0 : const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
412 0 : const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
413 0 : const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
414 0 : const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
415 0 : const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
416 0 : const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
417 0 : const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
418 0 : const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
419 : // Combine
420 0 : res0 = _mm_packs_epi32(w0, w1);
421 0 : res4 = _mm_packs_epi32(w2, w3);
422 0 : res2 = _mm_packs_epi32(w4, w5);
423 0 : res6 = _mm_packs_epi32(w6, w7);
424 : }
425 : // Work on next four results
426 : {
427 : // Interleave to do the multiply by constants which gets us into 32bits
428 0 : const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
429 0 : const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
430 0 : const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
431 0 : const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
432 0 : const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
433 0 : const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
434 : // dct_const_round_shift
435 0 : const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
436 0 : const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
437 0 : const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
438 0 : const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
439 0 : const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
440 0 : const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
441 0 : const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
442 0 : const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
443 : // Combine
444 0 : const __m128i r0 = _mm_packs_epi32(s0, s1);
445 0 : const __m128i r1 = _mm_packs_epi32(s2, s3);
446 : // Add/subtract
447 0 : const __m128i x0 = _mm_add_epi16(q4, r0);
448 0 : const __m128i x1 = _mm_sub_epi16(q4, r0);
449 0 : const __m128i x2 = _mm_sub_epi16(q7, r1);
450 0 : const __m128i x3 = _mm_add_epi16(q7, r1);
451 : // Interleave to do the multiply by constants which gets us into 32bits
452 0 : const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
453 0 : const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
454 0 : const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
455 0 : const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
456 0 : const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
457 0 : const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
458 0 : const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
459 0 : const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
460 0 : const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
461 0 : const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
462 0 : const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
463 0 : const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
464 : // dct_const_round_shift
465 0 : const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
466 0 : const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
467 0 : const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
468 0 : const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
469 0 : const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
470 0 : const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
471 0 : const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
472 0 : const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
473 0 : const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
474 0 : const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
475 0 : const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
476 0 : const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
477 0 : const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
478 0 : const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
479 0 : const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
480 0 : const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
481 : // Combine
482 0 : res1 = _mm_packs_epi32(w0, w1);
483 0 : res7 = _mm_packs_epi32(w2, w3);
484 0 : res5 = _mm_packs_epi32(w4, w5);
485 0 : res3 = _mm_packs_epi32(w6, w7);
486 : }
487 : // Transpose the 8x8.
488 : {
489 : // 00 01 02 03 04 05 06 07
490 : // 10 11 12 13 14 15 16 17
491 : // 20 21 22 23 24 25 26 27
492 : // 30 31 32 33 34 35 36 37
493 : // 40 41 42 43 44 45 46 47
494 : // 50 51 52 53 54 55 56 57
495 : // 60 61 62 63 64 65 66 67
496 : // 70 71 72 73 74 75 76 77
497 0 : const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
498 0 : const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
499 0 : const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
500 0 : const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
501 0 : const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
502 0 : const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
503 0 : const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
504 0 : const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
505 : // 00 10 01 11 02 12 03 13
506 : // 20 30 21 31 22 32 23 33
507 : // 04 14 05 15 06 16 07 17
508 : // 24 34 25 35 26 36 27 37
509 : // 40 50 41 51 42 52 43 53
510 : // 60 70 61 71 62 72 63 73
511 : // 54 54 55 55 56 56 57 57
512 : // 64 74 65 75 66 76 67 77
513 0 : const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
514 0 : const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
515 0 : const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
516 0 : const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
517 0 : const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
518 0 : const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
519 0 : const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
520 0 : const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
521 : // 00 10 20 30 01 11 21 31
522 : // 40 50 60 70 41 51 61 71
523 : // 02 12 22 32 03 13 23 33
524 : // 42 52 62 72 43 53 63 73
525 : // 04 14 24 34 05 15 21 36
526 : // 44 54 64 74 45 55 61 76
527 : // 06 16 26 36 07 17 27 37
528 : // 46 56 66 76 47 57 67 77
529 0 : in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
530 0 : in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
531 0 : in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
532 0 : in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
533 0 : in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
534 0 : in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
535 0 : in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
536 0 : in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
537 : // 00 10 20 30 40 50 60 70
538 : // 01 11 21 31 41 51 61 71
539 : // 02 12 22 32 42 52 62 72
540 : // 03 13 23 33 43 53 63 73
541 : // 04 14 24 34 44 54 64 74
542 : // 05 15 25 35 45 55 65 75
543 : // 06 16 26 36 46 56 66 76
544 : // 07 17 27 37 47 57 67 77
545 : }
546 : }
547 : // Post-condition output and store it
548 : {
549 : // Post-condition (division by two)
550 : // division of two 16 bits signed numbers using shifts
551 : // n / 2 = (n - (n >> 15)) >> 1
552 0 : const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
553 0 : const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
554 0 : const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
555 0 : const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
556 0 : const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
557 0 : const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
558 0 : const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
559 0 : const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
560 0 : in0 = _mm_sub_epi16(in0, sign_in0);
561 0 : in1 = _mm_sub_epi16(in1, sign_in1);
562 0 : in2 = _mm_sub_epi16(in2, sign_in2);
563 0 : in3 = _mm_sub_epi16(in3, sign_in3);
564 0 : in4 = _mm_sub_epi16(in4, sign_in4);
565 0 : in5 = _mm_sub_epi16(in5, sign_in5);
566 0 : in6 = _mm_sub_epi16(in6, sign_in6);
567 0 : in7 = _mm_sub_epi16(in7, sign_in7);
568 0 : in0 = _mm_srai_epi16(in0, 1);
569 0 : in1 = _mm_srai_epi16(in1, 1);
570 0 : in2 = _mm_srai_epi16(in2, 1);
571 0 : in3 = _mm_srai_epi16(in3, 1);
572 0 : in4 = _mm_srai_epi16(in4, 1);
573 0 : in5 = _mm_srai_epi16(in5, 1);
574 0 : in6 = _mm_srai_epi16(in6, 1);
575 0 : in7 = _mm_srai_epi16(in7, 1);
576 : }
577 :
578 0 : iscan_ptr += n_coeffs;
579 0 : qcoeff_ptr += n_coeffs;
580 0 : dqcoeff_ptr += n_coeffs;
581 0 : n_coeffs = -n_coeffs;
582 0 : zero = _mm_setzero_si128();
583 :
584 0 : if (!skip_block) {
585 : __m128i eob;
586 : __m128i round, quant, dequant;
587 : {
588 : __m128i coeff0, coeff1;
589 :
590 : // Setup global values
591 : {
592 0 : round = _mm_load_si128((const __m128i *)round_ptr);
593 0 : quant = _mm_load_si128((const __m128i *)quant_ptr);
594 0 : dequant = _mm_load_si128((const __m128i *)dequant_ptr);
595 : }
596 :
597 : {
598 : __m128i coeff0_sign, coeff1_sign;
599 : __m128i qcoeff0, qcoeff1;
600 : __m128i qtmp0, qtmp1;
601 : // Do DC and first 15 AC
602 0 : coeff0 = *in[0];
603 0 : coeff1 = *in[1];
604 :
605 : // Poor man's sign extract
606 0 : coeff0_sign = _mm_srai_epi16(coeff0, 15);
607 0 : coeff1_sign = _mm_srai_epi16(coeff1, 15);
608 0 : qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
609 0 : qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
610 0 : qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
611 0 : qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
612 :
613 0 : qcoeff0 = _mm_adds_epi16(qcoeff0, round);
614 0 : round = _mm_unpackhi_epi64(round, round);
615 0 : qcoeff1 = _mm_adds_epi16(qcoeff1, round);
616 0 : qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
617 0 : quant = _mm_unpackhi_epi64(quant, quant);
618 0 : qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
619 :
620 : // Reinsert signs
621 0 : qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
622 0 : qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
623 0 : qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
624 0 : qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
625 :
626 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
627 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
628 :
629 0 : coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
630 0 : dequant = _mm_unpackhi_epi64(dequant, dequant);
631 0 : coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
632 :
633 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
634 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
635 : }
636 :
637 : {
638 : // Scan for eob
639 : __m128i zero_coeff0, zero_coeff1;
640 : __m128i nzero_coeff0, nzero_coeff1;
641 : __m128i iscan0, iscan1;
642 : __m128i eob1;
643 0 : zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
644 0 : zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
645 0 : nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
646 0 : nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
647 0 : iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
648 0 : iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
649 : // Add one to convert from indices to counts
650 0 : iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
651 0 : iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
652 0 : eob = _mm_and_si128(iscan0, nzero_coeff0);
653 0 : eob1 = _mm_and_si128(iscan1, nzero_coeff1);
654 0 : eob = _mm_max_epi16(eob, eob1);
655 : }
656 0 : n_coeffs += 8 * 2;
657 : }
658 :
659 : // AC only loop
660 0 : index = 2;
661 0 : while (n_coeffs < 0) {
662 : __m128i coeff0, coeff1;
663 : {
664 : __m128i coeff0_sign, coeff1_sign;
665 : __m128i qcoeff0, qcoeff1;
666 : __m128i qtmp0, qtmp1;
667 :
668 0 : assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
669 0 : coeff0 = *in[index];
670 0 : coeff1 = *in[index + 1];
671 :
672 : // Poor man's sign extract
673 0 : coeff0_sign = _mm_srai_epi16(coeff0, 15);
674 0 : coeff1_sign = _mm_srai_epi16(coeff1, 15);
675 0 : qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
676 0 : qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
677 0 : qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
678 0 : qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
679 :
680 0 : qcoeff0 = _mm_adds_epi16(qcoeff0, round);
681 0 : qcoeff1 = _mm_adds_epi16(qcoeff1, round);
682 0 : qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
683 0 : qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
684 :
685 : // Reinsert signs
686 0 : qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
687 0 : qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
688 0 : qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
689 0 : qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
690 :
691 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
692 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
693 :
694 0 : coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
695 0 : coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
696 :
697 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
698 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
699 : }
700 :
701 : {
702 : // Scan for eob
703 : __m128i zero_coeff0, zero_coeff1;
704 : __m128i nzero_coeff0, nzero_coeff1;
705 : __m128i iscan0, iscan1;
706 : __m128i eob0, eob1;
707 0 : zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
708 0 : zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
709 0 : nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
710 0 : nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
711 0 : iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
712 0 : iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
713 : // Add one to convert from indices to counts
714 0 : iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
715 0 : iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
716 0 : eob0 = _mm_and_si128(iscan0, nzero_coeff0);
717 0 : eob1 = _mm_and_si128(iscan1, nzero_coeff1);
718 0 : eob0 = _mm_max_epi16(eob0, eob1);
719 0 : eob = _mm_max_epi16(eob, eob0);
720 : }
721 0 : n_coeffs += 8 * 2;
722 0 : index += 2;
723 : }
724 :
725 : // Accumulate EOB
726 : {
727 : __m128i eob_shuffled;
728 0 : eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
729 0 : eob = _mm_max_epi16(eob, eob_shuffled);
730 0 : eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
731 0 : eob = _mm_max_epi16(eob, eob_shuffled);
732 0 : eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
733 0 : eob = _mm_max_epi16(eob, eob_shuffled);
734 0 : *eob_ptr = _mm_extract_epi16(eob, 1);
735 : }
736 : } else {
737 : do {
738 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
739 0 : _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
740 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
741 0 : _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
742 0 : n_coeffs += 8 * 2;
743 0 : } while (n_coeffs < 0);
744 0 : *eob_ptr = 0;
745 : }
746 0 : }
747 :
748 : // load 8x8 array
749 0 : static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
750 : int stride, int flipud, int fliplr) {
751 0 : if (!flipud) {
752 0 : in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
753 0 : in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
754 0 : in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
755 0 : in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
756 0 : in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
757 0 : in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
758 0 : in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
759 0 : in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
760 : } else {
761 0 : in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
762 0 : in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
763 0 : in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
764 0 : in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
765 0 : in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
766 0 : in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
767 0 : in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
768 0 : in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
769 : }
770 :
771 0 : if (fliplr) {
772 0 : in[0] = mm_reverse_epi16(in[0]);
773 0 : in[1] = mm_reverse_epi16(in[1]);
774 0 : in[2] = mm_reverse_epi16(in[2]);
775 0 : in[3] = mm_reverse_epi16(in[3]);
776 0 : in[4] = mm_reverse_epi16(in[4]);
777 0 : in[5] = mm_reverse_epi16(in[5]);
778 0 : in[6] = mm_reverse_epi16(in[6]);
779 0 : in[7] = mm_reverse_epi16(in[7]);
780 : }
781 :
782 0 : in[0] = _mm_slli_epi16(in[0], 2);
783 0 : in[1] = _mm_slli_epi16(in[1], 2);
784 0 : in[2] = _mm_slli_epi16(in[2], 2);
785 0 : in[3] = _mm_slli_epi16(in[3], 2);
786 0 : in[4] = _mm_slli_epi16(in[4], 2);
787 0 : in[5] = _mm_slli_epi16(in[5], 2);
788 0 : in[6] = _mm_slli_epi16(in[6], 2);
789 0 : in[7] = _mm_slli_epi16(in[7], 2);
790 0 : }
791 :
792 : // right shift and rounding
793 0 : static INLINE void right_shift_8x8(__m128i *res, const int bit) {
794 0 : __m128i sign0 = _mm_srai_epi16(res[0], 15);
795 0 : __m128i sign1 = _mm_srai_epi16(res[1], 15);
796 0 : __m128i sign2 = _mm_srai_epi16(res[2], 15);
797 0 : __m128i sign3 = _mm_srai_epi16(res[3], 15);
798 0 : __m128i sign4 = _mm_srai_epi16(res[4], 15);
799 0 : __m128i sign5 = _mm_srai_epi16(res[5], 15);
800 0 : __m128i sign6 = _mm_srai_epi16(res[6], 15);
801 0 : __m128i sign7 = _mm_srai_epi16(res[7], 15);
802 :
803 0 : if (bit == 2) {
804 0 : const __m128i const_rounding = _mm_set1_epi16(1);
805 0 : res[0] = _mm_adds_epi16(res[0], const_rounding);
806 0 : res[1] = _mm_adds_epi16(res[1], const_rounding);
807 0 : res[2] = _mm_adds_epi16(res[2], const_rounding);
808 0 : res[3] = _mm_adds_epi16(res[3], const_rounding);
809 0 : res[4] = _mm_adds_epi16(res[4], const_rounding);
810 0 : res[5] = _mm_adds_epi16(res[5], const_rounding);
811 0 : res[6] = _mm_adds_epi16(res[6], const_rounding);
812 0 : res[7] = _mm_adds_epi16(res[7], const_rounding);
813 : }
814 :
815 0 : res[0] = _mm_sub_epi16(res[0], sign0);
816 0 : res[1] = _mm_sub_epi16(res[1], sign1);
817 0 : res[2] = _mm_sub_epi16(res[2], sign2);
818 0 : res[3] = _mm_sub_epi16(res[3], sign3);
819 0 : res[4] = _mm_sub_epi16(res[4], sign4);
820 0 : res[5] = _mm_sub_epi16(res[5], sign5);
821 0 : res[6] = _mm_sub_epi16(res[6], sign6);
822 0 : res[7] = _mm_sub_epi16(res[7], sign7);
823 :
824 0 : if (bit == 1) {
825 0 : res[0] = _mm_srai_epi16(res[0], 1);
826 0 : res[1] = _mm_srai_epi16(res[1], 1);
827 0 : res[2] = _mm_srai_epi16(res[2], 1);
828 0 : res[3] = _mm_srai_epi16(res[3], 1);
829 0 : res[4] = _mm_srai_epi16(res[4], 1);
830 0 : res[5] = _mm_srai_epi16(res[5], 1);
831 0 : res[6] = _mm_srai_epi16(res[6], 1);
832 0 : res[7] = _mm_srai_epi16(res[7], 1);
833 : } else {
834 0 : res[0] = _mm_srai_epi16(res[0], 2);
835 0 : res[1] = _mm_srai_epi16(res[1], 2);
836 0 : res[2] = _mm_srai_epi16(res[2], 2);
837 0 : res[3] = _mm_srai_epi16(res[3], 2);
838 0 : res[4] = _mm_srai_epi16(res[4], 2);
839 0 : res[5] = _mm_srai_epi16(res[5], 2);
840 0 : res[6] = _mm_srai_epi16(res[6], 2);
841 0 : res[7] = _mm_srai_epi16(res[7], 2);
842 : }
843 0 : }
844 :
845 : // write 8x8 array
846 0 : static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
847 : int stride) {
848 0 : store_output(&res[0], (output + 0 * stride));
849 0 : store_output(&res[1], (output + 1 * stride));
850 0 : store_output(&res[2], (output + 2 * stride));
851 0 : store_output(&res[3], (output + 3 * stride));
852 0 : store_output(&res[4], (output + 4 * stride));
853 0 : store_output(&res[5], (output + 5 * stride));
854 0 : store_output(&res[6], (output + 6 * stride));
855 0 : store_output(&res[7], (output + 7 * stride));
856 0 : }
857 :
858 : // perform in-place transpose
859 0 : static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
860 0 : const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
861 0 : const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
862 0 : const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
863 0 : const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
864 0 : const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
865 0 : const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
866 0 : const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
867 0 : const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
868 : // 00 10 01 11 02 12 03 13
869 : // 20 30 21 31 22 32 23 33
870 : // 04 14 05 15 06 16 07 17
871 : // 24 34 25 35 26 36 27 37
872 : // 40 50 41 51 42 52 43 53
873 : // 60 70 61 71 62 72 63 73
874 : // 44 54 45 55 46 56 47 57
875 : // 64 74 65 75 66 76 67 77
876 0 : const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
877 0 : const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
878 0 : const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
879 0 : const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
880 0 : const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
881 0 : const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
882 0 : const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
883 0 : const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
884 : // 00 10 20 30 01 11 21 31
885 : // 40 50 60 70 41 51 61 71
886 : // 02 12 22 32 03 13 23 33
887 : // 42 52 62 72 43 53 63 73
888 : // 04 14 24 34 05 15 25 35
889 : // 44 54 64 74 45 55 65 75
890 : // 06 16 26 36 07 17 27 37
891 : // 46 56 66 76 47 57 67 77
892 0 : res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
893 0 : res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
894 0 : res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
895 0 : res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
896 0 : res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
897 0 : res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
898 0 : res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
899 0 : res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
900 : // 00 10 20 30 40 50 60 70
901 : // 01 11 21 31 41 51 61 71
902 : // 02 12 22 32 42 52 62 72
903 : // 03 13 23 33 43 53 63 73
904 : // 04 14 24 34 44 54 64 74
905 : // 05 15 25 35 45 55 65 75
906 : // 06 16 26 36 46 56 66 76
907 : // 07 17 27 37 47 57 67 77
908 0 : }
909 :
910 0 : static void fdct8_sse2(__m128i *in) {
911 : // constants
912 0 : const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
913 0 : const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
914 0 : const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
915 0 : const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
916 0 : const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
917 0 : const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
918 0 : const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
919 0 : const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
920 0 : const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
921 : __m128i u0, u1, u2, u3, u4, u5, u6, u7;
922 : __m128i v0, v1, v2, v3, v4, v5, v6, v7;
923 : __m128i s0, s1, s2, s3, s4, s5, s6, s7;
924 :
925 : // stage 1
926 0 : s0 = _mm_add_epi16(in[0], in[7]);
927 0 : s1 = _mm_add_epi16(in[1], in[6]);
928 0 : s2 = _mm_add_epi16(in[2], in[5]);
929 0 : s3 = _mm_add_epi16(in[3], in[4]);
930 0 : s4 = _mm_sub_epi16(in[3], in[4]);
931 0 : s5 = _mm_sub_epi16(in[2], in[5]);
932 0 : s6 = _mm_sub_epi16(in[1], in[6]);
933 0 : s7 = _mm_sub_epi16(in[0], in[7]);
934 :
935 0 : u0 = _mm_add_epi16(s0, s3);
936 0 : u1 = _mm_add_epi16(s1, s2);
937 0 : u2 = _mm_sub_epi16(s1, s2);
938 0 : u3 = _mm_sub_epi16(s0, s3);
939 : // interleave and perform butterfly multiplication/addition
940 0 : v0 = _mm_unpacklo_epi16(u0, u1);
941 0 : v1 = _mm_unpackhi_epi16(u0, u1);
942 0 : v2 = _mm_unpacklo_epi16(u2, u3);
943 0 : v3 = _mm_unpackhi_epi16(u2, u3);
944 :
945 0 : u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
946 0 : u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
947 0 : u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
948 0 : u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
949 0 : u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
950 0 : u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
951 0 : u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
952 0 : u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
953 :
954 : // shift and rounding
955 0 : v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
956 0 : v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
957 0 : v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
958 0 : v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
959 0 : v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
960 0 : v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
961 0 : v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
962 0 : v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
963 :
964 0 : u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
965 0 : u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
966 0 : u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
967 0 : u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
968 0 : u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
969 0 : u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
970 0 : u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
971 0 : u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
972 :
973 0 : in[0] = _mm_packs_epi32(u0, u1);
974 0 : in[2] = _mm_packs_epi32(u4, u5);
975 0 : in[4] = _mm_packs_epi32(u2, u3);
976 0 : in[6] = _mm_packs_epi32(u6, u7);
977 :
978 : // stage 2
979 : // interleave and perform butterfly multiplication/addition
980 0 : u0 = _mm_unpacklo_epi16(s6, s5);
981 0 : u1 = _mm_unpackhi_epi16(s6, s5);
982 0 : v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
983 0 : v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
984 0 : v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
985 0 : v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
986 :
987 : // shift and rounding
988 0 : u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
989 0 : u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
990 0 : u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
991 0 : u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
992 :
993 0 : v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
994 0 : v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
995 0 : v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
996 0 : v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
997 :
998 0 : u0 = _mm_packs_epi32(v0, v1);
999 0 : u1 = _mm_packs_epi32(v2, v3);
1000 :
1001 : // stage 3
1002 0 : s0 = _mm_add_epi16(s4, u0);
1003 0 : s1 = _mm_sub_epi16(s4, u0);
1004 0 : s2 = _mm_sub_epi16(s7, u1);
1005 0 : s3 = _mm_add_epi16(s7, u1);
1006 :
1007 : // stage 4
1008 0 : u0 = _mm_unpacklo_epi16(s0, s3);
1009 0 : u1 = _mm_unpackhi_epi16(s0, s3);
1010 0 : u2 = _mm_unpacklo_epi16(s1, s2);
1011 0 : u3 = _mm_unpackhi_epi16(s1, s2);
1012 :
1013 0 : v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
1014 0 : v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
1015 0 : v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
1016 0 : v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
1017 0 : v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
1018 0 : v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
1019 0 : v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
1020 0 : v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
1021 :
1022 : // shift and rounding
1023 0 : u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
1024 0 : u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
1025 0 : u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
1026 0 : u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
1027 0 : u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
1028 0 : u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
1029 0 : u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
1030 0 : u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
1031 :
1032 0 : v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
1033 0 : v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
1034 0 : v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
1035 0 : v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
1036 0 : v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
1037 0 : v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
1038 0 : v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
1039 0 : v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
1040 :
1041 0 : in[1] = _mm_packs_epi32(v0, v1);
1042 0 : in[3] = _mm_packs_epi32(v4, v5);
1043 0 : in[5] = _mm_packs_epi32(v2, v3);
1044 0 : in[7] = _mm_packs_epi32(v6, v7);
1045 :
1046 : // transpose
1047 0 : array_transpose_8x8(in, in);
1048 0 : }
1049 :
1050 0 : static void fadst8_sse2(__m128i *in) {
1051 : // Constants
1052 0 : const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1053 0 : const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1054 0 : const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1055 0 : const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1056 0 : const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1057 0 : const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1058 0 : const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1059 0 : const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1060 0 : const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1061 0 : const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1062 0 : const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1063 0 : const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1064 0 : const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1065 0 : const __m128i k__const_0 = _mm_set1_epi16(0);
1066 0 : const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1067 :
1068 : __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
1069 : __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
1070 : __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
1071 : __m128i s0, s1, s2, s3, s4, s5, s6, s7;
1072 : __m128i in0, in1, in2, in3, in4, in5, in6, in7;
1073 :
1074 : // properly aligned for butterfly input
1075 0 : in0 = in[7];
1076 0 : in1 = in[0];
1077 0 : in2 = in[5];
1078 0 : in3 = in[2];
1079 0 : in4 = in[3];
1080 0 : in5 = in[4];
1081 0 : in6 = in[1];
1082 0 : in7 = in[6];
1083 :
1084 : // column transformation
1085 : // stage 1
1086 : // interleave and multiply/add into 32-bit integer
1087 0 : s0 = _mm_unpacklo_epi16(in0, in1);
1088 0 : s1 = _mm_unpackhi_epi16(in0, in1);
1089 0 : s2 = _mm_unpacklo_epi16(in2, in3);
1090 0 : s3 = _mm_unpackhi_epi16(in2, in3);
1091 0 : s4 = _mm_unpacklo_epi16(in4, in5);
1092 0 : s5 = _mm_unpackhi_epi16(in4, in5);
1093 0 : s6 = _mm_unpacklo_epi16(in6, in7);
1094 0 : s7 = _mm_unpackhi_epi16(in6, in7);
1095 :
1096 0 : u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
1097 0 : u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
1098 0 : u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
1099 0 : u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
1100 0 : u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
1101 0 : u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
1102 0 : u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
1103 0 : u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
1104 0 : u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
1105 0 : u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
1106 0 : u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
1107 0 : u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
1108 0 : u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
1109 0 : u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
1110 0 : u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
1111 0 : u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
1112 :
1113 : // addition
1114 0 : w0 = _mm_add_epi32(u0, u8);
1115 0 : w1 = _mm_add_epi32(u1, u9);
1116 0 : w2 = _mm_add_epi32(u2, u10);
1117 0 : w3 = _mm_add_epi32(u3, u11);
1118 0 : w4 = _mm_add_epi32(u4, u12);
1119 0 : w5 = _mm_add_epi32(u5, u13);
1120 0 : w6 = _mm_add_epi32(u6, u14);
1121 0 : w7 = _mm_add_epi32(u7, u15);
1122 0 : w8 = _mm_sub_epi32(u0, u8);
1123 0 : w9 = _mm_sub_epi32(u1, u9);
1124 0 : w10 = _mm_sub_epi32(u2, u10);
1125 0 : w11 = _mm_sub_epi32(u3, u11);
1126 0 : w12 = _mm_sub_epi32(u4, u12);
1127 0 : w13 = _mm_sub_epi32(u5, u13);
1128 0 : w14 = _mm_sub_epi32(u6, u14);
1129 0 : w15 = _mm_sub_epi32(u7, u15);
1130 :
1131 : // shift and rounding
1132 0 : v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
1133 0 : v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
1134 0 : v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
1135 0 : v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
1136 0 : v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
1137 0 : v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
1138 0 : v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
1139 0 : v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
1140 :
1141 0 : u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
1142 0 : u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
1143 0 : u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
1144 0 : u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
1145 0 : u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
1146 0 : u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
1147 0 : u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
1148 0 : u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
1149 :
1150 : // back to 16-bit and pack 8 integers into __m128i
1151 0 : v0 = _mm_add_epi32(w0, w4);
1152 0 : v1 = _mm_add_epi32(w1, w5);
1153 0 : v2 = _mm_add_epi32(w2, w6);
1154 0 : v3 = _mm_add_epi32(w3, w7);
1155 0 : v4 = _mm_sub_epi32(w0, w4);
1156 0 : v5 = _mm_sub_epi32(w1, w5);
1157 0 : v6 = _mm_sub_epi32(w2, w6);
1158 0 : v7 = _mm_sub_epi32(w3, w7);
1159 :
1160 0 : w0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
1161 0 : w1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
1162 0 : w2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
1163 0 : w3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
1164 0 : w4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
1165 0 : w5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
1166 0 : w6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
1167 0 : w7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
1168 :
1169 0 : v0 = _mm_srai_epi32(w0, DCT_CONST_BITS);
1170 0 : v1 = _mm_srai_epi32(w1, DCT_CONST_BITS);
1171 0 : v2 = _mm_srai_epi32(w2, DCT_CONST_BITS);
1172 0 : v3 = _mm_srai_epi32(w3, DCT_CONST_BITS);
1173 0 : v4 = _mm_srai_epi32(w4, DCT_CONST_BITS);
1174 0 : v5 = _mm_srai_epi32(w5, DCT_CONST_BITS);
1175 0 : v6 = _mm_srai_epi32(w6, DCT_CONST_BITS);
1176 0 : v7 = _mm_srai_epi32(w7, DCT_CONST_BITS);
1177 :
1178 0 : in[4] = _mm_packs_epi32(u8, u9);
1179 0 : in[5] = _mm_packs_epi32(u10, u11);
1180 0 : in[6] = _mm_packs_epi32(u12, u13);
1181 0 : in[7] = _mm_packs_epi32(u14, u15);
1182 :
1183 : // stage 2
1184 0 : s0 = _mm_packs_epi32(v0, v1);
1185 0 : s1 = _mm_packs_epi32(v2, v3);
1186 0 : s2 = _mm_packs_epi32(v4, v5);
1187 0 : s3 = _mm_packs_epi32(v6, v7);
1188 :
1189 0 : u0 = _mm_unpacklo_epi16(in[4], in[5]);
1190 0 : u1 = _mm_unpackhi_epi16(in[4], in[5]);
1191 0 : u2 = _mm_unpacklo_epi16(in[6], in[7]);
1192 0 : u3 = _mm_unpackhi_epi16(in[6], in[7]);
1193 :
1194 0 : v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
1195 0 : v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
1196 0 : v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
1197 0 : v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
1198 0 : v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
1199 0 : v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
1200 0 : v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
1201 0 : v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
1202 :
1203 0 : w0 = _mm_add_epi32(v0, v4);
1204 0 : w1 = _mm_add_epi32(v1, v5);
1205 0 : w2 = _mm_add_epi32(v2, v6);
1206 0 : w3 = _mm_add_epi32(v3, v7);
1207 0 : w4 = _mm_sub_epi32(v0, v4);
1208 0 : w5 = _mm_sub_epi32(v1, v5);
1209 0 : w6 = _mm_sub_epi32(v2, v6);
1210 0 : w7 = _mm_sub_epi32(v3, v7);
1211 :
1212 0 : v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
1213 0 : v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
1214 0 : v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
1215 0 : v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
1216 0 : v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
1217 0 : v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
1218 0 : v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
1219 0 : v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
1220 :
1221 0 : u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1222 0 : u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1223 0 : u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1224 0 : u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1225 0 : u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
1226 0 : u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
1227 0 : u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
1228 0 : u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
1229 :
1230 : // back to 16-bit intergers
1231 0 : s4 = _mm_packs_epi32(u0, u1);
1232 0 : s5 = _mm_packs_epi32(u2, u3);
1233 0 : s6 = _mm_packs_epi32(u4, u5);
1234 0 : s7 = _mm_packs_epi32(u6, u7);
1235 :
1236 : // stage 3
1237 0 : u0 = _mm_unpacklo_epi16(s2, s3);
1238 0 : u1 = _mm_unpackhi_epi16(s2, s3);
1239 0 : u2 = _mm_unpacklo_epi16(s6, s7);
1240 0 : u3 = _mm_unpackhi_epi16(s6, s7);
1241 :
1242 0 : v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
1243 0 : v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
1244 0 : v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
1245 0 : v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
1246 0 : v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
1247 0 : v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
1248 0 : v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
1249 0 : v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
1250 :
1251 0 : u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
1252 0 : u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
1253 0 : u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
1254 0 : u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
1255 0 : u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
1256 0 : u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
1257 0 : u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
1258 0 : u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
1259 :
1260 0 : v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
1261 0 : v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
1262 0 : v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
1263 0 : v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
1264 0 : v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
1265 0 : v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
1266 0 : v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
1267 0 : v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
1268 :
1269 0 : s2 = _mm_packs_epi32(v0, v1);
1270 0 : s3 = _mm_packs_epi32(v2, v3);
1271 0 : s6 = _mm_packs_epi32(v4, v5);
1272 0 : s7 = _mm_packs_epi32(v6, v7);
1273 :
1274 : // FIXME(jingning): do subtract using bit inversion?
1275 0 : in[0] = s0;
1276 0 : in[1] = _mm_sub_epi16(k__const_0, s4);
1277 0 : in[2] = s6;
1278 0 : in[3] = _mm_sub_epi16(k__const_0, s2);
1279 0 : in[4] = s3;
1280 0 : in[5] = _mm_sub_epi16(k__const_0, s7);
1281 0 : in[6] = s5;
1282 0 : in[7] = _mm_sub_epi16(k__const_0, s1);
1283 :
1284 : // transpose
1285 0 : array_transpose_8x8(in, in);
1286 0 : }
1287 :
1288 : #if CONFIG_EXT_TX
1289 0 : static void fidtx8_sse2(__m128i *in) {
1290 0 : in[0] = _mm_slli_epi16(in[0], 1);
1291 0 : in[1] = _mm_slli_epi16(in[1], 1);
1292 0 : in[2] = _mm_slli_epi16(in[2], 1);
1293 0 : in[3] = _mm_slli_epi16(in[3], 1);
1294 0 : in[4] = _mm_slli_epi16(in[4], 1);
1295 0 : in[5] = _mm_slli_epi16(in[5], 1);
1296 0 : in[6] = _mm_slli_epi16(in[6], 1);
1297 0 : in[7] = _mm_slli_epi16(in[7], 1);
1298 :
1299 0 : array_transpose_8x8(in, in);
1300 0 : }
1301 : #endif // CONFIG_EXT_TX
1302 :
1303 0 : void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
1304 : int tx_type) {
1305 : __m128i in[8];
1306 :
1307 0 : switch (tx_type) {
1308 0 : case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break;
1309 : case ADST_DCT:
1310 0 : load_buffer_8x8(input, in, stride, 0, 0);
1311 0 : fadst8_sse2(in);
1312 0 : fdct8_sse2(in);
1313 0 : right_shift_8x8(in, 1);
1314 0 : write_buffer_8x8(output, in, 8);
1315 0 : break;
1316 : case DCT_ADST:
1317 0 : load_buffer_8x8(input, in, stride, 0, 0);
1318 0 : fdct8_sse2(in);
1319 0 : fadst8_sse2(in);
1320 0 : right_shift_8x8(in, 1);
1321 0 : write_buffer_8x8(output, in, 8);
1322 0 : break;
1323 : case ADST_ADST:
1324 0 : load_buffer_8x8(input, in, stride, 0, 0);
1325 0 : fadst8_sse2(in);
1326 0 : fadst8_sse2(in);
1327 0 : right_shift_8x8(in, 1);
1328 0 : write_buffer_8x8(output, in, 8);
1329 0 : break;
1330 : #if CONFIG_EXT_TX
1331 : case FLIPADST_DCT:
1332 0 : load_buffer_8x8(input, in, stride, 1, 0);
1333 0 : fadst8_sse2(in);
1334 0 : fdct8_sse2(in);
1335 0 : right_shift_8x8(in, 1);
1336 0 : write_buffer_8x8(output, in, 8);
1337 0 : break;
1338 : case DCT_FLIPADST:
1339 0 : load_buffer_8x8(input, in, stride, 0, 1);
1340 0 : fdct8_sse2(in);
1341 0 : fadst8_sse2(in);
1342 0 : right_shift_8x8(in, 1);
1343 0 : write_buffer_8x8(output, in, 8);
1344 0 : break;
1345 : case FLIPADST_FLIPADST:
1346 0 : load_buffer_8x8(input, in, stride, 1, 1);
1347 0 : fadst8_sse2(in);
1348 0 : fadst8_sse2(in);
1349 0 : right_shift_8x8(in, 1);
1350 0 : write_buffer_8x8(output, in, 8);
1351 0 : break;
1352 : case ADST_FLIPADST:
1353 0 : load_buffer_8x8(input, in, stride, 0, 1);
1354 0 : fadst8_sse2(in);
1355 0 : fadst8_sse2(in);
1356 0 : right_shift_8x8(in, 1);
1357 0 : write_buffer_8x8(output, in, 8);
1358 0 : break;
1359 : case FLIPADST_ADST:
1360 0 : load_buffer_8x8(input, in, stride, 1, 0);
1361 0 : fadst8_sse2(in);
1362 0 : fadst8_sse2(in);
1363 0 : right_shift_8x8(in, 1);
1364 0 : write_buffer_8x8(output, in, 8);
1365 0 : break;
1366 : case IDTX:
1367 0 : load_buffer_8x8(input, in, stride, 0, 0);
1368 0 : fidtx8_sse2(in);
1369 0 : fidtx8_sse2(in);
1370 0 : right_shift_8x8(in, 1);
1371 0 : write_buffer_8x8(output, in, 8);
1372 0 : break;
1373 : case V_DCT:
1374 0 : load_buffer_8x8(input, in, stride, 0, 0);
1375 0 : fdct8_sse2(in);
1376 0 : fidtx8_sse2(in);
1377 0 : right_shift_8x8(in, 1);
1378 0 : write_buffer_8x8(output, in, 8);
1379 0 : break;
1380 : case H_DCT:
1381 0 : load_buffer_8x8(input, in, stride, 0, 0);
1382 0 : fidtx8_sse2(in);
1383 0 : fdct8_sse2(in);
1384 0 : right_shift_8x8(in, 1);
1385 0 : write_buffer_8x8(output, in, 8);
1386 0 : break;
1387 : case V_ADST:
1388 0 : load_buffer_8x8(input, in, stride, 0, 0);
1389 0 : fadst8_sse2(in);
1390 0 : fidtx8_sse2(in);
1391 0 : right_shift_8x8(in, 1);
1392 0 : write_buffer_8x8(output, in, 8);
1393 0 : break;
1394 : case H_ADST:
1395 0 : load_buffer_8x8(input, in, stride, 0, 0);
1396 0 : fidtx8_sse2(in);
1397 0 : fadst8_sse2(in);
1398 0 : right_shift_8x8(in, 1);
1399 0 : write_buffer_8x8(output, in, 8);
1400 0 : break;
1401 : case V_FLIPADST:
1402 0 : load_buffer_8x8(input, in, stride, 1, 0);
1403 0 : fadst8_sse2(in);
1404 0 : fidtx8_sse2(in);
1405 0 : right_shift_8x8(in, 1);
1406 0 : write_buffer_8x8(output, in, 8);
1407 0 : break;
1408 : case H_FLIPADST:
1409 0 : load_buffer_8x8(input, in, stride, 0, 1);
1410 0 : fidtx8_sse2(in);
1411 0 : fadst8_sse2(in);
1412 0 : right_shift_8x8(in, 1);
1413 0 : write_buffer_8x8(output, in, 8);
1414 0 : break;
1415 : #endif // CONFIG_EXT_TX
1416 0 : default: assert(0);
1417 : }
1418 0 : }
1419 :
1420 0 : static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
1421 : __m128i *in1, int stride, int flipud,
1422 : int fliplr) {
1423 : // Load 4 8x8 blocks
1424 0 : const int16_t *topL = input;
1425 0 : const int16_t *topR = input + 8;
1426 0 : const int16_t *botL = input + 8 * stride;
1427 0 : const int16_t *botR = input + 8 * stride + 8;
1428 :
1429 : const int16_t *tmp;
1430 :
1431 0 : if (flipud) {
1432 : // Swap left columns
1433 0 : tmp = topL;
1434 0 : topL = botL;
1435 0 : botL = tmp;
1436 : // Swap right columns
1437 0 : tmp = topR;
1438 0 : topR = botR;
1439 0 : botR = tmp;
1440 : }
1441 :
1442 0 : if (fliplr) {
1443 : // Swap top rows
1444 0 : tmp = topL;
1445 0 : topL = topR;
1446 0 : topR = tmp;
1447 : // Swap bottom rows
1448 0 : tmp = botL;
1449 0 : botL = botR;
1450 0 : botR = tmp;
1451 : }
1452 :
1453 : // load first 8 columns
1454 0 : load_buffer_8x8(topL, in0, stride, flipud, fliplr);
1455 0 : load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
1456 :
1457 : // load second 8 columns
1458 0 : load_buffer_8x8(topR, in1, stride, flipud, fliplr);
1459 0 : load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
1460 0 : }
1461 :
1462 0 : static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
1463 : __m128i *in1, int stride) {
1464 : // write first 8 columns
1465 0 : write_buffer_8x8(output, in0, stride);
1466 0 : write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
1467 : // write second 8 columns
1468 0 : output += 8;
1469 0 : write_buffer_8x8(output, in1, stride);
1470 0 : write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
1471 0 : }
1472 :
1473 0 : static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
1474 : __m128i tbuf[8];
1475 0 : array_transpose_8x8(res0, res0);
1476 0 : array_transpose_8x8(res1, tbuf);
1477 0 : array_transpose_8x8(res0 + 8, res1);
1478 0 : array_transpose_8x8(res1 + 8, res1 + 8);
1479 :
1480 0 : res0[8] = tbuf[0];
1481 0 : res0[9] = tbuf[1];
1482 0 : res0[10] = tbuf[2];
1483 0 : res0[11] = tbuf[3];
1484 0 : res0[12] = tbuf[4];
1485 0 : res0[13] = tbuf[5];
1486 0 : res0[14] = tbuf[6];
1487 0 : res0[15] = tbuf[7];
1488 0 : }
1489 :
1490 0 : static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
1491 : // perform rounding operations
1492 0 : right_shift_8x8(res0, 2);
1493 0 : right_shift_8x8(res0 + 8, 2);
1494 0 : right_shift_8x8(res1, 2);
1495 0 : right_shift_8x8(res1 + 8, 2);
1496 0 : }
1497 :
1498 0 : static void fdct16_8col(__m128i *in) {
1499 : // perform 16x16 1-D DCT for 8 columns
1500 : __m128i i[8], s[8], p[8], t[8], u[16], v[16];
1501 0 : const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1502 0 : const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1503 0 : const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1504 0 : const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1505 0 : const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1506 0 : const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1507 0 : const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
1508 0 : const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
1509 0 : const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
1510 0 : const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1511 0 : const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
1512 0 : const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
1513 0 : const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
1514 0 : const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
1515 0 : const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
1516 0 : const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
1517 0 : const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
1518 0 : const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
1519 0 : const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1520 :
1521 : // stage 1
1522 0 : i[0] = _mm_add_epi16(in[0], in[15]);
1523 0 : i[1] = _mm_add_epi16(in[1], in[14]);
1524 0 : i[2] = _mm_add_epi16(in[2], in[13]);
1525 0 : i[3] = _mm_add_epi16(in[3], in[12]);
1526 0 : i[4] = _mm_add_epi16(in[4], in[11]);
1527 0 : i[5] = _mm_add_epi16(in[5], in[10]);
1528 0 : i[6] = _mm_add_epi16(in[6], in[9]);
1529 0 : i[7] = _mm_add_epi16(in[7], in[8]);
1530 :
1531 0 : s[0] = _mm_sub_epi16(in[7], in[8]);
1532 0 : s[1] = _mm_sub_epi16(in[6], in[9]);
1533 0 : s[2] = _mm_sub_epi16(in[5], in[10]);
1534 0 : s[3] = _mm_sub_epi16(in[4], in[11]);
1535 0 : s[4] = _mm_sub_epi16(in[3], in[12]);
1536 0 : s[5] = _mm_sub_epi16(in[2], in[13]);
1537 0 : s[6] = _mm_sub_epi16(in[1], in[14]);
1538 0 : s[7] = _mm_sub_epi16(in[0], in[15]);
1539 :
1540 0 : p[0] = _mm_add_epi16(i[0], i[7]);
1541 0 : p[1] = _mm_add_epi16(i[1], i[6]);
1542 0 : p[2] = _mm_add_epi16(i[2], i[5]);
1543 0 : p[3] = _mm_add_epi16(i[3], i[4]);
1544 0 : p[4] = _mm_sub_epi16(i[3], i[4]);
1545 0 : p[5] = _mm_sub_epi16(i[2], i[5]);
1546 0 : p[6] = _mm_sub_epi16(i[1], i[6]);
1547 0 : p[7] = _mm_sub_epi16(i[0], i[7]);
1548 :
1549 0 : u[0] = _mm_add_epi16(p[0], p[3]);
1550 0 : u[1] = _mm_add_epi16(p[1], p[2]);
1551 0 : u[2] = _mm_sub_epi16(p[1], p[2]);
1552 0 : u[3] = _mm_sub_epi16(p[0], p[3]);
1553 :
1554 0 : v[0] = _mm_unpacklo_epi16(u[0], u[1]);
1555 0 : v[1] = _mm_unpackhi_epi16(u[0], u[1]);
1556 0 : v[2] = _mm_unpacklo_epi16(u[2], u[3]);
1557 0 : v[3] = _mm_unpackhi_epi16(u[2], u[3]);
1558 :
1559 0 : u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
1560 0 : u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
1561 0 : u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
1562 0 : u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
1563 0 : u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
1564 0 : u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
1565 0 : u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
1566 0 : u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
1567 :
1568 0 : v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1569 0 : v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1570 0 : v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1571 0 : v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1572 0 : v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1573 0 : v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1574 0 : v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1575 0 : v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1576 :
1577 0 : u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1578 0 : u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1579 0 : u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1580 0 : u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1581 0 : u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1582 0 : u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1583 0 : u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1584 0 : u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1585 :
1586 0 : in[0] = _mm_packs_epi32(u[0], u[1]);
1587 0 : in[4] = _mm_packs_epi32(u[4], u[5]);
1588 0 : in[8] = _mm_packs_epi32(u[2], u[3]);
1589 0 : in[12] = _mm_packs_epi32(u[6], u[7]);
1590 :
1591 0 : u[0] = _mm_unpacklo_epi16(p[5], p[6]);
1592 0 : u[1] = _mm_unpackhi_epi16(p[5], p[6]);
1593 0 : v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1594 0 : v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1595 0 : v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1596 0 : v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1597 :
1598 0 : u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1599 0 : u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1600 0 : u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1601 0 : u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1602 :
1603 0 : v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1604 0 : v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1605 0 : v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1606 0 : v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1607 :
1608 0 : u[0] = _mm_packs_epi32(v[0], v[1]);
1609 0 : u[1] = _mm_packs_epi32(v[2], v[3]);
1610 :
1611 0 : t[0] = _mm_add_epi16(p[4], u[0]);
1612 0 : t[1] = _mm_sub_epi16(p[4], u[0]);
1613 0 : t[2] = _mm_sub_epi16(p[7], u[1]);
1614 0 : t[3] = _mm_add_epi16(p[7], u[1]);
1615 :
1616 0 : u[0] = _mm_unpacklo_epi16(t[0], t[3]);
1617 0 : u[1] = _mm_unpackhi_epi16(t[0], t[3]);
1618 0 : u[2] = _mm_unpacklo_epi16(t[1], t[2]);
1619 0 : u[3] = _mm_unpackhi_epi16(t[1], t[2]);
1620 :
1621 0 : v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
1622 0 : v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
1623 0 : v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
1624 0 : v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
1625 0 : v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
1626 0 : v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
1627 0 : v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
1628 0 : v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
1629 :
1630 0 : u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1631 0 : u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1632 0 : u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1633 0 : u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1634 0 : u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1635 0 : u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1636 0 : u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1637 0 : u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1638 :
1639 0 : v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1640 0 : v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1641 0 : v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1642 0 : v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1643 0 : v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1644 0 : v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1645 0 : v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1646 0 : v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1647 :
1648 0 : in[2] = _mm_packs_epi32(v[0], v[1]);
1649 0 : in[6] = _mm_packs_epi32(v[4], v[5]);
1650 0 : in[10] = _mm_packs_epi32(v[2], v[3]);
1651 0 : in[14] = _mm_packs_epi32(v[6], v[7]);
1652 :
1653 : // stage 2
1654 0 : u[0] = _mm_unpacklo_epi16(s[2], s[5]);
1655 0 : u[1] = _mm_unpackhi_epi16(s[2], s[5]);
1656 0 : u[2] = _mm_unpacklo_epi16(s[3], s[4]);
1657 0 : u[3] = _mm_unpackhi_epi16(s[3], s[4]);
1658 :
1659 0 : v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1660 0 : v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1661 0 : v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1662 0 : v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1663 0 : v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1664 0 : v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1665 0 : v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1666 0 : v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1667 :
1668 0 : u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1669 0 : u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1670 0 : u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1671 0 : u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1672 0 : u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1673 0 : u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1674 0 : u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1675 0 : u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1676 :
1677 0 : v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1678 0 : v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1679 0 : v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1680 0 : v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1681 0 : v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1682 0 : v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1683 0 : v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1684 0 : v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1685 :
1686 0 : t[2] = _mm_packs_epi32(v[0], v[1]);
1687 0 : t[3] = _mm_packs_epi32(v[2], v[3]);
1688 0 : t[4] = _mm_packs_epi32(v[4], v[5]);
1689 0 : t[5] = _mm_packs_epi32(v[6], v[7]);
1690 :
1691 : // stage 3
1692 0 : p[0] = _mm_add_epi16(s[0], t[3]);
1693 0 : p[1] = _mm_add_epi16(s[1], t[2]);
1694 0 : p[2] = _mm_sub_epi16(s[1], t[2]);
1695 0 : p[3] = _mm_sub_epi16(s[0], t[3]);
1696 0 : p[4] = _mm_sub_epi16(s[7], t[4]);
1697 0 : p[5] = _mm_sub_epi16(s[6], t[5]);
1698 0 : p[6] = _mm_add_epi16(s[6], t[5]);
1699 0 : p[7] = _mm_add_epi16(s[7], t[4]);
1700 :
1701 : // stage 4
1702 0 : u[0] = _mm_unpacklo_epi16(p[1], p[6]);
1703 0 : u[1] = _mm_unpackhi_epi16(p[1], p[6]);
1704 0 : u[2] = _mm_unpacklo_epi16(p[2], p[5]);
1705 0 : u[3] = _mm_unpackhi_epi16(p[2], p[5]);
1706 :
1707 0 : v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
1708 0 : v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
1709 0 : v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
1710 0 : v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
1711 0 : v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
1712 0 : v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
1713 0 : v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
1714 0 : v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
1715 :
1716 0 : u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1717 0 : u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1718 0 : u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1719 0 : u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1720 0 : u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1721 0 : u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1722 0 : u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1723 0 : u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1724 :
1725 0 : v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1726 0 : v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1727 0 : v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1728 0 : v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1729 0 : v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1730 0 : v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1731 0 : v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1732 0 : v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1733 :
1734 0 : t[1] = _mm_packs_epi32(v[0], v[1]);
1735 0 : t[2] = _mm_packs_epi32(v[2], v[3]);
1736 0 : t[5] = _mm_packs_epi32(v[4], v[5]);
1737 0 : t[6] = _mm_packs_epi32(v[6], v[7]);
1738 :
1739 : // stage 5
1740 0 : s[0] = _mm_add_epi16(p[0], t[1]);
1741 0 : s[1] = _mm_sub_epi16(p[0], t[1]);
1742 0 : s[2] = _mm_sub_epi16(p[3], t[2]);
1743 0 : s[3] = _mm_add_epi16(p[3], t[2]);
1744 0 : s[4] = _mm_add_epi16(p[4], t[5]);
1745 0 : s[5] = _mm_sub_epi16(p[4], t[5]);
1746 0 : s[6] = _mm_sub_epi16(p[7], t[6]);
1747 0 : s[7] = _mm_add_epi16(p[7], t[6]);
1748 :
1749 : // stage 6
1750 0 : u[0] = _mm_unpacklo_epi16(s[0], s[7]);
1751 0 : u[1] = _mm_unpackhi_epi16(s[0], s[7]);
1752 0 : u[2] = _mm_unpacklo_epi16(s[1], s[6]);
1753 0 : u[3] = _mm_unpackhi_epi16(s[1], s[6]);
1754 0 : u[4] = _mm_unpacklo_epi16(s[2], s[5]);
1755 0 : u[5] = _mm_unpackhi_epi16(s[2], s[5]);
1756 0 : u[6] = _mm_unpacklo_epi16(s[3], s[4]);
1757 0 : u[7] = _mm_unpackhi_epi16(s[3], s[4]);
1758 :
1759 0 : v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
1760 0 : v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
1761 0 : v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
1762 0 : v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
1763 0 : v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
1764 0 : v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
1765 0 : v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
1766 0 : v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
1767 0 : v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
1768 0 : v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
1769 0 : v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
1770 0 : v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
1771 0 : v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
1772 0 : v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
1773 0 : v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
1774 0 : v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
1775 :
1776 0 : u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1777 0 : u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1778 0 : u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1779 0 : u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1780 0 : u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1781 0 : u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1782 0 : u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1783 0 : u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1784 0 : u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1785 0 : u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1786 0 : u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1787 0 : u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1788 0 : u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1789 0 : u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1790 0 : u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1791 0 : u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1792 :
1793 0 : v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1794 0 : v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1795 0 : v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1796 0 : v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1797 0 : v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1798 0 : v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1799 0 : v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1800 0 : v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1801 0 : v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1802 0 : v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1803 0 : v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1804 0 : v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1805 0 : v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1806 0 : v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1807 0 : v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1808 0 : v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1809 :
1810 0 : in[1] = _mm_packs_epi32(v[0], v[1]);
1811 0 : in[9] = _mm_packs_epi32(v[2], v[3]);
1812 0 : in[5] = _mm_packs_epi32(v[4], v[5]);
1813 0 : in[13] = _mm_packs_epi32(v[6], v[7]);
1814 0 : in[3] = _mm_packs_epi32(v[8], v[9]);
1815 0 : in[11] = _mm_packs_epi32(v[10], v[11]);
1816 0 : in[7] = _mm_packs_epi32(v[12], v[13]);
1817 0 : in[15] = _mm_packs_epi32(v[14], v[15]);
1818 0 : }
1819 :
1820 0 : static void fadst16_8col(__m128i *in) {
1821 : // perform 16x16 1-D ADST for 8 columns
1822 : __m128i s[16], x[16], u[32], v[32];
1823 0 : const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1824 0 : const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1825 0 : const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1826 0 : const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1827 0 : const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1828 0 : const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1829 0 : const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1830 0 : const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1831 0 : const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1832 0 : const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1833 0 : const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1834 0 : const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1835 0 : const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1836 0 : const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1837 0 : const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1838 0 : const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1839 0 : const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1840 0 : const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1841 0 : const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1842 0 : const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1843 0 : const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1844 0 : const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1845 0 : const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1846 0 : const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1847 0 : const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1848 0 : const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
1849 0 : const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1850 0 : const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1851 0 : const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1852 0 : const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1853 0 : const __m128i kZero = _mm_set1_epi16(0);
1854 :
1855 0 : u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1856 0 : u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1857 0 : u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1858 0 : u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1859 0 : u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1860 0 : u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1861 0 : u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1862 0 : u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1863 0 : u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1864 0 : u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1865 0 : u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1866 0 : u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1867 0 : u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1868 0 : u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1869 0 : u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1870 0 : u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1871 :
1872 0 : v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1873 0 : v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1874 0 : v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1875 0 : v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1876 0 : v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1877 0 : v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1878 0 : v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1879 0 : v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1880 0 : v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1881 0 : v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1882 0 : v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1883 0 : v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1884 0 : v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1885 0 : v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1886 0 : v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1887 0 : v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1888 0 : v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1889 0 : v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1890 0 : v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1891 0 : v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1892 0 : v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1893 0 : v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1894 0 : v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1895 0 : v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1896 0 : v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1897 0 : v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1898 0 : v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1899 0 : v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1900 0 : v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1901 0 : v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1902 0 : v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1903 0 : v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1904 :
1905 0 : u[0] = _mm_add_epi32(v[0], v[16]);
1906 0 : u[1] = _mm_add_epi32(v[1], v[17]);
1907 0 : u[2] = _mm_add_epi32(v[2], v[18]);
1908 0 : u[3] = _mm_add_epi32(v[3], v[19]);
1909 0 : u[4] = _mm_add_epi32(v[4], v[20]);
1910 0 : u[5] = _mm_add_epi32(v[5], v[21]);
1911 0 : u[6] = _mm_add_epi32(v[6], v[22]);
1912 0 : u[7] = _mm_add_epi32(v[7], v[23]);
1913 0 : u[8] = _mm_add_epi32(v[8], v[24]);
1914 0 : u[9] = _mm_add_epi32(v[9], v[25]);
1915 0 : u[10] = _mm_add_epi32(v[10], v[26]);
1916 0 : u[11] = _mm_add_epi32(v[11], v[27]);
1917 0 : u[12] = _mm_add_epi32(v[12], v[28]);
1918 0 : u[13] = _mm_add_epi32(v[13], v[29]);
1919 0 : u[14] = _mm_add_epi32(v[14], v[30]);
1920 0 : u[15] = _mm_add_epi32(v[15], v[31]);
1921 0 : u[16] = _mm_sub_epi32(v[0], v[16]);
1922 0 : u[17] = _mm_sub_epi32(v[1], v[17]);
1923 0 : u[18] = _mm_sub_epi32(v[2], v[18]);
1924 0 : u[19] = _mm_sub_epi32(v[3], v[19]);
1925 0 : u[20] = _mm_sub_epi32(v[4], v[20]);
1926 0 : u[21] = _mm_sub_epi32(v[5], v[21]);
1927 0 : u[22] = _mm_sub_epi32(v[6], v[22]);
1928 0 : u[23] = _mm_sub_epi32(v[7], v[23]);
1929 0 : u[24] = _mm_sub_epi32(v[8], v[24]);
1930 0 : u[25] = _mm_sub_epi32(v[9], v[25]);
1931 0 : u[26] = _mm_sub_epi32(v[10], v[26]);
1932 0 : u[27] = _mm_sub_epi32(v[11], v[27]);
1933 0 : u[28] = _mm_sub_epi32(v[12], v[28]);
1934 0 : u[29] = _mm_sub_epi32(v[13], v[29]);
1935 0 : u[30] = _mm_sub_epi32(v[14], v[30]);
1936 0 : u[31] = _mm_sub_epi32(v[15], v[31]);
1937 :
1938 0 : v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1939 0 : v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1940 0 : v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1941 0 : v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1942 0 : v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1943 0 : v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1944 0 : v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1945 0 : v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1946 0 : v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1947 0 : v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1948 0 : v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1949 0 : v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1950 0 : v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1951 0 : v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1952 0 : v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1953 0 : v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1954 :
1955 0 : u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1956 0 : u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1957 0 : u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1958 0 : u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1959 0 : u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1960 0 : u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1961 0 : u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1962 0 : u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1963 0 : u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1964 0 : u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1965 0 : u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1966 0 : u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1967 0 : u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1968 0 : u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1969 0 : u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1970 0 : u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1971 :
1972 0 : v[0] = _mm_add_epi32(u[0], u[8]);
1973 0 : v[1] = _mm_add_epi32(u[1], u[9]);
1974 0 : v[2] = _mm_add_epi32(u[2], u[10]);
1975 0 : v[3] = _mm_add_epi32(u[3], u[11]);
1976 0 : v[4] = _mm_add_epi32(u[4], u[12]);
1977 0 : v[5] = _mm_add_epi32(u[5], u[13]);
1978 0 : v[6] = _mm_add_epi32(u[6], u[14]);
1979 0 : v[7] = _mm_add_epi32(u[7], u[15]);
1980 :
1981 0 : v[16] = _mm_add_epi32(v[0], v[4]);
1982 0 : v[17] = _mm_add_epi32(v[1], v[5]);
1983 0 : v[18] = _mm_add_epi32(v[2], v[6]);
1984 0 : v[19] = _mm_add_epi32(v[3], v[7]);
1985 0 : v[20] = _mm_sub_epi32(v[0], v[4]);
1986 0 : v[21] = _mm_sub_epi32(v[1], v[5]);
1987 0 : v[22] = _mm_sub_epi32(v[2], v[6]);
1988 0 : v[23] = _mm_sub_epi32(v[3], v[7]);
1989 0 : v[16] = _mm_add_epi32(v[16], k__DCT_CONST_ROUNDING);
1990 0 : v[17] = _mm_add_epi32(v[17], k__DCT_CONST_ROUNDING);
1991 0 : v[18] = _mm_add_epi32(v[18], k__DCT_CONST_ROUNDING);
1992 0 : v[19] = _mm_add_epi32(v[19], k__DCT_CONST_ROUNDING);
1993 0 : v[20] = _mm_add_epi32(v[20], k__DCT_CONST_ROUNDING);
1994 0 : v[21] = _mm_add_epi32(v[21], k__DCT_CONST_ROUNDING);
1995 0 : v[22] = _mm_add_epi32(v[22], k__DCT_CONST_ROUNDING);
1996 0 : v[23] = _mm_add_epi32(v[23], k__DCT_CONST_ROUNDING);
1997 0 : v[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1998 0 : v[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1999 0 : v[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
2000 0 : v[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
2001 0 : v[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
2002 0 : v[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
2003 0 : v[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
2004 0 : v[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
2005 0 : s[0] = _mm_packs_epi32(v[16], v[17]);
2006 0 : s[1] = _mm_packs_epi32(v[18], v[19]);
2007 0 : s[2] = _mm_packs_epi32(v[20], v[21]);
2008 0 : s[3] = _mm_packs_epi32(v[22], v[23]);
2009 :
2010 0 : v[8] = _mm_sub_epi32(u[0], u[8]);
2011 0 : v[9] = _mm_sub_epi32(u[1], u[9]);
2012 0 : v[10] = _mm_sub_epi32(u[2], u[10]);
2013 0 : v[11] = _mm_sub_epi32(u[3], u[11]);
2014 0 : v[12] = _mm_sub_epi32(u[4], u[12]);
2015 0 : v[13] = _mm_sub_epi32(u[5], u[13]);
2016 0 : v[14] = _mm_sub_epi32(u[6], u[14]);
2017 0 : v[15] = _mm_sub_epi32(u[7], u[15]);
2018 :
2019 0 : v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2020 0 : v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2021 0 : v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2022 0 : v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2023 0 : v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2024 0 : v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2025 0 : v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2026 0 : v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2027 :
2028 0 : v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2029 0 : v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2030 0 : v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2031 0 : v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2032 0 : v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2033 0 : v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2034 0 : v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2035 0 : v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2036 :
2037 0 : s[4] = _mm_packs_epi32(v[8], v[9]);
2038 0 : s[5] = _mm_packs_epi32(v[10], v[11]);
2039 0 : s[6] = _mm_packs_epi32(v[12], v[13]);
2040 0 : s[7] = _mm_packs_epi32(v[14], v[15]);
2041 : //
2042 :
2043 0 : s[8] = _mm_packs_epi32(u[16], u[17]);
2044 0 : s[9] = _mm_packs_epi32(u[18], u[19]);
2045 0 : s[10] = _mm_packs_epi32(u[20], u[21]);
2046 0 : s[11] = _mm_packs_epi32(u[22], u[23]);
2047 0 : s[12] = _mm_packs_epi32(u[24], u[25]);
2048 0 : s[13] = _mm_packs_epi32(u[26], u[27]);
2049 0 : s[14] = _mm_packs_epi32(u[28], u[29]);
2050 0 : s[15] = _mm_packs_epi32(u[30], u[31]);
2051 :
2052 : // stage 2
2053 0 : u[0] = _mm_unpacklo_epi16(s[8], s[9]);
2054 0 : u[1] = _mm_unpackhi_epi16(s[8], s[9]);
2055 0 : u[2] = _mm_unpacklo_epi16(s[10], s[11]);
2056 0 : u[3] = _mm_unpackhi_epi16(s[10], s[11]);
2057 0 : u[4] = _mm_unpacklo_epi16(s[12], s[13]);
2058 0 : u[5] = _mm_unpackhi_epi16(s[12], s[13]);
2059 0 : u[6] = _mm_unpacklo_epi16(s[14], s[15]);
2060 0 : u[7] = _mm_unpackhi_epi16(s[14], s[15]);
2061 :
2062 0 : v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
2063 0 : v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
2064 0 : v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
2065 0 : v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
2066 0 : v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
2067 0 : v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
2068 0 : v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
2069 0 : v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
2070 0 : v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
2071 0 : v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
2072 0 : v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
2073 0 : v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
2074 0 : v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
2075 0 : v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
2076 0 : v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
2077 0 : v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
2078 :
2079 0 : u[0] = _mm_add_epi32(v[0], v[8]);
2080 0 : u[1] = _mm_add_epi32(v[1], v[9]);
2081 0 : u[2] = _mm_add_epi32(v[2], v[10]);
2082 0 : u[3] = _mm_add_epi32(v[3], v[11]);
2083 0 : u[4] = _mm_add_epi32(v[4], v[12]);
2084 0 : u[5] = _mm_add_epi32(v[5], v[13]);
2085 0 : u[6] = _mm_add_epi32(v[6], v[14]);
2086 0 : u[7] = _mm_add_epi32(v[7], v[15]);
2087 0 : u[8] = _mm_sub_epi32(v[0], v[8]);
2088 0 : u[9] = _mm_sub_epi32(v[1], v[9]);
2089 0 : u[10] = _mm_sub_epi32(v[2], v[10]);
2090 0 : u[11] = _mm_sub_epi32(v[3], v[11]);
2091 0 : u[12] = _mm_sub_epi32(v[4], v[12]);
2092 0 : u[13] = _mm_sub_epi32(v[5], v[13]);
2093 0 : u[14] = _mm_sub_epi32(v[6], v[14]);
2094 0 : u[15] = _mm_sub_epi32(v[7], v[15]);
2095 :
2096 0 : v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2097 0 : v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2098 0 : v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2099 0 : v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2100 0 : v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2101 0 : v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2102 0 : v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2103 0 : v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2104 :
2105 0 : u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2106 0 : u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2107 0 : u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2108 0 : u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2109 0 : u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2110 0 : u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2111 0 : u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2112 0 : u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2113 :
2114 0 : v[8] = _mm_add_epi32(u[0], u[4]);
2115 0 : v[9] = _mm_add_epi32(u[1], u[5]);
2116 0 : v[10] = _mm_add_epi32(u[2], u[6]);
2117 0 : v[11] = _mm_add_epi32(u[3], u[7]);
2118 0 : v[12] = _mm_sub_epi32(u[0], u[4]);
2119 0 : v[13] = _mm_sub_epi32(u[1], u[5]);
2120 0 : v[14] = _mm_sub_epi32(u[2], u[6]);
2121 0 : v[15] = _mm_sub_epi32(u[3], u[7]);
2122 :
2123 0 : v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2124 0 : v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2125 0 : v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2126 0 : v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2127 0 : v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2128 0 : v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2129 0 : v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2130 0 : v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2131 0 : v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2132 0 : v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2133 0 : v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2134 0 : v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2135 0 : v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2136 0 : v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2137 0 : v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2138 0 : v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2139 0 : s[8] = _mm_packs_epi32(v[8], v[9]);
2140 0 : s[9] = _mm_packs_epi32(v[10], v[11]);
2141 0 : s[10] = _mm_packs_epi32(v[12], v[13]);
2142 0 : s[11] = _mm_packs_epi32(v[14], v[15]);
2143 :
2144 0 : x[12] = _mm_packs_epi32(u[8], u[9]);
2145 0 : x[13] = _mm_packs_epi32(u[10], u[11]);
2146 0 : x[14] = _mm_packs_epi32(u[12], u[13]);
2147 0 : x[15] = _mm_packs_epi32(u[14], u[15]);
2148 :
2149 : // stage 3
2150 0 : u[0] = _mm_unpacklo_epi16(s[4], s[5]);
2151 0 : u[1] = _mm_unpackhi_epi16(s[4], s[5]);
2152 0 : u[2] = _mm_unpacklo_epi16(s[6], s[7]);
2153 0 : u[3] = _mm_unpackhi_epi16(s[6], s[7]);
2154 0 : u[4] = _mm_unpacklo_epi16(x[12], x[13]);
2155 0 : u[5] = _mm_unpackhi_epi16(x[12], x[13]);
2156 0 : u[6] = _mm_unpacklo_epi16(x[14], x[15]);
2157 0 : u[7] = _mm_unpackhi_epi16(x[14], x[15]);
2158 :
2159 0 : v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
2160 0 : v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
2161 0 : v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
2162 0 : v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
2163 0 : v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
2164 0 : v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
2165 0 : v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
2166 0 : v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
2167 0 : v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
2168 0 : v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
2169 0 : v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
2170 0 : v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
2171 0 : v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
2172 0 : v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
2173 0 : v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
2174 0 : v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
2175 :
2176 0 : u[0] = _mm_add_epi32(v[0], v[4]);
2177 0 : u[1] = _mm_add_epi32(v[1], v[5]);
2178 0 : u[2] = _mm_add_epi32(v[2], v[6]);
2179 0 : u[3] = _mm_add_epi32(v[3], v[7]);
2180 0 : u[4] = _mm_sub_epi32(v[0], v[4]);
2181 0 : u[5] = _mm_sub_epi32(v[1], v[5]);
2182 0 : u[6] = _mm_sub_epi32(v[2], v[6]);
2183 0 : u[7] = _mm_sub_epi32(v[3], v[7]);
2184 0 : u[8] = _mm_add_epi32(v[8], v[12]);
2185 0 : u[9] = _mm_add_epi32(v[9], v[13]);
2186 0 : u[10] = _mm_add_epi32(v[10], v[14]);
2187 0 : u[11] = _mm_add_epi32(v[11], v[15]);
2188 0 : u[12] = _mm_sub_epi32(v[8], v[12]);
2189 0 : u[13] = _mm_sub_epi32(v[9], v[13]);
2190 0 : u[14] = _mm_sub_epi32(v[10], v[14]);
2191 0 : u[15] = _mm_sub_epi32(v[11], v[15]);
2192 :
2193 0 : u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2194 0 : u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2195 0 : u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2196 0 : u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2197 0 : u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2198 0 : u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2199 0 : u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2200 0 : u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2201 0 : u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2202 0 : u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2203 0 : u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2204 0 : u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2205 0 : u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2206 0 : u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2207 0 : u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2208 0 : u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2209 :
2210 0 : v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2211 0 : v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2212 0 : v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2213 0 : v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2214 0 : v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2215 0 : v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2216 0 : v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2217 0 : v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2218 0 : v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2219 0 : v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2220 0 : v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2221 0 : v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2222 0 : v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2223 0 : v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2224 0 : v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2225 0 : v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2226 :
2227 0 : s[4] = _mm_packs_epi32(v[0], v[1]);
2228 0 : s[5] = _mm_packs_epi32(v[2], v[3]);
2229 0 : s[6] = _mm_packs_epi32(v[4], v[5]);
2230 0 : s[7] = _mm_packs_epi32(v[6], v[7]);
2231 :
2232 0 : s[12] = _mm_packs_epi32(v[8], v[9]);
2233 0 : s[13] = _mm_packs_epi32(v[10], v[11]);
2234 0 : s[14] = _mm_packs_epi32(v[12], v[13]);
2235 0 : s[15] = _mm_packs_epi32(v[14], v[15]);
2236 :
2237 : // stage 4
2238 0 : u[0] = _mm_unpacklo_epi16(s[2], s[3]);
2239 0 : u[1] = _mm_unpackhi_epi16(s[2], s[3]);
2240 0 : u[2] = _mm_unpacklo_epi16(s[6], s[7]);
2241 0 : u[3] = _mm_unpackhi_epi16(s[6], s[7]);
2242 0 : u[4] = _mm_unpacklo_epi16(s[10], s[11]);
2243 0 : u[5] = _mm_unpackhi_epi16(s[10], s[11]);
2244 0 : u[6] = _mm_unpacklo_epi16(s[14], s[15]);
2245 0 : u[7] = _mm_unpackhi_epi16(s[14], s[15]);
2246 :
2247 0 : v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
2248 0 : v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
2249 0 : v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
2250 0 : v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
2251 0 : v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2252 0 : v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2253 0 : v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2254 0 : v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2255 0 : v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
2256 0 : v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
2257 0 : v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
2258 0 : v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
2259 0 : v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
2260 0 : v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
2261 0 : v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
2262 0 : v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
2263 :
2264 0 : u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2265 0 : u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2266 0 : u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2267 0 : u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2268 0 : u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2269 0 : u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2270 0 : u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2271 0 : u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2272 0 : u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2273 0 : u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2274 0 : u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2275 0 : u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2276 0 : u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2277 0 : u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2278 0 : u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2279 0 : u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2280 :
2281 0 : v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2282 0 : v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2283 0 : v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2284 0 : v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2285 0 : v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2286 0 : v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2287 0 : v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2288 0 : v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2289 0 : v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2290 0 : v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2291 0 : v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2292 0 : v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2293 0 : v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2294 0 : v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2295 0 : v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2296 0 : v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2297 :
2298 0 : in[0] = s[0];
2299 0 : in[1] = _mm_sub_epi16(kZero, s[8]);
2300 0 : in[2] = s[12];
2301 0 : in[3] = _mm_sub_epi16(kZero, s[4]);
2302 0 : in[4] = _mm_packs_epi32(v[4], v[5]);
2303 0 : in[5] = _mm_packs_epi32(v[12], v[13]);
2304 0 : in[6] = _mm_packs_epi32(v[8], v[9]);
2305 0 : in[7] = _mm_packs_epi32(v[0], v[1]);
2306 0 : in[8] = _mm_packs_epi32(v[2], v[3]);
2307 0 : in[9] = _mm_packs_epi32(v[10], v[11]);
2308 0 : in[10] = _mm_packs_epi32(v[14], v[15]);
2309 0 : in[11] = _mm_packs_epi32(v[6], v[7]);
2310 0 : in[12] = s[5];
2311 0 : in[13] = _mm_sub_epi16(kZero, s[13]);
2312 0 : in[14] = s[9];
2313 0 : in[15] = _mm_sub_epi16(kZero, s[1]);
2314 0 : }
2315 :
2316 0 : static void fdct16_sse2(__m128i *in0, __m128i *in1) {
2317 0 : fdct16_8col(in0);
2318 0 : fdct16_8col(in1);
2319 0 : array_transpose_16x16(in0, in1);
2320 0 : }
2321 :
2322 0 : static void fadst16_sse2(__m128i *in0, __m128i *in1) {
2323 0 : fadst16_8col(in0);
2324 0 : fadst16_8col(in1);
2325 0 : array_transpose_16x16(in0, in1);
2326 0 : }
2327 :
2328 : #if CONFIG_EXT_TX
2329 0 : static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
2330 0 : idtx16_8col(in0);
2331 0 : idtx16_8col(in1);
2332 0 : array_transpose_16x16(in0, in1);
2333 0 : }
2334 : #endif // CONFIG_EXT_TX
2335 :
2336 0 : void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
2337 : int tx_type) {
2338 : __m128i in0[16], in1[16];
2339 :
2340 0 : switch (tx_type) {
2341 : case DCT_DCT:
2342 0 : load_buffer_16x16(input, in0, in1, stride, 0, 0);
2343 0 : fdct16_sse2(in0, in1);
2344 0 : right_shift_16x16(in0, in1);
2345 0 : fdct16_sse2(in0, in1);
2346 0 : write_buffer_16x16(output, in0, in1, 16);
2347 0 : break;
2348 : case ADST_DCT:
2349 0 : load_buffer_16x16(input, in0, in1, stride, 0, 0);
2350 0 : fadst16_sse2(in0, in1);
2351 0 : right_shift_16x16(in0, in1);
2352 0 : fdct16_sse2(in0, in1);
2353 0 : write_buffer_16x16(output, in0, in1, 16);
2354 0 : break;
2355 : case DCT_ADST:
2356 0 : load_buffer_16x16(input, in0, in1, stride, 0, 0);
2357 0 : fdct16_sse2(in0, in1);
2358 0 : right_shift_16x16(in0, in1);
2359 0 : fadst16_sse2(in0, in1);
2360 0 : write_buffer_16x16(output, in0, in1, 16);
2361 0 : break;
2362 : case ADST_ADST:
2363 0 : load_buffer_16x16(input, in0, in1, stride, 0, 0);
2364 0 : fadst16_sse2(in0, in1);
2365 0 : right_shift_16x16(in0, in1);
2366 0 : fadst16_sse2(in0, in1);
2367 0 : write_buffer_16x16(output, in0, in1, 16);
2368 0 : break;
2369 : #if CONFIG_EXT_TX
2370 : case FLIPADST_DCT:
2371 0 : load_buffer_16x16(input, in0, in1, stride, 1, 0);
2372 0 : fadst16_sse2(in0, in1);
2373 0 : right_shift_16x16(in0, in1);
2374 0 : fdct16_sse2(in0, in1);
2375 0 : write_buffer_16x16(output, in0, in1, 16);
2376 0 : break;
2377 : case DCT_FLIPADST:
2378 0 : load_buffer_16x16(input, in0, in1, stride, 0, 1);
2379 0 : fdct16_sse2(in0, in1);
2380 0 : right_shift_16x16(in0, in1);
2381 0 : fadst16_sse2(in0, in1);
2382 0 : write_buffer_16x16(output, in0, in1, 16);
2383 0 : break;
2384 : case FLIPADST_FLIPADST:
2385 0 : load_buffer_16x16(input, in0, in1, stride, 1, 1);
2386 0 : fadst16_sse2(in0, in1);
2387 0 : right_shift_16x16(in0, in1);
2388 0 : fadst16_sse2(in0, in1);
2389 0 : write_buffer_16x16(output, in0, in1, 16);
2390 0 : break;
2391 : case ADST_FLIPADST:
2392 0 : load_buffer_16x16(input, in0, in1, stride, 0, 1);
2393 0 : fadst16_sse2(in0, in1);
2394 0 : right_shift_16x16(in0, in1);
2395 0 : fadst16_sse2(in0, in1);
2396 0 : write_buffer_16x16(output, in0, in1, 16);
2397 0 : break;
2398 : case FLIPADST_ADST:
2399 0 : load_buffer_16x16(input, in0, in1, stride, 1, 0);
2400 0 : fadst16_sse2(in0, in1);
2401 0 : right_shift_16x16(in0, in1);
2402 0 : fadst16_sse2(in0, in1);
2403 0 : write_buffer_16x16(output, in0, in1, 16);
2404 0 : break;
2405 : case IDTX:
2406 0 : load_buffer_16x16(input, in0, in1, stride, 0, 0);
2407 0 : fidtx16_sse2(in0, in1);
2408 0 : right_shift_16x16(in0, in1);
2409 0 : fidtx16_sse2(in0, in1);
2410 0 : write_buffer_16x16(output, in0, in1, 16);
2411 0 : break;
2412 : case V_DCT:
2413 0 : load_buffer_16x16(input, in0, in1, stride, 0, 0);
2414 0 : fdct16_sse2(in0, in1);
2415 0 : right_shift_16x16(in0, in1);
2416 0 : fidtx16_sse2(in0, in1);
2417 0 : write_buffer_16x16(output, in0, in1, 16);
2418 0 : break;
2419 : case H_DCT:
2420 0 : load_buffer_16x16(input, in0, in1, stride, 0, 0);
2421 0 : fidtx16_sse2(in0, in1);
2422 0 : right_shift_16x16(in0, in1);
2423 0 : fdct16_sse2(in0, in1);
2424 0 : write_buffer_16x16(output, in0, in1, 16);
2425 0 : break;
2426 : case V_ADST:
2427 0 : load_buffer_16x16(input, in0, in1, stride, 0, 0);
2428 0 : fadst16_sse2(in0, in1);
2429 0 : right_shift_16x16(in0, in1);
2430 0 : fidtx16_sse2(in0, in1);
2431 0 : write_buffer_16x16(output, in0, in1, 16);
2432 0 : break;
2433 : case H_ADST:
2434 0 : load_buffer_16x16(input, in0, in1, stride, 0, 0);
2435 0 : fidtx16_sse2(in0, in1);
2436 0 : right_shift_16x16(in0, in1);
2437 0 : fadst16_sse2(in0, in1);
2438 0 : write_buffer_16x16(output, in0, in1, 16);
2439 0 : break;
2440 : case V_FLIPADST:
2441 0 : load_buffer_16x16(input, in0, in1, stride, 1, 0);
2442 0 : fadst16_sse2(in0, in1);
2443 0 : right_shift_16x16(in0, in1);
2444 0 : fidtx16_sse2(in0, in1);
2445 0 : write_buffer_16x16(output, in0, in1, 16);
2446 0 : break;
2447 : case H_FLIPADST:
2448 0 : load_buffer_16x16(input, in0, in1, stride, 0, 1);
2449 0 : fidtx16_sse2(in0, in1);
2450 0 : right_shift_16x16(in0, in1);
2451 0 : fadst16_sse2(in0, in1);
2452 0 : write_buffer_16x16(output, in0, in1, 16);
2453 0 : break;
2454 : #endif // CONFIG_EXT_TX
2455 0 : default: assert(0); break;
2456 : }
2457 0 : }
2458 :
2459 0 : static INLINE void prepare_4x8_row_first(__m128i *in) {
2460 0 : in[0] = _mm_unpacklo_epi64(in[0], in[2]);
2461 0 : in[1] = _mm_unpacklo_epi64(in[1], in[3]);
2462 0 : transpose_4x4(in);
2463 0 : in[4] = _mm_unpacklo_epi64(in[4], in[6]);
2464 0 : in[5] = _mm_unpacklo_epi64(in[5], in[7]);
2465 0 : transpose_4x4(in + 4);
2466 0 : }
2467 :
2468 : // Load input into the left-hand half of in (ie, into lanes 0..3 of
2469 : // each element of in). The right hand half (lanes 4..7) should be
2470 : // treated as being filled with "don't care" values.
2471 0 : static INLINE void load_buffer_4x8(const int16_t *input, __m128i *in,
2472 : int stride, int flipud, int fliplr) {
2473 0 : const int shift = 2;
2474 0 : if (!flipud) {
2475 0 : in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
2476 0 : in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
2477 0 : in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
2478 0 : in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
2479 0 : in[4] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
2480 0 : in[5] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
2481 0 : in[6] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
2482 0 : in[7] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
2483 : } else {
2484 0 : in[0] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
2485 0 : in[1] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
2486 0 : in[2] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
2487 0 : in[3] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
2488 0 : in[4] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
2489 0 : in[5] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
2490 0 : in[6] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
2491 0 : in[7] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
2492 : }
2493 :
2494 0 : if (fliplr) {
2495 0 : in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
2496 0 : in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
2497 0 : in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
2498 0 : in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
2499 0 : in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
2500 0 : in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
2501 0 : in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
2502 0 : in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
2503 : }
2504 :
2505 0 : in[0] = _mm_slli_epi16(in[0], shift);
2506 0 : in[1] = _mm_slli_epi16(in[1], shift);
2507 0 : in[2] = _mm_slli_epi16(in[2], shift);
2508 0 : in[3] = _mm_slli_epi16(in[3], shift);
2509 0 : in[4] = _mm_slli_epi16(in[4], shift);
2510 0 : in[5] = _mm_slli_epi16(in[5], shift);
2511 0 : in[6] = _mm_slli_epi16(in[6], shift);
2512 0 : in[7] = _mm_slli_epi16(in[7], shift);
2513 :
2514 0 : scale_sqrt2_8x4(in);
2515 0 : scale_sqrt2_8x4(in + 4);
2516 0 : prepare_4x8_row_first(in);
2517 0 : }
2518 :
2519 0 : static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
2520 : __m128i in01, in23, in45, in67, sign01, sign23, sign45, sign67;
2521 0 : const int shift = 1;
2522 :
2523 : // revert the 8x8 txfm's transpose
2524 0 : array_transpose_8x8(res, res);
2525 :
2526 0 : in01 = _mm_unpacklo_epi64(res[0], res[1]);
2527 0 : in23 = _mm_unpacklo_epi64(res[2], res[3]);
2528 0 : in45 = _mm_unpacklo_epi64(res[4], res[5]);
2529 0 : in67 = _mm_unpacklo_epi64(res[6], res[7]);
2530 :
2531 0 : sign01 = _mm_srai_epi16(in01, 15);
2532 0 : sign23 = _mm_srai_epi16(in23, 15);
2533 0 : sign45 = _mm_srai_epi16(in45, 15);
2534 0 : sign67 = _mm_srai_epi16(in67, 15);
2535 :
2536 0 : in01 = _mm_sub_epi16(in01, sign01);
2537 0 : in23 = _mm_sub_epi16(in23, sign23);
2538 0 : in45 = _mm_sub_epi16(in45, sign45);
2539 0 : in67 = _mm_sub_epi16(in67, sign67);
2540 :
2541 0 : in01 = _mm_srai_epi16(in01, shift);
2542 0 : in23 = _mm_srai_epi16(in23, shift);
2543 0 : in45 = _mm_srai_epi16(in45, shift);
2544 0 : in67 = _mm_srai_epi16(in67, shift);
2545 :
2546 0 : store_output(&in01, (output + 0 * 8));
2547 0 : store_output(&in23, (output + 1 * 8));
2548 0 : store_output(&in45, (output + 2 * 8));
2549 0 : store_output(&in67, (output + 3 * 8));
2550 0 : }
2551 :
2552 0 : void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
2553 : int tx_type) {
2554 : __m128i in[8];
2555 :
2556 0 : switch (tx_type) {
2557 : case DCT_DCT:
2558 0 : load_buffer_4x8(input, in, stride, 0, 0);
2559 0 : fdct4_sse2(in);
2560 0 : fdct4_sse2(in + 4);
2561 0 : fdct8_sse2(in);
2562 0 : break;
2563 : case ADST_DCT:
2564 0 : load_buffer_4x8(input, in, stride, 0, 0);
2565 0 : fdct4_sse2(in);
2566 0 : fdct4_sse2(in + 4);
2567 0 : fadst8_sse2(in);
2568 0 : break;
2569 : case DCT_ADST:
2570 0 : load_buffer_4x8(input, in, stride, 0, 0);
2571 0 : fadst4_sse2(in);
2572 0 : fadst4_sse2(in + 4);
2573 0 : fdct8_sse2(in);
2574 0 : break;
2575 : case ADST_ADST:
2576 0 : load_buffer_4x8(input, in, stride, 0, 0);
2577 0 : fadst4_sse2(in);
2578 0 : fadst4_sse2(in + 4);
2579 0 : fadst8_sse2(in);
2580 0 : break;
2581 : #if CONFIG_EXT_TX
2582 : case FLIPADST_DCT:
2583 0 : load_buffer_4x8(input, in, stride, 1, 0);
2584 0 : fdct4_sse2(in);
2585 0 : fdct4_sse2(in + 4);
2586 0 : fadst8_sse2(in);
2587 0 : break;
2588 : case DCT_FLIPADST:
2589 0 : load_buffer_4x8(input, in, stride, 0, 1);
2590 0 : fadst4_sse2(in);
2591 0 : fadst4_sse2(in + 4);
2592 0 : fdct8_sse2(in);
2593 0 : break;
2594 : case FLIPADST_FLIPADST:
2595 0 : load_buffer_4x8(input, in, stride, 1, 1);
2596 0 : fadst4_sse2(in);
2597 0 : fadst4_sse2(in + 4);
2598 0 : fadst8_sse2(in);
2599 0 : break;
2600 : case ADST_FLIPADST:
2601 0 : load_buffer_4x8(input, in, stride, 0, 1);
2602 0 : fadst4_sse2(in);
2603 0 : fadst4_sse2(in + 4);
2604 0 : fadst8_sse2(in);
2605 0 : break;
2606 : case FLIPADST_ADST:
2607 0 : load_buffer_4x8(input, in, stride, 1, 0);
2608 0 : fadst4_sse2(in);
2609 0 : fadst4_sse2(in + 4);
2610 0 : fadst8_sse2(in);
2611 0 : break;
2612 : case IDTX:
2613 0 : load_buffer_4x8(input, in, stride, 0, 0);
2614 0 : fidtx4_sse2(in);
2615 0 : fidtx4_sse2(in + 4);
2616 0 : fidtx8_sse2(in);
2617 0 : break;
2618 : case V_DCT:
2619 0 : load_buffer_4x8(input, in, stride, 0, 0);
2620 0 : fidtx4_sse2(in);
2621 0 : fidtx4_sse2(in + 4);
2622 0 : fdct8_sse2(in);
2623 0 : break;
2624 : case H_DCT:
2625 0 : load_buffer_4x8(input, in, stride, 0, 0);
2626 0 : fdct4_sse2(in);
2627 0 : fdct4_sse2(in + 4);
2628 0 : fidtx8_sse2(in);
2629 0 : break;
2630 : case V_ADST:
2631 0 : load_buffer_4x8(input, in, stride, 0, 0);
2632 0 : fidtx4_sse2(in);
2633 0 : fidtx4_sse2(in + 4);
2634 0 : fadst8_sse2(in);
2635 0 : break;
2636 : case H_ADST:
2637 0 : load_buffer_4x8(input, in, stride, 0, 0);
2638 0 : fadst4_sse2(in);
2639 0 : fadst4_sse2(in + 4);
2640 0 : fidtx8_sse2(in);
2641 0 : break;
2642 : case V_FLIPADST:
2643 0 : load_buffer_4x8(input, in, stride, 1, 0);
2644 0 : fidtx4_sse2(in);
2645 0 : fidtx4_sse2(in + 4);
2646 0 : fadst8_sse2(in);
2647 0 : break;
2648 : case H_FLIPADST:
2649 0 : load_buffer_4x8(input, in, stride, 0, 1);
2650 0 : fadst4_sse2(in);
2651 0 : fadst4_sse2(in + 4);
2652 0 : fidtx8_sse2(in);
2653 0 : break;
2654 : #endif
2655 0 : default: assert(0); break;
2656 : }
2657 0 : write_buffer_4x8(output, in);
2658 0 : }
2659 :
2660 : // Load input into the left-hand half of in (ie, into lanes 0..3 of
2661 : // each element of in). The right hand half (lanes 4..7) should be
2662 : // treated as being filled with "don't care" values.
2663 : // The input is split horizontally into two 4x4
2664 : // chunks 'l' and 'r'. Then 'l' is stored in the top-left 4x4
2665 : // block of 'in' and 'r' is stored in the bottom-left block.
2666 : // This is to allow us to reuse 4x4 transforms.
2667 0 : static INLINE void load_buffer_8x4(const int16_t *input, __m128i *in,
2668 : int stride, int flipud, int fliplr) {
2669 0 : const int shift = 2;
2670 0 : if (!flipud) {
2671 0 : in[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
2672 0 : in[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
2673 0 : in[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
2674 0 : in[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
2675 : } else {
2676 0 : in[0] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
2677 0 : in[1] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
2678 0 : in[2] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
2679 0 : in[3] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
2680 : }
2681 :
2682 0 : if (fliplr) {
2683 0 : in[0] = mm_reverse_epi16(in[0]);
2684 0 : in[1] = mm_reverse_epi16(in[1]);
2685 0 : in[2] = mm_reverse_epi16(in[2]);
2686 0 : in[3] = mm_reverse_epi16(in[3]);
2687 : }
2688 :
2689 0 : in[0] = _mm_slli_epi16(in[0], shift);
2690 0 : in[1] = _mm_slli_epi16(in[1], shift);
2691 0 : in[2] = _mm_slli_epi16(in[2], shift);
2692 0 : in[3] = _mm_slli_epi16(in[3], shift);
2693 :
2694 0 : scale_sqrt2_8x4(in);
2695 :
2696 0 : in[4] = _mm_shuffle_epi32(in[0], 0xe);
2697 0 : in[5] = _mm_shuffle_epi32(in[1], 0xe);
2698 0 : in[6] = _mm_shuffle_epi32(in[2], 0xe);
2699 0 : in[7] = _mm_shuffle_epi32(in[3], 0xe);
2700 0 : }
2701 :
2702 0 : static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
2703 : __m128i out0, out1, out2, out3, sign0, sign1, sign2, sign3;
2704 0 : const int shift = 1;
2705 0 : sign0 = _mm_srai_epi16(res[0], 15);
2706 0 : sign1 = _mm_srai_epi16(res[1], 15);
2707 0 : sign2 = _mm_srai_epi16(res[2], 15);
2708 0 : sign3 = _mm_srai_epi16(res[3], 15);
2709 :
2710 0 : out0 = _mm_sub_epi16(res[0], sign0);
2711 0 : out1 = _mm_sub_epi16(res[1], sign1);
2712 0 : out2 = _mm_sub_epi16(res[2], sign2);
2713 0 : out3 = _mm_sub_epi16(res[3], sign3);
2714 :
2715 0 : out0 = _mm_srai_epi16(out0, shift);
2716 0 : out1 = _mm_srai_epi16(out1, shift);
2717 0 : out2 = _mm_srai_epi16(out2, shift);
2718 0 : out3 = _mm_srai_epi16(out3, shift);
2719 :
2720 0 : store_output(&out0, (output + 0 * 8));
2721 0 : store_output(&out1, (output + 1 * 8));
2722 0 : store_output(&out2, (output + 2 * 8));
2723 0 : store_output(&out3, (output + 3 * 8));
2724 0 : }
2725 :
2726 0 : void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
2727 : int tx_type) {
2728 : __m128i in[8];
2729 :
2730 0 : switch (tx_type) {
2731 : case DCT_DCT:
2732 0 : load_buffer_8x4(input, in, stride, 0, 0);
2733 0 : fdct4_sse2(in);
2734 0 : fdct4_sse2(in + 4);
2735 0 : fdct8_sse2(in);
2736 0 : break;
2737 : case ADST_DCT:
2738 0 : load_buffer_8x4(input, in, stride, 0, 0);
2739 0 : fadst4_sse2(in);
2740 0 : fadst4_sse2(in + 4);
2741 0 : fdct8_sse2(in);
2742 0 : break;
2743 : case DCT_ADST:
2744 0 : load_buffer_8x4(input, in, stride, 0, 0);
2745 0 : fdct4_sse2(in);
2746 0 : fdct4_sse2(in + 4);
2747 0 : fadst8_sse2(in);
2748 0 : break;
2749 : case ADST_ADST:
2750 0 : load_buffer_8x4(input, in, stride, 0, 0);
2751 0 : fadst4_sse2(in);
2752 0 : fadst4_sse2(in + 4);
2753 0 : fadst8_sse2(in);
2754 0 : break;
2755 : #if CONFIG_EXT_TX
2756 : case FLIPADST_DCT:
2757 0 : load_buffer_8x4(input, in, stride, 1, 0);
2758 0 : fadst4_sse2(in);
2759 0 : fadst4_sse2(in + 4);
2760 0 : fdct8_sse2(in);
2761 0 : break;
2762 : case DCT_FLIPADST:
2763 0 : load_buffer_8x4(input, in, stride, 0, 1);
2764 0 : fdct4_sse2(in);
2765 0 : fdct4_sse2(in + 4);
2766 0 : fadst8_sse2(in);
2767 0 : break;
2768 : case FLIPADST_FLIPADST:
2769 0 : load_buffer_8x4(input, in, stride, 1, 1);
2770 0 : fadst4_sse2(in);
2771 0 : fadst4_sse2(in + 4);
2772 0 : fadst8_sse2(in);
2773 0 : break;
2774 : case ADST_FLIPADST:
2775 0 : load_buffer_8x4(input, in, stride, 0, 1);
2776 0 : fadst4_sse2(in);
2777 0 : fadst4_sse2(in + 4);
2778 0 : fadst8_sse2(in);
2779 0 : break;
2780 : case FLIPADST_ADST:
2781 0 : load_buffer_8x4(input, in, stride, 1, 0);
2782 0 : fadst4_sse2(in);
2783 0 : fadst4_sse2(in + 4);
2784 0 : fadst8_sse2(in);
2785 0 : break;
2786 : case IDTX:
2787 0 : load_buffer_8x4(input, in, stride, 0, 0);
2788 0 : fidtx4_sse2(in);
2789 0 : fidtx4_sse2(in + 4);
2790 0 : fidtx8_sse2(in);
2791 0 : break;
2792 : case V_DCT:
2793 0 : load_buffer_8x4(input, in, stride, 0, 0);
2794 0 : fdct4_sse2(in);
2795 0 : fdct4_sse2(in + 4);
2796 0 : fidtx8_sse2(in);
2797 0 : break;
2798 : case H_DCT:
2799 0 : load_buffer_8x4(input, in, stride, 0, 0);
2800 0 : fidtx4_sse2(in);
2801 0 : fidtx4_sse2(in + 4);
2802 0 : fdct8_sse2(in);
2803 0 : break;
2804 : case V_ADST:
2805 0 : load_buffer_8x4(input, in, stride, 0, 0);
2806 0 : fadst4_sse2(in);
2807 0 : fadst4_sse2(in + 4);
2808 0 : fidtx8_sse2(in);
2809 0 : break;
2810 : case H_ADST:
2811 0 : load_buffer_8x4(input, in, stride, 0, 0);
2812 0 : fidtx4_sse2(in);
2813 0 : fidtx4_sse2(in + 4);
2814 0 : fadst8_sse2(in);
2815 0 : break;
2816 : case V_FLIPADST:
2817 0 : load_buffer_8x4(input, in, stride, 1, 0);
2818 0 : fadst4_sse2(in);
2819 0 : fadst4_sse2(in + 4);
2820 0 : fidtx8_sse2(in);
2821 0 : break;
2822 : case H_FLIPADST:
2823 0 : load_buffer_8x4(input, in, stride, 0, 1);
2824 0 : fidtx4_sse2(in);
2825 0 : fidtx4_sse2(in + 4);
2826 0 : fadst8_sse2(in);
2827 0 : break;
2828 : #endif
2829 0 : default: assert(0); break;
2830 : }
2831 0 : write_buffer_8x4(output, in);
2832 0 : }
2833 :
2834 0 : static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in,
2835 : int stride, int flipud, int fliplr) {
2836 : // Load 2 8x8 blocks
2837 0 : const int16_t *t = input;
2838 0 : const int16_t *b = input + 8 * stride;
2839 :
2840 0 : if (flipud) {
2841 0 : const int16_t *const tmp = t;
2842 0 : t = b;
2843 0 : b = tmp;
2844 : }
2845 :
2846 0 : load_buffer_8x8(t, in, stride, flipud, fliplr);
2847 0 : scale_sqrt2_8x8(in);
2848 0 : load_buffer_8x8(b, in + 8, stride, flipud, fliplr);
2849 0 : scale_sqrt2_8x8(in + 8);
2850 0 : }
2851 :
2852 0 : static INLINE void round_power_of_two_signed(__m128i *x, int n) {
2853 0 : const __m128i rounding = _mm_set1_epi16((1 << n) >> 1);
2854 0 : const __m128i sign = _mm_srai_epi16(*x, 15);
2855 0 : const __m128i res = _mm_add_epi16(_mm_add_epi16(*x, rounding), sign);
2856 0 : *x = _mm_srai_epi16(res, n);
2857 0 : }
2858 :
2859 0 : static void row_8x16_rounding(__m128i *in, int bits) {
2860 : int i;
2861 0 : for (i = 0; i < 16; i++) {
2862 0 : round_power_of_two_signed(&in[i], bits);
2863 : }
2864 0 : }
2865 :
2866 0 : void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
2867 : int tx_type) {
2868 : __m128i in[16];
2869 :
2870 0 : __m128i *const t = in; // Alias to top 8x8 sub block
2871 0 : __m128i *const b = in + 8; // Alias to bottom 8x8 sub block
2872 :
2873 0 : switch (tx_type) {
2874 : case DCT_DCT:
2875 0 : load_buffer_8x16(input, in, stride, 0, 0);
2876 0 : array_transpose_8x8(t, t);
2877 0 : array_transpose_8x8(b, b);
2878 0 : fdct8_sse2(t);
2879 0 : fdct8_sse2(b);
2880 0 : row_8x16_rounding(in, 2);
2881 0 : fdct16_8col(in);
2882 0 : break;
2883 : case ADST_DCT:
2884 0 : load_buffer_8x16(input, in, stride, 0, 0);
2885 0 : array_transpose_8x8(t, t);
2886 0 : array_transpose_8x8(b, b);
2887 0 : fdct8_sse2(t);
2888 0 : fdct8_sse2(b);
2889 0 : row_8x16_rounding(in, 2);
2890 0 : fadst16_8col(in);
2891 0 : break;
2892 : case DCT_ADST:
2893 0 : load_buffer_8x16(input, in, stride, 0, 0);
2894 0 : array_transpose_8x8(t, t);
2895 0 : array_transpose_8x8(b, b);
2896 0 : fadst8_sse2(t);
2897 0 : fadst8_sse2(b);
2898 0 : row_8x16_rounding(in, 2);
2899 0 : fdct16_8col(in);
2900 0 : break;
2901 : case ADST_ADST:
2902 0 : load_buffer_8x16(input, in, stride, 0, 0);
2903 0 : array_transpose_8x8(t, t);
2904 0 : array_transpose_8x8(b, b);
2905 0 : fadst8_sse2(t);
2906 0 : fadst8_sse2(b);
2907 0 : row_8x16_rounding(in, 2);
2908 0 : fadst16_8col(in);
2909 0 : break;
2910 : #if CONFIG_EXT_TX
2911 : case FLIPADST_DCT:
2912 0 : load_buffer_8x16(input, in, stride, 1, 0);
2913 0 : array_transpose_8x8(t, t);
2914 0 : array_transpose_8x8(b, b);
2915 0 : fdct8_sse2(t);
2916 0 : fdct8_sse2(b);
2917 0 : row_8x16_rounding(in, 2);
2918 0 : fadst16_8col(in);
2919 0 : break;
2920 : case DCT_FLIPADST:
2921 0 : load_buffer_8x16(input, in, stride, 0, 1);
2922 0 : array_transpose_8x8(t, t);
2923 0 : array_transpose_8x8(b, b);
2924 0 : fadst8_sse2(t);
2925 0 : fadst8_sse2(b);
2926 0 : row_8x16_rounding(in, 2);
2927 0 : fdct16_8col(in);
2928 0 : break;
2929 : case FLIPADST_FLIPADST:
2930 0 : load_buffer_8x16(input, in, stride, 1, 1);
2931 0 : array_transpose_8x8(t, t);
2932 0 : array_transpose_8x8(b, b);
2933 0 : fadst8_sse2(t);
2934 0 : fadst8_sse2(b);
2935 0 : row_8x16_rounding(in, 2);
2936 0 : fadst16_8col(in);
2937 0 : break;
2938 : case ADST_FLIPADST:
2939 0 : load_buffer_8x16(input, in, stride, 0, 1);
2940 0 : array_transpose_8x8(t, t);
2941 0 : array_transpose_8x8(b, b);
2942 0 : fadst8_sse2(t);
2943 0 : fadst8_sse2(b);
2944 0 : row_8x16_rounding(in, 2);
2945 0 : fadst16_8col(in);
2946 0 : break;
2947 : case FLIPADST_ADST:
2948 0 : load_buffer_8x16(input, in, stride, 1, 0);
2949 0 : array_transpose_8x8(t, t);
2950 0 : array_transpose_8x8(b, b);
2951 0 : fadst8_sse2(t);
2952 0 : fadst8_sse2(b);
2953 0 : row_8x16_rounding(in, 2);
2954 0 : fadst16_8col(in);
2955 0 : break;
2956 : case IDTX:
2957 0 : load_buffer_8x16(input, in, stride, 0, 0);
2958 0 : array_transpose_8x8(t, t);
2959 0 : array_transpose_8x8(b, b);
2960 0 : fidtx8_sse2(t);
2961 0 : fidtx8_sse2(b);
2962 0 : row_8x16_rounding(in, 2);
2963 0 : idtx16_8col(in);
2964 0 : break;
2965 : case V_DCT:
2966 0 : load_buffer_8x16(input, in, stride, 0, 0);
2967 0 : array_transpose_8x8(t, t);
2968 0 : array_transpose_8x8(b, b);
2969 0 : fidtx8_sse2(t);
2970 0 : fidtx8_sse2(b);
2971 0 : row_8x16_rounding(in, 2);
2972 0 : fdct16_8col(in);
2973 0 : break;
2974 : case H_DCT:
2975 0 : load_buffer_8x16(input, in, stride, 0, 0);
2976 0 : array_transpose_8x8(t, t);
2977 0 : array_transpose_8x8(b, b);
2978 0 : fdct8_sse2(t);
2979 0 : fdct8_sse2(b);
2980 0 : row_8x16_rounding(in, 2);
2981 0 : idtx16_8col(in);
2982 0 : break;
2983 : case V_ADST:
2984 0 : load_buffer_8x16(input, in, stride, 0, 0);
2985 0 : array_transpose_8x8(t, t);
2986 0 : array_transpose_8x8(b, b);
2987 0 : fidtx8_sse2(t);
2988 0 : fidtx8_sse2(b);
2989 0 : row_8x16_rounding(in, 2);
2990 0 : fadst16_8col(in);
2991 0 : break;
2992 : case H_ADST:
2993 0 : load_buffer_8x16(input, in, stride, 0, 0);
2994 0 : array_transpose_8x8(t, t);
2995 0 : array_transpose_8x8(b, b);
2996 0 : fadst8_sse2(t);
2997 0 : fadst8_sse2(b);
2998 0 : row_8x16_rounding(in, 2);
2999 0 : idtx16_8col(in);
3000 0 : break;
3001 : case V_FLIPADST:
3002 0 : load_buffer_8x16(input, in, stride, 1, 0);
3003 0 : array_transpose_8x8(t, t);
3004 0 : array_transpose_8x8(b, b);
3005 0 : fidtx8_sse2(t);
3006 0 : fidtx8_sse2(b);
3007 0 : row_8x16_rounding(in, 2);
3008 0 : fadst16_8col(in);
3009 0 : break;
3010 : case H_FLIPADST:
3011 0 : load_buffer_8x16(input, in, stride, 0, 1);
3012 0 : array_transpose_8x8(t, t);
3013 0 : array_transpose_8x8(b, b);
3014 0 : fadst8_sse2(t);
3015 0 : fadst8_sse2(b);
3016 0 : row_8x16_rounding(in, 2);
3017 0 : idtx16_8col(in);
3018 0 : break;
3019 : #endif
3020 0 : default: assert(0); break;
3021 : }
3022 0 : write_buffer_8x8(output, t, 8);
3023 0 : write_buffer_8x8(output + 64, b, 8);
3024 0 : }
3025 :
3026 0 : static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in,
3027 : int stride, int flipud, int fliplr) {
3028 : // Load 2 8x8 blocks
3029 0 : const int16_t *l = input;
3030 0 : const int16_t *r = input + 8;
3031 :
3032 0 : if (fliplr) {
3033 0 : const int16_t *const tmp = l;
3034 0 : l = r;
3035 0 : r = tmp;
3036 : }
3037 :
3038 : // load first 8 columns
3039 0 : load_buffer_8x8(l, in, stride, flipud, fliplr);
3040 0 : scale_sqrt2_8x8(in);
3041 0 : load_buffer_8x8(r, in + 8, stride, flipud, fliplr);
3042 0 : scale_sqrt2_8x8(in + 8);
3043 0 : }
3044 :
3045 : #define col_16x8_rounding row_8x16_rounding
3046 :
3047 0 : void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
3048 : int tx_type) {
3049 : __m128i in[16];
3050 :
3051 0 : __m128i *const l = in; // Alias to left 8x8 sub block
3052 0 : __m128i *const r = in + 8; // Alias to right 8x8 sub block, which we store
3053 : // in the second half of the array
3054 :
3055 0 : switch (tx_type) {
3056 : case DCT_DCT:
3057 0 : load_buffer_16x8(input, in, stride, 0, 0);
3058 0 : fdct8_sse2(l);
3059 0 : fdct8_sse2(r);
3060 0 : col_16x8_rounding(in, 2);
3061 0 : fdct16_8col(in);
3062 0 : break;
3063 : case ADST_DCT:
3064 0 : load_buffer_16x8(input, in, stride, 0, 0);
3065 0 : fadst8_sse2(l);
3066 0 : fadst8_sse2(r);
3067 0 : col_16x8_rounding(in, 2);
3068 0 : fdct16_8col(in);
3069 0 : break;
3070 : case DCT_ADST:
3071 0 : load_buffer_16x8(input, in, stride, 0, 0);
3072 0 : fdct8_sse2(l);
3073 0 : fdct8_sse2(r);
3074 0 : col_16x8_rounding(in, 2);
3075 0 : fadst16_8col(in);
3076 0 : break;
3077 : case ADST_ADST:
3078 0 : load_buffer_16x8(input, in, stride, 0, 0);
3079 0 : fadst8_sse2(l);
3080 0 : fadst8_sse2(r);
3081 0 : col_16x8_rounding(in, 2);
3082 0 : fadst16_8col(in);
3083 0 : break;
3084 : #if CONFIG_EXT_TX
3085 : case FLIPADST_DCT:
3086 0 : load_buffer_16x8(input, in, stride, 1, 0);
3087 0 : fadst8_sse2(l);
3088 0 : fadst8_sse2(r);
3089 0 : col_16x8_rounding(in, 2);
3090 0 : fdct16_8col(in);
3091 0 : break;
3092 : case DCT_FLIPADST:
3093 0 : load_buffer_16x8(input, in, stride, 0, 1);
3094 0 : fdct8_sse2(l);
3095 0 : fdct8_sse2(r);
3096 0 : col_16x8_rounding(in, 2);
3097 0 : fadst16_8col(in);
3098 0 : break;
3099 : case FLIPADST_FLIPADST:
3100 0 : load_buffer_16x8(input, in, stride, 1, 1);
3101 0 : fadst8_sse2(l);
3102 0 : fadst8_sse2(r);
3103 0 : col_16x8_rounding(in, 2);
3104 0 : fadst16_8col(in);
3105 0 : break;
3106 : case ADST_FLIPADST:
3107 0 : load_buffer_16x8(input, in, stride, 0, 1);
3108 0 : fadst8_sse2(l);
3109 0 : fadst8_sse2(r);
3110 0 : col_16x8_rounding(in, 2);
3111 0 : fadst16_8col(in);
3112 0 : break;
3113 : case FLIPADST_ADST:
3114 0 : load_buffer_16x8(input, in, stride, 1, 0);
3115 0 : fadst8_sse2(l);
3116 0 : fadst8_sse2(r);
3117 0 : col_16x8_rounding(in, 2);
3118 0 : fadst16_8col(in);
3119 0 : break;
3120 : case IDTX:
3121 0 : load_buffer_16x8(input, in, stride, 0, 0);
3122 0 : fidtx8_sse2(l);
3123 0 : fidtx8_sse2(r);
3124 0 : col_16x8_rounding(in, 2);
3125 0 : idtx16_8col(in);
3126 0 : break;
3127 : case V_DCT:
3128 0 : load_buffer_16x8(input, in, stride, 0, 0);
3129 0 : fdct8_sse2(l);
3130 0 : fdct8_sse2(r);
3131 0 : col_16x8_rounding(in, 2);
3132 0 : idtx16_8col(in);
3133 0 : break;
3134 : case H_DCT:
3135 0 : load_buffer_16x8(input, in, stride, 0, 0);
3136 0 : fidtx8_sse2(l);
3137 0 : fidtx8_sse2(r);
3138 0 : col_16x8_rounding(in, 2);
3139 0 : fdct16_8col(in);
3140 0 : break;
3141 : case V_ADST:
3142 0 : load_buffer_16x8(input, in, stride, 0, 0);
3143 0 : fadst8_sse2(l);
3144 0 : fadst8_sse2(r);
3145 0 : col_16x8_rounding(in, 2);
3146 0 : idtx16_8col(in);
3147 0 : break;
3148 : case H_ADST:
3149 0 : load_buffer_16x8(input, in, stride, 0, 0);
3150 0 : fidtx8_sse2(l);
3151 0 : fidtx8_sse2(r);
3152 0 : col_16x8_rounding(in, 2);
3153 0 : fadst16_8col(in);
3154 0 : break;
3155 : case V_FLIPADST:
3156 0 : load_buffer_16x8(input, in, stride, 1, 0);
3157 0 : fadst8_sse2(l);
3158 0 : fadst8_sse2(r);
3159 0 : col_16x8_rounding(in, 2);
3160 0 : idtx16_8col(in);
3161 0 : break;
3162 : case H_FLIPADST:
3163 0 : load_buffer_16x8(input, in, stride, 0, 1);
3164 0 : fidtx8_sse2(l);
3165 0 : fidtx8_sse2(r);
3166 0 : col_16x8_rounding(in, 2);
3167 0 : fadst16_8col(in);
3168 0 : break;
3169 : #endif
3170 0 : default: assert(0); break;
3171 : }
3172 0 : array_transpose_8x8(l, l);
3173 0 : array_transpose_8x8(r, r);
3174 0 : write_buffer_8x8(output, l, 16);
3175 0 : write_buffer_8x8(output + 8, r, 16);
3176 0 : }
3177 :
3178 : // Note: The 16-column 32-element transforms expect their input to be
3179 : // split up into a 2x2 grid of 8x16 blocks
3180 0 : static INLINE void fdct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
3181 : __m128i *br) {
3182 0 : fdct32_8col(tl, bl);
3183 0 : fdct32_8col(tr, br);
3184 0 : array_transpose_16x16(tl, tr);
3185 0 : array_transpose_16x16(bl, br);
3186 0 : }
3187 :
3188 : #if CONFIG_EXT_TX
3189 0 : static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
3190 : __m128i *br) {
3191 : int i;
3192 0 : for (i = 0; i < 16; ++i) {
3193 0 : tl[i] = _mm_slli_epi16(tl[i], 2);
3194 0 : tr[i] = _mm_slli_epi16(tr[i], 2);
3195 0 : bl[i] = _mm_slli_epi16(bl[i], 2);
3196 0 : br[i] = _mm_slli_epi16(br[i], 2);
3197 : }
3198 0 : array_transpose_16x16(tl, tr);
3199 0 : array_transpose_16x16(bl, br);
3200 0 : }
3201 : #endif
3202 :
3203 0 : static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl,
3204 : __m128i *intr, __m128i *inbl,
3205 : __m128i *inbr, int stride, int flipud,
3206 : int fliplr) {
3207 : int i;
3208 0 : if (flipud) {
3209 0 : input = input + 31 * stride;
3210 0 : stride = -stride;
3211 : }
3212 :
3213 0 : for (i = 0; i < 16; ++i) {
3214 0 : intl[i] = _mm_slli_epi16(
3215 0 : _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
3216 0 : intr[i] = _mm_slli_epi16(
3217 0 : _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
3218 0 : inbl[i] = _mm_slli_epi16(
3219 0 : _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0)), 2);
3220 0 : inbr[i] = _mm_slli_epi16(
3221 0 : _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8)), 2);
3222 : }
3223 :
3224 0 : if (fliplr) {
3225 : __m128i tmp;
3226 0 : for (i = 0; i < 16; ++i) {
3227 0 : tmp = intl[i];
3228 0 : intl[i] = mm_reverse_epi16(intr[i]);
3229 0 : intr[i] = mm_reverse_epi16(tmp);
3230 0 : tmp = inbl[i];
3231 0 : inbl[i] = mm_reverse_epi16(inbr[i]);
3232 0 : inbr[i] = mm_reverse_epi16(tmp);
3233 : }
3234 : }
3235 :
3236 0 : scale_sqrt2_8x16(intl);
3237 0 : scale_sqrt2_8x16(intr);
3238 0 : scale_sqrt2_8x16(inbl);
3239 0 : scale_sqrt2_8x16(inbr);
3240 0 : }
3241 :
3242 0 : static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl,
3243 : __m128i *restr, __m128i *resbl,
3244 : __m128i *resbr) {
3245 : int i;
3246 0 : for (i = 0; i < 16; ++i) {
3247 0 : store_output(&restl[i], output + i * 16 + 0);
3248 0 : store_output(&restr[i], output + i * 16 + 8);
3249 0 : store_output(&resbl[i], output + (i + 16) * 16 + 0);
3250 0 : store_output(&resbr[i], output + (i + 16) * 16 + 8);
3251 : }
3252 0 : }
3253 :
3254 0 : static INLINE void round_signed_8x8(__m128i *in, const int bit) {
3255 0 : const __m128i rounding = _mm_set1_epi16((1 << bit) >> 1);
3256 0 : __m128i sign0 = _mm_srai_epi16(in[0], 15);
3257 0 : __m128i sign1 = _mm_srai_epi16(in[1], 15);
3258 0 : __m128i sign2 = _mm_srai_epi16(in[2], 15);
3259 0 : __m128i sign3 = _mm_srai_epi16(in[3], 15);
3260 0 : __m128i sign4 = _mm_srai_epi16(in[4], 15);
3261 0 : __m128i sign5 = _mm_srai_epi16(in[5], 15);
3262 0 : __m128i sign6 = _mm_srai_epi16(in[6], 15);
3263 0 : __m128i sign7 = _mm_srai_epi16(in[7], 15);
3264 :
3265 0 : in[0] = _mm_add_epi16(_mm_add_epi16(in[0], rounding), sign0);
3266 0 : in[1] = _mm_add_epi16(_mm_add_epi16(in[1], rounding), sign1);
3267 0 : in[2] = _mm_add_epi16(_mm_add_epi16(in[2], rounding), sign2);
3268 0 : in[3] = _mm_add_epi16(_mm_add_epi16(in[3], rounding), sign3);
3269 0 : in[4] = _mm_add_epi16(_mm_add_epi16(in[4], rounding), sign4);
3270 0 : in[5] = _mm_add_epi16(_mm_add_epi16(in[5], rounding), sign5);
3271 0 : in[6] = _mm_add_epi16(_mm_add_epi16(in[6], rounding), sign6);
3272 0 : in[7] = _mm_add_epi16(_mm_add_epi16(in[7], rounding), sign7);
3273 :
3274 0 : in[0] = _mm_srai_epi16(in[0], bit);
3275 0 : in[1] = _mm_srai_epi16(in[1], bit);
3276 0 : in[2] = _mm_srai_epi16(in[2], bit);
3277 0 : in[3] = _mm_srai_epi16(in[3], bit);
3278 0 : in[4] = _mm_srai_epi16(in[4], bit);
3279 0 : in[5] = _mm_srai_epi16(in[5], bit);
3280 0 : in[6] = _mm_srai_epi16(in[6], bit);
3281 0 : in[7] = _mm_srai_epi16(in[7], bit);
3282 0 : }
3283 :
3284 0 : static INLINE void round_signed_16x16(__m128i *in0, __m128i *in1) {
3285 0 : const int bit = 4;
3286 0 : round_signed_8x8(in0, bit);
3287 0 : round_signed_8x8(in0 + 8, bit);
3288 0 : round_signed_8x8(in1, bit);
3289 0 : round_signed_8x8(in1 + 8, bit);
3290 0 : }
3291 :
3292 : // Note:
3293 : // suffix "t" indicates the transpose operation comes first
3294 0 : static void fdct16t_sse2(__m128i *in0, __m128i *in1) {
3295 0 : array_transpose_16x16(in0, in1);
3296 0 : fdct16_8col(in0);
3297 0 : fdct16_8col(in1);
3298 0 : }
3299 :
3300 0 : static void fadst16t_sse2(__m128i *in0, __m128i *in1) {
3301 0 : array_transpose_16x16(in0, in1);
3302 0 : fadst16_8col(in0);
3303 0 : fadst16_8col(in1);
3304 0 : }
3305 :
3306 0 : static INLINE void fdct32t_16col(__m128i *tl, __m128i *tr, __m128i *bl,
3307 : __m128i *br) {
3308 0 : array_transpose_16x16(tl, tr);
3309 0 : array_transpose_16x16(bl, br);
3310 0 : fdct32_8col(tl, bl);
3311 0 : fdct32_8col(tr, br);
3312 0 : }
3313 :
3314 : typedef enum transpose_indicator_ {
3315 : transpose,
3316 : no_transpose,
3317 : } transpose_indicator;
3318 :
3319 0 : static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
3320 : __m128i *br, transpose_indicator t) {
3321 : __m128i tmpl[16], tmpr[16];
3322 : int i;
3323 :
3324 : // Copy the bottom half of the input to temporary storage
3325 0 : for (i = 0; i < 16; ++i) {
3326 0 : tmpl[i] = bl[i];
3327 0 : tmpr[i] = br[i];
3328 : }
3329 :
3330 : // Generate the bottom half of the output
3331 0 : for (i = 0; i < 16; ++i) {
3332 0 : bl[i] = _mm_slli_epi16(tl[i], 2);
3333 0 : br[i] = _mm_slli_epi16(tr[i], 2);
3334 : }
3335 0 : array_transpose_16x16(bl, br);
3336 :
3337 : // Copy the temporary storage back to the top half of the input
3338 0 : for (i = 0; i < 16; ++i) {
3339 0 : tl[i] = tmpl[i];
3340 0 : tr[i] = tmpr[i];
3341 : }
3342 :
3343 : // Generate the top half of the output
3344 0 : scale_sqrt2_8x16(tl);
3345 0 : scale_sqrt2_8x16(tr);
3346 0 : if (t == transpose)
3347 0 : fdct16t_sse2(tl, tr);
3348 : else
3349 0 : fdct16_sse2(tl, tr);
3350 0 : }
3351 :
3352 : // Note on data layout, for both this and the 32x16 transforms:
3353 : // So that we can reuse the 16-element transforms easily,
3354 : // we want to split the input into 8x16 blocks.
3355 : // For 16x32, this means the input is a 2x2 grid of such blocks.
3356 : // For 32x16, it means the input is a 4x1 grid.
3357 0 : void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
3358 : int tx_type) {
3359 : __m128i intl[16], intr[16], inbl[16], inbr[16];
3360 :
3361 0 : switch (tx_type) {
3362 : case DCT_DCT:
3363 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
3364 0 : fdct16t_sse2(intl, intr);
3365 0 : fdct16t_sse2(inbl, inbr);
3366 0 : round_signed_16x16(intl, intr);
3367 0 : round_signed_16x16(inbl, inbr);
3368 0 : fdct32t_16col(intl, intr, inbl, inbr);
3369 0 : break;
3370 : case ADST_DCT:
3371 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
3372 0 : fdct16t_sse2(intl, intr);
3373 0 : fdct16t_sse2(inbl, inbr);
3374 0 : round_signed_16x16(intl, intr);
3375 0 : round_signed_16x16(inbl, inbr);
3376 0 : fhalfright32_16col(intl, intr, inbl, inbr, transpose);
3377 0 : break;
3378 : case DCT_ADST:
3379 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
3380 0 : fadst16t_sse2(intl, intr);
3381 0 : fadst16t_sse2(inbl, inbr);
3382 0 : round_signed_16x16(intl, intr);
3383 0 : round_signed_16x16(inbl, inbr);
3384 0 : fdct32t_16col(intl, intr, inbl, inbr);
3385 0 : break;
3386 : case ADST_ADST:
3387 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
3388 0 : fadst16t_sse2(intl, intr);
3389 0 : fadst16t_sse2(inbl, inbr);
3390 0 : round_signed_16x16(intl, intr);
3391 0 : round_signed_16x16(inbl, inbr);
3392 0 : fhalfright32_16col(intl, intr, inbl, inbr, transpose);
3393 0 : break;
3394 : #if CONFIG_EXT_TX
3395 : case FLIPADST_DCT:
3396 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
3397 0 : fdct16t_sse2(intl, intr);
3398 0 : fdct16t_sse2(inbl, inbr);
3399 0 : round_signed_16x16(intl, intr);
3400 0 : round_signed_16x16(inbl, inbr);
3401 0 : fhalfright32_16col(intl, intr, inbl, inbr, transpose);
3402 0 : break;
3403 : case DCT_FLIPADST:
3404 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
3405 0 : fadst16t_sse2(intl, intr);
3406 0 : fadst16t_sse2(inbl, inbr);
3407 0 : round_signed_16x16(intl, intr);
3408 0 : round_signed_16x16(inbl, inbr);
3409 0 : fdct32t_16col(intl, intr, inbl, inbr);
3410 0 : break;
3411 : case FLIPADST_FLIPADST:
3412 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1);
3413 0 : fadst16t_sse2(intl, intr);
3414 0 : fadst16t_sse2(inbl, inbr);
3415 0 : round_signed_16x16(intl, intr);
3416 0 : round_signed_16x16(inbl, inbr);
3417 0 : fhalfright32_16col(intl, intr, inbl, inbr, transpose);
3418 0 : break;
3419 : case ADST_FLIPADST:
3420 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
3421 0 : fadst16t_sse2(intl, intr);
3422 0 : fadst16t_sse2(inbl, inbr);
3423 0 : round_signed_16x16(intl, intr);
3424 0 : round_signed_16x16(inbl, inbr);
3425 0 : fhalfright32_16col(intl, intr, inbl, inbr, transpose);
3426 0 : break;
3427 : case FLIPADST_ADST:
3428 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
3429 0 : fadst16t_sse2(intl, intr);
3430 0 : fadst16t_sse2(inbl, inbr);
3431 0 : round_signed_16x16(intl, intr);
3432 0 : round_signed_16x16(inbl, inbr);
3433 0 : fhalfright32_16col(intl, intr, inbl, inbr, transpose);
3434 0 : break;
3435 : case IDTX:
3436 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
3437 0 : fidtx16_sse2(intl, intr);
3438 0 : fidtx16_sse2(inbl, inbr);
3439 0 : round_signed_16x16(intl, intr);
3440 0 : round_signed_16x16(inbl, inbr);
3441 0 : fidtx32_16col(intl, intr, inbl, inbr);
3442 0 : break;
3443 : case V_DCT:
3444 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
3445 0 : fidtx16_sse2(intl, intr);
3446 0 : fidtx16_sse2(inbl, inbr);
3447 0 : round_signed_16x16(intl, intr);
3448 0 : round_signed_16x16(inbl, inbr);
3449 0 : fdct32t_16col(intl, intr, inbl, inbr);
3450 0 : break;
3451 : case H_DCT:
3452 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
3453 0 : fdct16t_sse2(intl, intr);
3454 0 : fdct16t_sse2(inbl, inbr);
3455 0 : round_signed_16x16(intl, intr);
3456 0 : round_signed_16x16(inbl, inbr);
3457 0 : fidtx32_16col(intl, intr, inbl, inbr);
3458 0 : break;
3459 : case V_ADST:
3460 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
3461 0 : fidtx16_sse2(intl, intr);
3462 0 : fidtx16_sse2(inbl, inbr);
3463 0 : round_signed_16x16(intl, intr);
3464 0 : round_signed_16x16(inbl, inbr);
3465 0 : fhalfright32_16col(intl, intr, inbl, inbr, transpose);
3466 0 : break;
3467 : case H_ADST:
3468 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
3469 0 : fadst16t_sse2(intl, intr);
3470 0 : fadst16t_sse2(inbl, inbr);
3471 0 : round_signed_16x16(intl, intr);
3472 0 : round_signed_16x16(inbl, inbr);
3473 0 : fidtx32_16col(intl, intr, inbl, inbr);
3474 0 : break;
3475 : case V_FLIPADST:
3476 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
3477 0 : fidtx16_sse2(intl, intr);
3478 0 : fidtx16_sse2(inbl, inbr);
3479 0 : round_signed_16x16(intl, intr);
3480 0 : round_signed_16x16(inbl, inbr);
3481 0 : fhalfright32_16col(intl, intr, inbl, inbr, transpose);
3482 0 : break;
3483 : case H_FLIPADST:
3484 0 : load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
3485 0 : fadst16t_sse2(intl, intr);
3486 0 : fadst16t_sse2(inbl, inbr);
3487 0 : round_signed_16x16(intl, intr);
3488 0 : round_signed_16x16(inbl, inbr);
3489 0 : fidtx32_16col(intl, intr, inbl, inbr);
3490 0 : break;
3491 : #endif
3492 0 : default: assert(0); break;
3493 : }
3494 0 : write_buffer_16x32(output, intl, intr, inbl, inbr);
3495 0 : }
3496 :
3497 0 : static INLINE void load_buffer_32x16(const int16_t *input, __m128i *in0,
3498 : __m128i *in1, __m128i *in2, __m128i *in3,
3499 : int stride, int flipud, int fliplr) {
3500 : int i;
3501 0 : if (flipud) {
3502 0 : input += 15 * stride;
3503 0 : stride = -stride;
3504 : }
3505 :
3506 0 : for (i = 0; i < 16; ++i) {
3507 0 : in0[i] = _mm_slli_epi16(
3508 0 : _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
3509 0 : in1[i] = _mm_slli_epi16(
3510 0 : _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
3511 0 : in2[i] = _mm_slli_epi16(
3512 0 : _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
3513 0 : in3[i] = _mm_slli_epi16(
3514 0 : _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
3515 : }
3516 :
3517 0 : if (fliplr) {
3518 0 : for (i = 0; i < 16; ++i) {
3519 0 : __m128i tmp1 = in0[i];
3520 0 : __m128i tmp2 = in1[i];
3521 0 : in0[i] = mm_reverse_epi16(in3[i]);
3522 0 : in1[i] = mm_reverse_epi16(in2[i]);
3523 0 : in2[i] = mm_reverse_epi16(tmp2);
3524 0 : in3[i] = mm_reverse_epi16(tmp1);
3525 : }
3526 : }
3527 :
3528 0 : scale_sqrt2_8x16(in0);
3529 0 : scale_sqrt2_8x16(in1);
3530 0 : scale_sqrt2_8x16(in2);
3531 0 : scale_sqrt2_8x16(in3);
3532 0 : }
3533 :
3534 0 : static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
3535 : __m128i *res1, __m128i *res2,
3536 : __m128i *res3) {
3537 : int i;
3538 0 : for (i = 0; i < 16; ++i) {
3539 0 : store_output(&res0[i], output + i * 32 + 0);
3540 0 : store_output(&res1[i], output + i * 32 + 8);
3541 0 : store_output(&res2[i], output + i * 32 + 16);
3542 0 : store_output(&res3[i], output + i * 32 + 24);
3543 : }
3544 0 : }
3545 :
3546 0 : void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
3547 : int tx_type) {
3548 : __m128i in0[16], in1[16], in2[16], in3[16];
3549 :
3550 0 : load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
3551 0 : switch (tx_type) {
3552 : case DCT_DCT:
3553 0 : fdct16_sse2(in0, in1);
3554 0 : fdct16_sse2(in2, in3);
3555 0 : round_signed_16x16(in0, in1);
3556 0 : round_signed_16x16(in2, in3);
3557 0 : fdct32_16col(in0, in1, in2, in3);
3558 0 : break;
3559 : case ADST_DCT:
3560 0 : fadst16_sse2(in0, in1);
3561 0 : fadst16_sse2(in2, in3);
3562 0 : round_signed_16x16(in0, in1);
3563 0 : round_signed_16x16(in2, in3);
3564 0 : fdct32_16col(in0, in1, in2, in3);
3565 0 : break;
3566 : case DCT_ADST:
3567 0 : fdct16_sse2(in0, in1);
3568 0 : fdct16_sse2(in2, in3);
3569 0 : round_signed_16x16(in0, in1);
3570 0 : round_signed_16x16(in2, in3);
3571 0 : fhalfright32_16col(in0, in1, in2, in3, no_transpose);
3572 0 : break;
3573 : case ADST_ADST:
3574 0 : fadst16_sse2(in0, in1);
3575 0 : fadst16_sse2(in2, in3);
3576 0 : round_signed_16x16(in0, in1);
3577 0 : round_signed_16x16(in2, in3);
3578 0 : fhalfright32_16col(in0, in1, in2, in3, no_transpose);
3579 0 : break;
3580 : #if CONFIG_EXT_TX
3581 : case FLIPADST_DCT:
3582 0 : load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
3583 0 : fadst16_sse2(in0, in1);
3584 0 : fadst16_sse2(in2, in3);
3585 0 : round_signed_16x16(in0, in1);
3586 0 : round_signed_16x16(in2, in3);
3587 0 : fdct32_16col(in0, in1, in2, in3);
3588 0 : break;
3589 : case DCT_FLIPADST:
3590 0 : load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
3591 0 : fdct16_sse2(in0, in1);
3592 0 : fdct16_sse2(in2, in3);
3593 0 : round_signed_16x16(in0, in1);
3594 0 : round_signed_16x16(in2, in3);
3595 0 : fhalfright32_16col(in0, in1, in2, in3, no_transpose);
3596 0 : break;
3597 : case FLIPADST_FLIPADST:
3598 0 : load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1);
3599 0 : fadst16_sse2(in0, in1);
3600 0 : fadst16_sse2(in2, in3);
3601 0 : round_signed_16x16(in0, in1);
3602 0 : round_signed_16x16(in2, in3);
3603 0 : fhalfright32_16col(in0, in1, in2, in3, no_transpose);
3604 0 : break;
3605 : case ADST_FLIPADST:
3606 0 : load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
3607 0 : fadst16_sse2(in0, in1);
3608 0 : fadst16_sse2(in2, in3);
3609 0 : round_signed_16x16(in0, in1);
3610 0 : round_signed_16x16(in2, in3);
3611 0 : fhalfright32_16col(in0, in1, in2, in3, no_transpose);
3612 0 : break;
3613 : case FLIPADST_ADST:
3614 0 : load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
3615 0 : fadst16_sse2(in0, in1);
3616 0 : fadst16_sse2(in2, in3);
3617 0 : round_signed_16x16(in0, in1);
3618 0 : round_signed_16x16(in2, in3);
3619 0 : fhalfright32_16col(in0, in1, in2, in3, no_transpose);
3620 0 : break;
3621 : case IDTX:
3622 0 : load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
3623 0 : fidtx16_sse2(in0, in1);
3624 0 : fidtx16_sse2(in2, in3);
3625 0 : round_signed_16x16(in0, in1);
3626 0 : round_signed_16x16(in2, in3);
3627 0 : fidtx32_16col(in0, in1, in2, in3);
3628 0 : break;
3629 : case V_DCT:
3630 0 : load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
3631 0 : fdct16_sse2(in0, in1);
3632 0 : fdct16_sse2(in2, in3);
3633 0 : round_signed_16x16(in0, in1);
3634 0 : round_signed_16x16(in2, in3);
3635 0 : fidtx32_16col(in0, in1, in2, in3);
3636 0 : break;
3637 : case H_DCT:
3638 0 : load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
3639 0 : fidtx16_sse2(in0, in1);
3640 0 : fidtx16_sse2(in2, in3);
3641 0 : round_signed_16x16(in0, in1);
3642 0 : round_signed_16x16(in2, in3);
3643 0 : fdct32_16col(in0, in1, in2, in3);
3644 0 : break;
3645 : case V_ADST:
3646 0 : load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
3647 0 : fadst16_sse2(in0, in1);
3648 0 : fadst16_sse2(in2, in3);
3649 0 : round_signed_16x16(in0, in1);
3650 0 : round_signed_16x16(in2, in3);
3651 0 : fidtx32_16col(in0, in1, in2, in3);
3652 0 : break;
3653 : case H_ADST:
3654 0 : load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
3655 0 : fidtx16_sse2(in0, in1);
3656 0 : fidtx16_sse2(in2, in3);
3657 0 : round_signed_16x16(in0, in1);
3658 0 : round_signed_16x16(in2, in3);
3659 0 : fhalfright32_16col(in0, in1, in2, in3, no_transpose);
3660 0 : break;
3661 : case V_FLIPADST:
3662 0 : load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
3663 0 : fadst16_sse2(in0, in1);
3664 0 : fadst16_sse2(in2, in3);
3665 0 : round_signed_16x16(in0, in1);
3666 0 : round_signed_16x16(in2, in3);
3667 0 : fidtx32_16col(in0, in1, in2, in3);
3668 0 : break;
3669 : case H_FLIPADST:
3670 0 : load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
3671 0 : fidtx16_sse2(in0, in1);
3672 0 : fidtx16_sse2(in2, in3);
3673 0 : round_signed_16x16(in0, in1);
3674 0 : round_signed_16x16(in2, in3);
3675 0 : fhalfright32_16col(in0, in1, in2, in3, no_transpose);
3676 0 : break;
3677 : #endif
3678 0 : default: assert(0); break;
3679 : }
3680 0 : write_buffer_32x16(output, in0, in1, in2, in3);
3681 0 : }
3682 :
3683 : // Note:
3684 : // 32x32 hybrid fwd txfm
3685 : // 4x2 grids of 8x16 block. Each block is represented by __m128i in[16]
3686 0 : static INLINE void load_buffer_32x32(const int16_t *input,
3687 : __m128i *in0 /*in0[32]*/,
3688 : __m128i *in1 /*in1[32]*/,
3689 : __m128i *in2 /*in2[32]*/,
3690 : __m128i *in3 /*in3[32]*/, int stride,
3691 : int flipud, int fliplr) {
3692 0 : if (flipud) {
3693 0 : input += 31 * stride;
3694 0 : stride = -stride;
3695 : }
3696 :
3697 : int i;
3698 0 : for (i = 0; i < 32; ++i) {
3699 0 : in0[i] = _mm_slli_epi16(
3700 0 : _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
3701 0 : in1[i] = _mm_slli_epi16(
3702 0 : _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
3703 0 : in2[i] = _mm_slli_epi16(
3704 0 : _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
3705 0 : in3[i] = _mm_slli_epi16(
3706 0 : _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
3707 : }
3708 :
3709 0 : if (fliplr) {
3710 0 : for (i = 0; i < 32; ++i) {
3711 0 : __m128i tmp1 = in0[i];
3712 0 : __m128i tmp2 = in1[i];
3713 0 : in0[i] = mm_reverse_epi16(in3[i]);
3714 0 : in1[i] = mm_reverse_epi16(in2[i]);
3715 0 : in2[i] = mm_reverse_epi16(tmp2);
3716 0 : in3[i] = mm_reverse_epi16(tmp1);
3717 : }
3718 : }
3719 0 : }
3720 :
3721 0 : static INLINE void swap_16x16(__m128i *b0l /*b0l[16]*/,
3722 : __m128i *b0r /*b0r[16]*/,
3723 : __m128i *b1l /*b1l[16]*/,
3724 : __m128i *b1r /*b1r[16]*/) {
3725 : int i;
3726 0 : for (i = 0; i < 16; ++i) {
3727 0 : __m128i tmp0 = b1l[i];
3728 0 : __m128i tmp1 = b1r[i];
3729 0 : b1l[i] = b0l[i];
3730 0 : b1r[i] = b0r[i];
3731 0 : b0l[i] = tmp0;
3732 0 : b0r[i] = tmp1;
3733 : }
3734 0 : }
3735 :
3736 0 : static INLINE void fdct32(__m128i *in0, __m128i *in1, __m128i *in2,
3737 : __m128i *in3) {
3738 0 : fdct32_8col(in0, &in0[16]);
3739 0 : fdct32_8col(in1, &in1[16]);
3740 0 : fdct32_8col(in2, &in2[16]);
3741 0 : fdct32_8col(in3, &in3[16]);
3742 :
3743 0 : array_transpose_16x16(in0, in1);
3744 0 : array_transpose_16x16(&in0[16], &in1[16]);
3745 0 : array_transpose_16x16(in2, in3);
3746 0 : array_transpose_16x16(&in2[16], &in3[16]);
3747 :
3748 0 : swap_16x16(&in0[16], &in1[16], in2, in3);
3749 0 : }
3750 :
3751 0 : static INLINE void fhalfright32(__m128i *in0, __m128i *in1, __m128i *in2,
3752 : __m128i *in3) {
3753 0 : fhalfright32_16col(in0, in1, &in0[16], &in1[16], no_transpose);
3754 0 : fhalfright32_16col(in2, in3, &in2[16], &in3[16], no_transpose);
3755 0 : swap_16x16(&in0[16], &in1[16], in2, in3);
3756 0 : }
3757 :
3758 : #if CONFIG_EXT_TX
3759 0 : static INLINE void fidtx32(__m128i *in0, __m128i *in1, __m128i *in2,
3760 : __m128i *in3) {
3761 0 : fidtx32_16col(in0, in1, &in0[16], &in1[16]);
3762 0 : fidtx32_16col(in2, in3, &in2[16], &in3[16]);
3763 0 : swap_16x16(&in0[16], &in1[16], in2, in3);
3764 0 : }
3765 : #endif
3766 :
3767 0 : static INLINE void round_signed_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
3768 : __m128i *in3) {
3769 0 : round_signed_16x16(in0, in1);
3770 0 : round_signed_16x16(&in0[16], &in1[16]);
3771 0 : round_signed_16x16(in2, in3);
3772 0 : round_signed_16x16(&in2[16], &in3[16]);
3773 0 : }
3774 :
3775 0 : static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
3776 : __m128i *in3, tran_low_t *output) {
3777 : int i;
3778 0 : for (i = 0; i < 32; ++i) {
3779 0 : store_output(&in0[i], output + i * 32 + 0);
3780 0 : store_output(&in1[i], output + i * 32 + 8);
3781 0 : store_output(&in2[i], output + i * 32 + 16);
3782 0 : store_output(&in3[i], output + i * 32 + 24);
3783 : }
3784 0 : }
3785 :
3786 0 : void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride,
3787 : int tx_type) {
3788 : __m128i in0[32], in1[32], in2[32], in3[32];
3789 :
3790 0 : load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0);
3791 0 : switch (tx_type) {
3792 : case DCT_DCT:
3793 0 : fdct32(in0, in1, in2, in3);
3794 0 : round_signed_32x32(in0, in1, in2, in3);
3795 0 : fdct32(in0, in1, in2, in3);
3796 0 : break;
3797 : case ADST_DCT:
3798 0 : fhalfright32(in0, in1, in2, in3);
3799 0 : round_signed_32x32(in0, in1, in2, in3);
3800 0 : fdct32(in0, in1, in2, in3);
3801 0 : break;
3802 : case DCT_ADST:
3803 0 : fdct32(in0, in1, in2, in3);
3804 0 : round_signed_32x32(in0, in1, in2, in3);
3805 0 : fhalfright32(in0, in1, in2, in3);
3806 0 : break;
3807 : case ADST_ADST:
3808 0 : fhalfright32(in0, in1, in2, in3);
3809 0 : round_signed_32x32(in0, in1, in2, in3);
3810 0 : fhalfright32(in0, in1, in2, in3);
3811 0 : break;
3812 : #if CONFIG_EXT_TX
3813 : case FLIPADST_DCT:
3814 0 : load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
3815 0 : fhalfright32(in0, in1, in2, in3);
3816 0 : round_signed_32x32(in0, in1, in2, in3);
3817 0 : fdct32(in0, in1, in2, in3);
3818 0 : break;
3819 : case DCT_FLIPADST:
3820 0 : load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
3821 0 : fdct32(in0, in1, in2, in3);
3822 0 : round_signed_32x32(in0, in1, in2, in3);
3823 0 : fhalfright32(in0, in1, in2, in3);
3824 0 : break;
3825 : case FLIPADST_FLIPADST:
3826 0 : load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 1);
3827 0 : fhalfright32(in0, in1, in2, in3);
3828 0 : round_signed_32x32(in0, in1, in2, in3);
3829 0 : fhalfright32(in0, in1, in2, in3);
3830 0 : break;
3831 : case ADST_FLIPADST:
3832 0 : load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
3833 0 : fhalfright32(in0, in1, in2, in3);
3834 0 : round_signed_32x32(in0, in1, in2, in3);
3835 0 : fhalfright32(in0, in1, in2, in3);
3836 0 : break;
3837 : case FLIPADST_ADST:
3838 0 : load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
3839 0 : fhalfright32(in0, in1, in2, in3);
3840 0 : round_signed_32x32(in0, in1, in2, in3);
3841 0 : fhalfright32(in0, in1, in2, in3);
3842 0 : break;
3843 : case IDTX:
3844 0 : fidtx32(in0, in1, in2, in3);
3845 0 : round_signed_32x32(in0, in1, in2, in3);
3846 0 : fidtx32(in0, in1, in2, in3);
3847 0 : break;
3848 : case V_DCT:
3849 0 : fdct32(in0, in1, in2, in3);
3850 0 : round_signed_32x32(in0, in1, in2, in3);
3851 0 : fidtx32(in0, in1, in2, in3);
3852 0 : break;
3853 : case H_DCT:
3854 0 : fidtx32(in0, in1, in2, in3);
3855 0 : round_signed_32x32(in0, in1, in2, in3);
3856 0 : fdct32(in0, in1, in2, in3);
3857 0 : break;
3858 : case V_ADST:
3859 0 : fhalfright32(in0, in1, in2, in3);
3860 0 : round_signed_32x32(in0, in1, in2, in3);
3861 0 : fidtx32(in0, in1, in2, in3);
3862 0 : break;
3863 : case H_ADST:
3864 0 : fidtx32(in0, in1, in2, in3);
3865 0 : round_signed_32x32(in0, in1, in2, in3);
3866 0 : fhalfright32(in0, in1, in2, in3);
3867 0 : break;
3868 : case V_FLIPADST:
3869 0 : load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
3870 0 : fhalfright32(in0, in1, in2, in3);
3871 0 : round_signed_32x32(in0, in1, in2, in3);
3872 0 : fidtx32(in0, in1, in2, in3);
3873 0 : break;
3874 : case H_FLIPADST:
3875 0 : load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
3876 0 : fidtx32(in0, in1, in2, in3);
3877 0 : round_signed_32x32(in0, in1, in2, in3);
3878 0 : fhalfright32(in0, in1, in2, in3);
3879 0 : break;
3880 : #endif
3881 0 : default: assert(0);
3882 : }
3883 0 : write_buffer_32x32(in0, in1, in2, in3, output);
3884 0 : }
|