Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 : #include <assert.h>
12 : #include <smmintrin.h> /* SSE4.1 */
13 :
14 : #include "./av1_rtcd.h"
15 : #include "./aom_config.h"
16 : #include "av1/common/av1_fwd_txfm1d_cfg.h"
17 : #include "av1/common/av1_txfm.h"
18 : #include "av1/common/x86/highbd_txfm_utility_sse4.h"
19 : #include "aom_dsp/txfm_common.h"
20 : #include "aom_dsp/x86/txfm_common_sse2.h"
21 : #include "aom_ports/mem.h"
22 :
23 0 : static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
24 : int stride, int flipud, int fliplr,
25 : int shift) {
26 0 : if (!flipud) {
27 0 : in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
28 0 : in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
29 0 : in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
30 0 : in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
31 : } else {
32 0 : in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
33 0 : in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
34 0 : in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
35 0 : in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
36 : }
37 :
38 0 : if (fliplr) {
39 0 : in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
40 0 : in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
41 0 : in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
42 0 : in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
43 : }
44 :
45 0 : in[0] = _mm_cvtepi16_epi32(in[0]);
46 0 : in[1] = _mm_cvtepi16_epi32(in[1]);
47 0 : in[2] = _mm_cvtepi16_epi32(in[2]);
48 0 : in[3] = _mm_cvtepi16_epi32(in[3]);
49 :
50 0 : in[0] = _mm_slli_epi32(in[0], shift);
51 0 : in[1] = _mm_slli_epi32(in[1], shift);
52 0 : in[2] = _mm_slli_epi32(in[2], shift);
53 0 : in[3] = _mm_slli_epi32(in[3], shift);
54 0 : }
55 :
56 : // We only use stage-2 bit;
57 : // shift[0] is used in load_buffer_4x4()
58 : // shift[1] is used in txfm_func_col()
59 : // shift[2] is used in txfm_func_row()
60 0 : static void fdct4x4_sse4_1(__m128i *in, int bit) {
61 0 : const int32_t *cospi = cospi_arr(bit);
62 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
63 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
64 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
65 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
66 : __m128i s0, s1, s2, s3;
67 : __m128i u0, u1, u2, u3;
68 : __m128i v0, v1, v2, v3;
69 :
70 0 : s0 = _mm_add_epi32(in[0], in[3]);
71 0 : s1 = _mm_add_epi32(in[1], in[2]);
72 0 : s2 = _mm_sub_epi32(in[1], in[2]);
73 0 : s3 = _mm_sub_epi32(in[0], in[3]);
74 :
75 : // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
76 0 : u0 = _mm_mullo_epi32(s0, cospi32);
77 0 : u1 = _mm_mullo_epi32(s1, cospi32);
78 0 : u2 = _mm_add_epi32(u0, u1);
79 0 : v0 = _mm_sub_epi32(u0, u1);
80 :
81 0 : u3 = _mm_add_epi32(u2, rnding);
82 0 : v1 = _mm_add_epi32(v0, rnding);
83 :
84 0 : u0 = _mm_srai_epi32(u3, bit);
85 0 : u2 = _mm_srai_epi32(v1, bit);
86 :
87 : // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
88 0 : v0 = _mm_mullo_epi32(s2, cospi48);
89 0 : v1 = _mm_mullo_epi32(s3, cospi16);
90 0 : v2 = _mm_add_epi32(v0, v1);
91 :
92 0 : v3 = _mm_add_epi32(v2, rnding);
93 0 : u1 = _mm_srai_epi32(v3, bit);
94 :
95 0 : v0 = _mm_mullo_epi32(s2, cospi16);
96 0 : v1 = _mm_mullo_epi32(s3, cospi48);
97 0 : v2 = _mm_sub_epi32(v1, v0);
98 :
99 0 : v3 = _mm_add_epi32(v2, rnding);
100 0 : u3 = _mm_srai_epi32(v3, bit);
101 :
102 : // Note: shift[1] and shift[2] are zeros
103 :
104 : // Transpose 4x4 32-bit
105 0 : v0 = _mm_unpacklo_epi32(u0, u1);
106 0 : v1 = _mm_unpackhi_epi32(u0, u1);
107 0 : v2 = _mm_unpacklo_epi32(u2, u3);
108 0 : v3 = _mm_unpackhi_epi32(u2, u3);
109 :
110 0 : in[0] = _mm_unpacklo_epi64(v0, v2);
111 0 : in[1] = _mm_unpackhi_epi64(v0, v2);
112 0 : in[2] = _mm_unpacklo_epi64(v1, v3);
113 0 : in[3] = _mm_unpackhi_epi64(v1, v3);
114 0 : }
115 :
116 0 : static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) {
117 0 : _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
118 0 : _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
119 0 : _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
120 0 : _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
121 0 : }
122 :
123 : // Note:
124 : // We implement av1_fwd_txfm2d_4x4(). This function is kept here since
125 : // av1_highbd_fht4x4_c() is not removed yet
126 0 : void av1_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output,
127 : int stride, int tx_type) {
128 : (void)input;
129 : (void)output;
130 : (void)stride;
131 : (void)tx_type;
132 0 : assert(0);
133 : }
134 :
135 0 : static void fadst4x4_sse4_1(__m128i *in, int bit) {
136 0 : const int32_t *cospi = cospi_arr(bit);
137 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
138 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
139 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
140 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
141 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
142 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
143 0 : const __m128i kZero = _mm_setzero_si128();
144 : __m128i s0, s1, s2, s3;
145 : __m128i u0, u1, u2, u3;
146 : __m128i v0, v1, v2, v3;
147 :
148 : // stage 0
149 : // stage 1
150 : // stage 2
151 0 : u0 = _mm_mullo_epi32(in[3], cospi8);
152 0 : u1 = _mm_mullo_epi32(in[0], cospi56);
153 0 : u2 = _mm_add_epi32(u0, u1);
154 0 : s0 = _mm_add_epi32(u2, rnding);
155 0 : s0 = _mm_srai_epi32(s0, bit);
156 :
157 0 : v0 = _mm_mullo_epi32(in[3], cospi56);
158 0 : v1 = _mm_mullo_epi32(in[0], cospi8);
159 0 : v2 = _mm_sub_epi32(v0, v1);
160 0 : s1 = _mm_add_epi32(v2, rnding);
161 0 : s1 = _mm_srai_epi32(s1, bit);
162 :
163 0 : u0 = _mm_mullo_epi32(in[1], cospi40);
164 0 : u1 = _mm_mullo_epi32(in[2], cospi24);
165 0 : u2 = _mm_add_epi32(u0, u1);
166 0 : s2 = _mm_add_epi32(u2, rnding);
167 0 : s2 = _mm_srai_epi32(s2, bit);
168 :
169 0 : v0 = _mm_mullo_epi32(in[1], cospi24);
170 0 : v1 = _mm_mullo_epi32(in[2], cospi40);
171 0 : v2 = _mm_sub_epi32(v0, v1);
172 0 : s3 = _mm_add_epi32(v2, rnding);
173 0 : s3 = _mm_srai_epi32(s3, bit);
174 :
175 : // stage 3
176 0 : u0 = _mm_add_epi32(s0, s2);
177 0 : u2 = _mm_sub_epi32(s0, s2);
178 0 : u1 = _mm_add_epi32(s1, s3);
179 0 : u3 = _mm_sub_epi32(s1, s3);
180 :
181 : // stage 4
182 0 : v0 = _mm_mullo_epi32(u2, cospi32);
183 0 : v1 = _mm_mullo_epi32(u3, cospi32);
184 0 : v2 = _mm_add_epi32(v0, v1);
185 0 : s2 = _mm_add_epi32(v2, rnding);
186 0 : u2 = _mm_srai_epi32(s2, bit);
187 :
188 0 : v2 = _mm_sub_epi32(v0, v1);
189 0 : s3 = _mm_add_epi32(v2, rnding);
190 0 : u3 = _mm_srai_epi32(s3, bit);
191 :
192 : // u0, u1, u2, u3
193 0 : u2 = _mm_sub_epi32(kZero, u2);
194 0 : u1 = _mm_sub_epi32(kZero, u1);
195 :
196 : // u0, u2, u3, u1
197 : // Transpose 4x4 32-bit
198 0 : v0 = _mm_unpacklo_epi32(u0, u2);
199 0 : v1 = _mm_unpackhi_epi32(u0, u2);
200 0 : v2 = _mm_unpacklo_epi32(u3, u1);
201 0 : v3 = _mm_unpackhi_epi32(u3, u1);
202 :
203 0 : in[0] = _mm_unpacklo_epi64(v0, v2);
204 0 : in[1] = _mm_unpackhi_epi64(v0, v2);
205 0 : in[2] = _mm_unpacklo_epi64(v1, v3);
206 0 : in[3] = _mm_unpackhi_epi64(v1, v3);
207 0 : }
208 :
209 0 : void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
210 : int input_stride, int tx_type, int bd) {
211 : __m128i in[4];
212 0 : const TXFM_1D_CFG *row_cfg = NULL;
213 0 : const TXFM_1D_CFG *col_cfg = NULL;
214 :
215 0 : switch (tx_type) {
216 : case DCT_DCT:
217 0 : row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
218 0 : col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
219 0 : load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
220 0 : fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
221 0 : fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
222 0 : write_buffer_4x4(in, coeff);
223 0 : break;
224 : case ADST_DCT:
225 0 : row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
226 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
227 0 : load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
228 0 : fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
229 0 : fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
230 0 : write_buffer_4x4(in, coeff);
231 0 : break;
232 : case DCT_ADST:
233 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
234 0 : col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
235 0 : load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
236 0 : fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
237 0 : fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
238 0 : write_buffer_4x4(in, coeff);
239 0 : break;
240 : case ADST_ADST:
241 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
242 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
243 0 : load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
244 0 : fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
245 0 : fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
246 0 : write_buffer_4x4(in, coeff);
247 0 : break;
248 : #if CONFIG_EXT_TX
249 : case FLIPADST_DCT:
250 0 : row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
251 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
252 0 : load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
253 0 : fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
254 0 : fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
255 0 : write_buffer_4x4(in, coeff);
256 0 : break;
257 : case DCT_FLIPADST:
258 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
259 0 : col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
260 0 : load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
261 0 : fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
262 0 : fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
263 0 : write_buffer_4x4(in, coeff);
264 0 : break;
265 : case FLIPADST_FLIPADST:
266 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
267 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
268 0 : load_buffer_4x4(input, in, input_stride, 1, 1, row_cfg->shift[0]);
269 0 : fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
270 0 : fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
271 0 : write_buffer_4x4(in, coeff);
272 0 : break;
273 : case ADST_FLIPADST:
274 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
275 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
276 0 : load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
277 0 : fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
278 0 : fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
279 0 : write_buffer_4x4(in, coeff);
280 0 : break;
281 : case FLIPADST_ADST:
282 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
283 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
284 0 : load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
285 0 : fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
286 0 : fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
287 0 : write_buffer_4x4(in, coeff);
288 0 : break;
289 : #endif
290 0 : default: assert(0);
291 : }
292 : (void)bd;
293 0 : }
294 :
295 0 : static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
296 : int stride, int flipud, int fliplr,
297 : int shift) {
298 : __m128i u;
299 0 : if (!flipud) {
300 0 : in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
301 0 : in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
302 0 : in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
303 0 : in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
304 0 : in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
305 0 : in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
306 0 : in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
307 0 : in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
308 : } else {
309 0 : in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
310 0 : in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
311 0 : in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
312 0 : in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
313 0 : in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
314 0 : in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
315 0 : in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
316 0 : in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
317 : }
318 :
319 0 : if (fliplr) {
320 0 : in[0] = mm_reverse_epi16(in[0]);
321 0 : in[1] = mm_reverse_epi16(in[1]);
322 0 : in[2] = mm_reverse_epi16(in[2]);
323 0 : in[3] = mm_reverse_epi16(in[3]);
324 0 : in[4] = mm_reverse_epi16(in[4]);
325 0 : in[5] = mm_reverse_epi16(in[5]);
326 0 : in[6] = mm_reverse_epi16(in[6]);
327 0 : in[7] = mm_reverse_epi16(in[7]);
328 : }
329 :
330 0 : u = _mm_unpackhi_epi64(in[4], in[4]);
331 0 : in[8] = _mm_cvtepi16_epi32(in[4]);
332 0 : in[9] = _mm_cvtepi16_epi32(u);
333 :
334 0 : u = _mm_unpackhi_epi64(in[5], in[5]);
335 0 : in[10] = _mm_cvtepi16_epi32(in[5]);
336 0 : in[11] = _mm_cvtepi16_epi32(u);
337 :
338 0 : u = _mm_unpackhi_epi64(in[6], in[6]);
339 0 : in[12] = _mm_cvtepi16_epi32(in[6]);
340 0 : in[13] = _mm_cvtepi16_epi32(u);
341 :
342 0 : u = _mm_unpackhi_epi64(in[7], in[7]);
343 0 : in[14] = _mm_cvtepi16_epi32(in[7]);
344 0 : in[15] = _mm_cvtepi16_epi32(u);
345 :
346 0 : u = _mm_unpackhi_epi64(in[3], in[3]);
347 0 : in[6] = _mm_cvtepi16_epi32(in[3]);
348 0 : in[7] = _mm_cvtepi16_epi32(u);
349 :
350 0 : u = _mm_unpackhi_epi64(in[2], in[2]);
351 0 : in[4] = _mm_cvtepi16_epi32(in[2]);
352 0 : in[5] = _mm_cvtepi16_epi32(u);
353 :
354 0 : u = _mm_unpackhi_epi64(in[1], in[1]);
355 0 : in[2] = _mm_cvtepi16_epi32(in[1]);
356 0 : in[3] = _mm_cvtepi16_epi32(u);
357 :
358 0 : u = _mm_unpackhi_epi64(in[0], in[0]);
359 0 : in[0] = _mm_cvtepi16_epi32(in[0]);
360 0 : in[1] = _mm_cvtepi16_epi32(u);
361 :
362 0 : in[0] = _mm_slli_epi32(in[0], shift);
363 0 : in[1] = _mm_slli_epi32(in[1], shift);
364 0 : in[2] = _mm_slli_epi32(in[2], shift);
365 0 : in[3] = _mm_slli_epi32(in[3], shift);
366 0 : in[4] = _mm_slli_epi32(in[4], shift);
367 0 : in[5] = _mm_slli_epi32(in[5], shift);
368 0 : in[6] = _mm_slli_epi32(in[6], shift);
369 0 : in[7] = _mm_slli_epi32(in[7], shift);
370 :
371 0 : in[8] = _mm_slli_epi32(in[8], shift);
372 0 : in[9] = _mm_slli_epi32(in[9], shift);
373 0 : in[10] = _mm_slli_epi32(in[10], shift);
374 0 : in[11] = _mm_slli_epi32(in[11], shift);
375 0 : in[12] = _mm_slli_epi32(in[12], shift);
376 0 : in[13] = _mm_slli_epi32(in[13], shift);
377 0 : in[14] = _mm_slli_epi32(in[14], shift);
378 0 : in[15] = _mm_slli_epi32(in[15], shift);
379 0 : }
380 :
381 0 : static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
382 0 : const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
383 :
384 0 : in[0] = _mm_add_epi32(in[0], rounding);
385 0 : in[1] = _mm_add_epi32(in[1], rounding);
386 0 : in[2] = _mm_add_epi32(in[2], rounding);
387 0 : in[3] = _mm_add_epi32(in[3], rounding);
388 0 : in[4] = _mm_add_epi32(in[4], rounding);
389 0 : in[5] = _mm_add_epi32(in[5], rounding);
390 0 : in[6] = _mm_add_epi32(in[6], rounding);
391 0 : in[7] = _mm_add_epi32(in[7], rounding);
392 0 : in[8] = _mm_add_epi32(in[8], rounding);
393 0 : in[9] = _mm_add_epi32(in[9], rounding);
394 0 : in[10] = _mm_add_epi32(in[10], rounding);
395 0 : in[11] = _mm_add_epi32(in[11], rounding);
396 0 : in[12] = _mm_add_epi32(in[12], rounding);
397 0 : in[13] = _mm_add_epi32(in[13], rounding);
398 0 : in[14] = _mm_add_epi32(in[14], rounding);
399 0 : in[15] = _mm_add_epi32(in[15], rounding);
400 :
401 0 : in[0] = _mm_srai_epi32(in[0], shift);
402 0 : in[1] = _mm_srai_epi32(in[1], shift);
403 0 : in[2] = _mm_srai_epi32(in[2], shift);
404 0 : in[3] = _mm_srai_epi32(in[3], shift);
405 0 : in[4] = _mm_srai_epi32(in[4], shift);
406 0 : in[5] = _mm_srai_epi32(in[5], shift);
407 0 : in[6] = _mm_srai_epi32(in[6], shift);
408 0 : in[7] = _mm_srai_epi32(in[7], shift);
409 0 : in[8] = _mm_srai_epi32(in[8], shift);
410 0 : in[9] = _mm_srai_epi32(in[9], shift);
411 0 : in[10] = _mm_srai_epi32(in[10], shift);
412 0 : in[11] = _mm_srai_epi32(in[11], shift);
413 0 : in[12] = _mm_srai_epi32(in[12], shift);
414 0 : in[13] = _mm_srai_epi32(in[13], shift);
415 0 : in[14] = _mm_srai_epi32(in[14], shift);
416 0 : in[15] = _mm_srai_epi32(in[15], shift);
417 0 : }
418 :
419 0 : static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) {
420 0 : _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
421 0 : _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
422 0 : _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
423 0 : _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
424 :
425 0 : _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
426 0 : _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
427 0 : _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
428 0 : _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
429 :
430 0 : _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
431 0 : _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
432 0 : _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
433 0 : _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
434 :
435 0 : _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
436 0 : _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
437 0 : _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
438 0 : _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
439 0 : }
440 :
441 0 : static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
442 0 : const int32_t *cospi = cospi_arr(bit);
443 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
444 0 : const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
445 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
446 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
447 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
448 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
449 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
450 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
451 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
452 : __m128i u[8], v[8];
453 :
454 : // Even 8 points 0, 2, ..., 14
455 : // stage 0
456 : // stage 1
457 0 : u[0] = _mm_add_epi32(in[0], in[14]);
458 0 : v[7] = _mm_sub_epi32(in[0], in[14]); // v[7]
459 0 : u[1] = _mm_add_epi32(in[2], in[12]);
460 0 : u[6] = _mm_sub_epi32(in[2], in[12]);
461 0 : u[2] = _mm_add_epi32(in[4], in[10]);
462 0 : u[5] = _mm_sub_epi32(in[4], in[10]);
463 0 : u[3] = _mm_add_epi32(in[6], in[8]);
464 0 : v[4] = _mm_sub_epi32(in[6], in[8]); // v[4]
465 :
466 : // stage 2
467 0 : v[0] = _mm_add_epi32(u[0], u[3]);
468 0 : v[3] = _mm_sub_epi32(u[0], u[3]);
469 0 : v[1] = _mm_add_epi32(u[1], u[2]);
470 0 : v[2] = _mm_sub_epi32(u[1], u[2]);
471 :
472 0 : v[5] = _mm_mullo_epi32(u[5], cospim32);
473 0 : v[6] = _mm_mullo_epi32(u[6], cospi32);
474 0 : v[5] = _mm_add_epi32(v[5], v[6]);
475 0 : v[5] = _mm_add_epi32(v[5], rnding);
476 0 : v[5] = _mm_srai_epi32(v[5], bit);
477 :
478 0 : u[0] = _mm_mullo_epi32(u[5], cospi32);
479 0 : v[6] = _mm_mullo_epi32(u[6], cospim32);
480 0 : v[6] = _mm_sub_epi32(u[0], v[6]);
481 0 : v[6] = _mm_add_epi32(v[6], rnding);
482 0 : v[6] = _mm_srai_epi32(v[6], bit);
483 :
484 : // stage 3
485 : // type 0
486 0 : v[0] = _mm_mullo_epi32(v[0], cospi32);
487 0 : v[1] = _mm_mullo_epi32(v[1], cospi32);
488 0 : u[0] = _mm_add_epi32(v[0], v[1]);
489 0 : u[0] = _mm_add_epi32(u[0], rnding);
490 0 : u[0] = _mm_srai_epi32(u[0], bit);
491 :
492 0 : u[1] = _mm_sub_epi32(v[0], v[1]);
493 0 : u[1] = _mm_add_epi32(u[1], rnding);
494 0 : u[1] = _mm_srai_epi32(u[1], bit);
495 :
496 : // type 1
497 0 : v[0] = _mm_mullo_epi32(v[2], cospi48);
498 0 : v[1] = _mm_mullo_epi32(v[3], cospi16);
499 0 : u[2] = _mm_add_epi32(v[0], v[1]);
500 0 : u[2] = _mm_add_epi32(u[2], rnding);
501 0 : u[2] = _mm_srai_epi32(u[2], bit);
502 :
503 0 : v[0] = _mm_mullo_epi32(v[2], cospi16);
504 0 : v[1] = _mm_mullo_epi32(v[3], cospi48);
505 0 : u[3] = _mm_sub_epi32(v[1], v[0]);
506 0 : u[3] = _mm_add_epi32(u[3], rnding);
507 0 : u[3] = _mm_srai_epi32(u[3], bit);
508 :
509 0 : u[4] = _mm_add_epi32(v[4], v[5]);
510 0 : u[5] = _mm_sub_epi32(v[4], v[5]);
511 0 : u[6] = _mm_sub_epi32(v[7], v[6]);
512 0 : u[7] = _mm_add_epi32(v[7], v[6]);
513 :
514 : // stage 4
515 : // stage 5
516 0 : v[0] = _mm_mullo_epi32(u[4], cospi56);
517 0 : v[1] = _mm_mullo_epi32(u[7], cospi8);
518 0 : v[0] = _mm_add_epi32(v[0], v[1]);
519 0 : v[0] = _mm_add_epi32(v[0], rnding);
520 0 : out[2] = _mm_srai_epi32(v[0], bit); // buf0[4]
521 :
522 0 : v[0] = _mm_mullo_epi32(u[4], cospi8);
523 0 : v[1] = _mm_mullo_epi32(u[7], cospi56);
524 0 : v[0] = _mm_sub_epi32(v[1], v[0]);
525 0 : v[0] = _mm_add_epi32(v[0], rnding);
526 0 : out[14] = _mm_srai_epi32(v[0], bit); // buf0[7]
527 :
528 0 : v[0] = _mm_mullo_epi32(u[5], cospi24);
529 0 : v[1] = _mm_mullo_epi32(u[6], cospi40);
530 0 : v[0] = _mm_add_epi32(v[0], v[1]);
531 0 : v[0] = _mm_add_epi32(v[0], rnding);
532 0 : out[10] = _mm_srai_epi32(v[0], bit); // buf0[5]
533 :
534 0 : v[0] = _mm_mullo_epi32(u[5], cospi40);
535 0 : v[1] = _mm_mullo_epi32(u[6], cospi24);
536 0 : v[0] = _mm_sub_epi32(v[1], v[0]);
537 0 : v[0] = _mm_add_epi32(v[0], rnding);
538 0 : out[6] = _mm_srai_epi32(v[0], bit); // buf0[6]
539 :
540 0 : out[0] = u[0]; // buf0[0]
541 0 : out[8] = u[1]; // buf0[1]
542 0 : out[4] = u[2]; // buf0[2]
543 0 : out[12] = u[3]; // buf0[3]
544 :
545 : // Odd 8 points: 1, 3, ..., 15
546 : // stage 0
547 : // stage 1
548 0 : u[0] = _mm_add_epi32(in[1], in[15]);
549 0 : v[7] = _mm_sub_epi32(in[1], in[15]); // v[7]
550 0 : u[1] = _mm_add_epi32(in[3], in[13]);
551 0 : u[6] = _mm_sub_epi32(in[3], in[13]);
552 0 : u[2] = _mm_add_epi32(in[5], in[11]);
553 0 : u[5] = _mm_sub_epi32(in[5], in[11]);
554 0 : u[3] = _mm_add_epi32(in[7], in[9]);
555 0 : v[4] = _mm_sub_epi32(in[7], in[9]); // v[4]
556 :
557 : // stage 2
558 0 : v[0] = _mm_add_epi32(u[0], u[3]);
559 0 : v[3] = _mm_sub_epi32(u[0], u[3]);
560 0 : v[1] = _mm_add_epi32(u[1], u[2]);
561 0 : v[2] = _mm_sub_epi32(u[1], u[2]);
562 :
563 0 : v[5] = _mm_mullo_epi32(u[5], cospim32);
564 0 : v[6] = _mm_mullo_epi32(u[6], cospi32);
565 0 : v[5] = _mm_add_epi32(v[5], v[6]);
566 0 : v[5] = _mm_add_epi32(v[5], rnding);
567 0 : v[5] = _mm_srai_epi32(v[5], bit);
568 :
569 0 : u[0] = _mm_mullo_epi32(u[5], cospi32);
570 0 : v[6] = _mm_mullo_epi32(u[6], cospim32);
571 0 : v[6] = _mm_sub_epi32(u[0], v[6]);
572 0 : v[6] = _mm_add_epi32(v[6], rnding);
573 0 : v[6] = _mm_srai_epi32(v[6], bit);
574 :
575 : // stage 3
576 : // type 0
577 0 : v[0] = _mm_mullo_epi32(v[0], cospi32);
578 0 : v[1] = _mm_mullo_epi32(v[1], cospi32);
579 0 : u[0] = _mm_add_epi32(v[0], v[1]);
580 0 : u[0] = _mm_add_epi32(u[0], rnding);
581 0 : u[0] = _mm_srai_epi32(u[0], bit);
582 :
583 0 : u[1] = _mm_sub_epi32(v[0], v[1]);
584 0 : u[1] = _mm_add_epi32(u[1], rnding);
585 0 : u[1] = _mm_srai_epi32(u[1], bit);
586 :
587 : // type 1
588 0 : v[0] = _mm_mullo_epi32(v[2], cospi48);
589 0 : v[1] = _mm_mullo_epi32(v[3], cospi16);
590 0 : u[2] = _mm_add_epi32(v[0], v[1]);
591 0 : u[2] = _mm_add_epi32(u[2], rnding);
592 0 : u[2] = _mm_srai_epi32(u[2], bit);
593 :
594 0 : v[0] = _mm_mullo_epi32(v[2], cospi16);
595 0 : v[1] = _mm_mullo_epi32(v[3], cospi48);
596 0 : u[3] = _mm_sub_epi32(v[1], v[0]);
597 0 : u[3] = _mm_add_epi32(u[3], rnding);
598 0 : u[3] = _mm_srai_epi32(u[3], bit);
599 :
600 0 : u[4] = _mm_add_epi32(v[4], v[5]);
601 0 : u[5] = _mm_sub_epi32(v[4], v[5]);
602 0 : u[6] = _mm_sub_epi32(v[7], v[6]);
603 0 : u[7] = _mm_add_epi32(v[7], v[6]);
604 :
605 : // stage 4
606 : // stage 5
607 0 : v[0] = _mm_mullo_epi32(u[4], cospi56);
608 0 : v[1] = _mm_mullo_epi32(u[7], cospi8);
609 0 : v[0] = _mm_add_epi32(v[0], v[1]);
610 0 : v[0] = _mm_add_epi32(v[0], rnding);
611 0 : out[3] = _mm_srai_epi32(v[0], bit); // buf0[4]
612 :
613 0 : v[0] = _mm_mullo_epi32(u[4], cospi8);
614 0 : v[1] = _mm_mullo_epi32(u[7], cospi56);
615 0 : v[0] = _mm_sub_epi32(v[1], v[0]);
616 0 : v[0] = _mm_add_epi32(v[0], rnding);
617 0 : out[15] = _mm_srai_epi32(v[0], bit); // buf0[7]
618 :
619 0 : v[0] = _mm_mullo_epi32(u[5], cospi24);
620 0 : v[1] = _mm_mullo_epi32(u[6], cospi40);
621 0 : v[0] = _mm_add_epi32(v[0], v[1]);
622 0 : v[0] = _mm_add_epi32(v[0], rnding);
623 0 : out[11] = _mm_srai_epi32(v[0], bit); // buf0[5]
624 :
625 0 : v[0] = _mm_mullo_epi32(u[5], cospi40);
626 0 : v[1] = _mm_mullo_epi32(u[6], cospi24);
627 0 : v[0] = _mm_sub_epi32(v[1], v[0]);
628 0 : v[0] = _mm_add_epi32(v[0], rnding);
629 0 : out[7] = _mm_srai_epi32(v[0], bit); // buf0[6]
630 :
631 0 : out[1] = u[0]; // buf0[0]
632 0 : out[9] = u[1]; // buf0[1]
633 0 : out[5] = u[2]; // buf0[2]
634 0 : out[13] = u[3]; // buf0[3]
635 0 : }
636 :
637 0 : static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
638 0 : const int32_t *cospi = cospi_arr(bit);
639 0 : const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
640 0 : const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
641 0 : const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
642 0 : const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
643 0 : const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
644 0 : const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
645 0 : const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
646 0 : const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
647 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
648 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
649 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
650 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
651 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
652 0 : const __m128i kZero = _mm_setzero_si128();
653 : __m128i u[8], v[8], x;
654 :
655 : // Even 8 points: 0, 2, ..., 14
656 : // stage 0
657 : // stage 1
658 : // stage 2
659 : // (1)
660 0 : u[0] = _mm_mullo_epi32(in[14], cospi4);
661 0 : x = _mm_mullo_epi32(in[0], cospi60);
662 0 : u[0] = _mm_add_epi32(u[0], x);
663 0 : u[0] = _mm_add_epi32(u[0], rnding);
664 0 : u[0] = _mm_srai_epi32(u[0], bit);
665 :
666 0 : u[1] = _mm_mullo_epi32(in[14], cospi60);
667 0 : x = _mm_mullo_epi32(in[0], cospi4);
668 0 : u[1] = _mm_sub_epi32(u[1], x);
669 0 : u[1] = _mm_add_epi32(u[1], rnding);
670 0 : u[1] = _mm_srai_epi32(u[1], bit);
671 :
672 : // (2)
673 0 : u[2] = _mm_mullo_epi32(in[10], cospi20);
674 0 : x = _mm_mullo_epi32(in[4], cospi44);
675 0 : u[2] = _mm_add_epi32(u[2], x);
676 0 : u[2] = _mm_add_epi32(u[2], rnding);
677 0 : u[2] = _mm_srai_epi32(u[2], bit);
678 :
679 0 : u[3] = _mm_mullo_epi32(in[10], cospi44);
680 0 : x = _mm_mullo_epi32(in[4], cospi20);
681 0 : u[3] = _mm_sub_epi32(u[3], x);
682 0 : u[3] = _mm_add_epi32(u[3], rnding);
683 0 : u[3] = _mm_srai_epi32(u[3], bit);
684 :
685 : // (3)
686 0 : u[4] = _mm_mullo_epi32(in[6], cospi36);
687 0 : x = _mm_mullo_epi32(in[8], cospi28);
688 0 : u[4] = _mm_add_epi32(u[4], x);
689 0 : u[4] = _mm_add_epi32(u[4], rnding);
690 0 : u[4] = _mm_srai_epi32(u[4], bit);
691 :
692 0 : u[5] = _mm_mullo_epi32(in[6], cospi28);
693 0 : x = _mm_mullo_epi32(in[8], cospi36);
694 0 : u[5] = _mm_sub_epi32(u[5], x);
695 0 : u[5] = _mm_add_epi32(u[5], rnding);
696 0 : u[5] = _mm_srai_epi32(u[5], bit);
697 :
698 : // (4)
699 0 : u[6] = _mm_mullo_epi32(in[2], cospi52);
700 0 : x = _mm_mullo_epi32(in[12], cospi12);
701 0 : u[6] = _mm_add_epi32(u[6], x);
702 0 : u[6] = _mm_add_epi32(u[6], rnding);
703 0 : u[6] = _mm_srai_epi32(u[6], bit);
704 :
705 0 : u[7] = _mm_mullo_epi32(in[2], cospi12);
706 0 : x = _mm_mullo_epi32(in[12], cospi52);
707 0 : u[7] = _mm_sub_epi32(u[7], x);
708 0 : u[7] = _mm_add_epi32(u[7], rnding);
709 0 : u[7] = _mm_srai_epi32(u[7], bit);
710 :
711 : // stage 3
712 0 : v[0] = _mm_add_epi32(u[0], u[4]);
713 0 : v[4] = _mm_sub_epi32(u[0], u[4]);
714 0 : v[1] = _mm_add_epi32(u[1], u[5]);
715 0 : v[5] = _mm_sub_epi32(u[1], u[5]);
716 0 : v[2] = _mm_add_epi32(u[2], u[6]);
717 0 : v[6] = _mm_sub_epi32(u[2], u[6]);
718 0 : v[3] = _mm_add_epi32(u[3], u[7]);
719 0 : v[7] = _mm_sub_epi32(u[3], u[7]);
720 :
721 : // stage 4
722 0 : u[0] = v[0];
723 0 : u[1] = v[1];
724 0 : u[2] = v[2];
725 0 : u[3] = v[3];
726 :
727 0 : u[4] = _mm_mullo_epi32(v[4], cospi16);
728 0 : x = _mm_mullo_epi32(v[5], cospi48);
729 0 : u[4] = _mm_add_epi32(u[4], x);
730 0 : u[4] = _mm_add_epi32(u[4], rnding);
731 0 : u[4] = _mm_srai_epi32(u[4], bit);
732 :
733 0 : u[5] = _mm_mullo_epi32(v[4], cospi48);
734 0 : x = _mm_mullo_epi32(v[5], cospi16);
735 0 : u[5] = _mm_sub_epi32(u[5], x);
736 0 : u[5] = _mm_add_epi32(u[5], rnding);
737 0 : u[5] = _mm_srai_epi32(u[5], bit);
738 :
739 0 : u[6] = _mm_mullo_epi32(v[6], cospim48);
740 0 : x = _mm_mullo_epi32(v[7], cospi16);
741 0 : u[6] = _mm_add_epi32(u[6], x);
742 0 : u[6] = _mm_add_epi32(u[6], rnding);
743 0 : u[6] = _mm_srai_epi32(u[6], bit);
744 :
745 0 : u[7] = _mm_mullo_epi32(v[6], cospi16);
746 0 : x = _mm_mullo_epi32(v[7], cospim48);
747 0 : u[7] = _mm_sub_epi32(u[7], x);
748 0 : u[7] = _mm_add_epi32(u[7], rnding);
749 0 : u[7] = _mm_srai_epi32(u[7], bit);
750 :
751 : // stage 5
752 0 : v[0] = _mm_add_epi32(u[0], u[2]);
753 0 : v[2] = _mm_sub_epi32(u[0], u[2]);
754 0 : v[1] = _mm_add_epi32(u[1], u[3]);
755 0 : v[3] = _mm_sub_epi32(u[1], u[3]);
756 0 : v[4] = _mm_add_epi32(u[4], u[6]);
757 0 : v[6] = _mm_sub_epi32(u[4], u[6]);
758 0 : v[5] = _mm_add_epi32(u[5], u[7]);
759 0 : v[7] = _mm_sub_epi32(u[5], u[7]);
760 :
761 : // stage 6
762 0 : u[0] = v[0];
763 0 : u[1] = v[1];
764 0 : u[4] = v[4];
765 0 : u[5] = v[5];
766 :
767 0 : v[0] = _mm_mullo_epi32(v[2], cospi32);
768 0 : x = _mm_mullo_epi32(v[3], cospi32);
769 0 : u[2] = _mm_add_epi32(v[0], x);
770 0 : u[2] = _mm_add_epi32(u[2], rnding);
771 0 : u[2] = _mm_srai_epi32(u[2], bit);
772 :
773 0 : u[3] = _mm_sub_epi32(v[0], x);
774 0 : u[3] = _mm_add_epi32(u[3], rnding);
775 0 : u[3] = _mm_srai_epi32(u[3], bit);
776 :
777 0 : v[0] = _mm_mullo_epi32(v[6], cospi32);
778 0 : x = _mm_mullo_epi32(v[7], cospi32);
779 0 : u[6] = _mm_add_epi32(v[0], x);
780 0 : u[6] = _mm_add_epi32(u[6], rnding);
781 0 : u[6] = _mm_srai_epi32(u[6], bit);
782 :
783 0 : u[7] = _mm_sub_epi32(v[0], x);
784 0 : u[7] = _mm_add_epi32(u[7], rnding);
785 0 : u[7] = _mm_srai_epi32(u[7], bit);
786 :
787 : // stage 7
788 0 : out[0] = u[0];
789 0 : out[2] = _mm_sub_epi32(kZero, u[4]);
790 0 : out[4] = u[6];
791 0 : out[6] = _mm_sub_epi32(kZero, u[2]);
792 0 : out[8] = u[3];
793 0 : out[10] = _mm_sub_epi32(kZero, u[7]);
794 0 : out[12] = u[5];
795 0 : out[14] = _mm_sub_epi32(kZero, u[1]);
796 :
797 : // Odd 8 points: 1, 3, ..., 15
798 : // stage 0
799 : // stage 1
800 : // stage 2
801 : // (1)
802 0 : u[0] = _mm_mullo_epi32(in[15], cospi4);
803 0 : x = _mm_mullo_epi32(in[1], cospi60);
804 0 : u[0] = _mm_add_epi32(u[0], x);
805 0 : u[0] = _mm_add_epi32(u[0], rnding);
806 0 : u[0] = _mm_srai_epi32(u[0], bit);
807 :
808 0 : u[1] = _mm_mullo_epi32(in[15], cospi60);
809 0 : x = _mm_mullo_epi32(in[1], cospi4);
810 0 : u[1] = _mm_sub_epi32(u[1], x);
811 0 : u[1] = _mm_add_epi32(u[1], rnding);
812 0 : u[1] = _mm_srai_epi32(u[1], bit);
813 :
814 : // (2)
815 0 : u[2] = _mm_mullo_epi32(in[11], cospi20);
816 0 : x = _mm_mullo_epi32(in[5], cospi44);
817 0 : u[2] = _mm_add_epi32(u[2], x);
818 0 : u[2] = _mm_add_epi32(u[2], rnding);
819 0 : u[2] = _mm_srai_epi32(u[2], bit);
820 :
821 0 : u[3] = _mm_mullo_epi32(in[11], cospi44);
822 0 : x = _mm_mullo_epi32(in[5], cospi20);
823 0 : u[3] = _mm_sub_epi32(u[3], x);
824 0 : u[3] = _mm_add_epi32(u[3], rnding);
825 0 : u[3] = _mm_srai_epi32(u[3], bit);
826 :
827 : // (3)
828 0 : u[4] = _mm_mullo_epi32(in[7], cospi36);
829 0 : x = _mm_mullo_epi32(in[9], cospi28);
830 0 : u[4] = _mm_add_epi32(u[4], x);
831 0 : u[4] = _mm_add_epi32(u[4], rnding);
832 0 : u[4] = _mm_srai_epi32(u[4], bit);
833 :
834 0 : u[5] = _mm_mullo_epi32(in[7], cospi28);
835 0 : x = _mm_mullo_epi32(in[9], cospi36);
836 0 : u[5] = _mm_sub_epi32(u[5], x);
837 0 : u[5] = _mm_add_epi32(u[5], rnding);
838 0 : u[5] = _mm_srai_epi32(u[5], bit);
839 :
840 : // (4)
841 0 : u[6] = _mm_mullo_epi32(in[3], cospi52);
842 0 : x = _mm_mullo_epi32(in[13], cospi12);
843 0 : u[6] = _mm_add_epi32(u[6], x);
844 0 : u[6] = _mm_add_epi32(u[6], rnding);
845 0 : u[6] = _mm_srai_epi32(u[6], bit);
846 :
847 0 : u[7] = _mm_mullo_epi32(in[3], cospi12);
848 0 : x = _mm_mullo_epi32(in[13], cospi52);
849 0 : u[7] = _mm_sub_epi32(u[7], x);
850 0 : u[7] = _mm_add_epi32(u[7], rnding);
851 0 : u[7] = _mm_srai_epi32(u[7], bit);
852 :
853 : // stage 3
854 0 : v[0] = _mm_add_epi32(u[0], u[4]);
855 0 : v[4] = _mm_sub_epi32(u[0], u[4]);
856 0 : v[1] = _mm_add_epi32(u[1], u[5]);
857 0 : v[5] = _mm_sub_epi32(u[1], u[5]);
858 0 : v[2] = _mm_add_epi32(u[2], u[6]);
859 0 : v[6] = _mm_sub_epi32(u[2], u[6]);
860 0 : v[3] = _mm_add_epi32(u[3], u[7]);
861 0 : v[7] = _mm_sub_epi32(u[3], u[7]);
862 :
863 : // stage 4
864 0 : u[0] = v[0];
865 0 : u[1] = v[1];
866 0 : u[2] = v[2];
867 0 : u[3] = v[3];
868 :
869 0 : u[4] = _mm_mullo_epi32(v[4], cospi16);
870 0 : x = _mm_mullo_epi32(v[5], cospi48);
871 0 : u[4] = _mm_add_epi32(u[4], x);
872 0 : u[4] = _mm_add_epi32(u[4], rnding);
873 0 : u[4] = _mm_srai_epi32(u[4], bit);
874 :
875 0 : u[5] = _mm_mullo_epi32(v[4], cospi48);
876 0 : x = _mm_mullo_epi32(v[5], cospi16);
877 0 : u[5] = _mm_sub_epi32(u[5], x);
878 0 : u[5] = _mm_add_epi32(u[5], rnding);
879 0 : u[5] = _mm_srai_epi32(u[5], bit);
880 :
881 0 : u[6] = _mm_mullo_epi32(v[6], cospim48);
882 0 : x = _mm_mullo_epi32(v[7], cospi16);
883 0 : u[6] = _mm_add_epi32(u[6], x);
884 0 : u[6] = _mm_add_epi32(u[6], rnding);
885 0 : u[6] = _mm_srai_epi32(u[6], bit);
886 :
887 0 : u[7] = _mm_mullo_epi32(v[6], cospi16);
888 0 : x = _mm_mullo_epi32(v[7], cospim48);
889 0 : u[7] = _mm_sub_epi32(u[7], x);
890 0 : u[7] = _mm_add_epi32(u[7], rnding);
891 0 : u[7] = _mm_srai_epi32(u[7], bit);
892 :
893 : // stage 5
894 0 : v[0] = _mm_add_epi32(u[0], u[2]);
895 0 : v[2] = _mm_sub_epi32(u[0], u[2]);
896 0 : v[1] = _mm_add_epi32(u[1], u[3]);
897 0 : v[3] = _mm_sub_epi32(u[1], u[3]);
898 0 : v[4] = _mm_add_epi32(u[4], u[6]);
899 0 : v[6] = _mm_sub_epi32(u[4], u[6]);
900 0 : v[5] = _mm_add_epi32(u[5], u[7]);
901 0 : v[7] = _mm_sub_epi32(u[5], u[7]);
902 :
903 : // stage 6
904 0 : u[0] = v[0];
905 0 : u[1] = v[1];
906 0 : u[4] = v[4];
907 0 : u[5] = v[5];
908 :
909 0 : v[0] = _mm_mullo_epi32(v[2], cospi32);
910 0 : x = _mm_mullo_epi32(v[3], cospi32);
911 0 : u[2] = _mm_add_epi32(v[0], x);
912 0 : u[2] = _mm_add_epi32(u[2], rnding);
913 0 : u[2] = _mm_srai_epi32(u[2], bit);
914 :
915 0 : u[3] = _mm_sub_epi32(v[0], x);
916 0 : u[3] = _mm_add_epi32(u[3], rnding);
917 0 : u[3] = _mm_srai_epi32(u[3], bit);
918 :
919 0 : v[0] = _mm_mullo_epi32(v[6], cospi32);
920 0 : x = _mm_mullo_epi32(v[7], cospi32);
921 0 : u[6] = _mm_add_epi32(v[0], x);
922 0 : u[6] = _mm_add_epi32(u[6], rnding);
923 0 : u[6] = _mm_srai_epi32(u[6], bit);
924 :
925 0 : u[7] = _mm_sub_epi32(v[0], x);
926 0 : u[7] = _mm_add_epi32(u[7], rnding);
927 0 : u[7] = _mm_srai_epi32(u[7], bit);
928 :
929 : // stage 7
930 0 : out[1] = u[0];
931 0 : out[3] = _mm_sub_epi32(kZero, u[4]);
932 0 : out[5] = u[6];
933 0 : out[7] = _mm_sub_epi32(kZero, u[2]);
934 0 : out[9] = u[3];
935 0 : out[11] = _mm_sub_epi32(kZero, u[7]);
936 0 : out[13] = u[5];
937 0 : out[15] = _mm_sub_epi32(kZero, u[1]);
938 0 : }
939 :
940 0 : void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
941 : int tx_type, int bd) {
942 : __m128i in[16], out[16];
943 0 : const TXFM_1D_CFG *row_cfg = NULL;
944 0 : const TXFM_1D_CFG *col_cfg = NULL;
945 :
946 0 : switch (tx_type) {
947 : case DCT_DCT:
948 0 : row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
949 0 : col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
950 0 : load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
951 0 : fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
952 0 : col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
953 0 : transpose_8x8(out, in);
954 0 : fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
955 0 : transpose_8x8(out, in);
956 0 : write_buffer_8x8(in, coeff);
957 0 : break;
958 : case ADST_DCT:
959 0 : row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
960 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
961 0 : load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
962 0 : fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
963 0 : col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
964 0 : transpose_8x8(out, in);
965 0 : fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
966 0 : transpose_8x8(out, in);
967 0 : write_buffer_8x8(in, coeff);
968 0 : break;
969 : case DCT_ADST:
970 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
971 0 : col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
972 0 : load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
973 0 : fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
974 0 : col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
975 0 : transpose_8x8(out, in);
976 0 : fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
977 0 : transpose_8x8(out, in);
978 0 : write_buffer_8x8(in, coeff);
979 0 : break;
980 : case ADST_ADST:
981 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
982 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
983 0 : load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
984 0 : fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
985 0 : col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
986 0 : transpose_8x8(out, in);
987 0 : fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
988 0 : transpose_8x8(out, in);
989 0 : write_buffer_8x8(in, coeff);
990 0 : break;
991 : #if CONFIG_EXT_TX
992 : case FLIPADST_DCT:
993 0 : row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
994 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
995 0 : load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]);
996 0 : fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
997 0 : col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
998 0 : transpose_8x8(out, in);
999 0 : fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
1000 0 : transpose_8x8(out, in);
1001 0 : write_buffer_8x8(in, coeff);
1002 0 : break;
1003 : case DCT_FLIPADST:
1004 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
1005 0 : col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
1006 0 : load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]);
1007 0 : fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
1008 0 : col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
1009 0 : transpose_8x8(out, in);
1010 0 : fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
1011 0 : transpose_8x8(out, in);
1012 0 : write_buffer_8x8(in, coeff);
1013 0 : break;
1014 : case FLIPADST_FLIPADST:
1015 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
1016 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
1017 0 : load_buffer_8x8(input, in, stride, 1, 1, row_cfg->shift[0]);
1018 0 : fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
1019 0 : col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
1020 0 : transpose_8x8(out, in);
1021 0 : fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
1022 0 : transpose_8x8(out, in);
1023 0 : write_buffer_8x8(in, coeff);
1024 0 : break;
1025 : case ADST_FLIPADST:
1026 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
1027 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
1028 0 : load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]);
1029 0 : fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
1030 0 : col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
1031 0 : transpose_8x8(out, in);
1032 0 : fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
1033 0 : transpose_8x8(out, in);
1034 0 : write_buffer_8x8(in, coeff);
1035 0 : break;
1036 : case FLIPADST_ADST:
1037 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
1038 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
1039 0 : load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]);
1040 0 : fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
1041 0 : col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
1042 0 : transpose_8x8(out, in);
1043 0 : fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
1044 0 : transpose_8x8(out, in);
1045 0 : write_buffer_8x8(in, coeff);
1046 0 : break;
1047 : #endif // CONFIG_EXT_TX
1048 0 : default: assert(0);
1049 : }
1050 : (void)bd;
1051 0 : }
1052 :
1053 : // Hybrid Transform 16x16
1054 :
1055 0 : static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
1056 0 : int row_index = 0;
1057 0 : int dst_index = 0;
1058 0 : int src_index = 0;
1059 :
1060 : // row 0, 1, .., 7
1061 : do {
1062 0 : out[dst_index] = in[src_index];
1063 0 : out[dst_index + 1] = in[src_index + 1];
1064 0 : out[dst_index + 2] = in[src_index + 16];
1065 0 : out[dst_index + 3] = in[src_index + 17];
1066 0 : dst_index += 4;
1067 0 : src_index += 2;
1068 0 : row_index += 1;
1069 0 : } while (row_index < 8);
1070 :
1071 : // row 8, 9, ..., 15
1072 0 : src_index += 16;
1073 : do {
1074 0 : out[dst_index] = in[src_index];
1075 0 : out[dst_index + 1] = in[src_index + 1];
1076 0 : out[dst_index + 2] = in[src_index + 16];
1077 0 : out[dst_index + 3] = in[src_index + 17];
1078 0 : dst_index += 4;
1079 0 : src_index += 2;
1080 0 : row_index += 1;
1081 0 : } while (row_index < 16);
1082 0 : }
1083 :
1084 0 : static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
1085 : int stride, int flipud, int fliplr,
1086 : int shift) {
1087 : __m128i in[64];
1088 : // Load 4 8x8 blocks
1089 0 : const int16_t *topL = input;
1090 0 : const int16_t *topR = input + 8;
1091 0 : const int16_t *botL = input + 8 * stride;
1092 0 : const int16_t *botR = input + 8 * stride + 8;
1093 :
1094 : const int16_t *tmp;
1095 :
1096 0 : if (flipud) {
1097 : // Swap left columns
1098 0 : tmp = topL;
1099 0 : topL = botL;
1100 0 : botL = tmp;
1101 : // Swap right columns
1102 0 : tmp = topR;
1103 0 : topR = botR;
1104 0 : botR = tmp;
1105 : }
1106 :
1107 0 : if (fliplr) {
1108 : // Swap top rows
1109 0 : tmp = topL;
1110 0 : topL = topR;
1111 0 : topR = tmp;
1112 : // Swap bottom rows
1113 0 : tmp = botL;
1114 0 : botL = botR;
1115 0 : botR = tmp;
1116 : }
1117 :
1118 : // load first 8 columns
1119 0 : load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
1120 0 : load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
1121 :
1122 : // load second 8 columns
1123 0 : load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
1124 0 : load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
1125 :
1126 0 : convert_8x8_to_16x16(in, out);
1127 0 : }
1128 :
1129 0 : static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
1130 0 : const int32_t *cospi = cospi_arr(bit);
1131 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1132 0 : const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
1133 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1134 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1135 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1136 0 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1137 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1138 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1139 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1140 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1141 0 : const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1142 0 : const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1143 0 : const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1144 0 : const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1145 0 : const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1146 0 : const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1147 0 : const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1148 0 : const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1149 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1150 : __m128i u[16], v[16], x;
1151 0 : const int col_num = 4;
1152 : int col;
1153 :
1154 : // Calculate the column 0, 1, 2, 3
1155 0 : for (col = 0; col < col_num; ++col) {
1156 : // stage 0
1157 : // stage 1
1158 0 : u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
1159 0 : u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
1160 0 : u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
1161 0 : u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
1162 0 : u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
1163 0 : u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
1164 0 : u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
1165 0 : u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
1166 0 : u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
1167 0 : u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
1168 0 : u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
1169 0 : u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
1170 0 : u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
1171 0 : u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
1172 0 : u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
1173 0 : u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
1174 :
1175 : // stage 2
1176 0 : v[0] = _mm_add_epi32(u[0], u[7]);
1177 0 : v[7] = _mm_sub_epi32(u[0], u[7]);
1178 0 : v[1] = _mm_add_epi32(u[1], u[6]);
1179 0 : v[6] = _mm_sub_epi32(u[1], u[6]);
1180 0 : v[2] = _mm_add_epi32(u[2], u[5]);
1181 0 : v[5] = _mm_sub_epi32(u[2], u[5]);
1182 0 : v[3] = _mm_add_epi32(u[3], u[4]);
1183 0 : v[4] = _mm_sub_epi32(u[3], u[4]);
1184 0 : v[8] = u[8];
1185 0 : v[9] = u[9];
1186 :
1187 0 : v[10] = _mm_mullo_epi32(u[10], cospim32);
1188 0 : x = _mm_mullo_epi32(u[13], cospi32);
1189 0 : v[10] = _mm_add_epi32(v[10], x);
1190 0 : v[10] = _mm_add_epi32(v[10], rnding);
1191 0 : v[10] = _mm_srai_epi32(v[10], bit);
1192 :
1193 0 : v[13] = _mm_mullo_epi32(u[10], cospi32);
1194 0 : x = _mm_mullo_epi32(u[13], cospim32);
1195 0 : v[13] = _mm_sub_epi32(v[13], x);
1196 0 : v[13] = _mm_add_epi32(v[13], rnding);
1197 0 : v[13] = _mm_srai_epi32(v[13], bit);
1198 :
1199 0 : v[11] = _mm_mullo_epi32(u[11], cospim32);
1200 0 : x = _mm_mullo_epi32(u[12], cospi32);
1201 0 : v[11] = _mm_add_epi32(v[11], x);
1202 0 : v[11] = _mm_add_epi32(v[11], rnding);
1203 0 : v[11] = _mm_srai_epi32(v[11], bit);
1204 :
1205 0 : v[12] = _mm_mullo_epi32(u[11], cospi32);
1206 0 : x = _mm_mullo_epi32(u[12], cospim32);
1207 0 : v[12] = _mm_sub_epi32(v[12], x);
1208 0 : v[12] = _mm_add_epi32(v[12], rnding);
1209 0 : v[12] = _mm_srai_epi32(v[12], bit);
1210 0 : v[14] = u[14];
1211 0 : v[15] = u[15];
1212 :
1213 : // stage 3
1214 0 : u[0] = _mm_add_epi32(v[0], v[3]);
1215 0 : u[3] = _mm_sub_epi32(v[0], v[3]);
1216 0 : u[1] = _mm_add_epi32(v[1], v[2]);
1217 0 : u[2] = _mm_sub_epi32(v[1], v[2]);
1218 0 : u[4] = v[4];
1219 :
1220 0 : u[5] = _mm_mullo_epi32(v[5], cospim32);
1221 0 : x = _mm_mullo_epi32(v[6], cospi32);
1222 0 : u[5] = _mm_add_epi32(u[5], x);
1223 0 : u[5] = _mm_add_epi32(u[5], rnding);
1224 0 : u[5] = _mm_srai_epi32(u[5], bit);
1225 :
1226 0 : u[6] = _mm_mullo_epi32(v[5], cospi32);
1227 0 : x = _mm_mullo_epi32(v[6], cospim32);
1228 0 : u[6] = _mm_sub_epi32(u[6], x);
1229 0 : u[6] = _mm_add_epi32(u[6], rnding);
1230 0 : u[6] = _mm_srai_epi32(u[6], bit);
1231 :
1232 0 : u[7] = v[7];
1233 0 : u[8] = _mm_add_epi32(v[8], v[11]);
1234 0 : u[11] = _mm_sub_epi32(v[8], v[11]);
1235 0 : u[9] = _mm_add_epi32(v[9], v[10]);
1236 0 : u[10] = _mm_sub_epi32(v[9], v[10]);
1237 0 : u[12] = _mm_sub_epi32(v[15], v[12]);
1238 0 : u[15] = _mm_add_epi32(v[15], v[12]);
1239 0 : u[13] = _mm_sub_epi32(v[14], v[13]);
1240 0 : u[14] = _mm_add_epi32(v[14], v[13]);
1241 :
1242 : // stage 4
1243 0 : u[0] = _mm_mullo_epi32(u[0], cospi32);
1244 0 : u[1] = _mm_mullo_epi32(u[1], cospi32);
1245 0 : v[0] = _mm_add_epi32(u[0], u[1]);
1246 0 : v[0] = _mm_add_epi32(v[0], rnding);
1247 0 : v[0] = _mm_srai_epi32(v[0], bit);
1248 :
1249 0 : v[1] = _mm_sub_epi32(u[0], u[1]);
1250 0 : v[1] = _mm_add_epi32(v[1], rnding);
1251 0 : v[1] = _mm_srai_epi32(v[1], bit);
1252 :
1253 0 : v[2] = _mm_mullo_epi32(u[2], cospi48);
1254 0 : x = _mm_mullo_epi32(u[3], cospi16);
1255 0 : v[2] = _mm_add_epi32(v[2], x);
1256 0 : v[2] = _mm_add_epi32(v[2], rnding);
1257 0 : v[2] = _mm_srai_epi32(v[2], bit);
1258 :
1259 0 : v[3] = _mm_mullo_epi32(u[2], cospi16);
1260 0 : x = _mm_mullo_epi32(u[3], cospi48);
1261 0 : v[3] = _mm_sub_epi32(x, v[3]);
1262 0 : v[3] = _mm_add_epi32(v[3], rnding);
1263 0 : v[3] = _mm_srai_epi32(v[3], bit);
1264 :
1265 0 : v[4] = _mm_add_epi32(u[4], u[5]);
1266 0 : v[5] = _mm_sub_epi32(u[4], u[5]);
1267 0 : v[6] = _mm_sub_epi32(u[7], u[6]);
1268 0 : v[7] = _mm_add_epi32(u[7], u[6]);
1269 0 : v[8] = u[8];
1270 :
1271 0 : v[9] = _mm_mullo_epi32(u[9], cospim16);
1272 0 : x = _mm_mullo_epi32(u[14], cospi48);
1273 0 : v[9] = _mm_add_epi32(v[9], x);
1274 0 : v[9] = _mm_add_epi32(v[9], rnding);
1275 0 : v[9] = _mm_srai_epi32(v[9], bit);
1276 :
1277 0 : v[14] = _mm_mullo_epi32(u[9], cospi48);
1278 0 : x = _mm_mullo_epi32(u[14], cospim16);
1279 0 : v[14] = _mm_sub_epi32(v[14], x);
1280 0 : v[14] = _mm_add_epi32(v[14], rnding);
1281 0 : v[14] = _mm_srai_epi32(v[14], bit);
1282 :
1283 0 : v[10] = _mm_mullo_epi32(u[10], cospim48);
1284 0 : x = _mm_mullo_epi32(u[13], cospim16);
1285 0 : v[10] = _mm_add_epi32(v[10], x);
1286 0 : v[10] = _mm_add_epi32(v[10], rnding);
1287 0 : v[10] = _mm_srai_epi32(v[10], bit);
1288 :
1289 0 : v[13] = _mm_mullo_epi32(u[10], cospim16);
1290 0 : x = _mm_mullo_epi32(u[13], cospim48);
1291 0 : v[13] = _mm_sub_epi32(v[13], x);
1292 0 : v[13] = _mm_add_epi32(v[13], rnding);
1293 0 : v[13] = _mm_srai_epi32(v[13], bit);
1294 :
1295 0 : v[11] = u[11];
1296 0 : v[12] = u[12];
1297 0 : v[15] = u[15];
1298 :
1299 : // stage 5
1300 0 : u[0] = v[0];
1301 0 : u[1] = v[1];
1302 0 : u[2] = v[2];
1303 0 : u[3] = v[3];
1304 :
1305 0 : u[4] = _mm_mullo_epi32(v[4], cospi56);
1306 0 : x = _mm_mullo_epi32(v[7], cospi8);
1307 0 : u[4] = _mm_add_epi32(u[4], x);
1308 0 : u[4] = _mm_add_epi32(u[4], rnding);
1309 0 : u[4] = _mm_srai_epi32(u[4], bit);
1310 :
1311 0 : u[7] = _mm_mullo_epi32(v[4], cospi8);
1312 0 : x = _mm_mullo_epi32(v[7], cospi56);
1313 0 : u[7] = _mm_sub_epi32(x, u[7]);
1314 0 : u[7] = _mm_add_epi32(u[7], rnding);
1315 0 : u[7] = _mm_srai_epi32(u[7], bit);
1316 :
1317 0 : u[5] = _mm_mullo_epi32(v[5], cospi24);
1318 0 : x = _mm_mullo_epi32(v[6], cospi40);
1319 0 : u[5] = _mm_add_epi32(u[5], x);
1320 0 : u[5] = _mm_add_epi32(u[5], rnding);
1321 0 : u[5] = _mm_srai_epi32(u[5], bit);
1322 :
1323 0 : u[6] = _mm_mullo_epi32(v[5], cospi40);
1324 0 : x = _mm_mullo_epi32(v[6], cospi24);
1325 0 : u[6] = _mm_sub_epi32(x, u[6]);
1326 0 : u[6] = _mm_add_epi32(u[6], rnding);
1327 0 : u[6] = _mm_srai_epi32(u[6], bit);
1328 :
1329 0 : u[8] = _mm_add_epi32(v[8], v[9]);
1330 0 : u[9] = _mm_sub_epi32(v[8], v[9]);
1331 0 : u[10] = _mm_sub_epi32(v[11], v[10]);
1332 0 : u[11] = _mm_add_epi32(v[11], v[10]);
1333 0 : u[12] = _mm_add_epi32(v[12], v[13]);
1334 0 : u[13] = _mm_sub_epi32(v[12], v[13]);
1335 0 : u[14] = _mm_sub_epi32(v[15], v[14]);
1336 0 : u[15] = _mm_add_epi32(v[15], v[14]);
1337 :
1338 : // stage 6
1339 0 : v[0] = u[0];
1340 0 : v[1] = u[1];
1341 0 : v[2] = u[2];
1342 0 : v[3] = u[3];
1343 0 : v[4] = u[4];
1344 0 : v[5] = u[5];
1345 0 : v[6] = u[6];
1346 0 : v[7] = u[7];
1347 :
1348 0 : v[8] = _mm_mullo_epi32(u[8], cospi60);
1349 0 : x = _mm_mullo_epi32(u[15], cospi4);
1350 0 : v[8] = _mm_add_epi32(v[8], x);
1351 0 : v[8] = _mm_add_epi32(v[8], rnding);
1352 0 : v[8] = _mm_srai_epi32(v[8], bit);
1353 :
1354 0 : v[15] = _mm_mullo_epi32(u[8], cospi4);
1355 0 : x = _mm_mullo_epi32(u[15], cospi60);
1356 0 : v[15] = _mm_sub_epi32(x, v[15]);
1357 0 : v[15] = _mm_add_epi32(v[15], rnding);
1358 0 : v[15] = _mm_srai_epi32(v[15], bit);
1359 :
1360 0 : v[9] = _mm_mullo_epi32(u[9], cospi28);
1361 0 : x = _mm_mullo_epi32(u[14], cospi36);
1362 0 : v[9] = _mm_add_epi32(v[9], x);
1363 0 : v[9] = _mm_add_epi32(v[9], rnding);
1364 0 : v[9] = _mm_srai_epi32(v[9], bit);
1365 :
1366 0 : v[14] = _mm_mullo_epi32(u[9], cospi36);
1367 0 : x = _mm_mullo_epi32(u[14], cospi28);
1368 0 : v[14] = _mm_sub_epi32(x, v[14]);
1369 0 : v[14] = _mm_add_epi32(v[14], rnding);
1370 0 : v[14] = _mm_srai_epi32(v[14], bit);
1371 :
1372 0 : v[10] = _mm_mullo_epi32(u[10], cospi44);
1373 0 : x = _mm_mullo_epi32(u[13], cospi20);
1374 0 : v[10] = _mm_add_epi32(v[10], x);
1375 0 : v[10] = _mm_add_epi32(v[10], rnding);
1376 0 : v[10] = _mm_srai_epi32(v[10], bit);
1377 :
1378 0 : v[13] = _mm_mullo_epi32(u[10], cospi20);
1379 0 : x = _mm_mullo_epi32(u[13], cospi44);
1380 0 : v[13] = _mm_sub_epi32(x, v[13]);
1381 0 : v[13] = _mm_add_epi32(v[13], rnding);
1382 0 : v[13] = _mm_srai_epi32(v[13], bit);
1383 :
1384 0 : v[11] = _mm_mullo_epi32(u[11], cospi12);
1385 0 : x = _mm_mullo_epi32(u[12], cospi52);
1386 0 : v[11] = _mm_add_epi32(v[11], x);
1387 0 : v[11] = _mm_add_epi32(v[11], rnding);
1388 0 : v[11] = _mm_srai_epi32(v[11], bit);
1389 :
1390 0 : v[12] = _mm_mullo_epi32(u[11], cospi52);
1391 0 : x = _mm_mullo_epi32(u[12], cospi12);
1392 0 : v[12] = _mm_sub_epi32(x, v[12]);
1393 0 : v[12] = _mm_add_epi32(v[12], rnding);
1394 0 : v[12] = _mm_srai_epi32(v[12], bit);
1395 :
1396 0 : out[0 * col_num + col] = v[0];
1397 0 : out[1 * col_num + col] = v[8];
1398 0 : out[2 * col_num + col] = v[4];
1399 0 : out[3 * col_num + col] = v[12];
1400 0 : out[4 * col_num + col] = v[2];
1401 0 : out[5 * col_num + col] = v[10];
1402 0 : out[6 * col_num + col] = v[6];
1403 0 : out[7 * col_num + col] = v[14];
1404 0 : out[8 * col_num + col] = v[1];
1405 0 : out[9 * col_num + col] = v[9];
1406 0 : out[10 * col_num + col] = v[5];
1407 0 : out[11 * col_num + col] = v[13];
1408 0 : out[12 * col_num + col] = v[3];
1409 0 : out[13 * col_num + col] = v[11];
1410 0 : out[14 * col_num + col] = v[7];
1411 0 : out[15 * col_num + col] = v[15];
1412 : }
1413 0 : }
1414 :
1415 0 : static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
1416 0 : const int32_t *cospi = cospi_arr(bit);
1417 0 : const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
1418 0 : const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
1419 0 : const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
1420 0 : const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
1421 0 : const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
1422 0 : const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
1423 0 : const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
1424 0 : const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
1425 0 : const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
1426 0 : const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
1427 0 : const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
1428 0 : const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
1429 0 : const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
1430 0 : const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
1431 0 : const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
1432 0 : const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
1433 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1434 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1435 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1436 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1437 0 : const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
1438 0 : const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
1439 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1440 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1441 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1442 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1443 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1444 : __m128i u[16], v[16], x, y;
1445 0 : const int col_num = 4;
1446 : int col;
1447 :
1448 : // Calculate the column 0, 1, 2, 3
1449 0 : for (col = 0; col < col_num; ++col) {
1450 : // stage 0
1451 : // stage 1
1452 : // stage 2
1453 0 : v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
1454 0 : x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
1455 0 : v[0] = _mm_add_epi32(v[0], x);
1456 0 : v[0] = _mm_add_epi32(v[0], rnding);
1457 0 : v[0] = _mm_srai_epi32(v[0], bit);
1458 :
1459 0 : v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
1460 0 : x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
1461 0 : v[1] = _mm_sub_epi32(v[1], x);
1462 0 : v[1] = _mm_add_epi32(v[1], rnding);
1463 0 : v[1] = _mm_srai_epi32(v[1], bit);
1464 :
1465 0 : v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
1466 0 : x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
1467 0 : v[2] = _mm_add_epi32(v[2], x);
1468 0 : v[2] = _mm_add_epi32(v[2], rnding);
1469 0 : v[2] = _mm_srai_epi32(v[2], bit);
1470 :
1471 0 : v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
1472 0 : x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
1473 0 : v[3] = _mm_sub_epi32(v[3], x);
1474 0 : v[3] = _mm_add_epi32(v[3], rnding);
1475 0 : v[3] = _mm_srai_epi32(v[3], bit);
1476 :
1477 0 : v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
1478 0 : x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
1479 0 : v[4] = _mm_add_epi32(v[4], x);
1480 0 : v[4] = _mm_add_epi32(v[4], rnding);
1481 0 : v[4] = _mm_srai_epi32(v[4], bit);
1482 :
1483 0 : v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
1484 0 : x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
1485 0 : v[5] = _mm_sub_epi32(v[5], x);
1486 0 : v[5] = _mm_add_epi32(v[5], rnding);
1487 0 : v[5] = _mm_srai_epi32(v[5], bit);
1488 :
1489 0 : v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
1490 0 : x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
1491 0 : v[6] = _mm_add_epi32(v[6], x);
1492 0 : v[6] = _mm_add_epi32(v[6], rnding);
1493 0 : v[6] = _mm_srai_epi32(v[6], bit);
1494 :
1495 0 : v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
1496 0 : x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
1497 0 : v[7] = _mm_sub_epi32(v[7], x);
1498 0 : v[7] = _mm_add_epi32(v[7], rnding);
1499 0 : v[7] = _mm_srai_epi32(v[7], bit);
1500 :
1501 0 : v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
1502 0 : x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
1503 0 : v[8] = _mm_add_epi32(v[8], x);
1504 0 : v[8] = _mm_add_epi32(v[8], rnding);
1505 0 : v[8] = _mm_srai_epi32(v[8], bit);
1506 :
1507 0 : v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
1508 0 : x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
1509 0 : v[9] = _mm_sub_epi32(v[9], x);
1510 0 : v[9] = _mm_add_epi32(v[9], rnding);
1511 0 : v[9] = _mm_srai_epi32(v[9], bit);
1512 :
1513 0 : v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
1514 0 : x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
1515 0 : v[10] = _mm_add_epi32(v[10], x);
1516 0 : v[10] = _mm_add_epi32(v[10], rnding);
1517 0 : v[10] = _mm_srai_epi32(v[10], bit);
1518 :
1519 0 : v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
1520 0 : x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
1521 0 : v[11] = _mm_sub_epi32(v[11], x);
1522 0 : v[11] = _mm_add_epi32(v[11], rnding);
1523 0 : v[11] = _mm_srai_epi32(v[11], bit);
1524 :
1525 0 : v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
1526 0 : x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
1527 0 : v[12] = _mm_add_epi32(v[12], x);
1528 0 : v[12] = _mm_add_epi32(v[12], rnding);
1529 0 : v[12] = _mm_srai_epi32(v[12], bit);
1530 :
1531 0 : v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
1532 0 : x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
1533 0 : v[13] = _mm_sub_epi32(v[13], x);
1534 0 : v[13] = _mm_add_epi32(v[13], rnding);
1535 0 : v[13] = _mm_srai_epi32(v[13], bit);
1536 :
1537 0 : v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
1538 0 : x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
1539 0 : v[14] = _mm_add_epi32(v[14], x);
1540 0 : v[14] = _mm_add_epi32(v[14], rnding);
1541 0 : v[14] = _mm_srai_epi32(v[14], bit);
1542 :
1543 0 : v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
1544 0 : x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
1545 0 : v[15] = _mm_sub_epi32(v[15], x);
1546 0 : v[15] = _mm_add_epi32(v[15], rnding);
1547 0 : v[15] = _mm_srai_epi32(v[15], bit);
1548 :
1549 : // stage 3
1550 0 : u[0] = _mm_add_epi32(v[0], v[8]);
1551 0 : u[8] = _mm_sub_epi32(v[0], v[8]);
1552 0 : u[1] = _mm_add_epi32(v[1], v[9]);
1553 0 : u[9] = _mm_sub_epi32(v[1], v[9]);
1554 0 : u[2] = _mm_add_epi32(v[2], v[10]);
1555 0 : u[10] = _mm_sub_epi32(v[2], v[10]);
1556 0 : u[3] = _mm_add_epi32(v[3], v[11]);
1557 0 : u[11] = _mm_sub_epi32(v[3], v[11]);
1558 0 : u[4] = _mm_add_epi32(v[4], v[12]);
1559 0 : u[12] = _mm_sub_epi32(v[4], v[12]);
1560 0 : u[5] = _mm_add_epi32(v[5], v[13]);
1561 0 : u[13] = _mm_sub_epi32(v[5], v[13]);
1562 0 : u[6] = _mm_add_epi32(v[6], v[14]);
1563 0 : u[14] = _mm_sub_epi32(v[6], v[14]);
1564 0 : u[7] = _mm_add_epi32(v[7], v[15]);
1565 0 : u[15] = _mm_sub_epi32(v[7], v[15]);
1566 :
1567 : // stage 4
1568 0 : v[0] = u[0];
1569 0 : v[1] = u[1];
1570 0 : v[2] = u[2];
1571 0 : v[3] = u[3];
1572 0 : v[4] = u[4];
1573 0 : v[5] = u[5];
1574 0 : v[6] = u[6];
1575 0 : v[7] = u[7];
1576 :
1577 0 : v[8] = _mm_mullo_epi32(u[8], cospi8);
1578 0 : x = _mm_mullo_epi32(u[9], cospi56);
1579 0 : v[8] = _mm_add_epi32(v[8], x);
1580 0 : v[8] = _mm_add_epi32(v[8], rnding);
1581 0 : v[8] = _mm_srai_epi32(v[8], bit);
1582 :
1583 0 : v[9] = _mm_mullo_epi32(u[8], cospi56);
1584 0 : x = _mm_mullo_epi32(u[9], cospi8);
1585 0 : v[9] = _mm_sub_epi32(v[9], x);
1586 0 : v[9] = _mm_add_epi32(v[9], rnding);
1587 0 : v[9] = _mm_srai_epi32(v[9], bit);
1588 :
1589 0 : v[10] = _mm_mullo_epi32(u[10], cospi40);
1590 0 : x = _mm_mullo_epi32(u[11], cospi24);
1591 0 : v[10] = _mm_add_epi32(v[10], x);
1592 0 : v[10] = _mm_add_epi32(v[10], rnding);
1593 0 : v[10] = _mm_srai_epi32(v[10], bit);
1594 :
1595 0 : v[11] = _mm_mullo_epi32(u[10], cospi24);
1596 0 : x = _mm_mullo_epi32(u[11], cospi40);
1597 0 : v[11] = _mm_sub_epi32(v[11], x);
1598 0 : v[11] = _mm_add_epi32(v[11], rnding);
1599 0 : v[11] = _mm_srai_epi32(v[11], bit);
1600 :
1601 0 : v[12] = _mm_mullo_epi32(u[12], cospim56);
1602 0 : x = _mm_mullo_epi32(u[13], cospi8);
1603 0 : v[12] = _mm_add_epi32(v[12], x);
1604 0 : v[12] = _mm_add_epi32(v[12], rnding);
1605 0 : v[12] = _mm_srai_epi32(v[12], bit);
1606 :
1607 0 : v[13] = _mm_mullo_epi32(u[12], cospi8);
1608 0 : x = _mm_mullo_epi32(u[13], cospim56);
1609 0 : v[13] = _mm_sub_epi32(v[13], x);
1610 0 : v[13] = _mm_add_epi32(v[13], rnding);
1611 0 : v[13] = _mm_srai_epi32(v[13], bit);
1612 :
1613 0 : v[14] = _mm_mullo_epi32(u[14], cospim24);
1614 0 : x = _mm_mullo_epi32(u[15], cospi40);
1615 0 : v[14] = _mm_add_epi32(v[14], x);
1616 0 : v[14] = _mm_add_epi32(v[14], rnding);
1617 0 : v[14] = _mm_srai_epi32(v[14], bit);
1618 :
1619 0 : v[15] = _mm_mullo_epi32(u[14], cospi40);
1620 0 : x = _mm_mullo_epi32(u[15], cospim24);
1621 0 : v[15] = _mm_sub_epi32(v[15], x);
1622 0 : v[15] = _mm_add_epi32(v[15], rnding);
1623 0 : v[15] = _mm_srai_epi32(v[15], bit);
1624 :
1625 : // stage 5
1626 0 : u[0] = _mm_add_epi32(v[0], v[4]);
1627 0 : u[4] = _mm_sub_epi32(v[0], v[4]);
1628 0 : u[1] = _mm_add_epi32(v[1], v[5]);
1629 0 : u[5] = _mm_sub_epi32(v[1], v[5]);
1630 0 : u[2] = _mm_add_epi32(v[2], v[6]);
1631 0 : u[6] = _mm_sub_epi32(v[2], v[6]);
1632 0 : u[3] = _mm_add_epi32(v[3], v[7]);
1633 0 : u[7] = _mm_sub_epi32(v[3], v[7]);
1634 0 : u[8] = _mm_add_epi32(v[8], v[12]);
1635 0 : u[12] = _mm_sub_epi32(v[8], v[12]);
1636 0 : u[9] = _mm_add_epi32(v[9], v[13]);
1637 0 : u[13] = _mm_sub_epi32(v[9], v[13]);
1638 0 : u[10] = _mm_add_epi32(v[10], v[14]);
1639 0 : u[14] = _mm_sub_epi32(v[10], v[14]);
1640 0 : u[11] = _mm_add_epi32(v[11], v[15]);
1641 0 : u[15] = _mm_sub_epi32(v[11], v[15]);
1642 :
1643 : // stage 6
1644 0 : v[0] = u[0];
1645 0 : v[1] = u[1];
1646 0 : v[2] = u[2];
1647 0 : v[3] = u[3];
1648 :
1649 0 : v[4] = _mm_mullo_epi32(u[4], cospi16);
1650 0 : x = _mm_mullo_epi32(u[5], cospi48);
1651 0 : v[4] = _mm_add_epi32(v[4], x);
1652 0 : v[4] = _mm_add_epi32(v[4], rnding);
1653 0 : v[4] = _mm_srai_epi32(v[4], bit);
1654 :
1655 0 : v[5] = _mm_mullo_epi32(u[4], cospi48);
1656 0 : x = _mm_mullo_epi32(u[5], cospi16);
1657 0 : v[5] = _mm_sub_epi32(v[5], x);
1658 0 : v[5] = _mm_add_epi32(v[5], rnding);
1659 0 : v[5] = _mm_srai_epi32(v[5], bit);
1660 :
1661 0 : v[6] = _mm_mullo_epi32(u[6], cospim48);
1662 0 : x = _mm_mullo_epi32(u[7], cospi16);
1663 0 : v[6] = _mm_add_epi32(v[6], x);
1664 0 : v[6] = _mm_add_epi32(v[6], rnding);
1665 0 : v[6] = _mm_srai_epi32(v[6], bit);
1666 :
1667 0 : v[7] = _mm_mullo_epi32(u[6], cospi16);
1668 0 : x = _mm_mullo_epi32(u[7], cospim48);
1669 0 : v[7] = _mm_sub_epi32(v[7], x);
1670 0 : v[7] = _mm_add_epi32(v[7], rnding);
1671 0 : v[7] = _mm_srai_epi32(v[7], bit);
1672 :
1673 0 : v[8] = u[8];
1674 0 : v[9] = u[9];
1675 0 : v[10] = u[10];
1676 0 : v[11] = u[11];
1677 :
1678 0 : v[12] = _mm_mullo_epi32(u[12], cospi16);
1679 0 : x = _mm_mullo_epi32(u[13], cospi48);
1680 0 : v[12] = _mm_add_epi32(v[12], x);
1681 0 : v[12] = _mm_add_epi32(v[12], rnding);
1682 0 : v[12] = _mm_srai_epi32(v[12], bit);
1683 :
1684 0 : v[13] = _mm_mullo_epi32(u[12], cospi48);
1685 0 : x = _mm_mullo_epi32(u[13], cospi16);
1686 0 : v[13] = _mm_sub_epi32(v[13], x);
1687 0 : v[13] = _mm_add_epi32(v[13], rnding);
1688 0 : v[13] = _mm_srai_epi32(v[13], bit);
1689 :
1690 0 : v[14] = _mm_mullo_epi32(u[14], cospim48);
1691 0 : x = _mm_mullo_epi32(u[15], cospi16);
1692 0 : v[14] = _mm_add_epi32(v[14], x);
1693 0 : v[14] = _mm_add_epi32(v[14], rnding);
1694 0 : v[14] = _mm_srai_epi32(v[14], bit);
1695 :
1696 0 : v[15] = _mm_mullo_epi32(u[14], cospi16);
1697 0 : x = _mm_mullo_epi32(u[15], cospim48);
1698 0 : v[15] = _mm_sub_epi32(v[15], x);
1699 0 : v[15] = _mm_add_epi32(v[15], rnding);
1700 0 : v[15] = _mm_srai_epi32(v[15], bit);
1701 :
1702 : // stage 7
1703 0 : u[0] = _mm_add_epi32(v[0], v[2]);
1704 0 : u[2] = _mm_sub_epi32(v[0], v[2]);
1705 0 : u[1] = _mm_add_epi32(v[1], v[3]);
1706 0 : u[3] = _mm_sub_epi32(v[1], v[3]);
1707 0 : u[4] = _mm_add_epi32(v[4], v[6]);
1708 0 : u[6] = _mm_sub_epi32(v[4], v[6]);
1709 0 : u[5] = _mm_add_epi32(v[5], v[7]);
1710 0 : u[7] = _mm_sub_epi32(v[5], v[7]);
1711 0 : u[8] = _mm_add_epi32(v[8], v[10]);
1712 0 : u[10] = _mm_sub_epi32(v[8], v[10]);
1713 0 : u[9] = _mm_add_epi32(v[9], v[11]);
1714 0 : u[11] = _mm_sub_epi32(v[9], v[11]);
1715 0 : u[12] = _mm_add_epi32(v[12], v[14]);
1716 0 : u[14] = _mm_sub_epi32(v[12], v[14]);
1717 0 : u[13] = _mm_add_epi32(v[13], v[15]);
1718 0 : u[15] = _mm_sub_epi32(v[13], v[15]);
1719 :
1720 : // stage 8
1721 0 : v[0] = u[0];
1722 0 : v[1] = u[1];
1723 :
1724 0 : y = _mm_mullo_epi32(u[2], cospi32);
1725 0 : x = _mm_mullo_epi32(u[3], cospi32);
1726 0 : v[2] = _mm_add_epi32(y, x);
1727 0 : v[2] = _mm_add_epi32(v[2], rnding);
1728 0 : v[2] = _mm_srai_epi32(v[2], bit);
1729 :
1730 0 : v[3] = _mm_sub_epi32(y, x);
1731 0 : v[3] = _mm_add_epi32(v[3], rnding);
1732 0 : v[3] = _mm_srai_epi32(v[3], bit);
1733 :
1734 0 : v[4] = u[4];
1735 0 : v[5] = u[5];
1736 :
1737 0 : y = _mm_mullo_epi32(u[6], cospi32);
1738 0 : x = _mm_mullo_epi32(u[7], cospi32);
1739 0 : v[6] = _mm_add_epi32(y, x);
1740 0 : v[6] = _mm_add_epi32(v[6], rnding);
1741 0 : v[6] = _mm_srai_epi32(v[6], bit);
1742 :
1743 0 : v[7] = _mm_sub_epi32(y, x);
1744 0 : v[7] = _mm_add_epi32(v[7], rnding);
1745 0 : v[7] = _mm_srai_epi32(v[7], bit);
1746 :
1747 0 : v[8] = u[8];
1748 0 : v[9] = u[9];
1749 :
1750 0 : y = _mm_mullo_epi32(u[10], cospi32);
1751 0 : x = _mm_mullo_epi32(u[11], cospi32);
1752 0 : v[10] = _mm_add_epi32(y, x);
1753 0 : v[10] = _mm_add_epi32(v[10], rnding);
1754 0 : v[10] = _mm_srai_epi32(v[10], bit);
1755 :
1756 0 : v[11] = _mm_sub_epi32(y, x);
1757 0 : v[11] = _mm_add_epi32(v[11], rnding);
1758 0 : v[11] = _mm_srai_epi32(v[11], bit);
1759 :
1760 0 : v[12] = u[12];
1761 0 : v[13] = u[13];
1762 :
1763 0 : y = _mm_mullo_epi32(u[14], cospi32);
1764 0 : x = _mm_mullo_epi32(u[15], cospi32);
1765 0 : v[14] = _mm_add_epi32(y, x);
1766 0 : v[14] = _mm_add_epi32(v[14], rnding);
1767 0 : v[14] = _mm_srai_epi32(v[14], bit);
1768 :
1769 0 : v[15] = _mm_sub_epi32(y, x);
1770 0 : v[15] = _mm_add_epi32(v[15], rnding);
1771 0 : v[15] = _mm_srai_epi32(v[15], bit);
1772 :
1773 : // stage 9
1774 0 : out[0 * col_num + col] = v[0];
1775 0 : out[1 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[8]);
1776 0 : out[2 * col_num + col] = v[12];
1777 0 : out[3 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[4]);
1778 0 : out[4 * col_num + col] = v[6];
1779 0 : out[5 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[14]);
1780 0 : out[6 * col_num + col] = v[10];
1781 0 : out[7 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[2]);
1782 0 : out[8 * col_num + col] = v[3];
1783 0 : out[9 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[11]);
1784 0 : out[10 * col_num + col] = v[15];
1785 0 : out[11 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[7]);
1786 0 : out[12 * col_num + col] = v[5];
1787 0 : out[13 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[13]);
1788 0 : out[14 * col_num + col] = v[9];
1789 0 : out[15 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[1]);
1790 : }
1791 0 : }
1792 :
1793 0 : static void col_txfm_16x16_rounding(__m128i *in, int shift) {
1794 : // Note:
1795 : // We split 16x16 rounding into 4 sections of 8x8 rounding,
1796 : // instead of 4 columns
1797 0 : col_txfm_8x8_rounding(&in[0], shift);
1798 0 : col_txfm_8x8_rounding(&in[16], shift);
1799 0 : col_txfm_8x8_rounding(&in[32], shift);
1800 0 : col_txfm_8x8_rounding(&in[48], shift);
1801 0 : }
1802 :
1803 0 : static void write_buffer_16x16(const __m128i *in, tran_low_t *output) {
1804 0 : const int size_8x8 = 16 * 4;
1805 0 : write_buffer_8x8(&in[0], output);
1806 0 : output += size_8x8;
1807 0 : write_buffer_8x8(&in[16], output);
1808 0 : output += size_8x8;
1809 0 : write_buffer_8x8(&in[32], output);
1810 0 : output += size_8x8;
1811 0 : write_buffer_8x8(&in[48], output);
1812 0 : }
1813 :
1814 0 : void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
1815 : int stride, int tx_type, int bd) {
1816 : __m128i in[64], out[64];
1817 0 : const TXFM_1D_CFG *row_cfg = NULL;
1818 0 : const TXFM_1D_CFG *col_cfg = NULL;
1819 :
1820 0 : switch (tx_type) {
1821 : case DCT_DCT:
1822 0 : row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
1823 0 : col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
1824 0 : load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
1825 0 : fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
1826 0 : col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
1827 0 : transpose_16x16(out, in);
1828 0 : fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
1829 0 : transpose_16x16(out, in);
1830 0 : write_buffer_16x16(in, coeff);
1831 0 : break;
1832 : case ADST_DCT:
1833 0 : row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
1834 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
1835 0 : load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
1836 0 : fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
1837 0 : col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
1838 0 : transpose_16x16(out, in);
1839 0 : fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
1840 0 : transpose_16x16(out, in);
1841 0 : write_buffer_16x16(in, coeff);
1842 0 : break;
1843 : case DCT_ADST:
1844 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
1845 0 : col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
1846 0 : load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
1847 0 : fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
1848 0 : col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
1849 0 : transpose_16x16(out, in);
1850 0 : fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
1851 0 : transpose_16x16(out, in);
1852 0 : write_buffer_16x16(in, coeff);
1853 0 : break;
1854 : case ADST_ADST:
1855 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
1856 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
1857 0 : load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
1858 0 : fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
1859 0 : col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
1860 0 : transpose_16x16(out, in);
1861 0 : fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
1862 0 : transpose_16x16(out, in);
1863 0 : write_buffer_16x16(in, coeff);
1864 0 : break;
1865 : #if CONFIG_EXT_TX
1866 : case FLIPADST_DCT:
1867 0 : row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
1868 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
1869 0 : load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]);
1870 0 : fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
1871 0 : col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
1872 0 : transpose_16x16(out, in);
1873 0 : fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
1874 0 : transpose_16x16(out, in);
1875 0 : write_buffer_16x16(in, coeff);
1876 0 : break;
1877 : case DCT_FLIPADST:
1878 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
1879 0 : col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
1880 0 : load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]);
1881 0 : fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
1882 0 : col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
1883 0 : transpose_16x16(out, in);
1884 0 : fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
1885 0 : transpose_16x16(out, in);
1886 0 : write_buffer_16x16(in, coeff);
1887 0 : break;
1888 : case FLIPADST_FLIPADST:
1889 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
1890 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
1891 0 : load_buffer_16x16(input, in, stride, 1, 1, row_cfg->shift[0]);
1892 0 : fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
1893 0 : col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
1894 0 : transpose_16x16(out, in);
1895 0 : fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
1896 0 : transpose_16x16(out, in);
1897 0 : write_buffer_16x16(in, coeff);
1898 0 : break;
1899 : case ADST_FLIPADST:
1900 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
1901 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
1902 0 : load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]);
1903 0 : fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
1904 0 : col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
1905 0 : transpose_16x16(out, in);
1906 0 : fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
1907 0 : transpose_16x16(out, in);
1908 0 : write_buffer_16x16(in, coeff);
1909 0 : break;
1910 : case FLIPADST_ADST:
1911 0 : row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
1912 0 : col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
1913 0 : load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]);
1914 0 : fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
1915 0 : col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
1916 0 : transpose_16x16(out, in);
1917 0 : fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
1918 0 : transpose_16x16(out, in);
1919 0 : write_buffer_16x16(in, coeff);
1920 0 : break;
1921 : #endif // CONFIG_EXT_TX
1922 0 : default: assert(0);
1923 : }
1924 : (void)bd;
1925 0 : }
|