Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 : #include <assert.h>
12 : #include <smmintrin.h> /* SSE4.1 */
13 :
14 : #include "./av1_rtcd.h"
15 : #include "./aom_config.h"
16 : #include "av1/common/av1_inv_txfm1d_cfg.h"
17 : #include "av1/common/x86/highbd_txfm_utility_sse4.h"
18 :
19 0 : static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
20 0 : in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
21 0 : in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
22 0 : in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
23 0 : in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
24 0 : }
25 :
26 0 : static void idct4x4_sse4_1(__m128i *in, int bit) {
27 0 : const int32_t *cospi = cospi_arr(bit);
28 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
29 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
30 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
31 0 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
32 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
33 : __m128i u0, u1, u2, u3;
34 : __m128i v0, v1, v2, v3, x, y;
35 :
36 0 : v0 = _mm_unpacklo_epi32(in[0], in[1]);
37 0 : v1 = _mm_unpackhi_epi32(in[0], in[1]);
38 0 : v2 = _mm_unpacklo_epi32(in[2], in[3]);
39 0 : v3 = _mm_unpackhi_epi32(in[2], in[3]);
40 :
41 0 : u0 = _mm_unpacklo_epi64(v0, v2);
42 0 : u1 = _mm_unpackhi_epi64(v0, v2);
43 0 : u2 = _mm_unpacklo_epi64(v1, v3);
44 0 : u3 = _mm_unpackhi_epi64(v1, v3);
45 :
46 0 : x = _mm_mullo_epi32(u0, cospi32);
47 0 : y = _mm_mullo_epi32(u2, cospi32);
48 0 : v0 = _mm_add_epi32(x, y);
49 0 : v0 = _mm_add_epi32(v0, rnding);
50 0 : v0 = _mm_srai_epi32(v0, bit);
51 :
52 0 : v1 = _mm_sub_epi32(x, y);
53 0 : v1 = _mm_add_epi32(v1, rnding);
54 0 : v1 = _mm_srai_epi32(v1, bit);
55 :
56 0 : x = _mm_mullo_epi32(u1, cospi48);
57 0 : y = _mm_mullo_epi32(u3, cospim16);
58 0 : v2 = _mm_add_epi32(x, y);
59 0 : v2 = _mm_add_epi32(v2, rnding);
60 0 : v2 = _mm_srai_epi32(v2, bit);
61 :
62 0 : x = _mm_mullo_epi32(u1, cospi16);
63 0 : y = _mm_mullo_epi32(u3, cospi48);
64 0 : v3 = _mm_add_epi32(x, y);
65 0 : v3 = _mm_add_epi32(v3, rnding);
66 0 : v3 = _mm_srai_epi32(v3, bit);
67 :
68 0 : in[0] = _mm_add_epi32(v0, v3);
69 0 : in[1] = _mm_add_epi32(v1, v2);
70 0 : in[2] = _mm_sub_epi32(v1, v2);
71 0 : in[3] = _mm_sub_epi32(v0, v3);
72 0 : }
73 :
74 0 : static void iadst4x4_sse4_1(__m128i *in, int bit) {
75 0 : const int32_t *cospi = cospi_arr(bit);
76 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
77 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
78 0 : const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
79 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
80 0 : const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
81 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
82 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
83 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
84 0 : const __m128i zero = _mm_setzero_si128();
85 : __m128i u0, u1, u2, u3;
86 : __m128i v0, v1, v2, v3, x, y;
87 :
88 0 : v0 = _mm_unpacklo_epi32(in[0], in[1]);
89 0 : v1 = _mm_unpackhi_epi32(in[0], in[1]);
90 0 : v2 = _mm_unpacklo_epi32(in[2], in[3]);
91 0 : v3 = _mm_unpackhi_epi32(in[2], in[3]);
92 :
93 0 : u0 = _mm_unpacklo_epi64(v0, v2);
94 0 : u1 = _mm_unpackhi_epi64(v0, v2);
95 0 : u2 = _mm_unpacklo_epi64(v1, v3);
96 0 : u3 = _mm_unpackhi_epi64(v1, v3);
97 :
98 : // stage 0
99 : // stage 1
100 0 : u1 = _mm_sub_epi32(zero, u1);
101 0 : u3 = _mm_sub_epi32(zero, u3);
102 :
103 : // stage 2
104 0 : v0 = u0;
105 0 : v1 = u3;
106 0 : x = _mm_mullo_epi32(u1, cospi32);
107 0 : y = _mm_mullo_epi32(u2, cospi32);
108 0 : v2 = _mm_add_epi32(x, y);
109 0 : v2 = _mm_add_epi32(v2, rnding);
110 0 : v2 = _mm_srai_epi32(v2, bit);
111 :
112 0 : v3 = _mm_sub_epi32(x, y);
113 0 : v3 = _mm_add_epi32(v3, rnding);
114 0 : v3 = _mm_srai_epi32(v3, bit);
115 :
116 : // stage 3
117 0 : u0 = _mm_add_epi32(v0, v2);
118 0 : u1 = _mm_add_epi32(v1, v3);
119 0 : u2 = _mm_sub_epi32(v0, v2);
120 0 : u3 = _mm_sub_epi32(v1, v3);
121 :
122 : // stage 4
123 0 : x = _mm_mullo_epi32(u0, cospi8);
124 0 : y = _mm_mullo_epi32(u1, cospi56);
125 0 : in[3] = _mm_add_epi32(x, y);
126 0 : in[3] = _mm_add_epi32(in[3], rnding);
127 0 : in[3] = _mm_srai_epi32(in[3], bit);
128 :
129 0 : x = _mm_mullo_epi32(u0, cospi56);
130 0 : y = _mm_mullo_epi32(u1, cospim8);
131 0 : in[0] = _mm_add_epi32(x, y);
132 0 : in[0] = _mm_add_epi32(in[0], rnding);
133 0 : in[0] = _mm_srai_epi32(in[0], bit);
134 :
135 0 : x = _mm_mullo_epi32(u2, cospi40);
136 0 : y = _mm_mullo_epi32(u3, cospi24);
137 0 : in[1] = _mm_add_epi32(x, y);
138 0 : in[1] = _mm_add_epi32(in[1], rnding);
139 0 : in[1] = _mm_srai_epi32(in[1], bit);
140 :
141 0 : x = _mm_mullo_epi32(u2, cospi24);
142 0 : y = _mm_mullo_epi32(u3, cospim40);
143 0 : in[2] = _mm_add_epi32(x, y);
144 0 : in[2] = _mm_add_epi32(in[2], rnding);
145 0 : in[2] = _mm_srai_epi32(in[2], bit);
146 0 : }
147 :
148 0 : static INLINE void round_shift_4x4(__m128i *in, int shift) {
149 0 : __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
150 :
151 0 : in[0] = _mm_add_epi32(in[0], rnding);
152 0 : in[1] = _mm_add_epi32(in[1], rnding);
153 0 : in[2] = _mm_add_epi32(in[2], rnding);
154 0 : in[3] = _mm_add_epi32(in[3], rnding);
155 :
156 0 : in[0] = _mm_srai_epi32(in[0], shift);
157 0 : in[1] = _mm_srai_epi32(in[1], shift);
158 0 : in[2] = _mm_srai_epi32(in[2], shift);
159 0 : in[3] = _mm_srai_epi32(in[3], shift);
160 0 : }
161 :
162 0 : static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
163 0 : const __m128i zero = _mm_setzero_si128();
164 0 : const __m128i one = _mm_set1_epi16(1);
165 0 : const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
166 : __m128i clamped, mask;
167 :
168 0 : mask = _mm_cmpgt_epi16(u, max);
169 0 : clamped = _mm_andnot_si128(mask, u);
170 0 : mask = _mm_and_si128(mask, max);
171 0 : clamped = _mm_or_si128(mask, clamped);
172 0 : mask = _mm_cmpgt_epi16(clamped, zero);
173 0 : clamped = _mm_and_si128(clamped, mask);
174 :
175 0 : return clamped;
176 : }
177 :
178 0 : static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
179 : int fliplr, int flipud, int shift, int bd) {
180 0 : const __m128i zero = _mm_setzero_si128();
181 : __m128i u0, u1, u2, u3;
182 : __m128i v0, v1, v2, v3;
183 :
184 0 : round_shift_4x4(in, shift);
185 :
186 0 : v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
187 0 : v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
188 0 : v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
189 0 : v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
190 :
191 0 : v0 = _mm_unpacklo_epi16(v0, zero);
192 0 : v1 = _mm_unpacklo_epi16(v1, zero);
193 0 : v2 = _mm_unpacklo_epi16(v2, zero);
194 0 : v3 = _mm_unpacklo_epi16(v3, zero);
195 :
196 0 : if (fliplr) {
197 0 : in[0] = _mm_shuffle_epi32(in[0], 0x1B);
198 0 : in[1] = _mm_shuffle_epi32(in[1], 0x1B);
199 0 : in[2] = _mm_shuffle_epi32(in[2], 0x1B);
200 0 : in[3] = _mm_shuffle_epi32(in[3], 0x1B);
201 : }
202 :
203 0 : if (flipud) {
204 0 : u0 = _mm_add_epi32(in[3], v0);
205 0 : u1 = _mm_add_epi32(in[2], v1);
206 0 : u2 = _mm_add_epi32(in[1], v2);
207 0 : u3 = _mm_add_epi32(in[0], v3);
208 : } else {
209 0 : u0 = _mm_add_epi32(in[0], v0);
210 0 : u1 = _mm_add_epi32(in[1], v1);
211 0 : u2 = _mm_add_epi32(in[2], v2);
212 0 : u3 = _mm_add_epi32(in[3], v3);
213 : }
214 :
215 0 : v0 = _mm_packus_epi32(u0, u1);
216 0 : v2 = _mm_packus_epi32(u2, u3);
217 :
218 0 : u0 = highbd_clamp_epi16(v0, bd);
219 0 : u2 = highbd_clamp_epi16(v2, bd);
220 :
221 0 : v0 = _mm_unpacklo_epi64(u0, u0);
222 0 : v1 = _mm_unpackhi_epi64(u0, u0);
223 0 : v2 = _mm_unpacklo_epi64(u2, u2);
224 0 : v3 = _mm_unpackhi_epi64(u2, u2);
225 :
226 : _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
227 0 : _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
228 0 : _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
229 0 : _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
230 0 : }
231 :
232 0 : void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
233 : int stride, int tx_type, int bd) {
234 : __m128i in[4];
235 0 : const TXFM_1D_CFG *row_cfg = NULL;
236 0 : const TXFM_1D_CFG *col_cfg = NULL;
237 :
238 0 : switch (tx_type) {
239 : case DCT_DCT:
240 0 : row_cfg = &inv_txfm_1d_row_cfg_dct_4;
241 0 : col_cfg = &inv_txfm_1d_col_cfg_dct_4;
242 0 : load_buffer_4x4(coeff, in);
243 0 : idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
244 0 : idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
245 0 : write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
246 0 : break;
247 : case ADST_DCT:
248 0 : row_cfg = &inv_txfm_1d_row_cfg_dct_4;
249 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_4;
250 0 : load_buffer_4x4(coeff, in);
251 0 : idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
252 0 : iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
253 0 : write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
254 0 : break;
255 : case DCT_ADST:
256 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_4;
257 0 : col_cfg = &inv_txfm_1d_col_cfg_dct_4;
258 0 : load_buffer_4x4(coeff, in);
259 0 : iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
260 0 : idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
261 0 : write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
262 0 : break;
263 : case ADST_ADST:
264 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_4;
265 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_4;
266 0 : load_buffer_4x4(coeff, in);
267 0 : iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
268 0 : iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
269 0 : write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
270 0 : break;
271 : #if CONFIG_EXT_TX
272 : case FLIPADST_DCT:
273 0 : row_cfg = &inv_txfm_1d_row_cfg_dct_4;
274 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_4;
275 0 : load_buffer_4x4(coeff, in);
276 0 : idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
277 0 : iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
278 0 : write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
279 0 : break;
280 : case DCT_FLIPADST:
281 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_4;
282 0 : col_cfg = &inv_txfm_1d_col_cfg_dct_4;
283 0 : load_buffer_4x4(coeff, in);
284 0 : iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
285 0 : idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
286 0 : write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
287 0 : break;
288 : case FLIPADST_FLIPADST:
289 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_4;
290 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_4;
291 0 : load_buffer_4x4(coeff, in);
292 0 : iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
293 0 : iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
294 0 : write_buffer_4x4(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
295 0 : break;
296 : case ADST_FLIPADST:
297 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_4;
298 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_4;
299 0 : load_buffer_4x4(coeff, in);
300 0 : iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
301 0 : iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
302 0 : write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
303 0 : break;
304 : case FLIPADST_ADST:
305 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_4;
306 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_4;
307 0 : load_buffer_4x4(coeff, in);
308 0 : iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
309 0 : iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
310 0 : write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
311 0 : break;
312 : #endif // CONFIG_EXT_TX
313 0 : default: assert(0);
314 : }
315 0 : }
316 :
317 : // 8x8
318 0 : static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
319 0 : in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
320 0 : in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
321 0 : in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
322 0 : in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
323 0 : in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
324 0 : in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
325 0 : in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
326 0 : in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
327 0 : in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
328 0 : in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
329 0 : in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
330 0 : in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
331 0 : in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
332 0 : in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
333 0 : in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
334 0 : in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
335 0 : }
336 :
337 0 : static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
338 0 : const int32_t *cospi = cospi_arr(bit);
339 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
340 0 : const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
341 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
342 0 : const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
343 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
344 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
345 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
346 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
347 0 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
348 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
349 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
350 : __m128i u0, u1, u2, u3, u4, u5, u6, u7;
351 : __m128i v0, v1, v2, v3, v4, v5, v6, v7;
352 : __m128i x, y;
353 : int col;
354 :
355 : // Note:
356 : // Even column: 0, 2, ..., 14
357 : // Odd column: 1, 3, ..., 15
358 : // one even column plus one odd column constructs one row (8 coeffs)
359 : // total we have 8 rows (8x8).
360 0 : for (col = 0; col < 2; ++col) {
361 : // stage 0
362 : // stage 1
363 : // stage 2
364 0 : u0 = in[0 * 2 + col];
365 0 : u1 = in[4 * 2 + col];
366 0 : u2 = in[2 * 2 + col];
367 0 : u3 = in[6 * 2 + col];
368 :
369 0 : x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
370 0 : y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
371 0 : u4 = _mm_add_epi32(x, y);
372 0 : u4 = _mm_add_epi32(u4, rnding);
373 0 : u4 = _mm_srai_epi32(u4, bit);
374 :
375 0 : x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
376 0 : y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
377 0 : u7 = _mm_add_epi32(x, y);
378 0 : u7 = _mm_add_epi32(u7, rnding);
379 0 : u7 = _mm_srai_epi32(u7, bit);
380 :
381 0 : x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
382 0 : y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
383 0 : u5 = _mm_add_epi32(x, y);
384 0 : u5 = _mm_add_epi32(u5, rnding);
385 0 : u5 = _mm_srai_epi32(u5, bit);
386 :
387 0 : x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
388 0 : y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
389 0 : u6 = _mm_add_epi32(x, y);
390 0 : u6 = _mm_add_epi32(u6, rnding);
391 0 : u6 = _mm_srai_epi32(u6, bit);
392 :
393 : // stage 3
394 0 : x = _mm_mullo_epi32(u0, cospi32);
395 0 : y = _mm_mullo_epi32(u1, cospi32);
396 0 : v0 = _mm_add_epi32(x, y);
397 0 : v0 = _mm_add_epi32(v0, rnding);
398 0 : v0 = _mm_srai_epi32(v0, bit);
399 :
400 0 : v1 = _mm_sub_epi32(x, y);
401 0 : v1 = _mm_add_epi32(v1, rnding);
402 0 : v1 = _mm_srai_epi32(v1, bit);
403 :
404 0 : x = _mm_mullo_epi32(u2, cospi48);
405 0 : y = _mm_mullo_epi32(u3, cospim16);
406 0 : v2 = _mm_add_epi32(x, y);
407 0 : v2 = _mm_add_epi32(v2, rnding);
408 0 : v2 = _mm_srai_epi32(v2, bit);
409 :
410 0 : x = _mm_mullo_epi32(u2, cospi16);
411 0 : y = _mm_mullo_epi32(u3, cospi48);
412 0 : v3 = _mm_add_epi32(x, y);
413 0 : v3 = _mm_add_epi32(v3, rnding);
414 0 : v3 = _mm_srai_epi32(v3, bit);
415 :
416 0 : v4 = _mm_add_epi32(u4, u5);
417 0 : v5 = _mm_sub_epi32(u4, u5);
418 0 : v6 = _mm_sub_epi32(u7, u6);
419 0 : v7 = _mm_add_epi32(u6, u7);
420 :
421 : // stage 4
422 0 : u0 = _mm_add_epi32(v0, v3);
423 0 : u1 = _mm_add_epi32(v1, v2);
424 0 : u2 = _mm_sub_epi32(v1, v2);
425 0 : u3 = _mm_sub_epi32(v0, v3);
426 0 : u4 = v4;
427 0 : u7 = v7;
428 :
429 0 : x = _mm_mullo_epi32(v5, cospi32);
430 0 : y = _mm_mullo_epi32(v6, cospi32);
431 0 : u6 = _mm_add_epi32(y, x);
432 0 : u6 = _mm_add_epi32(u6, rnding);
433 0 : u6 = _mm_srai_epi32(u6, bit);
434 :
435 0 : u5 = _mm_sub_epi32(y, x);
436 0 : u5 = _mm_add_epi32(u5, rnding);
437 0 : u5 = _mm_srai_epi32(u5, bit);
438 :
439 : // stage 5
440 0 : out[0 * 2 + col] = _mm_add_epi32(u0, u7);
441 0 : out[1 * 2 + col] = _mm_add_epi32(u1, u6);
442 0 : out[2 * 2 + col] = _mm_add_epi32(u2, u5);
443 0 : out[3 * 2 + col] = _mm_add_epi32(u3, u4);
444 0 : out[4 * 2 + col] = _mm_sub_epi32(u3, u4);
445 0 : out[5 * 2 + col] = _mm_sub_epi32(u2, u5);
446 0 : out[6 * 2 + col] = _mm_sub_epi32(u1, u6);
447 0 : out[7 * 2 + col] = _mm_sub_epi32(u0, u7);
448 : }
449 0 : }
450 :
451 0 : static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
452 0 : const int32_t *cospi = cospi_arr(bit);
453 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
454 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
455 0 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
456 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
457 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
458 0 : const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
459 0 : const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
460 0 : const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
461 0 : const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
462 0 : const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
463 0 : const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
464 0 : const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
465 0 : const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
466 0 : const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
467 0 : const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
468 0 : const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
469 0 : const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
470 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
471 0 : const __m128i zero = _mm_setzero_si128();
472 : __m128i u0, u1, u2, u3, u4, u5, u6, u7;
473 : __m128i v0, v1, v2, v3, v4, v5, v6, v7;
474 : __m128i x, y;
475 : int col;
476 :
477 : // Note:
478 : // Even column: 0, 2, ..., 14
479 : // Odd column: 1, 3, ..., 15
480 : // one even column plus one odd column constructs one row (8 coeffs)
481 : // total we have 8 rows (8x8).
482 0 : for (col = 0; col < 2; ++col) {
483 : // stage 0
484 : // stage 1
485 0 : u0 = in[2 * 0 + col];
486 0 : u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
487 0 : u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
488 0 : u3 = in[2 * 4 + col];
489 0 : u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
490 0 : u5 = in[2 * 6 + col];
491 0 : u6 = in[2 * 2 + col];
492 0 : u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
493 :
494 : // stage 2
495 0 : v0 = u0;
496 0 : v1 = u1;
497 :
498 0 : x = _mm_mullo_epi32(u2, cospi32);
499 0 : y = _mm_mullo_epi32(u3, cospi32);
500 0 : v2 = _mm_add_epi32(x, y);
501 0 : v2 = _mm_add_epi32(v2, rnding);
502 0 : v2 = _mm_srai_epi32(v2, bit);
503 :
504 0 : v3 = _mm_sub_epi32(x, y);
505 0 : v3 = _mm_add_epi32(v3, rnding);
506 0 : v3 = _mm_srai_epi32(v3, bit);
507 :
508 0 : v4 = u4;
509 0 : v5 = u5;
510 :
511 0 : x = _mm_mullo_epi32(u6, cospi32);
512 0 : y = _mm_mullo_epi32(u7, cospi32);
513 0 : v6 = _mm_add_epi32(x, y);
514 0 : v6 = _mm_add_epi32(v6, rnding);
515 0 : v6 = _mm_srai_epi32(v6, bit);
516 :
517 0 : v7 = _mm_sub_epi32(x, y);
518 0 : v7 = _mm_add_epi32(v7, rnding);
519 0 : v7 = _mm_srai_epi32(v7, bit);
520 :
521 : // stage 3
522 0 : u0 = _mm_add_epi32(v0, v2);
523 0 : u1 = _mm_add_epi32(v1, v3);
524 0 : u2 = _mm_sub_epi32(v0, v2);
525 0 : u3 = _mm_sub_epi32(v1, v3);
526 0 : u4 = _mm_add_epi32(v4, v6);
527 0 : u5 = _mm_add_epi32(v5, v7);
528 0 : u6 = _mm_sub_epi32(v4, v6);
529 0 : u7 = _mm_sub_epi32(v5, v7);
530 :
531 : // stage 4
532 0 : v0 = u0;
533 0 : v1 = u1;
534 0 : v2 = u2;
535 0 : v3 = u3;
536 :
537 0 : x = _mm_mullo_epi32(u4, cospi16);
538 0 : y = _mm_mullo_epi32(u5, cospi48);
539 0 : v4 = _mm_add_epi32(x, y);
540 0 : v4 = _mm_add_epi32(v4, rnding);
541 0 : v4 = _mm_srai_epi32(v4, bit);
542 :
543 0 : x = _mm_mullo_epi32(u4, cospi48);
544 0 : y = _mm_mullo_epi32(u5, cospim16);
545 0 : v5 = _mm_add_epi32(x, y);
546 0 : v5 = _mm_add_epi32(v5, rnding);
547 0 : v5 = _mm_srai_epi32(v5, bit);
548 :
549 0 : x = _mm_mullo_epi32(u6, cospim48);
550 0 : y = _mm_mullo_epi32(u7, cospi16);
551 0 : v6 = _mm_add_epi32(x, y);
552 0 : v6 = _mm_add_epi32(v6, rnding);
553 0 : v6 = _mm_srai_epi32(v6, bit);
554 :
555 0 : x = _mm_mullo_epi32(u6, cospi16);
556 0 : y = _mm_mullo_epi32(u7, cospi48);
557 0 : v7 = _mm_add_epi32(x, y);
558 0 : v7 = _mm_add_epi32(v7, rnding);
559 0 : v7 = _mm_srai_epi32(v7, bit);
560 :
561 : // stage 5
562 0 : u0 = _mm_add_epi32(v0, v4);
563 0 : u1 = _mm_add_epi32(v1, v5);
564 0 : u2 = _mm_add_epi32(v2, v6);
565 0 : u3 = _mm_add_epi32(v3, v7);
566 0 : u4 = _mm_sub_epi32(v0, v4);
567 0 : u5 = _mm_sub_epi32(v1, v5);
568 0 : u6 = _mm_sub_epi32(v2, v6);
569 0 : u7 = _mm_sub_epi32(v3, v7);
570 :
571 : // stage 6
572 0 : x = _mm_mullo_epi32(u0, cospi4);
573 0 : y = _mm_mullo_epi32(u1, cospi60);
574 0 : v0 = _mm_add_epi32(x, y);
575 0 : v0 = _mm_add_epi32(v0, rnding);
576 0 : v0 = _mm_srai_epi32(v0, bit);
577 :
578 0 : x = _mm_mullo_epi32(u0, cospi60);
579 0 : y = _mm_mullo_epi32(u1, cospim4);
580 0 : v1 = _mm_add_epi32(x, y);
581 0 : v1 = _mm_add_epi32(v1, rnding);
582 0 : v1 = _mm_srai_epi32(v1, bit);
583 :
584 0 : x = _mm_mullo_epi32(u2, cospi20);
585 0 : y = _mm_mullo_epi32(u3, cospi44);
586 0 : v2 = _mm_add_epi32(x, y);
587 0 : v2 = _mm_add_epi32(v2, rnding);
588 0 : v2 = _mm_srai_epi32(v2, bit);
589 :
590 0 : x = _mm_mullo_epi32(u2, cospi44);
591 0 : y = _mm_mullo_epi32(u3, cospim20);
592 0 : v3 = _mm_add_epi32(x, y);
593 0 : v3 = _mm_add_epi32(v3, rnding);
594 0 : v3 = _mm_srai_epi32(v3, bit);
595 :
596 0 : x = _mm_mullo_epi32(u4, cospi36);
597 0 : y = _mm_mullo_epi32(u5, cospi28);
598 0 : v4 = _mm_add_epi32(x, y);
599 0 : v4 = _mm_add_epi32(v4, rnding);
600 0 : v4 = _mm_srai_epi32(v4, bit);
601 :
602 0 : x = _mm_mullo_epi32(u4, cospi28);
603 0 : y = _mm_mullo_epi32(u5, cospim36);
604 0 : v5 = _mm_add_epi32(x, y);
605 0 : v5 = _mm_add_epi32(v5, rnding);
606 0 : v5 = _mm_srai_epi32(v5, bit);
607 :
608 0 : x = _mm_mullo_epi32(u6, cospi52);
609 0 : y = _mm_mullo_epi32(u7, cospi12);
610 0 : v6 = _mm_add_epi32(x, y);
611 0 : v6 = _mm_add_epi32(v6, rnding);
612 0 : v6 = _mm_srai_epi32(v6, bit);
613 :
614 0 : x = _mm_mullo_epi32(u6, cospi12);
615 0 : y = _mm_mullo_epi32(u7, cospim52);
616 0 : v7 = _mm_add_epi32(x, y);
617 0 : v7 = _mm_add_epi32(v7, rnding);
618 0 : v7 = _mm_srai_epi32(v7, bit);
619 :
620 : // stage 7
621 0 : out[2 * 0 + col] = v1;
622 0 : out[2 * 1 + col] = v6;
623 0 : out[2 * 2 + col] = v3;
624 0 : out[2 * 3 + col] = v4;
625 0 : out[2 * 4 + col] = v5;
626 0 : out[2 * 5 + col] = v2;
627 0 : out[2 * 6 + col] = v7;
628 0 : out[2 * 7 + col] = v0;
629 : }
630 0 : }
631 :
632 0 : static void round_shift_8x8(__m128i *in, int shift) {
633 0 : round_shift_4x4(&in[0], shift);
634 0 : round_shift_4x4(&in[4], shift);
635 0 : round_shift_4x4(&in[8], shift);
636 0 : round_shift_4x4(&in[12], shift);
637 0 : }
638 :
639 0 : static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
640 : int fliplr, int bd) {
641 : __m128i x0, x1;
642 0 : const __m128i zero = _mm_setzero_si128();
643 :
644 0 : x0 = _mm_unpacklo_epi16(pred, zero);
645 0 : x1 = _mm_unpackhi_epi16(pred, zero);
646 :
647 0 : if (fliplr) {
648 0 : res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
649 0 : res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
650 0 : x0 = _mm_add_epi32(res_hi, x0);
651 0 : x1 = _mm_add_epi32(res_lo, x1);
652 :
653 : } else {
654 0 : x0 = _mm_add_epi32(res_lo, x0);
655 0 : x1 = _mm_add_epi32(res_hi, x1);
656 : }
657 :
658 0 : x0 = _mm_packus_epi32(x0, x1);
659 0 : return highbd_clamp_epi16(x0, bd);
660 : }
661 :
662 0 : static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
663 : int fliplr, int flipud, int shift, int bd) {
664 : __m128i u0, u1, u2, u3, u4, u5, u6, u7;
665 : __m128i v0, v1, v2, v3, v4, v5, v6, v7;
666 :
667 0 : round_shift_8x8(in, shift);
668 :
669 0 : v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
670 0 : v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
671 0 : v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
672 0 : v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
673 0 : v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
674 0 : v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
675 0 : v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
676 0 : v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
677 :
678 0 : if (flipud) {
679 0 : u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
680 0 : u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
681 0 : u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
682 0 : u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
683 0 : u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
684 0 : u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
685 0 : u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
686 0 : u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
687 : } else {
688 0 : u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
689 0 : u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
690 0 : u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
691 0 : u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
692 0 : u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
693 0 : u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
694 0 : u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
695 0 : u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
696 : }
697 :
698 : _mm_store_si128((__m128i *)(output + 0 * stride), u0);
699 0 : _mm_store_si128((__m128i *)(output + 1 * stride), u1);
700 0 : _mm_store_si128((__m128i *)(output + 2 * stride), u2);
701 0 : _mm_store_si128((__m128i *)(output + 3 * stride), u3);
702 0 : _mm_store_si128((__m128i *)(output + 4 * stride), u4);
703 0 : _mm_store_si128((__m128i *)(output + 5 * stride), u5);
704 0 : _mm_store_si128((__m128i *)(output + 6 * stride), u6);
705 0 : _mm_store_si128((__m128i *)(output + 7 * stride), u7);
706 0 : }
707 :
708 0 : void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
709 : int stride, int tx_type, int bd) {
710 : __m128i in[16], out[16];
711 0 : const TXFM_1D_CFG *row_cfg = NULL;
712 0 : const TXFM_1D_CFG *col_cfg = NULL;
713 :
714 0 : switch (tx_type) {
715 : case DCT_DCT:
716 0 : row_cfg = &inv_txfm_1d_row_cfg_dct_8;
717 0 : col_cfg = &inv_txfm_1d_col_cfg_dct_8;
718 0 : load_buffer_8x8(coeff, in);
719 0 : transpose_8x8(in, out);
720 0 : idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
721 0 : transpose_8x8(in, out);
722 0 : idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
723 0 : write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
724 0 : break;
725 : case DCT_ADST:
726 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_8;
727 0 : col_cfg = &inv_txfm_1d_col_cfg_dct_8;
728 0 : load_buffer_8x8(coeff, in);
729 0 : transpose_8x8(in, out);
730 0 : iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
731 0 : transpose_8x8(in, out);
732 0 : idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
733 0 : write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
734 0 : break;
735 : case ADST_DCT:
736 0 : row_cfg = &inv_txfm_1d_row_cfg_dct_8;
737 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_8;
738 0 : load_buffer_8x8(coeff, in);
739 0 : transpose_8x8(in, out);
740 0 : idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
741 0 : transpose_8x8(in, out);
742 0 : iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
743 0 : write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
744 0 : break;
745 : case ADST_ADST:
746 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_8;
747 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_8;
748 0 : load_buffer_8x8(coeff, in);
749 0 : transpose_8x8(in, out);
750 0 : iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
751 0 : transpose_8x8(in, out);
752 0 : iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
753 0 : write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
754 0 : break;
755 : #if CONFIG_EXT_TX
756 : case FLIPADST_DCT:
757 0 : row_cfg = &inv_txfm_1d_row_cfg_dct_8;
758 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_8;
759 0 : load_buffer_8x8(coeff, in);
760 0 : transpose_8x8(in, out);
761 0 : idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
762 0 : transpose_8x8(in, out);
763 0 : iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
764 0 : write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
765 0 : break;
766 : case DCT_FLIPADST:
767 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_8;
768 0 : col_cfg = &inv_txfm_1d_col_cfg_dct_8;
769 0 : load_buffer_8x8(coeff, in);
770 0 : transpose_8x8(in, out);
771 0 : iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
772 0 : transpose_8x8(in, out);
773 0 : idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
774 0 : write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
775 0 : break;
776 : case ADST_FLIPADST:
777 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_8;
778 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_8;
779 0 : load_buffer_8x8(coeff, in);
780 0 : transpose_8x8(in, out);
781 0 : iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
782 0 : transpose_8x8(in, out);
783 0 : iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
784 0 : write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
785 0 : break;
786 : case FLIPADST_FLIPADST:
787 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_8;
788 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_8;
789 0 : load_buffer_8x8(coeff, in);
790 0 : transpose_8x8(in, out);
791 0 : iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
792 0 : transpose_8x8(in, out);
793 0 : iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
794 0 : write_buffer_8x8(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
795 0 : break;
796 : case FLIPADST_ADST:
797 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_8;
798 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_8;
799 0 : load_buffer_8x8(coeff, in);
800 0 : transpose_8x8(in, out);
801 0 : iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
802 0 : transpose_8x8(in, out);
803 0 : iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
804 0 : write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
805 0 : break;
806 : #endif // CONFIG_EXT_TX
807 0 : default: assert(0);
808 : }
809 0 : }
810 :
811 : // 16x16
812 0 : static void load_buffer_16x16(const int32_t *coeff, __m128i *in) {
813 : int i;
814 0 : for (i = 0; i < 64; ++i) {
815 0 : in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2)));
816 : }
817 0 : }
818 :
819 0 : static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8,
820 : int col) {
821 : int i;
822 0 : for (i = 0; i < 16; i += 2) {
823 0 : in8x8[i] = in[col];
824 0 : in8x8[i + 1] = in[col + 1];
825 0 : col += 4;
826 : }
827 0 : }
828 :
829 0 : static void swap_addr(uint16_t **output1, uint16_t **output2) {
830 : uint16_t *tmp;
831 0 : tmp = *output1;
832 0 : *output1 = *output2;
833 0 : *output2 = tmp;
834 0 : }
835 :
836 0 : static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
837 : int fliplr, int flipud, int shift, int bd) {
838 : __m128i in8x8[16];
839 0 : uint16_t *leftUp = &output[0];
840 0 : uint16_t *rightUp = &output[8];
841 0 : uint16_t *leftDown = &output[8 * stride];
842 0 : uint16_t *rightDown = &output[8 * stride + 8];
843 :
844 0 : if (fliplr) {
845 0 : swap_addr(&leftUp, &rightUp);
846 0 : swap_addr(&leftDown, &rightDown);
847 : }
848 :
849 0 : if (flipud) {
850 0 : swap_addr(&leftUp, &leftDown);
851 0 : swap_addr(&rightUp, &rightDown);
852 : }
853 :
854 : // Left-up quarter
855 0 : assign_8x8_input_from_16x16(in, in8x8, 0);
856 0 : write_buffer_8x8(in8x8, leftUp, stride, fliplr, flipud, shift, bd);
857 :
858 : // Right-up quarter
859 0 : assign_8x8_input_from_16x16(in, in8x8, 2);
860 0 : write_buffer_8x8(in8x8, rightUp, stride, fliplr, flipud, shift, bd);
861 :
862 : // Left-down quarter
863 0 : assign_8x8_input_from_16x16(in, in8x8, 32);
864 0 : write_buffer_8x8(in8x8, leftDown, stride, fliplr, flipud, shift, bd);
865 :
866 : // Right-down quarter
867 0 : assign_8x8_input_from_16x16(in, in8x8, 34);
868 0 : write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
869 0 : }
870 :
871 0 : static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
872 0 : const int32_t *cospi = cospi_arr(bit);
873 0 : const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
874 0 : const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
875 0 : const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
876 0 : const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
877 0 : const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
878 0 : const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
879 0 : const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
880 0 : const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
881 0 : const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
882 0 : const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
883 0 : const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
884 0 : const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
885 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
886 0 : const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
887 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
888 0 : const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
889 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
890 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
891 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
892 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
893 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
894 0 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
895 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
896 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
897 : __m128i u[16], v[16], x, y;
898 : int col;
899 :
900 0 : for (col = 0; col < 4; ++col) {
901 : // stage 0
902 : // stage 1
903 0 : u[0] = in[0 * 4 + col];
904 0 : u[1] = in[8 * 4 + col];
905 0 : u[2] = in[4 * 4 + col];
906 0 : u[3] = in[12 * 4 + col];
907 0 : u[4] = in[2 * 4 + col];
908 0 : u[5] = in[10 * 4 + col];
909 0 : u[6] = in[6 * 4 + col];
910 0 : u[7] = in[14 * 4 + col];
911 0 : u[8] = in[1 * 4 + col];
912 0 : u[9] = in[9 * 4 + col];
913 0 : u[10] = in[5 * 4 + col];
914 0 : u[11] = in[13 * 4 + col];
915 0 : u[12] = in[3 * 4 + col];
916 0 : u[13] = in[11 * 4 + col];
917 0 : u[14] = in[7 * 4 + col];
918 0 : u[15] = in[15 * 4 + col];
919 :
920 : // stage 2
921 0 : v[0] = u[0];
922 0 : v[1] = u[1];
923 0 : v[2] = u[2];
924 0 : v[3] = u[3];
925 0 : v[4] = u[4];
926 0 : v[5] = u[5];
927 0 : v[6] = u[6];
928 0 : v[7] = u[7];
929 :
930 0 : v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
931 0 : v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
932 0 : v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
933 0 : v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
934 0 : v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
935 0 : v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
936 0 : v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
937 0 : v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
938 :
939 : // stage 3
940 0 : u[0] = v[0];
941 0 : u[1] = v[1];
942 0 : u[2] = v[2];
943 0 : u[3] = v[3];
944 0 : u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
945 0 : u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
946 0 : u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
947 0 : u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
948 0 : u[8] = _mm_add_epi32(v[8], v[9]);
949 0 : u[9] = _mm_sub_epi32(v[8], v[9]);
950 0 : u[10] = _mm_sub_epi32(v[11], v[10]);
951 0 : u[11] = _mm_add_epi32(v[10], v[11]);
952 0 : u[12] = _mm_add_epi32(v[12], v[13]);
953 0 : u[13] = _mm_sub_epi32(v[12], v[13]);
954 0 : u[14] = _mm_sub_epi32(v[15], v[14]);
955 0 : u[15] = _mm_add_epi32(v[14], v[15]);
956 :
957 : // stage 4
958 0 : x = _mm_mullo_epi32(u[0], cospi32);
959 0 : y = _mm_mullo_epi32(u[1], cospi32);
960 0 : v[0] = _mm_add_epi32(x, y);
961 0 : v[0] = _mm_add_epi32(v[0], rnding);
962 0 : v[0] = _mm_srai_epi32(v[0], bit);
963 :
964 0 : v[1] = _mm_sub_epi32(x, y);
965 0 : v[1] = _mm_add_epi32(v[1], rnding);
966 0 : v[1] = _mm_srai_epi32(v[1], bit);
967 :
968 0 : v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
969 0 : v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
970 0 : v[4] = _mm_add_epi32(u[4], u[5]);
971 0 : v[5] = _mm_sub_epi32(u[4], u[5]);
972 0 : v[6] = _mm_sub_epi32(u[7], u[6]);
973 0 : v[7] = _mm_add_epi32(u[6], u[7]);
974 0 : v[8] = u[8];
975 0 : v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
976 0 : v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
977 0 : v[11] = u[11];
978 0 : v[12] = u[12];
979 0 : v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
980 0 : v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
981 0 : v[15] = u[15];
982 :
983 : // stage 5
984 0 : u[0] = _mm_add_epi32(v[0], v[3]);
985 0 : u[1] = _mm_add_epi32(v[1], v[2]);
986 0 : u[2] = _mm_sub_epi32(v[1], v[2]);
987 0 : u[3] = _mm_sub_epi32(v[0], v[3]);
988 0 : u[4] = v[4];
989 :
990 0 : x = _mm_mullo_epi32(v[5], cospi32);
991 0 : y = _mm_mullo_epi32(v[6], cospi32);
992 0 : u[5] = _mm_sub_epi32(y, x);
993 0 : u[5] = _mm_add_epi32(u[5], rnding);
994 0 : u[5] = _mm_srai_epi32(u[5], bit);
995 :
996 0 : u[6] = _mm_add_epi32(y, x);
997 0 : u[6] = _mm_add_epi32(u[6], rnding);
998 0 : u[6] = _mm_srai_epi32(u[6], bit);
999 :
1000 0 : u[7] = v[7];
1001 0 : u[8] = _mm_add_epi32(v[8], v[11]);
1002 0 : u[9] = _mm_add_epi32(v[9], v[10]);
1003 0 : u[10] = _mm_sub_epi32(v[9], v[10]);
1004 0 : u[11] = _mm_sub_epi32(v[8], v[11]);
1005 0 : u[12] = _mm_sub_epi32(v[15], v[12]);
1006 0 : u[13] = _mm_sub_epi32(v[14], v[13]);
1007 0 : u[14] = _mm_add_epi32(v[13], v[14]);
1008 0 : u[15] = _mm_add_epi32(v[12], v[15]);
1009 :
1010 : // stage 6
1011 0 : v[0] = _mm_add_epi32(u[0], u[7]);
1012 0 : v[1] = _mm_add_epi32(u[1], u[6]);
1013 0 : v[2] = _mm_add_epi32(u[2], u[5]);
1014 0 : v[3] = _mm_add_epi32(u[3], u[4]);
1015 0 : v[4] = _mm_sub_epi32(u[3], u[4]);
1016 0 : v[5] = _mm_sub_epi32(u[2], u[5]);
1017 0 : v[6] = _mm_sub_epi32(u[1], u[6]);
1018 0 : v[7] = _mm_sub_epi32(u[0], u[7]);
1019 0 : v[8] = u[8];
1020 0 : v[9] = u[9];
1021 :
1022 0 : x = _mm_mullo_epi32(u[10], cospi32);
1023 0 : y = _mm_mullo_epi32(u[13], cospi32);
1024 0 : v[10] = _mm_sub_epi32(y, x);
1025 0 : v[10] = _mm_add_epi32(v[10], rnding);
1026 0 : v[10] = _mm_srai_epi32(v[10], bit);
1027 :
1028 0 : v[13] = _mm_add_epi32(x, y);
1029 0 : v[13] = _mm_add_epi32(v[13], rnding);
1030 0 : v[13] = _mm_srai_epi32(v[13], bit);
1031 :
1032 0 : x = _mm_mullo_epi32(u[11], cospi32);
1033 0 : y = _mm_mullo_epi32(u[12], cospi32);
1034 0 : v[11] = _mm_sub_epi32(y, x);
1035 0 : v[11] = _mm_add_epi32(v[11], rnding);
1036 0 : v[11] = _mm_srai_epi32(v[11], bit);
1037 :
1038 0 : v[12] = _mm_add_epi32(x, y);
1039 0 : v[12] = _mm_add_epi32(v[12], rnding);
1040 0 : v[12] = _mm_srai_epi32(v[12], bit);
1041 :
1042 0 : v[14] = u[14];
1043 0 : v[15] = u[15];
1044 :
1045 : // stage 7
1046 0 : out[0 * 4 + col] = _mm_add_epi32(v[0], v[15]);
1047 0 : out[1 * 4 + col] = _mm_add_epi32(v[1], v[14]);
1048 0 : out[2 * 4 + col] = _mm_add_epi32(v[2], v[13]);
1049 0 : out[3 * 4 + col] = _mm_add_epi32(v[3], v[12]);
1050 0 : out[4 * 4 + col] = _mm_add_epi32(v[4], v[11]);
1051 0 : out[5 * 4 + col] = _mm_add_epi32(v[5], v[10]);
1052 0 : out[6 * 4 + col] = _mm_add_epi32(v[6], v[9]);
1053 0 : out[7 * 4 + col] = _mm_add_epi32(v[7], v[8]);
1054 0 : out[8 * 4 + col] = _mm_sub_epi32(v[7], v[8]);
1055 0 : out[9 * 4 + col] = _mm_sub_epi32(v[6], v[9]);
1056 0 : out[10 * 4 + col] = _mm_sub_epi32(v[5], v[10]);
1057 0 : out[11 * 4 + col] = _mm_sub_epi32(v[4], v[11]);
1058 0 : out[12 * 4 + col] = _mm_sub_epi32(v[3], v[12]);
1059 0 : out[13 * 4 + col] = _mm_sub_epi32(v[2], v[13]);
1060 0 : out[14 * 4 + col] = _mm_sub_epi32(v[1], v[14]);
1061 0 : out[15 * 4 + col] = _mm_sub_epi32(v[0], v[15]);
1062 : }
1063 0 : }
1064 :
1065 0 : static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
1066 0 : const int32_t *cospi = cospi_arr(bit);
1067 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1068 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1069 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1070 0 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1071 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1072 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1073 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1074 0 : const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
1075 0 : const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
1076 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1077 0 : const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
1078 0 : const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1079 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1080 0 : const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
1081 0 : const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
1082 0 : const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
1083 0 : const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
1084 0 : const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
1085 0 : const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
1086 0 : const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
1087 0 : const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
1088 0 : const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
1089 0 : const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
1090 0 : const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
1091 0 : const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
1092 0 : const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
1093 0 : const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
1094 0 : const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
1095 0 : const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
1096 0 : const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
1097 0 : const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
1098 0 : const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
1099 0 : const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
1100 0 : const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
1101 0 : const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
1102 0 : const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
1103 0 : const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
1104 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1105 0 : const __m128i zero = _mm_setzero_si128();
1106 :
1107 : __m128i u[16], v[16], x, y;
1108 : int col;
1109 :
1110 0 : for (col = 0; col < 4; ++col) {
1111 : // stage 0
1112 : // stage 1
1113 0 : u[0] = in[0 * 4 + col];
1114 0 : u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
1115 0 : u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
1116 0 : u[3] = in[8 * 4 + col];
1117 0 : u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
1118 0 : u[5] = in[12 * 4 + col];
1119 0 : u[6] = in[4 * 4 + col];
1120 0 : u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
1121 0 : u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
1122 0 : u[9] = in[14 * 4 + col];
1123 0 : u[10] = in[6 * 4 + col];
1124 0 : u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
1125 0 : u[12] = in[2 * 4 + col];
1126 0 : u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
1127 0 : u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
1128 0 : u[15] = in[10 * 4 + col];
1129 :
1130 : // stage 2
1131 0 : v[0] = u[0];
1132 0 : v[1] = u[1];
1133 :
1134 0 : x = _mm_mullo_epi32(u[2], cospi32);
1135 0 : y = _mm_mullo_epi32(u[3], cospi32);
1136 0 : v[2] = _mm_add_epi32(x, y);
1137 0 : v[2] = _mm_add_epi32(v[2], rnding);
1138 0 : v[2] = _mm_srai_epi32(v[2], bit);
1139 :
1140 0 : v[3] = _mm_sub_epi32(x, y);
1141 0 : v[3] = _mm_add_epi32(v[3], rnding);
1142 0 : v[3] = _mm_srai_epi32(v[3], bit);
1143 :
1144 0 : v[4] = u[4];
1145 0 : v[5] = u[5];
1146 :
1147 0 : x = _mm_mullo_epi32(u[6], cospi32);
1148 0 : y = _mm_mullo_epi32(u[7], cospi32);
1149 0 : v[6] = _mm_add_epi32(x, y);
1150 0 : v[6] = _mm_add_epi32(v[6], rnding);
1151 0 : v[6] = _mm_srai_epi32(v[6], bit);
1152 :
1153 0 : v[7] = _mm_sub_epi32(x, y);
1154 0 : v[7] = _mm_add_epi32(v[7], rnding);
1155 0 : v[7] = _mm_srai_epi32(v[7], bit);
1156 :
1157 0 : v[8] = u[8];
1158 0 : v[9] = u[9];
1159 :
1160 0 : x = _mm_mullo_epi32(u[10], cospi32);
1161 0 : y = _mm_mullo_epi32(u[11], cospi32);
1162 0 : v[10] = _mm_add_epi32(x, y);
1163 0 : v[10] = _mm_add_epi32(v[10], rnding);
1164 0 : v[10] = _mm_srai_epi32(v[10], bit);
1165 :
1166 0 : v[11] = _mm_sub_epi32(x, y);
1167 0 : v[11] = _mm_add_epi32(v[11], rnding);
1168 0 : v[11] = _mm_srai_epi32(v[11], bit);
1169 :
1170 0 : v[12] = u[12];
1171 0 : v[13] = u[13];
1172 :
1173 0 : x = _mm_mullo_epi32(u[14], cospi32);
1174 0 : y = _mm_mullo_epi32(u[15], cospi32);
1175 0 : v[14] = _mm_add_epi32(x, y);
1176 0 : v[14] = _mm_add_epi32(v[14], rnding);
1177 0 : v[14] = _mm_srai_epi32(v[14], bit);
1178 :
1179 0 : v[15] = _mm_sub_epi32(x, y);
1180 0 : v[15] = _mm_add_epi32(v[15], rnding);
1181 0 : v[15] = _mm_srai_epi32(v[15], bit);
1182 :
1183 : // stage 3
1184 0 : u[0] = _mm_add_epi32(v[0], v[2]);
1185 0 : u[1] = _mm_add_epi32(v[1], v[3]);
1186 0 : u[2] = _mm_sub_epi32(v[0], v[2]);
1187 0 : u[3] = _mm_sub_epi32(v[1], v[3]);
1188 0 : u[4] = _mm_add_epi32(v[4], v[6]);
1189 0 : u[5] = _mm_add_epi32(v[5], v[7]);
1190 0 : u[6] = _mm_sub_epi32(v[4], v[6]);
1191 0 : u[7] = _mm_sub_epi32(v[5], v[7]);
1192 0 : u[8] = _mm_add_epi32(v[8], v[10]);
1193 0 : u[9] = _mm_add_epi32(v[9], v[11]);
1194 0 : u[10] = _mm_sub_epi32(v[8], v[10]);
1195 0 : u[11] = _mm_sub_epi32(v[9], v[11]);
1196 0 : u[12] = _mm_add_epi32(v[12], v[14]);
1197 0 : u[13] = _mm_add_epi32(v[13], v[15]);
1198 0 : u[14] = _mm_sub_epi32(v[12], v[14]);
1199 0 : u[15] = _mm_sub_epi32(v[13], v[15]);
1200 :
1201 : // stage 4
1202 0 : v[0] = u[0];
1203 0 : v[1] = u[1];
1204 0 : v[2] = u[2];
1205 0 : v[3] = u[3];
1206 0 : v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
1207 0 : v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
1208 0 : v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
1209 0 : v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
1210 0 : v[8] = u[8];
1211 0 : v[9] = u[9];
1212 0 : v[10] = u[10];
1213 0 : v[11] = u[11];
1214 0 : v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
1215 0 : v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
1216 0 : v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
1217 0 : v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
1218 :
1219 : // stage 5
1220 0 : u[0] = _mm_add_epi32(v[0], v[4]);
1221 0 : u[1] = _mm_add_epi32(v[1], v[5]);
1222 0 : u[2] = _mm_add_epi32(v[2], v[6]);
1223 0 : u[3] = _mm_add_epi32(v[3], v[7]);
1224 0 : u[4] = _mm_sub_epi32(v[0], v[4]);
1225 0 : u[5] = _mm_sub_epi32(v[1], v[5]);
1226 0 : u[6] = _mm_sub_epi32(v[2], v[6]);
1227 0 : u[7] = _mm_sub_epi32(v[3], v[7]);
1228 0 : u[8] = _mm_add_epi32(v[8], v[12]);
1229 0 : u[9] = _mm_add_epi32(v[9], v[13]);
1230 0 : u[10] = _mm_add_epi32(v[10], v[14]);
1231 0 : u[11] = _mm_add_epi32(v[11], v[15]);
1232 0 : u[12] = _mm_sub_epi32(v[8], v[12]);
1233 0 : u[13] = _mm_sub_epi32(v[9], v[13]);
1234 0 : u[14] = _mm_sub_epi32(v[10], v[14]);
1235 0 : u[15] = _mm_sub_epi32(v[11], v[15]);
1236 :
1237 : // stage 6
1238 0 : v[0] = u[0];
1239 0 : v[1] = u[1];
1240 0 : v[2] = u[2];
1241 0 : v[3] = u[3];
1242 0 : v[4] = u[4];
1243 0 : v[5] = u[5];
1244 0 : v[6] = u[6];
1245 0 : v[7] = u[7];
1246 0 : v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
1247 0 : v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
1248 0 : v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
1249 0 : v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
1250 0 : v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
1251 0 : v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
1252 0 : v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
1253 0 : v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
1254 :
1255 : // stage 7
1256 0 : u[0] = _mm_add_epi32(v[0], v[8]);
1257 0 : u[1] = _mm_add_epi32(v[1], v[9]);
1258 0 : u[2] = _mm_add_epi32(v[2], v[10]);
1259 0 : u[3] = _mm_add_epi32(v[3], v[11]);
1260 0 : u[4] = _mm_add_epi32(v[4], v[12]);
1261 0 : u[5] = _mm_add_epi32(v[5], v[13]);
1262 0 : u[6] = _mm_add_epi32(v[6], v[14]);
1263 0 : u[7] = _mm_add_epi32(v[7], v[15]);
1264 0 : u[8] = _mm_sub_epi32(v[0], v[8]);
1265 0 : u[9] = _mm_sub_epi32(v[1], v[9]);
1266 0 : u[10] = _mm_sub_epi32(v[2], v[10]);
1267 0 : u[11] = _mm_sub_epi32(v[3], v[11]);
1268 0 : u[12] = _mm_sub_epi32(v[4], v[12]);
1269 0 : u[13] = _mm_sub_epi32(v[5], v[13]);
1270 0 : u[14] = _mm_sub_epi32(v[6], v[14]);
1271 0 : u[15] = _mm_sub_epi32(v[7], v[15]);
1272 :
1273 : // stage 8
1274 0 : v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
1275 0 : v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
1276 0 : v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
1277 0 : v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
1278 0 : v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
1279 0 : v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
1280 0 : v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
1281 0 : v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
1282 0 : v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
1283 0 : v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
1284 0 : v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
1285 0 : v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
1286 0 : v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
1287 0 : v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
1288 0 : v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
1289 0 : v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
1290 :
1291 : // stage 9
1292 0 : out[0 * 4 + col] = v[1];
1293 0 : out[1 * 4 + col] = v[14];
1294 0 : out[2 * 4 + col] = v[3];
1295 0 : out[3 * 4 + col] = v[12];
1296 0 : out[4 * 4 + col] = v[5];
1297 0 : out[5 * 4 + col] = v[10];
1298 0 : out[6 * 4 + col] = v[7];
1299 0 : out[7 * 4 + col] = v[8];
1300 0 : out[8 * 4 + col] = v[9];
1301 0 : out[9 * 4 + col] = v[6];
1302 0 : out[10 * 4 + col] = v[11];
1303 0 : out[11 * 4 + col] = v[4];
1304 0 : out[12 * 4 + col] = v[13];
1305 0 : out[13 * 4 + col] = v[2];
1306 0 : out[14 * 4 + col] = v[15];
1307 0 : out[15 * 4 + col] = v[0];
1308 : }
1309 0 : }
1310 :
1311 0 : static void round_shift_16x16(__m128i *in, int shift) {
1312 0 : round_shift_8x8(&in[0], shift);
1313 0 : round_shift_8x8(&in[16], shift);
1314 0 : round_shift_8x8(&in[32], shift);
1315 0 : round_shift_8x8(&in[48], shift);
1316 0 : }
1317 :
1318 0 : void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
1319 : int stride, int tx_type, int bd) {
1320 : __m128i in[64], out[64];
1321 0 : const TXFM_1D_CFG *row_cfg = NULL;
1322 0 : const TXFM_1D_CFG *col_cfg = NULL;
1323 :
1324 0 : switch (tx_type) {
1325 : case DCT_DCT:
1326 0 : row_cfg = &inv_txfm_1d_row_cfg_dct_16;
1327 0 : col_cfg = &inv_txfm_1d_col_cfg_dct_16;
1328 0 : load_buffer_16x16(coeff, in);
1329 0 : transpose_16x16(in, out);
1330 0 : idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
1331 0 : round_shift_16x16(in, -row_cfg->shift[0]);
1332 0 : transpose_16x16(in, out);
1333 0 : idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
1334 0 : write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
1335 0 : break;
1336 : case DCT_ADST:
1337 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_16;
1338 0 : col_cfg = &inv_txfm_1d_col_cfg_dct_16;
1339 0 : load_buffer_16x16(coeff, in);
1340 0 : transpose_16x16(in, out);
1341 0 : iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
1342 0 : round_shift_16x16(in, -row_cfg->shift[0]);
1343 0 : transpose_16x16(in, out);
1344 0 : idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
1345 0 : write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
1346 0 : break;
1347 : case ADST_DCT:
1348 0 : row_cfg = &inv_txfm_1d_row_cfg_dct_16;
1349 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_16;
1350 0 : load_buffer_16x16(coeff, in);
1351 0 : transpose_16x16(in, out);
1352 0 : idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
1353 0 : round_shift_16x16(in, -row_cfg->shift[0]);
1354 0 : transpose_16x16(in, out);
1355 0 : iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
1356 0 : write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
1357 0 : break;
1358 : case ADST_ADST:
1359 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_16;
1360 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_16;
1361 0 : load_buffer_16x16(coeff, in);
1362 0 : transpose_16x16(in, out);
1363 0 : iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
1364 0 : round_shift_16x16(in, -row_cfg->shift[0]);
1365 0 : transpose_16x16(in, out);
1366 0 : iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
1367 0 : write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
1368 0 : break;
1369 : #if CONFIG_EXT_TX
1370 : case FLIPADST_DCT:
1371 0 : row_cfg = &inv_txfm_1d_row_cfg_dct_16;
1372 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_16;
1373 0 : load_buffer_16x16(coeff, in);
1374 0 : transpose_16x16(in, out);
1375 0 : idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
1376 0 : round_shift_16x16(in, -row_cfg->shift[0]);
1377 0 : transpose_16x16(in, out);
1378 0 : iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
1379 0 : write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
1380 0 : break;
1381 : case DCT_FLIPADST:
1382 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_16;
1383 0 : col_cfg = &inv_txfm_1d_col_cfg_dct_16;
1384 0 : load_buffer_16x16(coeff, in);
1385 0 : transpose_16x16(in, out);
1386 0 : iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
1387 0 : round_shift_16x16(in, -row_cfg->shift[0]);
1388 0 : transpose_16x16(in, out);
1389 0 : idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
1390 0 : write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
1391 0 : break;
1392 : case ADST_FLIPADST:
1393 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_16;
1394 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_16;
1395 0 : load_buffer_16x16(coeff, in);
1396 0 : transpose_16x16(in, out);
1397 0 : iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
1398 0 : round_shift_16x16(in, -row_cfg->shift[0]);
1399 0 : transpose_16x16(in, out);
1400 0 : iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
1401 0 : write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
1402 0 : break;
1403 : case FLIPADST_FLIPADST:
1404 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_16;
1405 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_16;
1406 0 : load_buffer_16x16(coeff, in);
1407 0 : transpose_16x16(in, out);
1408 0 : iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
1409 0 : round_shift_16x16(in, -row_cfg->shift[0]);
1410 0 : transpose_16x16(in, out);
1411 0 : iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
1412 0 : write_buffer_16x16(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
1413 0 : break;
1414 : case FLIPADST_ADST:
1415 0 : row_cfg = &inv_txfm_1d_row_cfg_adst_16;
1416 0 : col_cfg = &inv_txfm_1d_col_cfg_adst_16;
1417 0 : load_buffer_16x16(coeff, in);
1418 0 : transpose_16x16(in, out);
1419 0 : iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
1420 0 : round_shift_16x16(in, -row_cfg->shift[0]);
1421 0 : transpose_16x16(in, out);
1422 0 : iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
1423 0 : write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
1424 0 : break;
1425 : #endif
1426 0 : default: assert(0);
1427 : }
1428 0 : }
|