Line data Source code
1 : /*
2 : * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include <tmmintrin.h>
12 :
13 : #include "./aom_dsp_rtcd.h"
14 : #include "aom_dsp/x86/inv_txfm_sse2.h"
15 : #include "aom_dsp/x86/txfm_common_sse2.h"
16 :
17 0 : void aom_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
18 : int stride) {
19 0 : const __m128i zero = _mm_setzero_si128();
20 0 : const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
21 0 : const __m128i final_rounding = _mm_set1_epi16(1 << 4);
22 0 : const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
23 0 : const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
24 0 : const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
25 0 : const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
26 0 : const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
27 0 : const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
28 0 : const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
29 0 : const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
30 :
31 : __m128i in0, in1, in2, in3, in4, in5, in6, in7;
32 : __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
33 : __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
34 : __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
35 : int i;
36 :
37 : // Load input data.
38 0 : in0 = load_input_data(input);
39 0 : in1 = load_input_data(input + 8 * 1);
40 0 : in2 = load_input_data(input + 8 * 2);
41 0 : in3 = load_input_data(input + 8 * 3);
42 0 : in4 = load_input_data(input + 8 * 4);
43 0 : in5 = load_input_data(input + 8 * 5);
44 0 : in6 = load_input_data(input + 8 * 6);
45 0 : in7 = load_input_data(input + 8 * 7);
46 :
47 : // 2-D
48 0 : for (i = 0; i < 2; i++) {
49 : // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
50 0 : TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
51 : in4, in5, in6, in7);
52 :
53 : // 4-stage 1D idct8x8
54 : {
55 : /* Stage1 */
56 : {
57 0 : const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);
58 0 : const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);
59 0 : const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);
60 0 : const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);
61 :
62 : {
63 0 : tmp0 = _mm_madd_epi16(lo_17, stg1_0);
64 0 : tmp1 = _mm_madd_epi16(hi_17, stg1_0);
65 0 : tmp2 = _mm_madd_epi16(lo_17, stg1_1);
66 0 : tmp3 = _mm_madd_epi16(hi_17, stg1_1);
67 0 : tmp4 = _mm_madd_epi16(lo_35, stg1_2);
68 0 : tmp5 = _mm_madd_epi16(hi_35, stg1_2);
69 0 : tmp6 = _mm_madd_epi16(lo_35, stg1_3);
70 0 : tmp7 = _mm_madd_epi16(hi_35, stg1_3);
71 :
72 0 : tmp0 = _mm_add_epi32(tmp0, rounding);
73 0 : tmp1 = _mm_add_epi32(tmp1, rounding);
74 0 : tmp2 = _mm_add_epi32(tmp2, rounding);
75 0 : tmp3 = _mm_add_epi32(tmp3, rounding);
76 0 : tmp4 = _mm_add_epi32(tmp4, rounding);
77 0 : tmp5 = _mm_add_epi32(tmp5, rounding);
78 0 : tmp6 = _mm_add_epi32(tmp6, rounding);
79 0 : tmp7 = _mm_add_epi32(tmp7, rounding);
80 :
81 0 : tmp0 = _mm_srai_epi32(tmp0, 14);
82 0 : tmp1 = _mm_srai_epi32(tmp1, 14);
83 0 : tmp2 = _mm_srai_epi32(tmp2, 14);
84 0 : tmp3 = _mm_srai_epi32(tmp3, 14);
85 0 : tmp4 = _mm_srai_epi32(tmp4, 14);
86 0 : tmp5 = _mm_srai_epi32(tmp5, 14);
87 0 : tmp6 = _mm_srai_epi32(tmp6, 14);
88 0 : tmp7 = _mm_srai_epi32(tmp7, 14);
89 :
90 0 : stp1_4 = _mm_packs_epi32(tmp0, tmp1);
91 0 : stp1_7 = _mm_packs_epi32(tmp2, tmp3);
92 0 : stp1_5 = _mm_packs_epi32(tmp4, tmp5);
93 0 : stp1_6 = _mm_packs_epi32(tmp6, tmp7);
94 : }
95 : }
96 :
97 : /* Stage2 */
98 : {
99 0 : const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);
100 0 : const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);
101 :
102 : {
103 0 : tmp0 = _mm_unpacklo_epi16(in0, in4);
104 0 : tmp1 = _mm_unpackhi_epi16(in0, in4);
105 :
106 0 : tmp2 = _mm_madd_epi16(tmp0, stk2_0);
107 0 : tmp3 = _mm_madd_epi16(tmp1, stk2_0);
108 0 : tmp4 = _mm_madd_epi16(tmp0, stk2_1);
109 0 : tmp5 = _mm_madd_epi16(tmp1, stk2_1);
110 :
111 0 : tmp2 = _mm_add_epi32(tmp2, rounding);
112 0 : tmp3 = _mm_add_epi32(tmp3, rounding);
113 0 : tmp4 = _mm_add_epi32(tmp4, rounding);
114 0 : tmp5 = _mm_add_epi32(tmp5, rounding);
115 :
116 0 : tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
117 0 : tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
118 0 : tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
119 0 : tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
120 :
121 0 : stp2_0 = _mm_packs_epi32(tmp2, tmp3);
122 0 : stp2_1 = _mm_packs_epi32(tmp4, tmp5);
123 :
124 0 : tmp0 = _mm_madd_epi16(lo_26, stg2_2);
125 0 : tmp1 = _mm_madd_epi16(hi_26, stg2_2);
126 0 : tmp2 = _mm_madd_epi16(lo_26, stg2_3);
127 0 : tmp3 = _mm_madd_epi16(hi_26, stg2_3);
128 :
129 0 : tmp0 = _mm_add_epi32(tmp0, rounding);
130 0 : tmp1 = _mm_add_epi32(tmp1, rounding);
131 0 : tmp2 = _mm_add_epi32(tmp2, rounding);
132 0 : tmp3 = _mm_add_epi32(tmp3, rounding);
133 :
134 0 : tmp0 = _mm_srai_epi32(tmp0, 14);
135 0 : tmp1 = _mm_srai_epi32(tmp1, 14);
136 0 : tmp2 = _mm_srai_epi32(tmp2, 14);
137 0 : tmp3 = _mm_srai_epi32(tmp3, 14);
138 :
139 0 : stp2_2 = _mm_packs_epi32(tmp0, tmp1);
140 0 : stp2_3 = _mm_packs_epi32(tmp2, tmp3);
141 : }
142 :
143 0 : stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
144 0 : stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
145 0 : stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
146 0 : stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
147 : }
148 :
149 : /* Stage3 */
150 : {
151 0 : stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
152 0 : stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
153 0 : stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
154 0 : stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
155 :
156 0 : tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
157 0 : tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
158 :
159 0 : tmp2 = _mm_madd_epi16(tmp0, stk2_1);
160 0 : tmp3 = _mm_madd_epi16(tmp1, stk2_1);
161 0 : tmp4 = _mm_madd_epi16(tmp0, stk2_0);
162 0 : tmp5 = _mm_madd_epi16(tmp1, stk2_0);
163 :
164 0 : tmp2 = _mm_add_epi32(tmp2, rounding);
165 0 : tmp3 = _mm_add_epi32(tmp3, rounding);
166 0 : tmp4 = _mm_add_epi32(tmp4, rounding);
167 0 : tmp5 = _mm_add_epi32(tmp5, rounding);
168 :
169 0 : tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
170 0 : tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
171 0 : tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
172 0 : tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
173 :
174 0 : stp1_5 = _mm_packs_epi32(tmp2, tmp3);
175 0 : stp1_6 = _mm_packs_epi32(tmp4, tmp5);
176 : }
177 :
178 : /* Stage4 */
179 0 : in0 = _mm_add_epi16(stp1_0, stp2_7);
180 0 : in1 = _mm_add_epi16(stp1_1, stp1_6);
181 0 : in2 = _mm_add_epi16(stp1_2, stp1_5);
182 0 : in3 = _mm_add_epi16(stp1_3, stp2_4);
183 0 : in4 = _mm_sub_epi16(stp1_3, stp2_4);
184 0 : in5 = _mm_sub_epi16(stp1_2, stp1_5);
185 0 : in6 = _mm_sub_epi16(stp1_1, stp1_6);
186 0 : in7 = _mm_sub_epi16(stp1_0, stp2_7);
187 : }
188 : }
189 :
190 : // Final rounding and shift
191 0 : in0 = _mm_adds_epi16(in0, final_rounding);
192 0 : in1 = _mm_adds_epi16(in1, final_rounding);
193 0 : in2 = _mm_adds_epi16(in2, final_rounding);
194 0 : in3 = _mm_adds_epi16(in3, final_rounding);
195 0 : in4 = _mm_adds_epi16(in4, final_rounding);
196 0 : in5 = _mm_adds_epi16(in5, final_rounding);
197 0 : in6 = _mm_adds_epi16(in6, final_rounding);
198 0 : in7 = _mm_adds_epi16(in7, final_rounding);
199 :
200 0 : in0 = _mm_srai_epi16(in0, 5);
201 0 : in1 = _mm_srai_epi16(in1, 5);
202 0 : in2 = _mm_srai_epi16(in2, 5);
203 0 : in3 = _mm_srai_epi16(in3, 5);
204 0 : in4 = _mm_srai_epi16(in4, 5);
205 0 : in5 = _mm_srai_epi16(in5, 5);
206 0 : in6 = _mm_srai_epi16(in6, 5);
207 0 : in7 = _mm_srai_epi16(in7, 5);
208 :
209 0 : RECON_AND_STORE(dest + 0 * stride, in0);
210 0 : RECON_AND_STORE(dest + 1 * stride, in1);
211 0 : RECON_AND_STORE(dest + 2 * stride, in2);
212 0 : RECON_AND_STORE(dest + 3 * stride, in3);
213 0 : RECON_AND_STORE(dest + 4 * stride, in4);
214 0 : RECON_AND_STORE(dest + 5 * stride, in5);
215 0 : RECON_AND_STORE(dest + 6 * stride, in6);
216 0 : RECON_AND_STORE(dest + 7 * stride, in7);
217 0 : }
218 :
219 0 : void aom_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
220 : int stride) {
221 0 : const __m128i zero = _mm_setzero_si128();
222 0 : const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
223 0 : const __m128i final_rounding = _mm_set1_epi16(1 << 4);
224 0 : const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
225 0 : const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
226 0 : const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
227 0 : const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
228 0 : const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
229 0 : const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
230 0 : const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
231 0 : const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
232 0 : const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
233 0 : const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
234 :
235 : __m128i in0, in1, in2, in3, in4, in5, in6, in7;
236 : __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
237 : __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
238 : __m128i tmp0, tmp1, tmp2, tmp3;
239 :
240 : // Rows. Load 4-row input data.
241 0 : in0 = load_input_data(input);
242 0 : in1 = load_input_data(input + 8 * 1);
243 0 : in2 = load_input_data(input + 8 * 2);
244 0 : in3 = load_input_data(input + 8 * 3);
245 :
246 : // 8x4 Transpose
247 0 : TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
248 :
249 : // Stage1
250 0 : tmp0 = _mm_mulhrs_epi16(in0, stg1_0);
251 0 : tmp1 = _mm_mulhrs_epi16(in0, stg1_1);
252 0 : tmp2 = _mm_mulhrs_epi16(in1, stg1_2);
253 0 : tmp3 = _mm_mulhrs_epi16(in1, stg1_3);
254 :
255 0 : stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1);
256 0 : stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3);
257 :
258 : // Stage2
259 0 : tmp0 = _mm_mulhrs_epi16(in0, stg2_0);
260 0 : stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0);
261 :
262 0 : tmp1 = _mm_mulhrs_epi16(in1, stg2_2);
263 0 : tmp2 = _mm_mulhrs_epi16(in1, stg2_3);
264 0 : stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1);
265 :
266 0 : tmp0 = _mm_add_epi16(stp1_4, stp1_5);
267 0 : tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
268 :
269 0 : stp2_4 = tmp0;
270 0 : stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
271 0 : stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
272 :
273 0 : tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6);
274 0 : tmp1 = _mm_madd_epi16(tmp0, stg3_0);
275 0 : tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0
276 :
277 0 : tmp1 = _mm_add_epi32(tmp1, rounding);
278 0 : tmp2 = _mm_add_epi32(tmp2, rounding);
279 0 : tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
280 0 : tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
281 :
282 0 : stp1_5 = _mm_packs_epi32(tmp1, tmp2);
283 :
284 : // Stage3
285 0 : tmp2 = _mm_add_epi16(stp2_0, stp2_2);
286 0 : tmp3 = _mm_sub_epi16(stp2_0, stp2_2);
287 :
288 0 : stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2);
289 0 : stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2);
290 :
291 : // Stage4
292 0 : tmp0 = _mm_add_epi16(stp1_3, stp2_4);
293 0 : tmp1 = _mm_add_epi16(stp1_2, stp1_5);
294 0 : tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
295 0 : tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
296 :
297 0 : TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
298 :
299 : /* Stage1 */
300 0 : stp1_4 = _mm_mulhrs_epi16(in1, stg1_0);
301 0 : stp1_7 = _mm_mulhrs_epi16(in1, stg1_1);
302 0 : stp1_5 = _mm_mulhrs_epi16(in3, stg1_2);
303 0 : stp1_6 = _mm_mulhrs_epi16(in3, stg1_3);
304 :
305 : /* Stage2 */
306 0 : stp2_0 = _mm_mulhrs_epi16(in0, stg2_0);
307 0 : stp2_1 = _mm_mulhrs_epi16(in0, stg2_0);
308 :
309 0 : stp2_2 = _mm_mulhrs_epi16(in2, stg2_2);
310 0 : stp2_3 = _mm_mulhrs_epi16(in2, stg2_3);
311 :
312 0 : stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
313 0 : stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
314 0 : stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
315 0 : stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
316 :
317 : /* Stage3 */
318 0 : stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
319 0 : stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
320 0 : stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
321 0 : stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
322 :
323 0 : tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
324 0 : tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
325 :
326 0 : tmp2 = _mm_madd_epi16(tmp0, stk2_0);
327 0 : tmp3 = _mm_madd_epi16(tmp1, stk2_0);
328 0 : tmp2 = _mm_add_epi32(tmp2, rounding);
329 0 : tmp3 = _mm_add_epi32(tmp3, rounding);
330 0 : tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
331 0 : tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
332 0 : stp1_6 = _mm_packs_epi32(tmp2, tmp3);
333 :
334 0 : tmp2 = _mm_madd_epi16(tmp0, stk2_1);
335 0 : tmp3 = _mm_madd_epi16(tmp1, stk2_1);
336 0 : tmp2 = _mm_add_epi32(tmp2, rounding);
337 0 : tmp3 = _mm_add_epi32(tmp3, rounding);
338 0 : tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
339 0 : tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
340 0 : stp1_5 = _mm_packs_epi32(tmp2, tmp3);
341 :
342 : /* Stage4 */
343 0 : in0 = _mm_add_epi16(stp1_0, stp2_7);
344 0 : in1 = _mm_add_epi16(stp1_1, stp1_6);
345 0 : in2 = _mm_add_epi16(stp1_2, stp1_5);
346 0 : in3 = _mm_add_epi16(stp1_3, stp2_4);
347 0 : in4 = _mm_sub_epi16(stp1_3, stp2_4);
348 0 : in5 = _mm_sub_epi16(stp1_2, stp1_5);
349 0 : in6 = _mm_sub_epi16(stp1_1, stp1_6);
350 0 : in7 = _mm_sub_epi16(stp1_0, stp2_7);
351 :
352 : // Final rounding and shift
353 0 : in0 = _mm_adds_epi16(in0, final_rounding);
354 0 : in1 = _mm_adds_epi16(in1, final_rounding);
355 0 : in2 = _mm_adds_epi16(in2, final_rounding);
356 0 : in3 = _mm_adds_epi16(in3, final_rounding);
357 0 : in4 = _mm_adds_epi16(in4, final_rounding);
358 0 : in5 = _mm_adds_epi16(in5, final_rounding);
359 0 : in6 = _mm_adds_epi16(in6, final_rounding);
360 0 : in7 = _mm_adds_epi16(in7, final_rounding);
361 :
362 0 : in0 = _mm_srai_epi16(in0, 5);
363 0 : in1 = _mm_srai_epi16(in1, 5);
364 0 : in2 = _mm_srai_epi16(in2, 5);
365 0 : in3 = _mm_srai_epi16(in3, 5);
366 0 : in4 = _mm_srai_epi16(in4, 5);
367 0 : in5 = _mm_srai_epi16(in5, 5);
368 0 : in6 = _mm_srai_epi16(in6, 5);
369 0 : in7 = _mm_srai_epi16(in7, 5);
370 :
371 0 : RECON_AND_STORE(dest + 0 * stride, in0);
372 0 : RECON_AND_STORE(dest + 1 * stride, in1);
373 0 : RECON_AND_STORE(dest + 2 * stride, in2);
374 0 : RECON_AND_STORE(dest + 3 * stride, in3);
375 0 : RECON_AND_STORE(dest + 4 * stride, in4);
376 0 : RECON_AND_STORE(dest + 5 * stride, in5);
377 0 : RECON_AND_STORE(dest + 6 * stride, in6);
378 0 : RECON_AND_STORE(dest + 7 * stride, in7);
379 0 : }
380 :
381 : // Only do addition and subtraction butterfly, size = 16, 32
382 0 : static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
383 : int size) {
384 0 : int i = 0;
385 0 : const int num = size >> 1;
386 0 : const int bound = size - 1;
387 0 : while (i < num) {
388 0 : out[i] = _mm_add_epi16(in[i], in[bound - i]);
389 0 : out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
390 0 : i++;
391 : }
392 0 : }
393 :
394 : #define BUTTERFLY_PAIR(x0, x1, co0, co1) \
395 : do { \
396 : tmp0 = _mm_madd_epi16(x0, co0); \
397 : tmp1 = _mm_madd_epi16(x1, co0); \
398 : tmp2 = _mm_madd_epi16(x0, co1); \
399 : tmp3 = _mm_madd_epi16(x1, co1); \
400 : tmp0 = _mm_add_epi32(tmp0, rounding); \
401 : tmp1 = _mm_add_epi32(tmp1, rounding); \
402 : tmp2 = _mm_add_epi32(tmp2, rounding); \
403 : tmp3 = _mm_add_epi32(tmp3, rounding); \
404 : tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
405 : tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
406 : tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
407 : tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
408 : } while (0)
409 :
410 0 : static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
411 : const __m128i *c0, const __m128i *c1, __m128i *y0,
412 : __m128i *y1) {
413 : __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
414 0 : const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
415 :
416 0 : u0 = _mm_unpacklo_epi16(*x0, *x1);
417 0 : u1 = _mm_unpackhi_epi16(*x0, *x1);
418 0 : BUTTERFLY_PAIR(u0, u1, *c0, *c1);
419 0 : *y0 = _mm_packs_epi32(tmp0, tmp1);
420 0 : *y1 = _mm_packs_epi32(tmp2, tmp3);
421 0 : }
422 :
423 0 : static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
424 : const __m128i *c1) {
425 : __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
426 0 : const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
427 :
428 0 : u0 = _mm_unpacklo_epi16(*x0, *x1);
429 0 : u1 = _mm_unpackhi_epi16(*x0, *x1);
430 0 : BUTTERFLY_PAIR(u0, u1, *c0, *c1);
431 0 : *x0 = _mm_packs_epi32(tmp0, tmp1);
432 0 : *x1 = _mm_packs_epi32(tmp2, tmp3);
433 0 : }
434 :
435 0 : static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
436 0 : const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
437 0 : const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
438 0 : const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
439 0 : const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
440 :
441 0 : const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
442 0 : const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
443 :
444 0 : const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
445 0 : const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
446 0 : const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
447 0 : const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
448 0 : const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
449 0 : const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
450 :
451 0 : const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
452 : __m128i u0, u1, u2, u3, u4, u5, u6, u7;
453 : __m128i x0, x1, x4, x5, x6, x7;
454 : __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
455 :
456 : // phase 1
457 :
458 : // 0, 15
459 0 : u2 = _mm_mulhrs_epi16(in[2], stk2_1); // stp2_15
460 0 : u3 = _mm_mulhrs_epi16(in[6], stk2_7); // stp2_12
461 0 : v15 = _mm_add_epi16(u2, u3);
462 : // in[0], in[4]
463 0 : x0 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[0]
464 0 : x7 = _mm_mulhrs_epi16(in[4], stk3_1); // stp1[7]
465 0 : v0 = _mm_add_epi16(x0, x7); // stp2_0
466 0 : stp1[0] = _mm_add_epi16(v0, v15);
467 0 : stp1[15] = _mm_sub_epi16(v0, v15);
468 :
469 : // in[2], in[6]
470 0 : u0 = _mm_mulhrs_epi16(in[2], stk2_0); // stp2_8
471 0 : u1 = _mm_mulhrs_epi16(in[6], stk2_6); // stp2_11
472 0 : butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14
473 0 : butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13
474 :
475 0 : v8 = _mm_add_epi16(u0, u1);
476 0 : v9 = _mm_add_epi16(u4, u6);
477 0 : v10 = _mm_sub_epi16(u4, u6);
478 0 : v11 = _mm_sub_epi16(u0, u1);
479 0 : v12 = _mm_sub_epi16(u2, u3);
480 0 : v13 = _mm_sub_epi16(u5, u7);
481 0 : v14 = _mm_add_epi16(u5, u7);
482 :
483 0 : butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
484 0 : butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
485 :
486 : // 1, 14
487 0 : x1 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0
488 : // stp1[2] = stp1[0], stp1[3] = stp1[1]
489 0 : x4 = _mm_mulhrs_epi16(in[4], stk3_0); // stp1[4]
490 0 : butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6);
491 0 : v1 = _mm_add_epi16(x1, x6); // stp2_1
492 0 : v2 = _mm_add_epi16(x0, x5); // stp2_2
493 0 : stp1[1] = _mm_add_epi16(v1, v14);
494 0 : stp1[14] = _mm_sub_epi16(v1, v14);
495 :
496 0 : stp1[2] = _mm_add_epi16(v2, v13);
497 0 : stp1[13] = _mm_sub_epi16(v2, v13);
498 :
499 0 : v3 = _mm_add_epi16(x1, x4); // stp2_3
500 0 : v4 = _mm_sub_epi16(x1, x4); // stp2_4
501 :
502 0 : v5 = _mm_sub_epi16(x0, x5); // stp2_5
503 :
504 0 : v6 = _mm_sub_epi16(x1, x6); // stp2_6
505 0 : v7 = _mm_sub_epi16(x0, x7); // stp2_7
506 0 : stp1[3] = _mm_add_epi16(v3, v12);
507 0 : stp1[12] = _mm_sub_epi16(v3, v12);
508 :
509 0 : stp1[6] = _mm_add_epi16(v6, v9);
510 0 : stp1[9] = _mm_sub_epi16(v6, v9);
511 :
512 0 : stp1[7] = _mm_add_epi16(v7, v8);
513 0 : stp1[8] = _mm_sub_epi16(v7, v8);
514 :
515 0 : stp1[4] = _mm_add_epi16(v4, v11);
516 0 : stp1[11] = _mm_sub_epi16(v4, v11);
517 :
518 0 : stp1[5] = _mm_add_epi16(v5, v10);
519 0 : stp1[10] = _mm_sub_epi16(v5, v10);
520 0 : }
521 :
522 0 : static void idct32_34_second_half(const __m128i *in, __m128i *stp1) {
523 0 : const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
524 0 : const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
525 0 : const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
526 0 : const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
527 0 : const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
528 0 : const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
529 0 : const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
530 0 : const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
531 0 : const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
532 0 : const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
533 0 : const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
534 0 : const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
535 0 : const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
536 0 : const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
537 :
538 0 : const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
539 0 : const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
540 0 : const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
541 0 : const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
542 :
543 0 : const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
544 : __m128i v16, v17, v18, v19, v20, v21, v22, v23;
545 : __m128i v24, v25, v26, v27, v28, v29, v30, v31;
546 : __m128i u16, u17, u18, u19, u20, u21, u22, u23;
547 : __m128i u24, u25, u26, u27, u28, u29, u30, u31;
548 :
549 0 : v16 = _mm_mulhrs_epi16(in[1], stk1_0);
550 0 : v31 = _mm_mulhrs_epi16(in[1], stk1_1);
551 :
552 0 : v19 = _mm_mulhrs_epi16(in[7], stk1_6);
553 0 : v28 = _mm_mulhrs_epi16(in[7], stk1_7);
554 :
555 0 : v20 = _mm_mulhrs_epi16(in[5], stk1_8);
556 0 : v27 = _mm_mulhrs_epi16(in[5], stk1_9);
557 :
558 0 : v23 = _mm_mulhrs_epi16(in[3], stk1_14);
559 0 : v24 = _mm_mulhrs_epi16(in[3], stk1_15);
560 :
561 0 : butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30);
562 0 : butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29);
563 0 : butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26);
564 0 : butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25);
565 :
566 0 : u16 = _mm_add_epi16(v16, v19);
567 0 : u17 = _mm_add_epi16(v17, v18);
568 0 : u18 = _mm_sub_epi16(v17, v18);
569 0 : u19 = _mm_sub_epi16(v16, v19);
570 0 : u20 = _mm_sub_epi16(v23, v20);
571 0 : u21 = _mm_sub_epi16(v22, v21);
572 0 : u22 = _mm_add_epi16(v22, v21);
573 0 : u23 = _mm_add_epi16(v23, v20);
574 0 : u24 = _mm_add_epi16(v24, v27);
575 0 : u27 = _mm_sub_epi16(v24, v27);
576 0 : u25 = _mm_add_epi16(v25, v26);
577 0 : u26 = _mm_sub_epi16(v25, v26);
578 0 : u28 = _mm_sub_epi16(v31, v28);
579 0 : u31 = _mm_add_epi16(v28, v31);
580 0 : u29 = _mm_sub_epi16(v30, v29);
581 0 : u30 = _mm_add_epi16(v29, v30);
582 :
583 0 : butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
584 0 : butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
585 0 : butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
586 0 : butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
587 :
588 0 : stp1[16] = _mm_add_epi16(u16, u23);
589 0 : stp1[23] = _mm_sub_epi16(u16, u23);
590 :
591 0 : stp1[17] = _mm_add_epi16(u17, u22);
592 0 : stp1[22] = _mm_sub_epi16(u17, u22);
593 :
594 0 : stp1[18] = _mm_add_epi16(u18, u21);
595 0 : stp1[21] = _mm_sub_epi16(u18, u21);
596 :
597 0 : stp1[19] = _mm_add_epi16(u19, u20);
598 0 : stp1[20] = _mm_sub_epi16(u19, u20);
599 :
600 0 : stp1[24] = _mm_sub_epi16(u31, u24);
601 0 : stp1[31] = _mm_add_epi16(u24, u31);
602 :
603 0 : stp1[25] = _mm_sub_epi16(u30, u25);
604 0 : stp1[30] = _mm_add_epi16(u25, u30);
605 :
606 0 : stp1[26] = _mm_sub_epi16(u29, u26);
607 0 : stp1[29] = _mm_add_epi16(u26, u29);
608 :
609 0 : stp1[27] = _mm_sub_epi16(u28, u27);
610 0 : stp1[28] = _mm_add_epi16(u27, u28);
611 :
612 0 : butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0);
613 0 : butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0);
614 0 : butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0);
615 0 : butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0);
616 0 : }
617 :
618 : // Only upper-left 8x8 has non-zero coeff
619 0 : void aom_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
620 : int stride) {
621 0 : const __m128i zero = _mm_setzero_si128();
622 0 : const __m128i final_rounding = _mm_set1_epi16(1 << 5);
623 : __m128i in[32], col[32];
624 : __m128i stp1[32];
625 : int i;
626 :
627 : // Load input data. Only need to load the top left 8x8 block.
628 0 : in[0] = load_input_data(input);
629 0 : in[1] = load_input_data(input + 32);
630 0 : in[2] = load_input_data(input + 64);
631 0 : in[3] = load_input_data(input + 96);
632 0 : in[4] = load_input_data(input + 128);
633 0 : in[5] = load_input_data(input + 160);
634 0 : in[6] = load_input_data(input + 192);
635 0 : in[7] = load_input_data(input + 224);
636 :
637 0 : array_transpose_8x8(in, in);
638 0 : idct32_34_first_half(in, stp1);
639 0 : idct32_34_second_half(in, stp1);
640 :
641 : // 1_D: Store 32 intermediate results for each 8x32 block.
642 0 : add_sub_butterfly(stp1, col, 32);
643 0 : for (i = 0; i < 4; i++) {
644 : int j;
645 : // Transpose 32x8 block to 8x32 block
646 0 : array_transpose_8x8(col + i * 8, in);
647 0 : idct32_34_first_half(in, stp1);
648 0 : idct32_34_second_half(in, stp1);
649 :
650 : // 2_D: Calculate the results and store them to destination.
651 0 : add_sub_butterfly(stp1, in, 32);
652 0 : for (j = 0; j < 32; ++j) {
653 : // Final rounding and shift
654 0 : in[j] = _mm_adds_epi16(in[j], final_rounding);
655 0 : in[j] = _mm_srai_epi16(in[j], 6);
656 0 : RECON_AND_STORE(dest + j * stride, in[j]);
657 : }
658 :
659 0 : dest += 8;
660 : }
661 0 : }
662 :
663 : // in0[16] represents the left 8x16 block
664 : // in1[16] represents the right 8x16 block
665 0 : static void load_buffer_16x16(const tran_low_t *input, __m128i *in0,
666 : __m128i *in1) {
667 : int i;
668 0 : for (i = 0; i < 16; i++) {
669 0 : in0[i] = load_input_data(input);
670 0 : in1[i] = load_input_data(input + 8);
671 0 : input += 32;
672 : }
673 0 : }
674 :
675 0 : static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0,
676 : __m128i *out1) {
677 0 : array_transpose_8x8(in0, out0);
678 0 : array_transpose_8x8(&in0[8], out1);
679 0 : array_transpose_8x8(in1, &out0[8]);
680 0 : array_transpose_8x8(&in1[8], &out1[8]);
681 0 : }
682 :
683 : // Group the coefficient calculation into smaller functions
684 : // to prevent stack spillover:
685 : // quarter_1: 0-7
686 : // quarter_2: 8-15
687 : // quarter_3_4: 16-23, 24-31
688 0 : static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/,
689 : __m128i *out /*out[8]*/) {
690 : __m128i u0, u1, u2, u3, u4, u5, u6, u7;
691 : __m128i v0, v1, v2, v3, v4, v5, v6, v7;
692 :
693 : {
694 0 : const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
695 0 : const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
696 0 : const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
697 0 : u0 = _mm_mulhrs_epi16(in[0], stk4_0);
698 0 : u2 = _mm_mulhrs_epi16(in[8], stk4_2);
699 0 : u3 = _mm_mulhrs_epi16(in[8], stk4_3);
700 0 : u1 = u0;
701 : }
702 :
703 0 : v0 = _mm_add_epi16(u0, u3);
704 0 : v1 = _mm_add_epi16(u1, u2);
705 0 : v2 = _mm_sub_epi16(u1, u2);
706 0 : v3 = _mm_sub_epi16(u0, u3);
707 :
708 : {
709 0 : const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
710 0 : const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
711 0 : const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
712 0 : const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
713 0 : u4 = _mm_mulhrs_epi16(in[4], stk3_0);
714 0 : u7 = _mm_mulhrs_epi16(in[4], stk3_1);
715 0 : u5 = _mm_mulhrs_epi16(in[12], stk3_2);
716 0 : u6 = _mm_mulhrs_epi16(in[12], stk3_3);
717 : }
718 :
719 0 : v4 = _mm_add_epi16(u4, u5);
720 0 : v5 = _mm_sub_epi16(u4, u5);
721 0 : v6 = _mm_sub_epi16(u7, u6);
722 0 : v7 = _mm_add_epi16(u7, u6);
723 :
724 : {
725 0 : const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
726 0 : const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
727 0 : butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
728 : }
729 :
730 0 : out[0] = _mm_add_epi16(v0, v7);
731 0 : out[1] = _mm_add_epi16(v1, v6);
732 0 : out[2] = _mm_add_epi16(v2, v5);
733 0 : out[3] = _mm_add_epi16(v3, v4);
734 0 : out[4] = _mm_sub_epi16(v3, v4);
735 0 : out[5] = _mm_sub_epi16(v2, v5);
736 0 : out[6] = _mm_sub_epi16(v1, v6);
737 0 : out[7] = _mm_sub_epi16(v0, v7);
738 0 : }
739 :
740 0 : static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/,
741 : __m128i *out /*out[8]*/) {
742 : __m128i u8, u9, u10, u11, u12, u13, u14, u15;
743 : __m128i v8, v9, v10, v11, v12, v13, v14, v15;
744 :
745 : {
746 0 : const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
747 0 : const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
748 0 : const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
749 0 : const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
750 0 : const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
751 0 : const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
752 0 : const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
753 0 : const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
754 0 : u8 = _mm_mulhrs_epi16(in[2], stk2_0);
755 0 : u15 = _mm_mulhrs_epi16(in[2], stk2_1);
756 0 : u9 = _mm_mulhrs_epi16(in[14], stk2_2);
757 0 : u14 = _mm_mulhrs_epi16(in[14], stk2_3);
758 0 : u10 = _mm_mulhrs_epi16(in[10], stk2_4);
759 0 : u13 = _mm_mulhrs_epi16(in[10], stk2_5);
760 0 : u11 = _mm_mulhrs_epi16(in[6], stk2_6);
761 0 : u12 = _mm_mulhrs_epi16(in[6], stk2_7);
762 : }
763 :
764 0 : v8 = _mm_add_epi16(u8, u9);
765 0 : v9 = _mm_sub_epi16(u8, u9);
766 0 : v10 = _mm_sub_epi16(u11, u10);
767 0 : v11 = _mm_add_epi16(u11, u10);
768 0 : v12 = _mm_add_epi16(u12, u13);
769 0 : v13 = _mm_sub_epi16(u12, u13);
770 0 : v14 = _mm_sub_epi16(u15, u14);
771 0 : v15 = _mm_add_epi16(u15, u14);
772 :
773 : {
774 0 : const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
775 0 : const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
776 0 : const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
777 0 : butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
778 0 : butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
779 : }
780 :
781 0 : out[0] = _mm_add_epi16(v8, v11);
782 0 : out[1] = _mm_add_epi16(v9, v10);
783 0 : out[2] = _mm_sub_epi16(v9, v10);
784 0 : out[3] = _mm_sub_epi16(v8, v11);
785 0 : out[4] = _mm_sub_epi16(v15, v12);
786 0 : out[5] = _mm_sub_epi16(v14, v13);
787 0 : out[6] = _mm_add_epi16(v14, v13);
788 0 : out[7] = _mm_add_epi16(v15, v12);
789 :
790 : {
791 0 : const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
792 0 : const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
793 0 : butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
794 0 : butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
795 : }
796 0 : }
797 :
798 : // 8x32 block even indexed 8 inputs of in[16],
799 : // output first half 16 to out[32]
800 0 : static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/,
801 : __m128i *out /*out[32]*/) {
802 : __m128i temp[16];
803 0 : idct32_8x32_135_quarter_1(in, temp);
804 0 : idct32_8x32_135_quarter_2(in, &temp[8]);
805 0 : add_sub_butterfly(temp, out, 16);
806 0 : }
807 :
808 : // 8x32 block odd indexed 8 inputs of in[16],
809 : // output second half 16 to out[32]
810 0 : static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/,
811 : __m128i *out /*out[32]*/) {
812 : __m128i v16, v17, v18, v19, v20, v21, v22, v23;
813 : __m128i v24, v25, v26, v27, v28, v29, v30, v31;
814 : __m128i u16, u17, u18, u19, u20, u21, u22, u23;
815 : __m128i u24, u25, u26, u27, u28, u29, u30, u31;
816 :
817 : {
818 0 : const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
819 0 : const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
820 0 : const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
821 0 : const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
822 :
823 0 : const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
824 0 : const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
825 0 : const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
826 0 : const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
827 0 : const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
828 0 : const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
829 0 : const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
830 0 : const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
831 :
832 0 : const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
833 0 : const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
834 0 : const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
835 0 : const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
836 0 : u16 = _mm_mulhrs_epi16(in[1], stk1_0);
837 0 : u31 = _mm_mulhrs_epi16(in[1], stk1_1);
838 0 : u17 = _mm_mulhrs_epi16(in[15], stk1_2);
839 0 : u30 = _mm_mulhrs_epi16(in[15], stk1_3);
840 :
841 0 : u18 = _mm_mulhrs_epi16(in[9], stk1_4);
842 0 : u29 = _mm_mulhrs_epi16(in[9], stk1_5);
843 0 : u19 = _mm_mulhrs_epi16(in[7], stk1_6);
844 0 : u28 = _mm_mulhrs_epi16(in[7], stk1_7);
845 :
846 0 : u20 = _mm_mulhrs_epi16(in[5], stk1_8);
847 0 : u27 = _mm_mulhrs_epi16(in[5], stk1_9);
848 0 : u21 = _mm_mulhrs_epi16(in[11], stk1_10);
849 0 : u26 = _mm_mulhrs_epi16(in[11], stk1_11);
850 :
851 0 : u22 = _mm_mulhrs_epi16(in[13], stk1_12);
852 0 : u25 = _mm_mulhrs_epi16(in[13], stk1_13);
853 0 : u23 = _mm_mulhrs_epi16(in[3], stk1_14);
854 0 : u24 = _mm_mulhrs_epi16(in[3], stk1_15);
855 : }
856 :
857 0 : v16 = _mm_add_epi16(u16, u17);
858 0 : v17 = _mm_sub_epi16(u16, u17);
859 0 : v18 = _mm_sub_epi16(u19, u18);
860 0 : v19 = _mm_add_epi16(u19, u18);
861 :
862 0 : v20 = _mm_add_epi16(u20, u21);
863 0 : v21 = _mm_sub_epi16(u20, u21);
864 0 : v22 = _mm_sub_epi16(u23, u22);
865 0 : v23 = _mm_add_epi16(u23, u22);
866 :
867 0 : v24 = _mm_add_epi16(u24, u25);
868 0 : v25 = _mm_sub_epi16(u24, u25);
869 0 : v26 = _mm_sub_epi16(u27, u26);
870 0 : v27 = _mm_add_epi16(u27, u26);
871 :
872 0 : v28 = _mm_add_epi16(u28, u29);
873 0 : v29 = _mm_sub_epi16(u28, u29);
874 0 : v30 = _mm_sub_epi16(u31, u30);
875 0 : v31 = _mm_add_epi16(u31, u30);
876 :
877 : {
878 0 : const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
879 0 : const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
880 0 : const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
881 0 : const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
882 0 : const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
883 0 : const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
884 :
885 0 : butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
886 0 : butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
887 0 : butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
888 0 : butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
889 : }
890 :
891 0 : u16 = _mm_add_epi16(v16, v19);
892 0 : u17 = _mm_add_epi16(v17, v18);
893 0 : u18 = _mm_sub_epi16(v17, v18);
894 0 : u19 = _mm_sub_epi16(v16, v19);
895 0 : u20 = _mm_sub_epi16(v23, v20);
896 0 : u21 = _mm_sub_epi16(v22, v21);
897 0 : u22 = _mm_add_epi16(v22, v21);
898 0 : u23 = _mm_add_epi16(v23, v20);
899 :
900 0 : u24 = _mm_add_epi16(v24, v27);
901 0 : u25 = _mm_add_epi16(v25, v26);
902 0 : u26 = _mm_sub_epi16(v25, v26);
903 0 : u27 = _mm_sub_epi16(v24, v27);
904 0 : u28 = _mm_sub_epi16(v31, v28);
905 0 : u29 = _mm_sub_epi16(v30, v29);
906 0 : u30 = _mm_add_epi16(v29, v30);
907 0 : u31 = _mm_add_epi16(v28, v31);
908 :
909 : {
910 0 : const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
911 0 : const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
912 0 : const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
913 0 : butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
914 0 : butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
915 0 : butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
916 0 : butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
917 : }
918 :
919 0 : out[0] = _mm_add_epi16(u16, u23);
920 0 : out[1] = _mm_add_epi16(u17, u22);
921 0 : out[2] = _mm_add_epi16(u18, u21);
922 0 : out[3] = _mm_add_epi16(u19, u20);
923 0 : v20 = _mm_sub_epi16(u19, u20);
924 0 : v21 = _mm_sub_epi16(u18, u21);
925 0 : v22 = _mm_sub_epi16(u17, u22);
926 0 : v23 = _mm_sub_epi16(u16, u23);
927 :
928 0 : v24 = _mm_sub_epi16(u31, u24);
929 0 : v25 = _mm_sub_epi16(u30, u25);
930 0 : v26 = _mm_sub_epi16(u29, u26);
931 0 : v27 = _mm_sub_epi16(u28, u27);
932 0 : out[12] = _mm_add_epi16(u27, u28);
933 0 : out[13] = _mm_add_epi16(u26, u29);
934 0 : out[14] = _mm_add_epi16(u25, u30);
935 0 : out[15] = _mm_add_epi16(u24, u31);
936 :
937 : {
938 0 : const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
939 0 : const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
940 0 : butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]);
941 0 : butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]);
942 0 : butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]);
943 0 : butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]);
944 : }
945 0 : }
946 :
947 : // 8x16 block, input __m128i in[16], output __m128i in[32]
948 0 : static void idct32_8x32_135(__m128i *in /*in[32]*/) {
949 : __m128i out[32];
950 0 : idct32_8x32_quarter_1_2(in, out);
951 0 : idct32_8x32_quarter_3_4(in, &out[16]);
952 0 : add_sub_butterfly(out, in, 32);
953 0 : }
954 :
955 0 : static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
956 0 : const __m128i final_rounding = _mm_set1_epi16(1 << 5);
957 0 : const __m128i zero = _mm_setzero_si128();
958 0 : int j = 0;
959 0 : while (j < 32) {
960 0 : in[j] = _mm_adds_epi16(in[j], final_rounding);
961 0 : in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
962 :
963 0 : in[j] = _mm_srai_epi16(in[j], 6);
964 0 : in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
965 :
966 0 : RECON_AND_STORE(dst, in[j]);
967 0 : dst += stride;
968 0 : RECON_AND_STORE(dst, in[j + 1]);
969 0 : dst += stride;
970 0 : j += 2;
971 : }
972 0 : }
973 :
974 0 : static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest,
975 : int stride) {
976 0 : store_buffer_8x32(in0, dest, stride);
977 0 : store_buffer_8x32(in1, dest + 8, stride);
978 0 : }
979 :
980 0 : static INLINE void idct32_135(__m128i *col0, __m128i *col1) {
981 0 : idct32_8x32_135(col0);
982 0 : idct32_8x32_135(col1);
983 0 : }
984 :
985 : typedef enum { left_16, right_16 } ColsIndicator;
986 :
987 0 : static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store,
988 : ColsIndicator cols) {
989 0 : switch (cols) {
990 : case left_16: {
991 : int i;
992 0 : array_transpose_16x16(in0, in1);
993 0 : for (i = 0; i < 16; ++i) {
994 0 : store[i] = in0[16 + i];
995 0 : store[16 + i] = in1[16 + i];
996 : }
997 0 : break;
998 : }
999 : case right_16: {
1000 0 : array_transpose_16x16_2(store, &store[16], in0, in1);
1001 0 : break;
1002 : }
1003 0 : default: { assert(0); }
1004 : }
1005 0 : }
1006 :
1007 : // Only upper-left 16x16 has non-zero coeff
1008 0 : void aom_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
1009 : int stride) {
1010 : // Each array represents an 8x32 block
1011 : __m128i col0[32], col1[32];
1012 : // This array represents a 16x16 block
1013 : __m128i temp[32];
1014 :
1015 : // Load input data. Only need to load the top left 16x16 block.
1016 0 : load_buffer_16x16(input, col0, col1);
1017 :
1018 : // columns
1019 0 : array_transpose_16x16(col0, col1);
1020 0 : idct32_135(col0, col1);
1021 :
1022 : // rows
1023 0 : transpose_and_copy_16x16(col0, col1, temp, left_16);
1024 0 : idct32_135(col0, col1);
1025 0 : recon_and_store(col0, col1, dest, stride);
1026 :
1027 0 : transpose_and_copy_16x16(col0, col1, temp, right_16);
1028 0 : idct32_135(col0, col1);
1029 0 : recon_and_store(col0, col1, dest + 16, stride);
1030 0 : }
1031 :
1032 : // For each 8x32 block __m128i in[32],
1033 : // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
1034 : // output pixels: 8-15 in __m128i in[32]
1035 0 : static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
1036 : __m128i *out /*out[16]*/) {
1037 : __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_
1038 : __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_
1039 :
1040 : {
1041 0 : const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1042 0 : const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1043 0 : const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1044 0 : const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1045 0 : butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
1046 0 : butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
1047 : }
1048 :
1049 0 : v8 = _mm_add_epi16(u8, u9);
1050 0 : v9 = _mm_sub_epi16(u8, u9);
1051 0 : v14 = _mm_sub_epi16(u15, u14);
1052 0 : v15 = _mm_add_epi16(u15, u14);
1053 :
1054 : {
1055 0 : const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1056 0 : const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1057 0 : const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1058 0 : const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1059 0 : butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
1060 0 : butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
1061 : }
1062 :
1063 0 : v10 = _mm_sub_epi16(u11, u10);
1064 0 : v11 = _mm_add_epi16(u11, u10);
1065 0 : v12 = _mm_add_epi16(u12, u13);
1066 0 : v13 = _mm_sub_epi16(u12, u13);
1067 :
1068 : {
1069 0 : const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1070 0 : const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1071 0 : const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1072 0 : butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
1073 0 : butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
1074 : }
1075 :
1076 0 : out[0] = _mm_add_epi16(v8, v11);
1077 0 : out[1] = _mm_add_epi16(v9, v10);
1078 0 : out[6] = _mm_add_epi16(v14, v13);
1079 0 : out[7] = _mm_add_epi16(v15, v12);
1080 :
1081 0 : out[2] = _mm_sub_epi16(v9, v10);
1082 0 : out[3] = _mm_sub_epi16(v8, v11);
1083 0 : out[4] = _mm_sub_epi16(v15, v12);
1084 0 : out[5] = _mm_sub_epi16(v14, v13);
1085 :
1086 : {
1087 0 : const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1088 0 : const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1089 0 : butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
1090 0 : butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
1091 : }
1092 0 : }
1093 :
1094 : // For each 8x32 block __m128i in[32],
1095 : // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
1096 : // output pixels: 0-7 in __m128i in[32]
1097 0 : static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/,
1098 : __m128i *out /*out[8]*/) {
1099 : __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_
1100 : __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_
1101 :
1102 : {
1103 0 : const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1104 0 : const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1105 0 : const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1106 0 : const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1107 0 : butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
1108 0 : butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
1109 : }
1110 :
1111 0 : v4 = _mm_add_epi16(u4, u5);
1112 0 : v5 = _mm_sub_epi16(u4, u5);
1113 0 : v6 = _mm_sub_epi16(u7, u6);
1114 0 : v7 = _mm_add_epi16(u7, u6);
1115 :
1116 : {
1117 0 : const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1118 0 : const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1119 0 : const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1120 0 : const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1121 0 : butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
1122 :
1123 0 : butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
1124 0 : butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
1125 : }
1126 :
1127 0 : v0 = _mm_add_epi16(u0, u3);
1128 0 : v1 = _mm_add_epi16(u1, u2);
1129 0 : v2 = _mm_sub_epi16(u1, u2);
1130 0 : v3 = _mm_sub_epi16(u0, u3);
1131 :
1132 0 : out[0] = _mm_add_epi16(v0, v7);
1133 0 : out[1] = _mm_add_epi16(v1, v6);
1134 0 : out[2] = _mm_add_epi16(v2, v5);
1135 0 : out[3] = _mm_add_epi16(v3, v4);
1136 0 : out[4] = _mm_sub_epi16(v3, v4);
1137 0 : out[5] = _mm_sub_epi16(v2, v5);
1138 0 : out[6] = _mm_sub_epi16(v1, v6);
1139 0 : out[7] = _mm_sub_epi16(v0, v7);
1140 0 : }
1141 :
1142 : // For each 8x32 block __m128i in[32],
1143 : // Input with odd index,
1144 : // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
1145 : // output pixels: 16-23, 24-31 in __m128i in[32]
1146 : // We avoid hide an offset, 16, inside this function. So we output 0-15 into
1147 : // array out[16]
1148 0 : static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
1149 : __m128i *out /*out[16]*/) {
1150 : __m128i v16, v17, v18, v19, v20, v21, v22, v23;
1151 : __m128i v24, v25, v26, v27, v28, v29, v30, v31;
1152 : __m128i u16, u17, u18, u19, u20, u21, u22, u23;
1153 : __m128i u24, u25, u26, u27, u28, u29, u30, u31;
1154 :
1155 : {
1156 0 : const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1157 0 : const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
1158 0 : const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1159 0 : const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
1160 0 : const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1161 0 : const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
1162 0 : const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1163 0 : const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
1164 0 : const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1165 0 : const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
1166 0 : const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1167 0 : const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1168 0 : const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1169 0 : const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
1170 0 : const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1171 0 : const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
1172 0 : butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
1173 0 : butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
1174 0 : butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
1175 0 : butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
1176 :
1177 0 : butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
1178 0 : butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
1179 :
1180 0 : butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
1181 0 : butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
1182 : }
1183 :
1184 0 : v16 = _mm_add_epi16(u16, u17);
1185 0 : v17 = _mm_sub_epi16(u16, u17);
1186 0 : v18 = _mm_sub_epi16(u19, u18);
1187 0 : v19 = _mm_add_epi16(u19, u18);
1188 :
1189 0 : v20 = _mm_add_epi16(u20, u21);
1190 0 : v21 = _mm_sub_epi16(u20, u21);
1191 0 : v22 = _mm_sub_epi16(u23, u22);
1192 0 : v23 = _mm_add_epi16(u23, u22);
1193 :
1194 0 : v24 = _mm_add_epi16(u24, u25);
1195 0 : v25 = _mm_sub_epi16(u24, u25);
1196 0 : v26 = _mm_sub_epi16(u27, u26);
1197 0 : v27 = _mm_add_epi16(u27, u26);
1198 :
1199 0 : v28 = _mm_add_epi16(u28, u29);
1200 0 : v29 = _mm_sub_epi16(u28, u29);
1201 0 : v30 = _mm_sub_epi16(u31, u30);
1202 0 : v31 = _mm_add_epi16(u31, u30);
1203 :
1204 : {
1205 0 : const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
1206 0 : const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
1207 0 : const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
1208 0 : const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1209 0 : const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
1210 0 : const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
1211 0 : butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
1212 0 : butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
1213 0 : butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
1214 0 : butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
1215 : }
1216 :
1217 0 : u16 = _mm_add_epi16(v16, v19);
1218 0 : u17 = _mm_add_epi16(v17, v18);
1219 0 : u18 = _mm_sub_epi16(v17, v18);
1220 0 : u19 = _mm_sub_epi16(v16, v19);
1221 0 : u20 = _mm_sub_epi16(v23, v20);
1222 0 : u21 = _mm_sub_epi16(v22, v21);
1223 0 : u22 = _mm_add_epi16(v22, v21);
1224 0 : u23 = _mm_add_epi16(v23, v20);
1225 :
1226 0 : u24 = _mm_add_epi16(v24, v27);
1227 0 : u25 = _mm_add_epi16(v25, v26);
1228 0 : u26 = _mm_sub_epi16(v25, v26);
1229 0 : u27 = _mm_sub_epi16(v24, v27);
1230 :
1231 0 : u28 = _mm_sub_epi16(v31, v28);
1232 0 : u29 = _mm_sub_epi16(v30, v29);
1233 0 : u30 = _mm_add_epi16(v29, v30);
1234 0 : u31 = _mm_add_epi16(v28, v31);
1235 :
1236 : {
1237 0 : const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1238 0 : const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1239 0 : const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1240 0 : butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
1241 0 : butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
1242 0 : butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
1243 0 : butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
1244 : }
1245 :
1246 0 : out[0] = _mm_add_epi16(u16, u23);
1247 0 : out[1] = _mm_add_epi16(u17, u22);
1248 0 : out[2] = _mm_add_epi16(u18, u21);
1249 0 : out[3] = _mm_add_epi16(u19, u20);
1250 0 : out[4] = _mm_sub_epi16(u19, u20);
1251 0 : out[5] = _mm_sub_epi16(u18, u21);
1252 0 : out[6] = _mm_sub_epi16(u17, u22);
1253 0 : out[7] = _mm_sub_epi16(u16, u23);
1254 :
1255 0 : out[8] = _mm_sub_epi16(u31, u24);
1256 0 : out[9] = _mm_sub_epi16(u30, u25);
1257 0 : out[10] = _mm_sub_epi16(u29, u26);
1258 0 : out[11] = _mm_sub_epi16(u28, u27);
1259 0 : out[12] = _mm_add_epi16(u27, u28);
1260 0 : out[13] = _mm_add_epi16(u26, u29);
1261 0 : out[14] = _mm_add_epi16(u25, u30);
1262 0 : out[15] = _mm_add_epi16(u24, u31);
1263 :
1264 : {
1265 0 : const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1266 0 : const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1267 0 : butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
1268 0 : butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
1269 0 : butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
1270 0 : butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
1271 : }
1272 0 : }
1273 :
1274 0 : static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/,
1275 : __m128i *out /*out[32]*/) {
1276 : __m128i temp[16];
1277 0 : idct32_full_8x32_quarter_1(in, temp);
1278 0 : idct32_full_8x32_quarter_2(in, &temp[8]);
1279 0 : add_sub_butterfly(temp, out, 16);
1280 0 : }
1281 :
1282 0 : static void idct32_full_8x32(const __m128i *in /*in[32]*/,
1283 : __m128i *out /*out[32]*/) {
1284 : __m128i temp[32];
1285 0 : idct32_full_8x32_quarter_1_2(in, temp);
1286 0 : idct32_full_8x32_quarter_3_4(in, &temp[16]);
1287 0 : add_sub_butterfly(temp, out, 32);
1288 0 : }
1289 :
1290 0 : static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
1291 : int i;
1292 0 : for (i = 0; i < 8; ++i) {
1293 0 : in[i] = load_input_data(input);
1294 0 : in[i + 8] = load_input_data(input + 8);
1295 0 : in[i + 16] = load_input_data(input + 16);
1296 0 : in[i + 24] = load_input_data(input + 24);
1297 0 : input += 32;
1298 : }
1299 0 : }
1300 :
1301 0 : void aom_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest,
1302 : int stride) {
1303 : __m128i col[128], in[32];
1304 : int i, j;
1305 :
1306 : // rows
1307 0 : for (i = 0; i < 4; ++i) {
1308 0 : load_buffer_8x32(input, in);
1309 0 : input += 32 << 3;
1310 :
1311 : // Transpose 32x8 block to 8x32 block
1312 0 : array_transpose_8x8(in, in);
1313 0 : array_transpose_8x8(in + 8, in + 8);
1314 0 : array_transpose_8x8(in + 16, in + 16);
1315 0 : array_transpose_8x8(in + 24, in + 24);
1316 :
1317 0 : idct32_full_8x32(in, col + (i << 5));
1318 : }
1319 :
1320 : // columns
1321 0 : for (i = 0; i < 4; ++i) {
1322 0 : j = i << 3;
1323 : // Transpose 32x8 block to 8x32 block
1324 0 : array_transpose_8x8(col + j, in);
1325 0 : array_transpose_8x8(col + j + 32, in + 8);
1326 0 : array_transpose_8x8(col + j + 64, in + 16);
1327 0 : array_transpose_8x8(col + j + 96, in + 24);
1328 :
1329 0 : idct32_full_8x32(in, in);
1330 0 : store_buffer_8x32(in, dest, stride);
1331 0 : dest += 8;
1332 : }
1333 0 : }
|