Line data Source code
1 : #include "av1/common/x86/av1_txfm1d_sse4.h"
2 :
3 0 : void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
4 : const int8_t *cos_bit, const int8_t *stage_range) {
5 0 : const int txfm_size = 32;
6 0 : const int num_per_128 = 4;
7 : const int32_t *cospi;
8 : __m128i buf0[32];
9 : __m128i buf1[32];
10 0 : int col_num = txfm_size / num_per_128;
11 : int bit;
12 : int col;
13 : (void)stage_range;
14 0 : for (col = 0; col < col_num; col++) {
15 : // stage 0;
16 0 : int32_t stage_idx = 0;
17 : int j;
18 0 : for (j = 0; j < 32; ++j) {
19 0 : buf0[j] = input[j * col_num + col];
20 : }
21 :
22 : // stage 1
23 0 : stage_idx++;
24 0 : buf1[0] = _mm_add_epi32(buf0[0], buf0[31]);
25 0 : buf1[31] = _mm_sub_epi32(buf0[0], buf0[31]);
26 0 : buf1[1] = _mm_add_epi32(buf0[1], buf0[30]);
27 0 : buf1[30] = _mm_sub_epi32(buf0[1], buf0[30]);
28 0 : buf1[2] = _mm_add_epi32(buf0[2], buf0[29]);
29 0 : buf1[29] = _mm_sub_epi32(buf0[2], buf0[29]);
30 0 : buf1[3] = _mm_add_epi32(buf0[3], buf0[28]);
31 0 : buf1[28] = _mm_sub_epi32(buf0[3], buf0[28]);
32 0 : buf1[4] = _mm_add_epi32(buf0[4], buf0[27]);
33 0 : buf1[27] = _mm_sub_epi32(buf0[4], buf0[27]);
34 0 : buf1[5] = _mm_add_epi32(buf0[5], buf0[26]);
35 0 : buf1[26] = _mm_sub_epi32(buf0[5], buf0[26]);
36 0 : buf1[6] = _mm_add_epi32(buf0[6], buf0[25]);
37 0 : buf1[25] = _mm_sub_epi32(buf0[6], buf0[25]);
38 0 : buf1[7] = _mm_add_epi32(buf0[7], buf0[24]);
39 0 : buf1[24] = _mm_sub_epi32(buf0[7], buf0[24]);
40 0 : buf1[8] = _mm_add_epi32(buf0[8], buf0[23]);
41 0 : buf1[23] = _mm_sub_epi32(buf0[8], buf0[23]);
42 0 : buf1[9] = _mm_add_epi32(buf0[9], buf0[22]);
43 0 : buf1[22] = _mm_sub_epi32(buf0[9], buf0[22]);
44 0 : buf1[10] = _mm_add_epi32(buf0[10], buf0[21]);
45 0 : buf1[21] = _mm_sub_epi32(buf0[10], buf0[21]);
46 0 : buf1[11] = _mm_add_epi32(buf0[11], buf0[20]);
47 0 : buf1[20] = _mm_sub_epi32(buf0[11], buf0[20]);
48 0 : buf1[12] = _mm_add_epi32(buf0[12], buf0[19]);
49 0 : buf1[19] = _mm_sub_epi32(buf0[12], buf0[19]);
50 0 : buf1[13] = _mm_add_epi32(buf0[13], buf0[18]);
51 0 : buf1[18] = _mm_sub_epi32(buf0[13], buf0[18]);
52 0 : buf1[14] = _mm_add_epi32(buf0[14], buf0[17]);
53 0 : buf1[17] = _mm_sub_epi32(buf0[14], buf0[17]);
54 0 : buf1[15] = _mm_add_epi32(buf0[15], buf0[16]);
55 0 : buf1[16] = _mm_sub_epi32(buf0[15], buf0[16]);
56 :
57 : // stage 2
58 0 : stage_idx++;
59 0 : bit = cos_bit[stage_idx];
60 0 : cospi = cospi_arr(bit);
61 0 : buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
62 0 : buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
63 0 : buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
64 0 : buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
65 0 : buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
66 0 : buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
67 0 : buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
68 0 : buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
69 0 : buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
70 0 : buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
71 0 : buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
72 0 : buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
73 0 : buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
74 0 : buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
75 0 : buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
76 0 : buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
77 0 : buf0[16] = buf1[16];
78 0 : buf0[17] = buf1[17];
79 0 : buf0[18] = buf1[18];
80 0 : buf0[19] = buf1[19];
81 0 : btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
82 : buf0[27], bit);
83 0 : btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
84 : buf0[26], bit);
85 0 : btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
86 : buf0[25], bit);
87 0 : btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
88 : buf0[24], bit);
89 0 : buf0[28] = buf1[28];
90 0 : buf0[29] = buf1[29];
91 0 : buf0[30] = buf1[30];
92 0 : buf0[31] = buf1[31];
93 :
94 : // stage 3
95 0 : stage_idx++;
96 0 : bit = cos_bit[stage_idx];
97 0 : cospi = cospi_arr(bit);
98 0 : buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
99 0 : buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
100 0 : buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
101 0 : buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
102 0 : buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
103 0 : buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
104 0 : buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
105 0 : buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
106 0 : buf1[8] = buf0[8];
107 0 : buf1[9] = buf0[9];
108 0 : btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
109 : buf1[13], bit);
110 0 : btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
111 : buf1[12], bit);
112 0 : buf1[14] = buf0[14];
113 0 : buf1[15] = buf0[15];
114 0 : buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
115 0 : buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
116 0 : buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
117 0 : buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
118 0 : buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
119 0 : buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
120 0 : buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
121 0 : buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
122 0 : buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
123 0 : buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
124 0 : buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
125 0 : buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
126 0 : buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
127 0 : buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
128 0 : buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
129 0 : buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
130 :
131 : // stage 4
132 0 : stage_idx++;
133 0 : bit = cos_bit[stage_idx];
134 0 : cospi = cospi_arr(bit);
135 0 : buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
136 0 : buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
137 0 : buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
138 0 : buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
139 0 : buf0[4] = buf1[4];
140 0 : btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
141 : buf0[6], bit);
142 0 : buf0[7] = buf1[7];
143 0 : buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
144 0 : buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
145 0 : buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
146 0 : buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
147 0 : buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
148 0 : buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
149 0 : buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
150 0 : buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
151 0 : buf0[16] = buf1[16];
152 0 : buf0[17] = buf1[17];
153 0 : btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
154 : buf0[29], bit);
155 0 : btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
156 : buf0[28], bit);
157 0 : btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
158 : buf0[27], bit);
159 0 : btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
160 : buf0[26], bit);
161 0 : buf0[22] = buf1[22];
162 0 : buf0[23] = buf1[23];
163 0 : buf0[24] = buf1[24];
164 0 : buf0[25] = buf1[25];
165 0 : buf0[30] = buf1[30];
166 0 : buf0[31] = buf1[31];
167 :
168 : // stage 5
169 0 : stage_idx++;
170 0 : bit = cos_bit[stage_idx];
171 0 : cospi = cospi_arr(bit);
172 0 : btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
173 : buf1[1], bit);
174 0 : btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
175 : buf1[3], bit);
176 0 : buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
177 0 : buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
178 0 : buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
179 0 : buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
180 0 : buf1[8] = buf0[8];
181 0 : btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
182 : buf1[14], bit);
183 0 : btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
184 : buf1[13], bit);
185 0 : buf1[11] = buf0[11];
186 0 : buf1[12] = buf0[12];
187 0 : buf1[15] = buf0[15];
188 0 : buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
189 0 : buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
190 0 : buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
191 0 : buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
192 0 : buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
193 0 : buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
194 0 : buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
195 0 : buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
196 0 : buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
197 0 : buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
198 0 : buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
199 0 : buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
200 0 : buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
201 0 : buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
202 0 : buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
203 0 : buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
204 :
205 : // stage 6
206 0 : stage_idx++;
207 0 : bit = cos_bit[stage_idx];
208 0 : cospi = cospi_arr(bit);
209 0 : buf0[0] = buf1[0];
210 0 : buf0[1] = buf1[1];
211 0 : buf0[2] = buf1[2];
212 0 : buf0[3] = buf1[3];
213 0 : btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
214 : bit);
215 0 : btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
216 : buf0[6], bit);
217 0 : buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
218 0 : buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
219 0 : buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
220 0 : buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
221 0 : buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
222 0 : buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
223 0 : buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
224 0 : buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
225 0 : buf0[16] = buf1[16];
226 0 : btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
227 : buf0[30], bit);
228 0 : btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
229 : buf0[29], bit);
230 0 : buf0[19] = buf1[19];
231 0 : buf0[20] = buf1[20];
232 0 : btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
233 : buf0[26], bit);
234 0 : btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
235 : buf0[25], bit);
236 0 : buf0[23] = buf1[23];
237 0 : buf0[24] = buf1[24];
238 0 : buf0[27] = buf1[27];
239 0 : buf0[28] = buf1[28];
240 0 : buf0[31] = buf1[31];
241 :
242 : // stage 7
243 0 : stage_idx++;
244 0 : bit = cos_bit[stage_idx];
245 0 : cospi = cospi_arr(bit);
246 0 : buf1[0] = buf0[0];
247 0 : buf1[1] = buf0[1];
248 0 : buf1[2] = buf0[2];
249 0 : buf1[3] = buf0[3];
250 0 : buf1[4] = buf0[4];
251 0 : buf1[5] = buf0[5];
252 0 : buf1[6] = buf0[6];
253 0 : buf1[7] = buf0[7];
254 0 : btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8],
255 : buf1[15], bit);
256 0 : btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
257 : buf1[14], bit);
258 0 : btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
259 : buf1[13], bit);
260 0 : btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
261 : buf1[12], bit);
262 0 : buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
263 0 : buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
264 0 : buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
265 0 : buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
266 0 : buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
267 0 : buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
268 0 : buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
269 0 : buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
270 0 : buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
271 0 : buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
272 0 : buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
273 0 : buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
274 0 : buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
275 0 : buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
276 0 : buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
277 0 : buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
278 :
279 : // stage 8
280 0 : stage_idx++;
281 0 : bit = cos_bit[stage_idx];
282 0 : cospi = cospi_arr(bit);
283 0 : buf0[0] = buf1[0];
284 0 : buf0[1] = buf1[1];
285 0 : buf0[2] = buf1[2];
286 0 : buf0[3] = buf1[3];
287 0 : buf0[4] = buf1[4];
288 0 : buf0[5] = buf1[5];
289 0 : buf0[6] = buf1[6];
290 0 : buf0[7] = buf1[7];
291 0 : buf0[8] = buf1[8];
292 0 : buf0[9] = buf1[9];
293 0 : buf0[10] = buf1[10];
294 0 : buf0[11] = buf1[11];
295 0 : buf0[12] = buf1[12];
296 0 : buf0[13] = buf1[13];
297 0 : buf0[14] = buf1[14];
298 0 : buf0[15] = buf1[15];
299 0 : btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
300 : buf0[31], bit);
301 0 : btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
302 : buf0[30], bit);
303 0 : btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
304 : buf0[29], bit);
305 0 : btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
306 : buf0[28], bit);
307 0 : btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
308 : buf0[27], bit);
309 0 : btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
310 : buf0[26], bit);
311 0 : btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
312 : buf0[25], bit);
313 0 : btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
314 : buf0[24], bit);
315 :
316 : // stage 9
317 0 : stage_idx++;
318 0 : buf1[0] = buf0[0];
319 0 : buf1[1] = buf0[16];
320 0 : buf1[2] = buf0[8];
321 0 : buf1[3] = buf0[24];
322 0 : buf1[4] = buf0[4];
323 0 : buf1[5] = buf0[20];
324 0 : buf1[6] = buf0[12];
325 0 : buf1[7] = buf0[28];
326 0 : buf1[8] = buf0[2];
327 0 : buf1[9] = buf0[18];
328 0 : buf1[10] = buf0[10];
329 0 : buf1[11] = buf0[26];
330 0 : buf1[12] = buf0[6];
331 0 : buf1[13] = buf0[22];
332 0 : buf1[14] = buf0[14];
333 0 : buf1[15] = buf0[30];
334 0 : buf1[16] = buf0[1];
335 0 : buf1[17] = buf0[17];
336 0 : buf1[18] = buf0[9];
337 0 : buf1[19] = buf0[25];
338 0 : buf1[20] = buf0[5];
339 0 : buf1[21] = buf0[21];
340 0 : buf1[22] = buf0[13];
341 0 : buf1[23] = buf0[29];
342 0 : buf1[24] = buf0[3];
343 0 : buf1[25] = buf0[19];
344 0 : buf1[26] = buf0[11];
345 0 : buf1[27] = buf0[27];
346 0 : buf1[28] = buf0[7];
347 0 : buf1[29] = buf0[23];
348 0 : buf1[30] = buf0[15];
349 0 : buf1[31] = buf0[31];
350 :
351 0 : for (j = 0; j < 32; ++j) {
352 0 : output[j * col_num + col] = buf1[j];
353 : }
354 : }
355 0 : }
356 :
357 0 : void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
358 : const int8_t *cos_bit, const int8_t *stage_range) {
359 0 : const int txfm_size = 4;
360 0 : const int num_per_128 = 4;
361 : const int32_t *cospi;
362 : __m128i buf0[4];
363 : __m128i buf1[4];
364 0 : int col_num = txfm_size / num_per_128;
365 : int bit;
366 : int col;
367 : (void)stage_range;
368 0 : for (col = 0; col < col_num; col++) {
369 : // stage 0;
370 0 : int32_t stage_idx = 0;
371 : int j;
372 0 : for (j = 0; j < 4; ++j) {
373 0 : buf0[j] = input[j * col_num + col];
374 : }
375 :
376 : // stage 1
377 0 : stage_idx++;
378 0 : buf1[0] = buf0[3];
379 0 : buf1[1] = buf0[0];
380 0 : buf1[2] = buf0[1];
381 0 : buf1[3] = buf0[2];
382 :
383 : // stage 2
384 0 : stage_idx++;
385 0 : bit = cos_bit[stage_idx];
386 0 : cospi = cospi_arr(bit);
387 0 : btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
388 : bit);
389 0 : btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
390 : buf0[3], bit);
391 :
392 : // stage 3
393 0 : stage_idx++;
394 0 : buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
395 0 : buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
396 0 : buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
397 0 : buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
398 :
399 : // stage 4
400 0 : stage_idx++;
401 0 : bit = cos_bit[stage_idx];
402 0 : cospi = cospi_arr(bit);
403 0 : buf0[0] = buf1[0];
404 0 : buf0[1] = buf1[1];
405 0 : btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
406 : buf0[3], bit);
407 :
408 : // stage 5
409 0 : stage_idx++;
410 0 : buf1[0] = buf0[0];
411 0 : buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
412 0 : buf1[2] = buf0[3];
413 0 : buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
414 :
415 0 : for (j = 0; j < 4; ++j) {
416 0 : output[j * col_num + col] = buf1[j];
417 : }
418 : }
419 0 : }
420 :
421 0 : void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
422 : const int8_t *cos_bit, const int8_t *stage_range) {
423 0 : const int txfm_size = 32;
424 0 : const int num_per_128 = 4;
425 : const int32_t *cospi;
426 : __m128i buf0[32];
427 : __m128i buf1[32];
428 0 : int col_num = txfm_size / num_per_128;
429 : int bit;
430 : int col;
431 : (void)stage_range;
432 0 : for (col = 0; col < col_num; col++) {
433 : // stage 0;
434 0 : int32_t stage_idx = 0;
435 : int j;
436 0 : for (j = 0; j < 32; ++j) {
437 0 : buf0[j] = input[j * col_num + col];
438 : }
439 :
440 : // stage 1
441 0 : stage_idx++;
442 0 : buf1[0] = buf0[31];
443 0 : buf1[1] = buf0[0];
444 0 : buf1[2] = buf0[29];
445 0 : buf1[3] = buf0[2];
446 0 : buf1[4] = buf0[27];
447 0 : buf1[5] = buf0[4];
448 0 : buf1[6] = buf0[25];
449 0 : buf1[7] = buf0[6];
450 0 : buf1[8] = buf0[23];
451 0 : buf1[9] = buf0[8];
452 0 : buf1[10] = buf0[21];
453 0 : buf1[11] = buf0[10];
454 0 : buf1[12] = buf0[19];
455 0 : buf1[13] = buf0[12];
456 0 : buf1[14] = buf0[17];
457 0 : buf1[15] = buf0[14];
458 0 : buf1[16] = buf0[15];
459 0 : buf1[17] = buf0[16];
460 0 : buf1[18] = buf0[13];
461 0 : buf1[19] = buf0[18];
462 0 : buf1[20] = buf0[11];
463 0 : buf1[21] = buf0[20];
464 0 : buf1[22] = buf0[9];
465 0 : buf1[23] = buf0[22];
466 0 : buf1[24] = buf0[7];
467 0 : buf1[25] = buf0[24];
468 0 : buf1[26] = buf0[5];
469 0 : buf1[27] = buf0[26];
470 0 : buf1[28] = buf0[3];
471 0 : buf1[29] = buf0[28];
472 0 : buf1[30] = buf0[1];
473 0 : buf1[31] = buf0[30];
474 :
475 : // stage 2
476 0 : stage_idx++;
477 0 : bit = cos_bit[stage_idx];
478 0 : cospi = cospi_arr(bit);
479 0 : btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
480 : bit);
481 0 : btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
482 : bit);
483 0 : btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
484 : bit);
485 0 : btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
486 : buf0[7], bit);
487 0 : btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
488 : buf0[9], bit);
489 0 : btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
490 : buf0[11], bit);
491 0 : btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
492 : buf0[13], bit);
493 0 : btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
494 : buf0[15], bit);
495 0 : btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
496 : buf0[17], bit);
497 0 : btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
498 : buf0[19], bit);
499 0 : btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
500 : buf0[21], bit);
501 0 : btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
502 : buf0[23], bit);
503 0 : btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
504 : buf0[25], bit);
505 0 : btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
506 : buf0[27], bit);
507 0 : btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
508 : buf0[29], bit);
509 0 : btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
510 : buf0[31], bit);
511 :
512 : // stage 3
513 0 : stage_idx++;
514 0 : buf1[0] = _mm_add_epi32(buf0[0], buf0[16]);
515 0 : buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]);
516 0 : buf1[1] = _mm_add_epi32(buf0[1], buf0[17]);
517 0 : buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]);
518 0 : buf1[2] = _mm_add_epi32(buf0[2], buf0[18]);
519 0 : buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]);
520 0 : buf1[3] = _mm_add_epi32(buf0[3], buf0[19]);
521 0 : buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]);
522 0 : buf1[4] = _mm_add_epi32(buf0[4], buf0[20]);
523 0 : buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]);
524 0 : buf1[5] = _mm_add_epi32(buf0[5], buf0[21]);
525 0 : buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]);
526 0 : buf1[6] = _mm_add_epi32(buf0[6], buf0[22]);
527 0 : buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]);
528 0 : buf1[7] = _mm_add_epi32(buf0[7], buf0[23]);
529 0 : buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]);
530 0 : buf1[8] = _mm_add_epi32(buf0[8], buf0[24]);
531 0 : buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]);
532 0 : buf1[9] = _mm_add_epi32(buf0[9], buf0[25]);
533 0 : buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]);
534 0 : buf1[10] = _mm_add_epi32(buf0[10], buf0[26]);
535 0 : buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]);
536 0 : buf1[11] = _mm_add_epi32(buf0[11], buf0[27]);
537 0 : buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]);
538 0 : buf1[12] = _mm_add_epi32(buf0[12], buf0[28]);
539 0 : buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]);
540 0 : buf1[13] = _mm_add_epi32(buf0[13], buf0[29]);
541 0 : buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]);
542 0 : buf1[14] = _mm_add_epi32(buf0[14], buf0[30]);
543 0 : buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]);
544 0 : buf1[15] = _mm_add_epi32(buf0[15], buf0[31]);
545 0 : buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]);
546 :
547 : // stage 4
548 0 : stage_idx++;
549 0 : bit = cos_bit[stage_idx];
550 0 : cospi = cospi_arr(bit);
551 0 : buf0[0] = buf1[0];
552 0 : buf0[1] = buf1[1];
553 0 : buf0[2] = buf1[2];
554 0 : buf0[3] = buf1[3];
555 0 : buf0[4] = buf1[4];
556 0 : buf0[5] = buf1[5];
557 0 : buf0[6] = buf1[6];
558 0 : buf0[7] = buf1[7];
559 0 : buf0[8] = buf1[8];
560 0 : buf0[9] = buf1[9];
561 0 : buf0[10] = buf1[10];
562 0 : buf0[11] = buf1[11];
563 0 : buf0[12] = buf1[12];
564 0 : buf0[13] = buf1[13];
565 0 : buf0[14] = buf1[14];
566 0 : buf0[15] = buf1[15];
567 0 : btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
568 : buf0[17], bit);
569 0 : btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
570 : buf0[19], bit);
571 0 : btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
572 : buf0[21], bit);
573 0 : btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
574 : buf0[23], bit);
575 0 : btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
576 : buf0[25], bit);
577 0 : btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
578 : buf0[27], bit);
579 0 : btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
580 : buf0[29], bit);
581 0 : btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
582 : buf0[31], bit);
583 :
584 : // stage 5
585 0 : stage_idx++;
586 0 : buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
587 0 : buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
588 0 : buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
589 0 : buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
590 0 : buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
591 0 : buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
592 0 : buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
593 0 : buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
594 0 : buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
595 0 : buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
596 0 : buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
597 0 : buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
598 0 : buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
599 0 : buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
600 0 : buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
601 0 : buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
602 0 : buf1[16] = _mm_add_epi32(buf0[16], buf0[24]);
603 0 : buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]);
604 0 : buf1[17] = _mm_add_epi32(buf0[17], buf0[25]);
605 0 : buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]);
606 0 : buf1[18] = _mm_add_epi32(buf0[18], buf0[26]);
607 0 : buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]);
608 0 : buf1[19] = _mm_add_epi32(buf0[19], buf0[27]);
609 0 : buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]);
610 0 : buf1[20] = _mm_add_epi32(buf0[20], buf0[28]);
611 0 : buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]);
612 0 : buf1[21] = _mm_add_epi32(buf0[21], buf0[29]);
613 0 : buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]);
614 0 : buf1[22] = _mm_add_epi32(buf0[22], buf0[30]);
615 0 : buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]);
616 0 : buf1[23] = _mm_add_epi32(buf0[23], buf0[31]);
617 0 : buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]);
618 :
619 : // stage 6
620 0 : stage_idx++;
621 0 : bit = cos_bit[stage_idx];
622 0 : cospi = cospi_arr(bit);
623 0 : buf0[0] = buf1[0];
624 0 : buf0[1] = buf1[1];
625 0 : buf0[2] = buf1[2];
626 0 : buf0[3] = buf1[3];
627 0 : buf0[4] = buf1[4];
628 0 : buf0[5] = buf1[5];
629 0 : buf0[6] = buf1[6];
630 0 : buf0[7] = buf1[7];
631 0 : btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
632 : bit);
633 0 : btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
634 : buf0[11], bit);
635 0 : btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
636 : buf0[13], bit);
637 0 : btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
638 : buf0[15], bit);
639 0 : buf0[16] = buf1[16];
640 0 : buf0[17] = buf1[17];
641 0 : buf0[18] = buf1[18];
642 0 : buf0[19] = buf1[19];
643 0 : buf0[20] = buf1[20];
644 0 : buf0[21] = buf1[21];
645 0 : buf0[22] = buf1[22];
646 0 : buf0[23] = buf1[23];
647 0 : btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
648 : buf0[25], bit);
649 0 : btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
650 : buf0[27], bit);
651 0 : btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
652 : buf0[29], bit);
653 0 : btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
654 : buf0[31], bit);
655 :
656 : // stage 7
657 0 : stage_idx++;
658 0 : buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
659 0 : buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
660 0 : buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
661 0 : buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
662 0 : buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
663 0 : buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
664 0 : buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
665 0 : buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
666 0 : buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
667 0 : buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
668 0 : buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
669 0 : buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
670 0 : buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
671 0 : buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
672 0 : buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
673 0 : buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
674 0 : buf1[16] = _mm_add_epi32(buf0[16], buf0[20]);
675 0 : buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]);
676 0 : buf1[17] = _mm_add_epi32(buf0[17], buf0[21]);
677 0 : buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]);
678 0 : buf1[18] = _mm_add_epi32(buf0[18], buf0[22]);
679 0 : buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]);
680 0 : buf1[19] = _mm_add_epi32(buf0[19], buf0[23]);
681 0 : buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]);
682 0 : buf1[24] = _mm_add_epi32(buf0[24], buf0[28]);
683 0 : buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]);
684 0 : buf1[25] = _mm_add_epi32(buf0[25], buf0[29]);
685 0 : buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]);
686 0 : buf1[26] = _mm_add_epi32(buf0[26], buf0[30]);
687 0 : buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]);
688 0 : buf1[27] = _mm_add_epi32(buf0[27], buf0[31]);
689 0 : buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]);
690 :
691 : // stage 8
692 0 : stage_idx++;
693 0 : bit = cos_bit[stage_idx];
694 0 : cospi = cospi_arr(bit);
695 0 : buf0[0] = buf1[0];
696 0 : buf0[1] = buf1[1];
697 0 : buf0[2] = buf1[2];
698 0 : buf0[3] = buf1[3];
699 0 : btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
700 : buf0[5], bit);
701 0 : btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
702 : buf0[7], bit);
703 0 : buf0[8] = buf1[8];
704 0 : buf0[9] = buf1[9];
705 0 : buf0[10] = buf1[10];
706 0 : buf0[11] = buf1[11];
707 0 : btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
708 : buf0[13], bit);
709 0 : btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
710 : buf0[15], bit);
711 0 : buf0[16] = buf1[16];
712 0 : buf0[17] = buf1[17];
713 0 : buf0[18] = buf1[18];
714 0 : buf0[19] = buf1[19];
715 0 : btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
716 : buf0[21], bit);
717 0 : btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
718 : buf0[23], bit);
719 0 : buf0[24] = buf1[24];
720 0 : buf0[25] = buf1[25];
721 0 : buf0[26] = buf1[26];
722 0 : buf0[27] = buf1[27];
723 0 : btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
724 : buf0[29], bit);
725 0 : btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
726 : buf0[31], bit);
727 :
728 : // stage 9
729 0 : stage_idx++;
730 0 : buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
731 0 : buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
732 0 : buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
733 0 : buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
734 0 : buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
735 0 : buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
736 0 : buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
737 0 : buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
738 0 : buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
739 0 : buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
740 0 : buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
741 0 : buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
742 0 : buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
743 0 : buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
744 0 : buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
745 0 : buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
746 0 : buf1[16] = _mm_add_epi32(buf0[16], buf0[18]);
747 0 : buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]);
748 0 : buf1[17] = _mm_add_epi32(buf0[17], buf0[19]);
749 0 : buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]);
750 0 : buf1[20] = _mm_add_epi32(buf0[20], buf0[22]);
751 0 : buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]);
752 0 : buf1[21] = _mm_add_epi32(buf0[21], buf0[23]);
753 0 : buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]);
754 0 : buf1[24] = _mm_add_epi32(buf0[24], buf0[26]);
755 0 : buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]);
756 0 : buf1[25] = _mm_add_epi32(buf0[25], buf0[27]);
757 0 : buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]);
758 0 : buf1[28] = _mm_add_epi32(buf0[28], buf0[30]);
759 0 : buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]);
760 0 : buf1[29] = _mm_add_epi32(buf0[29], buf0[31]);
761 0 : buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]);
762 :
763 : // stage 10
764 0 : stage_idx++;
765 0 : bit = cos_bit[stage_idx];
766 0 : cospi = cospi_arr(bit);
767 0 : buf0[0] = buf1[0];
768 0 : buf0[1] = buf1[1];
769 0 : btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
770 : buf0[3], bit);
771 0 : buf0[4] = buf1[4];
772 0 : buf0[5] = buf1[5];
773 0 : btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
774 : buf0[7], bit);
775 0 : buf0[8] = buf1[8];
776 0 : buf0[9] = buf1[9];
777 0 : btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
778 : buf0[11], bit);
779 0 : buf0[12] = buf1[12];
780 0 : buf0[13] = buf1[13];
781 0 : btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
782 : buf0[15], bit);
783 0 : buf0[16] = buf1[16];
784 0 : buf0[17] = buf1[17];
785 0 : btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
786 : buf0[19], bit);
787 0 : buf0[20] = buf1[20];
788 0 : buf0[21] = buf1[21];
789 0 : btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
790 : buf0[23], bit);
791 0 : buf0[24] = buf1[24];
792 0 : buf0[25] = buf1[25];
793 0 : btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
794 : buf0[27], bit);
795 0 : buf0[28] = buf1[28];
796 0 : buf0[29] = buf1[29];
797 0 : btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
798 : buf0[31], bit);
799 :
800 : // stage 11
801 0 : stage_idx++;
802 0 : buf1[0] = buf0[0];
803 0 : buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]);
804 0 : buf1[2] = buf0[24];
805 0 : buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
806 0 : buf1[4] = buf0[12];
807 0 : buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]);
808 0 : buf1[6] = buf0[20];
809 0 : buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
810 0 : buf1[8] = buf0[6];
811 0 : buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]);
812 0 : buf1[10] = buf0[30];
813 0 : buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
814 0 : buf1[12] = buf0[10];
815 0 : buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]);
816 0 : buf1[14] = buf0[18];
817 0 : buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
818 0 : buf1[16] = buf0[3];
819 0 : buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]);
820 0 : buf1[18] = buf0[27];
821 0 : buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
822 0 : buf1[20] = buf0[15];
823 0 : buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]);
824 0 : buf1[22] = buf0[23];
825 0 : buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
826 0 : buf1[24] = buf0[5];
827 0 : buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]);
828 0 : buf1[26] = buf0[29];
829 0 : buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
830 0 : buf1[28] = buf0[9];
831 0 : buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]);
832 0 : buf1[30] = buf0[17];
833 0 : buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
834 :
835 0 : for (j = 0; j < 32; ++j) {
836 0 : output[j * col_num + col] = buf1[j];
837 : }
838 : }
839 0 : }
|