Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <assert.h>
13 : #include <math.h>
14 :
15 : #include "./aom_config.h"
16 : #include "./aom_dsp_rtcd.h"
17 : #include "./av1_rtcd.h"
18 : #include "aom_dsp/fwd_txfm.h"
19 : #include "aom_ports/mem.h"
20 : #include "av1/common/blockd.h"
21 : #include "av1/common/av1_fwd_txfm1d.h"
22 : #include "av1/common/av1_fwd_txfm1d_cfg.h"
23 : #include "av1/common/idct.h"
24 :
25 0 : static INLINE void range_check(const tran_low_t *input, const int size,
26 : const int bit) {
27 : #if 0 // CONFIG_COEFFICIENT_RANGE_CHECKING
28 : // TODO(angiebird): the range_check is not used because the bit range
29 : // in fdct# is not correct. Since we are going to merge in a new version
30 : // of fdct# from nextgenv2, we won't fix the incorrect bit range now.
31 : int i;
32 : for (i = 0; i < size; ++i) {
33 : assert(abs(input[i]) < (1 << bit));
34 : }
35 : #else
36 : (void)input;
37 : (void)size;
38 : (void)bit;
39 : #endif
40 0 : }
41 :
42 0 : static void fdct4(const tran_low_t *input, tran_low_t *output) {
43 : tran_high_t temp;
44 : tran_low_t step[4];
45 :
46 : // stage 0
47 0 : range_check(input, 4, 14);
48 :
49 : // stage 1
50 0 : output[0] = input[0] + input[3];
51 0 : output[1] = input[1] + input[2];
52 0 : output[2] = input[1] - input[2];
53 0 : output[3] = input[0] - input[3];
54 :
55 0 : range_check(output, 4, 15);
56 :
57 : // stage 2
58 0 : temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
59 0 : step[0] = (tran_low_t)fdct_round_shift(temp);
60 0 : temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
61 0 : step[1] = (tran_low_t)fdct_round_shift(temp);
62 0 : temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
63 0 : step[2] = (tran_low_t)fdct_round_shift(temp);
64 0 : temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
65 0 : step[3] = (tran_low_t)fdct_round_shift(temp);
66 :
67 0 : range_check(step, 4, 16);
68 :
69 : // stage 3
70 0 : output[0] = step[0];
71 0 : output[1] = step[2];
72 0 : output[2] = step[1];
73 0 : output[3] = step[3];
74 :
75 0 : range_check(output, 4, 16);
76 0 : }
77 :
78 0 : static void fdct8(const tran_low_t *input, tran_low_t *output) {
79 : tran_high_t temp;
80 : tran_low_t step[8];
81 :
82 : // stage 0
83 0 : range_check(input, 8, 13);
84 :
85 : // stage 1
86 0 : output[0] = input[0] + input[7];
87 0 : output[1] = input[1] + input[6];
88 0 : output[2] = input[2] + input[5];
89 0 : output[3] = input[3] + input[4];
90 0 : output[4] = input[3] - input[4];
91 0 : output[5] = input[2] - input[5];
92 0 : output[6] = input[1] - input[6];
93 0 : output[7] = input[0] - input[7];
94 :
95 0 : range_check(output, 8, 14);
96 :
97 : // stage 2
98 0 : step[0] = output[0] + output[3];
99 0 : step[1] = output[1] + output[2];
100 0 : step[2] = output[1] - output[2];
101 0 : step[3] = output[0] - output[3];
102 0 : step[4] = output[4];
103 0 : temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
104 0 : step[5] = (tran_low_t)fdct_round_shift(temp);
105 0 : temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
106 0 : step[6] = (tran_low_t)fdct_round_shift(temp);
107 0 : step[7] = output[7];
108 :
109 0 : range_check(step, 8, 15);
110 :
111 : // stage 3
112 0 : temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
113 0 : output[0] = (tran_low_t)fdct_round_shift(temp);
114 0 : temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
115 0 : output[1] = (tran_low_t)fdct_round_shift(temp);
116 0 : temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
117 0 : output[2] = (tran_low_t)fdct_round_shift(temp);
118 0 : temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
119 0 : output[3] = (tran_low_t)fdct_round_shift(temp);
120 0 : output[4] = step[4] + step[5];
121 0 : output[5] = step[4] - step[5];
122 0 : output[6] = step[7] - step[6];
123 0 : output[7] = step[7] + step[6];
124 :
125 0 : range_check(output, 8, 16);
126 :
127 : // stage 4
128 0 : step[0] = output[0];
129 0 : step[1] = output[1];
130 0 : step[2] = output[2];
131 0 : step[3] = output[3];
132 0 : temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
133 0 : step[4] = (tran_low_t)fdct_round_shift(temp);
134 0 : temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
135 0 : step[5] = (tran_low_t)fdct_round_shift(temp);
136 0 : temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
137 0 : step[6] = (tran_low_t)fdct_round_shift(temp);
138 0 : temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
139 0 : step[7] = (tran_low_t)fdct_round_shift(temp);
140 :
141 0 : range_check(step, 8, 16);
142 :
143 : // stage 5
144 0 : output[0] = step[0];
145 0 : output[1] = step[4];
146 0 : output[2] = step[2];
147 0 : output[3] = step[6];
148 0 : output[4] = step[1];
149 0 : output[5] = step[5];
150 0 : output[6] = step[3];
151 0 : output[7] = step[7];
152 :
153 0 : range_check(output, 8, 16);
154 0 : }
155 :
156 0 : static void fdct16(const tran_low_t *input, tran_low_t *output) {
157 : tran_high_t temp;
158 : tran_low_t step[16];
159 :
160 : // stage 0
161 0 : range_check(input, 16, 13);
162 :
163 : // stage 1
164 0 : output[0] = input[0] + input[15];
165 0 : output[1] = input[1] + input[14];
166 0 : output[2] = input[2] + input[13];
167 0 : output[3] = input[3] + input[12];
168 0 : output[4] = input[4] + input[11];
169 0 : output[5] = input[5] + input[10];
170 0 : output[6] = input[6] + input[9];
171 0 : output[7] = input[7] + input[8];
172 0 : output[8] = input[7] - input[8];
173 0 : output[9] = input[6] - input[9];
174 0 : output[10] = input[5] - input[10];
175 0 : output[11] = input[4] - input[11];
176 0 : output[12] = input[3] - input[12];
177 0 : output[13] = input[2] - input[13];
178 0 : output[14] = input[1] - input[14];
179 0 : output[15] = input[0] - input[15];
180 :
181 0 : range_check(output, 16, 14);
182 :
183 : // stage 2
184 0 : step[0] = output[0] + output[7];
185 0 : step[1] = output[1] + output[6];
186 0 : step[2] = output[2] + output[5];
187 0 : step[3] = output[3] + output[4];
188 0 : step[4] = output[3] - output[4];
189 0 : step[5] = output[2] - output[5];
190 0 : step[6] = output[1] - output[6];
191 0 : step[7] = output[0] - output[7];
192 0 : step[8] = output[8];
193 0 : step[9] = output[9];
194 0 : temp = output[10] * -cospi_16_64 + output[13] * cospi_16_64;
195 0 : step[10] = (tran_low_t)fdct_round_shift(temp);
196 0 : temp = output[11] * -cospi_16_64 + output[12] * cospi_16_64;
197 0 : step[11] = (tran_low_t)fdct_round_shift(temp);
198 0 : temp = output[12] * cospi_16_64 + output[11] * cospi_16_64;
199 0 : step[12] = (tran_low_t)fdct_round_shift(temp);
200 0 : temp = output[13] * cospi_16_64 + output[10] * cospi_16_64;
201 0 : step[13] = (tran_low_t)fdct_round_shift(temp);
202 0 : step[14] = output[14];
203 0 : step[15] = output[15];
204 :
205 0 : range_check(step, 16, 15);
206 :
207 : // stage 3
208 0 : output[0] = step[0] + step[3];
209 0 : output[1] = step[1] + step[2];
210 0 : output[2] = step[1] - step[2];
211 0 : output[3] = step[0] - step[3];
212 0 : output[4] = step[4];
213 0 : temp = step[5] * -cospi_16_64 + step[6] * cospi_16_64;
214 0 : output[5] = (tran_low_t)fdct_round_shift(temp);
215 0 : temp = step[6] * cospi_16_64 + step[5] * cospi_16_64;
216 0 : output[6] = (tran_low_t)fdct_round_shift(temp);
217 0 : output[7] = step[7];
218 0 : output[8] = step[8] + step[11];
219 0 : output[9] = step[9] + step[10];
220 0 : output[10] = step[9] - step[10];
221 0 : output[11] = step[8] - step[11];
222 0 : output[12] = step[15] - step[12];
223 0 : output[13] = step[14] - step[13];
224 0 : output[14] = step[14] + step[13];
225 0 : output[15] = step[15] + step[12];
226 :
227 0 : range_check(output, 16, 16);
228 :
229 : // stage 4
230 0 : temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
231 0 : step[0] = (tran_low_t)fdct_round_shift(temp);
232 0 : temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
233 0 : step[1] = (tran_low_t)fdct_round_shift(temp);
234 0 : temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
235 0 : step[2] = (tran_low_t)fdct_round_shift(temp);
236 0 : temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
237 0 : step[3] = (tran_low_t)fdct_round_shift(temp);
238 0 : step[4] = output[4] + output[5];
239 0 : step[5] = output[4] - output[5];
240 0 : step[6] = output[7] - output[6];
241 0 : step[7] = output[7] + output[6];
242 0 : step[8] = output[8];
243 0 : temp = output[9] * -cospi_8_64 + output[14] * cospi_24_64;
244 0 : step[9] = (tran_low_t)fdct_round_shift(temp);
245 0 : temp = output[10] * -cospi_24_64 + output[13] * -cospi_8_64;
246 0 : step[10] = (tran_low_t)fdct_round_shift(temp);
247 0 : step[11] = output[11];
248 0 : step[12] = output[12];
249 0 : temp = output[13] * cospi_24_64 + output[10] * -cospi_8_64;
250 0 : step[13] = (tran_low_t)fdct_round_shift(temp);
251 0 : temp = output[14] * cospi_8_64 + output[9] * cospi_24_64;
252 0 : step[14] = (tran_low_t)fdct_round_shift(temp);
253 0 : step[15] = output[15];
254 :
255 0 : range_check(step, 16, 16);
256 :
257 : // stage 5
258 0 : output[0] = step[0];
259 0 : output[1] = step[1];
260 0 : output[2] = step[2];
261 0 : output[3] = step[3];
262 0 : temp = step[4] * cospi_28_64 + step[7] * cospi_4_64;
263 0 : output[4] = (tran_low_t)fdct_round_shift(temp);
264 0 : temp = step[5] * cospi_12_64 + step[6] * cospi_20_64;
265 0 : output[5] = (tran_low_t)fdct_round_shift(temp);
266 0 : temp = step[6] * cospi_12_64 + step[5] * -cospi_20_64;
267 0 : output[6] = (tran_low_t)fdct_round_shift(temp);
268 0 : temp = step[7] * cospi_28_64 + step[4] * -cospi_4_64;
269 0 : output[7] = (tran_low_t)fdct_round_shift(temp);
270 0 : output[8] = step[8] + step[9];
271 0 : output[9] = step[8] - step[9];
272 0 : output[10] = step[11] - step[10];
273 0 : output[11] = step[11] + step[10];
274 0 : output[12] = step[12] + step[13];
275 0 : output[13] = step[12] - step[13];
276 0 : output[14] = step[15] - step[14];
277 0 : output[15] = step[15] + step[14];
278 :
279 0 : range_check(output, 16, 16);
280 :
281 : // stage 6
282 0 : step[0] = output[0];
283 0 : step[1] = output[1];
284 0 : step[2] = output[2];
285 0 : step[3] = output[3];
286 0 : step[4] = output[4];
287 0 : step[5] = output[5];
288 0 : step[6] = output[6];
289 0 : step[7] = output[7];
290 0 : temp = output[8] * cospi_30_64 + output[15] * cospi_2_64;
291 0 : step[8] = (tran_low_t)fdct_round_shift(temp);
292 0 : temp = output[9] * cospi_14_64 + output[14] * cospi_18_64;
293 0 : step[9] = (tran_low_t)fdct_round_shift(temp);
294 0 : temp = output[10] * cospi_22_64 + output[13] * cospi_10_64;
295 0 : step[10] = (tran_low_t)fdct_round_shift(temp);
296 0 : temp = output[11] * cospi_6_64 + output[12] * cospi_26_64;
297 0 : step[11] = (tran_low_t)fdct_round_shift(temp);
298 0 : temp = output[12] * cospi_6_64 + output[11] * -cospi_26_64;
299 0 : step[12] = (tran_low_t)fdct_round_shift(temp);
300 0 : temp = output[13] * cospi_22_64 + output[10] * -cospi_10_64;
301 0 : step[13] = (tran_low_t)fdct_round_shift(temp);
302 0 : temp = output[14] * cospi_14_64 + output[9] * -cospi_18_64;
303 0 : step[14] = (tran_low_t)fdct_round_shift(temp);
304 0 : temp = output[15] * cospi_30_64 + output[8] * -cospi_2_64;
305 0 : step[15] = (tran_low_t)fdct_round_shift(temp);
306 :
307 0 : range_check(step, 16, 16);
308 :
309 : // stage 7
310 0 : output[0] = step[0];
311 0 : output[1] = step[8];
312 0 : output[2] = step[4];
313 0 : output[3] = step[12];
314 0 : output[4] = step[2];
315 0 : output[5] = step[10];
316 0 : output[6] = step[6];
317 0 : output[7] = step[14];
318 0 : output[8] = step[1];
319 0 : output[9] = step[9];
320 0 : output[10] = step[5];
321 0 : output[11] = step[13];
322 0 : output[12] = step[3];
323 0 : output[13] = step[11];
324 0 : output[14] = step[7];
325 0 : output[15] = step[15];
326 :
327 0 : range_check(output, 16, 16);
328 0 : }
329 :
330 0 : static void fdct32(const tran_low_t *input, tran_low_t *output) {
331 : tran_high_t temp;
332 : tran_low_t step[32];
333 :
334 : // stage 0
335 0 : range_check(input, 32, 14);
336 :
337 : // stage 1
338 0 : output[0] = input[0] + input[31];
339 0 : output[1] = input[1] + input[30];
340 0 : output[2] = input[2] + input[29];
341 0 : output[3] = input[3] + input[28];
342 0 : output[4] = input[4] + input[27];
343 0 : output[5] = input[5] + input[26];
344 0 : output[6] = input[6] + input[25];
345 0 : output[7] = input[7] + input[24];
346 0 : output[8] = input[8] + input[23];
347 0 : output[9] = input[9] + input[22];
348 0 : output[10] = input[10] + input[21];
349 0 : output[11] = input[11] + input[20];
350 0 : output[12] = input[12] + input[19];
351 0 : output[13] = input[13] + input[18];
352 0 : output[14] = input[14] + input[17];
353 0 : output[15] = input[15] + input[16];
354 0 : output[16] = input[15] - input[16];
355 0 : output[17] = input[14] - input[17];
356 0 : output[18] = input[13] - input[18];
357 0 : output[19] = input[12] - input[19];
358 0 : output[20] = input[11] - input[20];
359 0 : output[21] = input[10] - input[21];
360 0 : output[22] = input[9] - input[22];
361 0 : output[23] = input[8] - input[23];
362 0 : output[24] = input[7] - input[24];
363 0 : output[25] = input[6] - input[25];
364 0 : output[26] = input[5] - input[26];
365 0 : output[27] = input[4] - input[27];
366 0 : output[28] = input[3] - input[28];
367 0 : output[29] = input[2] - input[29];
368 0 : output[30] = input[1] - input[30];
369 0 : output[31] = input[0] - input[31];
370 :
371 0 : range_check(output, 32, 15);
372 :
373 : // stage 2
374 0 : step[0] = output[0] + output[15];
375 0 : step[1] = output[1] + output[14];
376 0 : step[2] = output[2] + output[13];
377 0 : step[3] = output[3] + output[12];
378 0 : step[4] = output[4] + output[11];
379 0 : step[5] = output[5] + output[10];
380 0 : step[6] = output[6] + output[9];
381 0 : step[7] = output[7] + output[8];
382 0 : step[8] = output[7] - output[8];
383 0 : step[9] = output[6] - output[9];
384 0 : step[10] = output[5] - output[10];
385 0 : step[11] = output[4] - output[11];
386 0 : step[12] = output[3] - output[12];
387 0 : step[13] = output[2] - output[13];
388 0 : step[14] = output[1] - output[14];
389 0 : step[15] = output[0] - output[15];
390 0 : step[16] = output[16];
391 0 : step[17] = output[17];
392 0 : step[18] = output[18];
393 0 : step[19] = output[19];
394 0 : temp = output[20] * -cospi_16_64 + output[27] * cospi_16_64;
395 0 : step[20] = (tran_low_t)fdct_round_shift(temp);
396 0 : temp = output[21] * -cospi_16_64 + output[26] * cospi_16_64;
397 0 : step[21] = (tran_low_t)fdct_round_shift(temp);
398 0 : temp = output[22] * -cospi_16_64 + output[25] * cospi_16_64;
399 0 : step[22] = (tran_low_t)fdct_round_shift(temp);
400 0 : temp = output[23] * -cospi_16_64 + output[24] * cospi_16_64;
401 0 : step[23] = (tran_low_t)fdct_round_shift(temp);
402 0 : temp = output[24] * cospi_16_64 + output[23] * cospi_16_64;
403 0 : step[24] = (tran_low_t)fdct_round_shift(temp);
404 0 : temp = output[25] * cospi_16_64 + output[22] * cospi_16_64;
405 0 : step[25] = (tran_low_t)fdct_round_shift(temp);
406 0 : temp = output[26] * cospi_16_64 + output[21] * cospi_16_64;
407 0 : step[26] = (tran_low_t)fdct_round_shift(temp);
408 0 : temp = output[27] * cospi_16_64 + output[20] * cospi_16_64;
409 0 : step[27] = (tran_low_t)fdct_round_shift(temp);
410 0 : step[28] = output[28];
411 0 : step[29] = output[29];
412 0 : step[30] = output[30];
413 0 : step[31] = output[31];
414 :
415 0 : range_check(step, 32, 16);
416 :
417 : // stage 3
418 0 : output[0] = step[0] + step[7];
419 0 : output[1] = step[1] + step[6];
420 0 : output[2] = step[2] + step[5];
421 0 : output[3] = step[3] + step[4];
422 0 : output[4] = step[3] - step[4];
423 0 : output[5] = step[2] - step[5];
424 0 : output[6] = step[1] - step[6];
425 0 : output[7] = step[0] - step[7];
426 0 : output[8] = step[8];
427 0 : output[9] = step[9];
428 0 : temp = step[10] * -cospi_16_64 + step[13] * cospi_16_64;
429 0 : output[10] = (tran_low_t)fdct_round_shift(temp);
430 0 : temp = step[11] * -cospi_16_64 + step[12] * cospi_16_64;
431 0 : output[11] = (tran_low_t)fdct_round_shift(temp);
432 0 : temp = step[12] * cospi_16_64 + step[11] * cospi_16_64;
433 0 : output[12] = (tran_low_t)fdct_round_shift(temp);
434 0 : temp = step[13] * cospi_16_64 + step[10] * cospi_16_64;
435 0 : output[13] = (tran_low_t)fdct_round_shift(temp);
436 0 : output[14] = step[14];
437 0 : output[15] = step[15];
438 0 : output[16] = step[16] + step[23];
439 0 : output[17] = step[17] + step[22];
440 0 : output[18] = step[18] + step[21];
441 0 : output[19] = step[19] + step[20];
442 0 : output[20] = step[19] - step[20];
443 0 : output[21] = step[18] - step[21];
444 0 : output[22] = step[17] - step[22];
445 0 : output[23] = step[16] - step[23];
446 0 : output[24] = step[31] - step[24];
447 0 : output[25] = step[30] - step[25];
448 0 : output[26] = step[29] - step[26];
449 0 : output[27] = step[28] - step[27];
450 0 : output[28] = step[28] + step[27];
451 0 : output[29] = step[29] + step[26];
452 0 : output[30] = step[30] + step[25];
453 0 : output[31] = step[31] + step[24];
454 :
455 0 : range_check(output, 32, 17);
456 :
457 : // stage 4
458 0 : step[0] = output[0] + output[3];
459 0 : step[1] = output[1] + output[2];
460 0 : step[2] = output[1] - output[2];
461 0 : step[3] = output[0] - output[3];
462 0 : step[4] = output[4];
463 0 : temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
464 0 : step[5] = (tran_low_t)fdct_round_shift(temp);
465 0 : temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
466 0 : step[6] = (tran_low_t)fdct_round_shift(temp);
467 0 : step[7] = output[7];
468 0 : step[8] = output[8] + output[11];
469 0 : step[9] = output[9] + output[10];
470 0 : step[10] = output[9] - output[10];
471 0 : step[11] = output[8] - output[11];
472 0 : step[12] = output[15] - output[12];
473 0 : step[13] = output[14] - output[13];
474 0 : step[14] = output[14] + output[13];
475 0 : step[15] = output[15] + output[12];
476 0 : step[16] = output[16];
477 0 : step[17] = output[17];
478 0 : temp = output[18] * -cospi_8_64 + output[29] * cospi_24_64;
479 0 : step[18] = (tran_low_t)fdct_round_shift(temp);
480 0 : temp = output[19] * -cospi_8_64 + output[28] * cospi_24_64;
481 0 : step[19] = (tran_low_t)fdct_round_shift(temp);
482 0 : temp = output[20] * -cospi_24_64 + output[27] * -cospi_8_64;
483 0 : step[20] = (tran_low_t)fdct_round_shift(temp);
484 0 : temp = output[21] * -cospi_24_64 + output[26] * -cospi_8_64;
485 0 : step[21] = (tran_low_t)fdct_round_shift(temp);
486 0 : step[22] = output[22];
487 0 : step[23] = output[23];
488 0 : step[24] = output[24];
489 0 : step[25] = output[25];
490 0 : temp = output[26] * cospi_24_64 + output[21] * -cospi_8_64;
491 0 : step[26] = (tran_low_t)fdct_round_shift(temp);
492 0 : temp = output[27] * cospi_24_64 + output[20] * -cospi_8_64;
493 0 : step[27] = (tran_low_t)fdct_round_shift(temp);
494 0 : temp = output[28] * cospi_8_64 + output[19] * cospi_24_64;
495 0 : step[28] = (tran_low_t)fdct_round_shift(temp);
496 0 : temp = output[29] * cospi_8_64 + output[18] * cospi_24_64;
497 0 : step[29] = (tran_low_t)fdct_round_shift(temp);
498 0 : step[30] = output[30];
499 0 : step[31] = output[31];
500 :
501 0 : range_check(step, 32, 18);
502 :
503 : // stage 5
504 0 : temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
505 0 : output[0] = (tran_low_t)fdct_round_shift(temp);
506 0 : temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
507 0 : output[1] = (tran_low_t)fdct_round_shift(temp);
508 0 : temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
509 0 : output[2] = (tran_low_t)fdct_round_shift(temp);
510 0 : temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
511 0 : output[3] = (tran_low_t)fdct_round_shift(temp);
512 0 : output[4] = step[4] + step[5];
513 0 : output[5] = step[4] - step[5];
514 0 : output[6] = step[7] - step[6];
515 0 : output[7] = step[7] + step[6];
516 0 : output[8] = step[8];
517 0 : temp = step[9] * -cospi_8_64 + step[14] * cospi_24_64;
518 0 : output[9] = (tran_low_t)fdct_round_shift(temp);
519 0 : temp = step[10] * -cospi_24_64 + step[13] * -cospi_8_64;
520 0 : output[10] = (tran_low_t)fdct_round_shift(temp);
521 0 : output[11] = step[11];
522 0 : output[12] = step[12];
523 0 : temp = step[13] * cospi_24_64 + step[10] * -cospi_8_64;
524 0 : output[13] = (tran_low_t)fdct_round_shift(temp);
525 0 : temp = step[14] * cospi_8_64 + step[9] * cospi_24_64;
526 0 : output[14] = (tran_low_t)fdct_round_shift(temp);
527 0 : output[15] = step[15];
528 0 : output[16] = step[16] + step[19];
529 0 : output[17] = step[17] + step[18];
530 0 : output[18] = step[17] - step[18];
531 0 : output[19] = step[16] - step[19];
532 0 : output[20] = step[23] - step[20];
533 0 : output[21] = step[22] - step[21];
534 0 : output[22] = step[22] + step[21];
535 0 : output[23] = step[23] + step[20];
536 0 : output[24] = step[24] + step[27];
537 0 : output[25] = step[25] + step[26];
538 0 : output[26] = step[25] - step[26];
539 0 : output[27] = step[24] - step[27];
540 0 : output[28] = step[31] - step[28];
541 0 : output[29] = step[30] - step[29];
542 0 : output[30] = step[30] + step[29];
543 0 : output[31] = step[31] + step[28];
544 :
545 0 : range_check(output, 32, 18);
546 :
547 : // stage 6
548 0 : step[0] = output[0];
549 0 : step[1] = output[1];
550 0 : step[2] = output[2];
551 0 : step[3] = output[3];
552 0 : temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
553 0 : step[4] = (tran_low_t)fdct_round_shift(temp);
554 0 : temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
555 0 : step[5] = (tran_low_t)fdct_round_shift(temp);
556 0 : temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
557 0 : step[6] = (tran_low_t)fdct_round_shift(temp);
558 0 : temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
559 0 : step[7] = (tran_low_t)fdct_round_shift(temp);
560 0 : step[8] = output[8] + output[9];
561 0 : step[9] = output[8] - output[9];
562 0 : step[10] = output[11] - output[10];
563 0 : step[11] = output[11] + output[10];
564 0 : step[12] = output[12] + output[13];
565 0 : step[13] = output[12] - output[13];
566 0 : step[14] = output[15] - output[14];
567 0 : step[15] = output[15] + output[14];
568 0 : step[16] = output[16];
569 0 : temp = output[17] * -cospi_4_64 + output[30] * cospi_28_64;
570 0 : step[17] = (tran_low_t)fdct_round_shift(temp);
571 0 : temp = output[18] * -cospi_28_64 + output[29] * -cospi_4_64;
572 0 : step[18] = (tran_low_t)fdct_round_shift(temp);
573 0 : step[19] = output[19];
574 0 : step[20] = output[20];
575 0 : temp = output[21] * -cospi_20_64 + output[26] * cospi_12_64;
576 0 : step[21] = (tran_low_t)fdct_round_shift(temp);
577 0 : temp = output[22] * -cospi_12_64 + output[25] * -cospi_20_64;
578 0 : step[22] = (tran_low_t)fdct_round_shift(temp);
579 0 : step[23] = output[23];
580 0 : step[24] = output[24];
581 0 : temp = output[25] * cospi_12_64 + output[22] * -cospi_20_64;
582 0 : step[25] = (tran_low_t)fdct_round_shift(temp);
583 0 : temp = output[26] * cospi_20_64 + output[21] * cospi_12_64;
584 0 : step[26] = (tran_low_t)fdct_round_shift(temp);
585 0 : step[27] = output[27];
586 0 : step[28] = output[28];
587 0 : temp = output[29] * cospi_28_64 + output[18] * -cospi_4_64;
588 0 : step[29] = (tran_low_t)fdct_round_shift(temp);
589 0 : temp = output[30] * cospi_4_64 + output[17] * cospi_28_64;
590 0 : step[30] = (tran_low_t)fdct_round_shift(temp);
591 0 : step[31] = output[31];
592 :
593 0 : range_check(step, 32, 18);
594 :
595 : // stage 7
596 0 : output[0] = step[0];
597 0 : output[1] = step[1];
598 0 : output[2] = step[2];
599 0 : output[3] = step[3];
600 0 : output[4] = step[4];
601 0 : output[5] = step[5];
602 0 : output[6] = step[6];
603 0 : output[7] = step[7];
604 0 : temp = step[8] * cospi_30_64 + step[15] * cospi_2_64;
605 0 : output[8] = (tran_low_t)fdct_round_shift(temp);
606 0 : temp = step[9] * cospi_14_64 + step[14] * cospi_18_64;
607 0 : output[9] = (tran_low_t)fdct_round_shift(temp);
608 0 : temp = step[10] * cospi_22_64 + step[13] * cospi_10_64;
609 0 : output[10] = (tran_low_t)fdct_round_shift(temp);
610 0 : temp = step[11] * cospi_6_64 + step[12] * cospi_26_64;
611 0 : output[11] = (tran_low_t)fdct_round_shift(temp);
612 0 : temp = step[12] * cospi_6_64 + step[11] * -cospi_26_64;
613 0 : output[12] = (tran_low_t)fdct_round_shift(temp);
614 0 : temp = step[13] * cospi_22_64 + step[10] * -cospi_10_64;
615 0 : output[13] = (tran_low_t)fdct_round_shift(temp);
616 0 : temp = step[14] * cospi_14_64 + step[9] * -cospi_18_64;
617 0 : output[14] = (tran_low_t)fdct_round_shift(temp);
618 0 : temp = step[15] * cospi_30_64 + step[8] * -cospi_2_64;
619 0 : output[15] = (tran_low_t)fdct_round_shift(temp);
620 0 : output[16] = step[16] + step[17];
621 0 : output[17] = step[16] - step[17];
622 0 : output[18] = step[19] - step[18];
623 0 : output[19] = step[19] + step[18];
624 0 : output[20] = step[20] + step[21];
625 0 : output[21] = step[20] - step[21];
626 0 : output[22] = step[23] - step[22];
627 0 : output[23] = step[23] + step[22];
628 0 : output[24] = step[24] + step[25];
629 0 : output[25] = step[24] - step[25];
630 0 : output[26] = step[27] - step[26];
631 0 : output[27] = step[27] + step[26];
632 0 : output[28] = step[28] + step[29];
633 0 : output[29] = step[28] - step[29];
634 0 : output[30] = step[31] - step[30];
635 0 : output[31] = step[31] + step[30];
636 :
637 0 : range_check(output, 32, 18);
638 :
639 : // stage 8
640 0 : step[0] = output[0];
641 0 : step[1] = output[1];
642 0 : step[2] = output[2];
643 0 : step[3] = output[3];
644 0 : step[4] = output[4];
645 0 : step[5] = output[5];
646 0 : step[6] = output[6];
647 0 : step[7] = output[7];
648 0 : step[8] = output[8];
649 0 : step[9] = output[9];
650 0 : step[10] = output[10];
651 0 : step[11] = output[11];
652 0 : step[12] = output[12];
653 0 : step[13] = output[13];
654 0 : step[14] = output[14];
655 0 : step[15] = output[15];
656 0 : temp = output[16] * cospi_31_64 + output[31] * cospi_1_64;
657 0 : step[16] = (tran_low_t)fdct_round_shift(temp);
658 0 : temp = output[17] * cospi_15_64 + output[30] * cospi_17_64;
659 0 : step[17] = (tran_low_t)fdct_round_shift(temp);
660 0 : temp = output[18] * cospi_23_64 + output[29] * cospi_9_64;
661 0 : step[18] = (tran_low_t)fdct_round_shift(temp);
662 0 : temp = output[19] * cospi_7_64 + output[28] * cospi_25_64;
663 0 : step[19] = (tran_low_t)fdct_round_shift(temp);
664 0 : temp = output[20] * cospi_27_64 + output[27] * cospi_5_64;
665 0 : step[20] = (tran_low_t)fdct_round_shift(temp);
666 0 : temp = output[21] * cospi_11_64 + output[26] * cospi_21_64;
667 0 : step[21] = (tran_low_t)fdct_round_shift(temp);
668 0 : temp = output[22] * cospi_19_64 + output[25] * cospi_13_64;
669 0 : step[22] = (tran_low_t)fdct_round_shift(temp);
670 0 : temp = output[23] * cospi_3_64 + output[24] * cospi_29_64;
671 0 : step[23] = (tran_low_t)fdct_round_shift(temp);
672 0 : temp = output[24] * cospi_3_64 + output[23] * -cospi_29_64;
673 0 : step[24] = (tran_low_t)fdct_round_shift(temp);
674 0 : temp = output[25] * cospi_19_64 + output[22] * -cospi_13_64;
675 0 : step[25] = (tran_low_t)fdct_round_shift(temp);
676 0 : temp = output[26] * cospi_11_64 + output[21] * -cospi_21_64;
677 0 : step[26] = (tran_low_t)fdct_round_shift(temp);
678 0 : temp = output[27] * cospi_27_64 + output[20] * -cospi_5_64;
679 0 : step[27] = (tran_low_t)fdct_round_shift(temp);
680 0 : temp = output[28] * cospi_7_64 + output[19] * -cospi_25_64;
681 0 : step[28] = (tran_low_t)fdct_round_shift(temp);
682 0 : temp = output[29] * cospi_23_64 + output[18] * -cospi_9_64;
683 0 : step[29] = (tran_low_t)fdct_round_shift(temp);
684 0 : temp = output[30] * cospi_15_64 + output[17] * -cospi_17_64;
685 0 : step[30] = (tran_low_t)fdct_round_shift(temp);
686 0 : temp = output[31] * cospi_31_64 + output[16] * -cospi_1_64;
687 0 : step[31] = (tran_low_t)fdct_round_shift(temp);
688 :
689 0 : range_check(step, 32, 18);
690 :
691 : // stage 9
692 0 : output[0] = step[0];
693 0 : output[1] = step[16];
694 0 : output[2] = step[8];
695 0 : output[3] = step[24];
696 0 : output[4] = step[4];
697 0 : output[5] = step[20];
698 0 : output[6] = step[12];
699 0 : output[7] = step[28];
700 0 : output[8] = step[2];
701 0 : output[9] = step[18];
702 0 : output[10] = step[10];
703 0 : output[11] = step[26];
704 0 : output[12] = step[6];
705 0 : output[13] = step[22];
706 0 : output[14] = step[14];
707 0 : output[15] = step[30];
708 0 : output[16] = step[1];
709 0 : output[17] = step[17];
710 0 : output[18] = step[9];
711 0 : output[19] = step[25];
712 0 : output[20] = step[5];
713 0 : output[21] = step[21];
714 0 : output[22] = step[13];
715 0 : output[23] = step[29];
716 0 : output[24] = step[3];
717 0 : output[25] = step[19];
718 0 : output[26] = step[11];
719 0 : output[27] = step[27];
720 0 : output[28] = step[7];
721 0 : output[29] = step[23];
722 0 : output[30] = step[15];
723 0 : output[31] = step[31];
724 :
725 0 : range_check(output, 32, 18);
726 0 : }
727 :
728 : #ifndef AV1_DCT_GTEST
729 :
730 0 : static void fadst4(const tran_low_t *input, tran_low_t *output) {
731 : tran_high_t x0, x1, x2, x3;
732 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
733 :
734 0 : x0 = input[0];
735 0 : x1 = input[1];
736 0 : x2 = input[2];
737 0 : x3 = input[3];
738 :
739 0 : if (!(x0 | x1 | x2 | x3)) {
740 0 : output[0] = output[1] = output[2] = output[3] = 0;
741 0 : return;
742 : }
743 :
744 0 : s0 = sinpi_1_9 * x0;
745 0 : s1 = sinpi_4_9 * x0;
746 0 : s2 = sinpi_2_9 * x1;
747 0 : s3 = sinpi_1_9 * x1;
748 0 : s4 = sinpi_3_9 * x2;
749 0 : s5 = sinpi_4_9 * x3;
750 0 : s6 = sinpi_2_9 * x3;
751 0 : s7 = x0 + x1 - x3;
752 :
753 0 : x0 = s0 + s2 + s5;
754 0 : x1 = sinpi_3_9 * s7;
755 0 : x2 = s1 - s3 + s6;
756 0 : x3 = s4;
757 :
758 0 : s0 = x0 + x3;
759 0 : s1 = x1;
760 0 : s2 = x2 - x3;
761 0 : s3 = x2 - x0 + x3;
762 :
763 : // 1-D transform scaling factor is sqrt(2).
764 0 : output[0] = (tran_low_t)fdct_round_shift(s0);
765 0 : output[1] = (tran_low_t)fdct_round_shift(s1);
766 0 : output[2] = (tran_low_t)fdct_round_shift(s2);
767 0 : output[3] = (tran_low_t)fdct_round_shift(s3);
768 : }
769 :
770 0 : static void fadst8(const tran_low_t *input, tran_low_t *output) {
771 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
772 :
773 0 : tran_high_t x0 = input[7];
774 0 : tran_high_t x1 = input[0];
775 0 : tran_high_t x2 = input[5];
776 0 : tran_high_t x3 = input[2];
777 0 : tran_high_t x4 = input[3];
778 0 : tran_high_t x5 = input[4];
779 0 : tran_high_t x6 = input[1];
780 0 : tran_high_t x7 = input[6];
781 :
782 : // stage 1
783 0 : s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
784 0 : s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
785 0 : s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
786 0 : s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
787 0 : s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
788 0 : s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
789 0 : s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
790 0 : s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
791 :
792 0 : x0 = s0 + s4;
793 0 : x1 = s1 + s5;
794 0 : x2 = s2 + s6;
795 0 : x3 = s3 + s7;
796 0 : x4 = fdct_round_shift(s0 - s4);
797 0 : x5 = fdct_round_shift(s1 - s5);
798 0 : x6 = fdct_round_shift(s2 - s6);
799 0 : x7 = fdct_round_shift(s3 - s7);
800 :
801 : // stage 2
802 0 : s0 = x0;
803 0 : s1 = x1;
804 0 : s2 = x2;
805 0 : s3 = x3;
806 0 : s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
807 0 : s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
808 0 : s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
809 0 : s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
810 :
811 0 : x0 = fdct_round_shift(s0 + s2);
812 0 : x1 = fdct_round_shift(s1 + s3);
813 0 : x2 = fdct_round_shift(s0 - s2);
814 0 : x3 = fdct_round_shift(s1 - s3);
815 0 : x4 = fdct_round_shift(s4 + s6);
816 0 : x5 = fdct_round_shift(s5 + s7);
817 0 : x6 = fdct_round_shift(s4 - s6);
818 0 : x7 = fdct_round_shift(s5 - s7);
819 :
820 : // stage 3
821 0 : s2 = cospi_16_64 * (x2 + x3);
822 0 : s3 = cospi_16_64 * (x2 - x3);
823 0 : s6 = cospi_16_64 * (x6 + x7);
824 0 : s7 = cospi_16_64 * (x6 - x7);
825 :
826 0 : x2 = fdct_round_shift(s2);
827 0 : x3 = fdct_round_shift(s3);
828 0 : x6 = fdct_round_shift(s6);
829 0 : x7 = fdct_round_shift(s7);
830 :
831 0 : output[0] = (tran_low_t)x0;
832 0 : output[1] = (tran_low_t)-x4;
833 0 : output[2] = (tran_low_t)x6;
834 0 : output[3] = (tran_low_t)-x2;
835 0 : output[4] = (tran_low_t)x3;
836 0 : output[5] = (tran_low_t)-x7;
837 0 : output[6] = (tran_low_t)x5;
838 0 : output[7] = (tran_low_t)-x1;
839 0 : }
840 :
841 0 : static void fadst16(const tran_low_t *input, tran_low_t *output) {
842 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
843 : tran_high_t s9, s10, s11, s12, s13, s14, s15;
844 :
845 0 : tran_high_t x0 = input[15];
846 0 : tran_high_t x1 = input[0];
847 0 : tran_high_t x2 = input[13];
848 0 : tran_high_t x3 = input[2];
849 0 : tran_high_t x4 = input[11];
850 0 : tran_high_t x5 = input[4];
851 0 : tran_high_t x6 = input[9];
852 0 : tran_high_t x7 = input[6];
853 0 : tran_high_t x8 = input[7];
854 0 : tran_high_t x9 = input[8];
855 0 : tran_high_t x10 = input[5];
856 0 : tran_high_t x11 = input[10];
857 0 : tran_high_t x12 = input[3];
858 0 : tran_high_t x13 = input[12];
859 0 : tran_high_t x14 = input[1];
860 0 : tran_high_t x15 = input[14];
861 :
862 : // stage 1
863 0 : s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
864 0 : s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
865 0 : s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
866 0 : s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
867 0 : s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
868 0 : s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
869 0 : s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
870 0 : s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
871 0 : s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
872 0 : s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
873 0 : s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
874 0 : s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
875 0 : s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
876 0 : s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
877 0 : s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
878 0 : s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
879 :
880 0 : x0 = s0 + s8;
881 0 : x1 = s1 + s9;
882 0 : x2 = s2 + s10;
883 0 : x3 = s3 + s11;
884 0 : x4 = s4 + s12;
885 0 : x5 = s5 + s13;
886 0 : x6 = s6 + s14;
887 0 : x7 = s7 + s15;
888 :
889 0 : x8 = fdct_round_shift(s0 - s8);
890 0 : x9 = fdct_round_shift(s1 - s9);
891 0 : x10 = fdct_round_shift(s2 - s10);
892 0 : x11 = fdct_round_shift(s3 - s11);
893 0 : x12 = fdct_round_shift(s4 - s12);
894 0 : x13 = fdct_round_shift(s5 - s13);
895 0 : x14 = fdct_round_shift(s6 - s14);
896 0 : x15 = fdct_round_shift(s7 - s15);
897 :
898 : // stage 2
899 0 : s0 = x0;
900 0 : s1 = x1;
901 0 : s2 = x2;
902 0 : s3 = x3;
903 0 : s4 = x4;
904 0 : s5 = x5;
905 0 : s6 = x6;
906 0 : s7 = x7;
907 0 : s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
908 0 : s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
909 0 : s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
910 0 : s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
911 0 : s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
912 0 : s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
913 0 : s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
914 0 : s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
915 :
916 0 : x0 = s0 + s4;
917 0 : x1 = s1 + s5;
918 0 : x2 = s2 + s6;
919 0 : x3 = s3 + s7;
920 0 : x4 = fdct_round_shift(s0 - s4);
921 0 : x5 = fdct_round_shift(s1 - s5);
922 0 : x6 = fdct_round_shift(s2 - s6);
923 0 : x7 = fdct_round_shift(s3 - s7);
924 :
925 0 : x8 = s8 + s12;
926 0 : x9 = s9 + s13;
927 0 : x10 = s10 + s14;
928 0 : x11 = s11 + s15;
929 0 : x12 = fdct_round_shift(s8 - s12);
930 0 : x13 = fdct_round_shift(s9 - s13);
931 0 : x14 = fdct_round_shift(s10 - s14);
932 0 : x15 = fdct_round_shift(s11 - s15);
933 :
934 : // stage 3
935 0 : s0 = x0;
936 0 : s1 = x1;
937 0 : s2 = x2;
938 0 : s3 = x3;
939 0 : s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
940 0 : s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
941 0 : s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
942 0 : s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
943 0 : s8 = x8;
944 0 : s9 = x9;
945 0 : s10 = x10;
946 0 : s11 = x11;
947 0 : s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
948 0 : s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
949 0 : s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
950 0 : s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
951 :
952 0 : x0 = fdct_round_shift(s0 + s2);
953 0 : x1 = fdct_round_shift(s1 + s3);
954 0 : x2 = fdct_round_shift(s0 - s2);
955 0 : x3 = fdct_round_shift(s1 - s3);
956 :
957 0 : x4 = fdct_round_shift(s4 + s6);
958 0 : x5 = fdct_round_shift(s5 + s7);
959 0 : x6 = fdct_round_shift(s4 - s6);
960 0 : x7 = fdct_round_shift(s5 - s7);
961 :
962 0 : x8 = fdct_round_shift(s8 + s10);
963 0 : x9 = fdct_round_shift(s9 + s11);
964 0 : x10 = fdct_round_shift(s8 - s10);
965 0 : x11 = fdct_round_shift(s9 - s11);
966 :
967 0 : x12 = fdct_round_shift(s12 + s14);
968 0 : x13 = fdct_round_shift(s13 + s15);
969 0 : x14 = fdct_round_shift(s12 - s14);
970 0 : x15 = fdct_round_shift(s13 - s15);
971 :
972 : // stage 4
973 0 : s2 = (-cospi_16_64) * (x2 + x3);
974 0 : s3 = cospi_16_64 * (x2 - x3);
975 0 : s6 = cospi_16_64 * (x6 + x7);
976 0 : s7 = cospi_16_64 * (-x6 + x7);
977 0 : s10 = cospi_16_64 * (x10 + x11);
978 0 : s11 = cospi_16_64 * (-x10 + x11);
979 0 : s14 = (-cospi_16_64) * (x14 + x15);
980 0 : s15 = cospi_16_64 * (x14 - x15);
981 :
982 0 : x2 = fdct_round_shift(s2);
983 0 : x3 = fdct_round_shift(s3);
984 0 : x6 = fdct_round_shift(s6);
985 0 : x7 = fdct_round_shift(s7);
986 0 : x10 = fdct_round_shift(s10);
987 0 : x11 = fdct_round_shift(s11);
988 0 : x14 = fdct_round_shift(s14);
989 0 : x15 = fdct_round_shift(s15);
990 :
991 0 : output[0] = (tran_low_t)x0;
992 0 : output[1] = (tran_low_t)-x8;
993 0 : output[2] = (tran_low_t)x12;
994 0 : output[3] = (tran_low_t)-x4;
995 0 : output[4] = (tran_low_t)x6;
996 0 : output[5] = (tran_low_t)x14;
997 0 : output[6] = (tran_low_t)x10;
998 0 : output[7] = (tran_low_t)x2;
999 0 : output[8] = (tran_low_t)x3;
1000 0 : output[9] = (tran_low_t)x11;
1001 0 : output[10] = (tran_low_t)x15;
1002 0 : output[11] = (tran_low_t)x7;
1003 0 : output[12] = (tran_low_t)x5;
1004 0 : output[13] = (tran_low_t)-x13;
1005 0 : output[14] = (tran_low_t)x9;
1006 0 : output[15] = (tran_low_t)-x1;
1007 0 : }
1008 :
1009 : // For use in lieu of ADST
1010 0 : static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
1011 : int i;
1012 : tran_low_t inputhalf[16];
1013 0 : for (i = 0; i < 16; ++i) {
1014 0 : output[16 + i] = input[i] * 4;
1015 : }
1016 : // Multiply input by sqrt(2)
1017 0 : for (i = 0; i < 16; ++i) {
1018 0 : inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
1019 : }
1020 0 : fdct16(inputhalf, output);
1021 : // Note overall scaling factor is 4 times orthogonal
1022 0 : }
1023 :
1024 : #if CONFIG_EXT_TX
1025 : // TODO(sarahparker) these functions will be removed once the highbitdepth
1026 : // codepath works properly for rectangular transforms. They have almost
1027 : // identical versions in av1_fwd_txfm1d.c, but those are currently only
1028 : // being used for square transforms.
1029 0 : static void fidtx4(const tran_low_t *input, tran_low_t *output) {
1030 : int i;
1031 0 : for (i = 0; i < 4; ++i)
1032 0 : output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
1033 0 : }
1034 :
1035 0 : static void fidtx8(const tran_low_t *input, tran_low_t *output) {
1036 : int i;
1037 0 : for (i = 0; i < 8; ++i) output[i] = input[i] * 2;
1038 0 : }
1039 :
1040 0 : static void fidtx16(const tran_low_t *input, tran_low_t *output) {
1041 : int i;
1042 0 : for (i = 0; i < 16; ++i)
1043 0 : output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2);
1044 0 : }
1045 :
1046 0 : static void fidtx32(const tran_low_t *input, tran_low_t *output) {
1047 : int i;
1048 0 : for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
1049 0 : }
1050 :
1051 0 : static void copy_block(const int16_t *src, int src_stride, int l, int w,
1052 : int16_t *dest, int dest_stride) {
1053 : int i;
1054 0 : for (i = 0; i < l; ++i) {
1055 0 : memcpy(dest + dest_stride * i, src + src_stride * i, w * sizeof(int16_t));
1056 : }
1057 0 : }
1058 :
1059 0 : static void fliplr(int16_t *dest, int stride, int l, int w) {
1060 : int i, j;
1061 0 : for (i = 0; i < l; ++i) {
1062 0 : for (j = 0; j < w / 2; ++j) {
1063 0 : const int16_t tmp = dest[i * stride + j];
1064 0 : dest[i * stride + j] = dest[i * stride + w - 1 - j];
1065 0 : dest[i * stride + w - 1 - j] = tmp;
1066 : }
1067 : }
1068 0 : }
1069 :
1070 0 : static void flipud(int16_t *dest, int stride, int l, int w) {
1071 : int i, j;
1072 0 : for (j = 0; j < w; ++j) {
1073 0 : for (i = 0; i < l / 2; ++i) {
1074 0 : const int16_t tmp = dest[i * stride + j];
1075 0 : dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
1076 0 : dest[(l - 1 - i) * stride + j] = tmp;
1077 : }
1078 : }
1079 0 : }
1080 :
1081 0 : static void fliplrud(int16_t *dest, int stride, int l, int w) {
1082 : int i, j;
1083 0 : for (i = 0; i < l / 2; ++i) {
1084 0 : for (j = 0; j < w; ++j) {
1085 0 : const int16_t tmp = dest[i * stride + j];
1086 0 : dest[i * stride + j] = dest[(l - 1 - i) * stride + w - 1 - j];
1087 0 : dest[(l - 1 - i) * stride + w - 1 - j] = tmp;
1088 : }
1089 : }
1090 0 : }
1091 :
1092 0 : static void copy_fliplr(const int16_t *src, int src_stride, int l, int w,
1093 : int16_t *dest, int dest_stride) {
1094 0 : copy_block(src, src_stride, l, w, dest, dest_stride);
1095 0 : fliplr(dest, dest_stride, l, w);
1096 0 : }
1097 :
1098 0 : static void copy_flipud(const int16_t *src, int src_stride, int l, int w,
1099 : int16_t *dest, int dest_stride) {
1100 0 : copy_block(src, src_stride, l, w, dest, dest_stride);
1101 0 : flipud(dest, dest_stride, l, w);
1102 0 : }
1103 :
1104 0 : static void copy_fliplrud(const int16_t *src, int src_stride, int l, int w,
1105 : int16_t *dest, int dest_stride) {
1106 0 : copy_block(src, src_stride, l, w, dest, dest_stride);
1107 0 : fliplrud(dest, dest_stride, l, w);
1108 0 : }
1109 :
1110 0 : static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w,
1111 : int16_t *buff, int tx_type) {
1112 0 : switch (tx_type) {
1113 : case DCT_DCT:
1114 : case ADST_DCT:
1115 : case DCT_ADST:
1116 : case ADST_ADST:
1117 : case IDTX:
1118 : case V_DCT:
1119 : case H_DCT:
1120 : case V_ADST:
1121 0 : case H_ADST: break;
1122 : case FLIPADST_DCT:
1123 : case FLIPADST_ADST:
1124 : case V_FLIPADST:
1125 0 : copy_flipud(*src, *src_stride, l, w, buff, w);
1126 0 : *src = buff;
1127 0 : *src_stride = w;
1128 0 : break;
1129 : case DCT_FLIPADST:
1130 : case ADST_FLIPADST:
1131 : case H_FLIPADST:
1132 0 : copy_fliplr(*src, *src_stride, l, w, buff, w);
1133 0 : *src = buff;
1134 0 : *src_stride = w;
1135 0 : break;
1136 : case FLIPADST_FLIPADST:
1137 0 : copy_fliplrud(*src, *src_stride, l, w, buff, w);
1138 0 : *src = buff;
1139 0 : *src_stride = w;
1140 0 : break;
1141 0 : default: assert(0); break;
1142 : }
1143 0 : }
1144 : #endif // CONFIG_EXT_TX
1145 :
1146 0 : void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
1147 : int tx_type) {
1148 0 : if (tx_type == DCT_DCT) {
1149 0 : aom_fdct4x4_c(input, output, stride);
1150 : } else {
1151 : static const transform_2d FHT[] = {
1152 : { fdct4, fdct4 }, // DCT_DCT
1153 : { fadst4, fdct4 }, // ADST_DCT
1154 : { fdct4, fadst4 }, // DCT_ADST
1155 : { fadst4, fadst4 }, // ADST_ADST
1156 : #if CONFIG_EXT_TX
1157 : { fadst4, fdct4 }, // FLIPADST_DCT
1158 : { fdct4, fadst4 }, // DCT_FLIPADST
1159 : { fadst4, fadst4 }, // FLIPADST_FLIPADST
1160 : { fadst4, fadst4 }, // ADST_FLIPADST
1161 : { fadst4, fadst4 }, // FLIPADST_ADST
1162 : { fidtx4, fidtx4 }, // IDTX
1163 : { fdct4, fidtx4 }, // V_DCT
1164 : { fidtx4, fdct4 }, // H_DCT
1165 : { fadst4, fidtx4 }, // V_ADST
1166 : { fidtx4, fadst4 }, // H_ADST
1167 : { fadst4, fidtx4 }, // V_FLIPADST
1168 : { fidtx4, fadst4 }, // H_FLIPADST
1169 : #endif // CONFIG_EXT_TX
1170 : };
1171 0 : const transform_2d ht = FHT[tx_type];
1172 : tran_low_t out[4 * 4];
1173 : int i, j;
1174 : tran_low_t temp_in[4], temp_out[4];
1175 :
1176 : #if CONFIG_EXT_TX
1177 : int16_t flipped_input[4 * 4];
1178 0 : maybe_flip_input(&input, &stride, 4, 4, flipped_input, tx_type);
1179 : #endif
1180 :
1181 : // Columns
1182 0 : for (i = 0; i < 4; ++i) {
1183 0 : for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
1184 0 : if (i == 0 && temp_in[0]) temp_in[0] += 1;
1185 0 : ht.cols(temp_in, temp_out);
1186 0 : for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
1187 : }
1188 :
1189 : // Rows
1190 0 : for (i = 0; i < 4; ++i) {
1191 0 : for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
1192 0 : ht.rows(temp_in, temp_out);
1193 0 : for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
1194 : }
1195 : }
1196 0 : }
1197 :
1198 0 : void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
1199 : int tx_type) {
1200 : static const transform_2d FHT[] = {
1201 : { fdct8, fdct4 }, // DCT_DCT
1202 : { fadst8, fdct4 }, // ADST_DCT
1203 : { fdct8, fadst4 }, // DCT_ADST
1204 : { fadst8, fadst4 }, // ADST_ADST
1205 : #if CONFIG_EXT_TX
1206 : { fadst8, fdct4 }, // FLIPADST_DCT
1207 : { fdct8, fadst4 }, // DCT_FLIPADST
1208 : { fadst8, fadst4 }, // FLIPADST_FLIPADST
1209 : { fadst8, fadst4 }, // ADST_FLIPADST
1210 : { fadst8, fadst4 }, // FLIPADST_ADST
1211 : { fidtx8, fidtx4 }, // IDTX
1212 : { fdct8, fidtx4 }, // V_DCT
1213 : { fidtx8, fdct4 }, // H_DCT
1214 : { fadst8, fidtx4 }, // V_ADST
1215 : { fidtx8, fadst4 }, // H_ADST
1216 : { fadst8, fidtx4 }, // V_FLIPADST
1217 : { fidtx8, fadst4 }, // H_FLIPADST
1218 : #endif
1219 : };
1220 0 : const transform_2d ht = FHT[tx_type];
1221 0 : const int n = 4;
1222 0 : const int n2 = 8;
1223 : tran_low_t out[8 * 4];
1224 : tran_low_t temp_in[8], temp_out[8];
1225 : int i, j;
1226 : #if CONFIG_EXT_TX
1227 : int16_t flipped_input[8 * 4];
1228 0 : maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
1229 : #endif
1230 :
1231 : // Rows
1232 0 : for (i = 0; i < n2; ++i) {
1233 0 : for (j = 0; j < n; ++j)
1234 0 : temp_in[j] =
1235 0 : (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
1236 0 : ht.rows(temp_in, temp_out);
1237 0 : for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
1238 : }
1239 :
1240 : // Columns
1241 0 : for (i = 0; i < n; ++i) {
1242 0 : for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
1243 0 : ht.cols(temp_in, temp_out);
1244 0 : for (j = 0; j < n2; ++j)
1245 0 : output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
1246 : }
1247 : // Note: overall scale factor of transform is 8 times unitary
1248 0 : }
1249 :
1250 0 : void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
1251 : int tx_type) {
1252 : static const transform_2d FHT[] = {
1253 : { fdct4, fdct8 }, // DCT_DCT
1254 : { fadst4, fdct8 }, // ADST_DCT
1255 : { fdct4, fadst8 }, // DCT_ADST
1256 : { fadst4, fadst8 }, // ADST_ADST
1257 : #if CONFIG_EXT_TX
1258 : { fadst4, fdct8 }, // FLIPADST_DCT
1259 : { fdct4, fadst8 }, // DCT_FLIPADST
1260 : { fadst4, fadst8 }, // FLIPADST_FLIPADST
1261 : { fadst4, fadst8 }, // ADST_FLIPADST
1262 : { fadst4, fadst8 }, // FLIPADST_ADST
1263 : { fidtx4, fidtx8 }, // IDTX
1264 : { fdct4, fidtx8 }, // V_DCT
1265 : { fidtx4, fdct8 }, // H_DCT
1266 : { fadst4, fidtx8 }, // V_ADST
1267 : { fidtx4, fadst8 }, // H_ADST
1268 : { fadst4, fidtx8 }, // V_FLIPADST
1269 : { fidtx4, fadst8 }, // H_FLIPADST
1270 : #endif
1271 : };
1272 0 : const transform_2d ht = FHT[tx_type];
1273 0 : const int n = 4;
1274 0 : const int n2 = 8;
1275 : tran_low_t out[8 * 4];
1276 : tran_low_t temp_in[8], temp_out[8];
1277 : int i, j;
1278 : #if CONFIG_EXT_TX
1279 : int16_t flipped_input[8 * 4];
1280 0 : maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
1281 : #endif
1282 :
1283 : // Columns
1284 0 : for (i = 0; i < n2; ++i) {
1285 0 : for (j = 0; j < n; ++j)
1286 0 : temp_in[j] =
1287 0 : (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
1288 0 : ht.cols(temp_in, temp_out);
1289 0 : for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
1290 : }
1291 :
1292 : // Rows
1293 0 : for (i = 0; i < n; ++i) {
1294 0 : for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
1295 0 : ht.rows(temp_in, temp_out);
1296 0 : for (j = 0; j < n2; ++j)
1297 0 : output[j + i * n2] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
1298 : }
1299 : // Note: overall scale factor of transform is 8 times unitary
1300 0 : }
1301 :
1302 0 : void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
1303 : int tx_type) {
1304 : static const transform_2d FHT[] = {
1305 : { fdct16, fdct4 }, // DCT_DCT
1306 : { fadst16, fdct4 }, // ADST_DCT
1307 : { fdct16, fadst4 }, // DCT_ADST
1308 : { fadst16, fadst4 }, // ADST_ADST
1309 : #if CONFIG_EXT_TX
1310 : { fadst16, fdct4 }, // FLIPADST_DCT
1311 : { fdct16, fadst4 }, // DCT_FLIPADST
1312 : { fadst16, fadst4 }, // FLIPADST_FLIPADST
1313 : { fadst16, fadst4 }, // ADST_FLIPADST
1314 : { fadst16, fadst4 }, // FLIPADST_ADST
1315 : { fidtx16, fidtx4 }, // IDTX
1316 : { fdct16, fidtx4 }, // V_DCT
1317 : { fidtx16, fdct4 }, // H_DCT
1318 : { fadst16, fidtx4 }, // V_ADST
1319 : { fidtx16, fadst4 }, // H_ADST
1320 : { fadst16, fidtx4 }, // V_FLIPADST
1321 : { fidtx16, fadst4 }, // H_FLIPADST
1322 : #endif
1323 : };
1324 0 : const transform_2d ht = FHT[tx_type];
1325 0 : const int n = 4;
1326 0 : const int n4 = 16;
1327 : tran_low_t out[16 * 4];
1328 : tran_low_t temp_in[16], temp_out[16];
1329 : int i, j;
1330 : #if CONFIG_EXT_TX
1331 : int16_t flipped_input[16 * 4];
1332 0 : maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
1333 : #endif
1334 :
1335 : // Rows
1336 0 : for (i = 0; i < n4; ++i) {
1337 0 : for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
1338 0 : ht.rows(temp_in, temp_out);
1339 0 : for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
1340 : }
1341 :
1342 : // Columns
1343 0 : for (i = 0; i < n; ++i) {
1344 0 : for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
1345 0 : ht.cols(temp_in, temp_out);
1346 0 : for (j = 0; j < n4; ++j)
1347 0 : output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
1348 : }
1349 : // Note: overall scale factor of transform is 8 times unitary
1350 0 : }
1351 :
1352 0 : void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
1353 : int tx_type) {
1354 : static const transform_2d FHT[] = {
1355 : { fdct4, fdct16 }, // DCT_DCT
1356 : { fadst4, fdct16 }, // ADST_DCT
1357 : { fdct4, fadst16 }, // DCT_ADST
1358 : { fadst4, fadst16 }, // ADST_ADST
1359 : #if CONFIG_EXT_TX
1360 : { fadst4, fdct16 }, // FLIPADST_DCT
1361 : { fdct4, fadst16 }, // DCT_FLIPADST
1362 : { fadst4, fadst16 }, // FLIPADST_FLIPADST
1363 : { fadst4, fadst16 }, // ADST_FLIPADST
1364 : { fadst4, fadst16 }, // FLIPADST_ADST
1365 : { fidtx4, fidtx16 }, // IDTX
1366 : { fdct4, fidtx16 }, // V_DCT
1367 : { fidtx4, fdct16 }, // H_DCT
1368 : { fadst4, fidtx16 }, // V_ADST
1369 : { fidtx4, fadst16 }, // H_ADST
1370 : { fadst4, fidtx16 }, // V_FLIPADST
1371 : { fidtx4, fadst16 }, // H_FLIPADST
1372 : #endif
1373 : };
1374 0 : const transform_2d ht = FHT[tx_type];
1375 0 : const int n = 4;
1376 0 : const int n4 = 16;
1377 : tran_low_t out[16 * 4];
1378 : tran_low_t temp_in[16], temp_out[16];
1379 : int i, j;
1380 : #if CONFIG_EXT_TX
1381 : int16_t flipped_input[16 * 4];
1382 0 : maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
1383 : #endif
1384 :
1385 : // Columns
1386 0 : for (i = 0; i < n4; ++i) {
1387 0 : for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
1388 0 : ht.cols(temp_in, temp_out);
1389 0 : for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
1390 : }
1391 :
1392 : // Rows
1393 0 : for (i = 0; i < n; ++i) {
1394 0 : for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
1395 0 : ht.rows(temp_in, temp_out);
1396 0 : for (j = 0; j < n4; ++j)
1397 0 : output[j + i * n4] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
1398 : }
1399 : // Note: overall scale factor of transform is 8 times unitary
1400 0 : }
1401 :
1402 0 : void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
1403 : int tx_type) {
1404 : static const transform_2d FHT[] = {
1405 : { fdct16, fdct8 }, // DCT_DCT
1406 : { fadst16, fdct8 }, // ADST_DCT
1407 : { fdct16, fadst8 }, // DCT_ADST
1408 : { fadst16, fadst8 }, // ADST_ADST
1409 : #if CONFIG_EXT_TX
1410 : { fadst16, fdct8 }, // FLIPADST_DCT
1411 : { fdct16, fadst8 }, // DCT_FLIPADST
1412 : { fadst16, fadst8 }, // FLIPADST_FLIPADST
1413 : { fadst16, fadst8 }, // ADST_FLIPADST
1414 : { fadst16, fadst8 }, // FLIPADST_ADST
1415 : { fidtx16, fidtx8 }, // IDTX
1416 : { fdct16, fidtx8 }, // V_DCT
1417 : { fidtx16, fdct8 }, // H_DCT
1418 : { fadst16, fidtx8 }, // V_ADST
1419 : { fidtx16, fadst8 }, // H_ADST
1420 : { fadst16, fidtx8 }, // V_FLIPADST
1421 : { fidtx16, fadst8 }, // H_FLIPADST
1422 : #endif
1423 : };
1424 0 : const transform_2d ht = FHT[tx_type];
1425 0 : const int n = 8;
1426 0 : const int n2 = 16;
1427 : tran_low_t out[16 * 8];
1428 : tran_low_t temp_in[16], temp_out[16];
1429 : int i, j;
1430 : #if CONFIG_EXT_TX
1431 : int16_t flipped_input[16 * 8];
1432 0 : maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
1433 : #endif
1434 :
1435 : // Rows
1436 0 : for (i = 0; i < n2; ++i) {
1437 0 : for (j = 0; j < n; ++j)
1438 0 : temp_in[j] =
1439 0 : (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
1440 0 : ht.rows(temp_in, temp_out);
1441 0 : for (j = 0; j < n; ++j)
1442 0 : out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
1443 : }
1444 :
1445 : // Columns
1446 0 : for (i = 0; i < n; ++i) {
1447 0 : for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
1448 0 : ht.cols(temp_in, temp_out);
1449 0 : for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
1450 : }
1451 : // Note: overall scale factor of transform is 8 times unitary
1452 0 : }
1453 :
1454 0 : void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
1455 : int tx_type) {
1456 : static const transform_2d FHT[] = {
1457 : { fdct8, fdct16 }, // DCT_DCT
1458 : { fadst8, fdct16 }, // ADST_DCT
1459 : { fdct8, fadst16 }, // DCT_ADST
1460 : { fadst8, fadst16 }, // ADST_ADST
1461 : #if CONFIG_EXT_TX
1462 : { fadst8, fdct16 }, // FLIPADST_DCT
1463 : { fdct8, fadst16 }, // DCT_FLIPADST
1464 : { fadst8, fadst16 }, // FLIPADST_FLIPADST
1465 : { fadst8, fadst16 }, // ADST_FLIPADST
1466 : { fadst8, fadst16 }, // FLIPADST_ADST
1467 : { fidtx8, fidtx16 }, // IDTX
1468 : { fdct8, fidtx16 }, // V_DCT
1469 : { fidtx8, fdct16 }, // H_DCT
1470 : { fadst8, fidtx16 }, // V_ADST
1471 : { fidtx8, fadst16 }, // H_ADST
1472 : { fadst8, fidtx16 }, // V_FLIPADST
1473 : { fidtx8, fadst16 }, // H_FLIPADST
1474 : #endif
1475 : };
1476 0 : const transform_2d ht = FHT[tx_type];
1477 0 : const int n = 8;
1478 0 : const int n2 = 16;
1479 : tran_low_t out[16 * 8];
1480 : tran_low_t temp_in[16], temp_out[16];
1481 : int i, j;
1482 : #if CONFIG_EXT_TX
1483 : int16_t flipped_input[16 * 8];
1484 0 : maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
1485 : #endif
1486 :
1487 : // Columns
1488 0 : for (i = 0; i < n2; ++i) {
1489 0 : for (j = 0; j < n; ++j)
1490 0 : temp_in[j] =
1491 0 : (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
1492 0 : ht.cols(temp_in, temp_out);
1493 0 : for (j = 0; j < n; ++j)
1494 0 : out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
1495 : }
1496 :
1497 : // Rows
1498 0 : for (i = 0; i < n; ++i) {
1499 0 : for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
1500 0 : ht.rows(temp_in, temp_out);
1501 0 : for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
1502 : }
1503 : // Note: overall scale factor of transform is 8 times unitary
1504 0 : }
1505 :
1506 0 : void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
1507 : int tx_type) {
1508 : static const transform_2d FHT[] = {
1509 : { fdct32, fdct8 }, // DCT_DCT
1510 : { fhalfright32, fdct8 }, // ADST_DCT
1511 : { fdct32, fadst8 }, // DCT_ADST
1512 : { fhalfright32, fadst8 }, // ADST_ADST
1513 : #if CONFIG_EXT_TX
1514 : { fhalfright32, fdct8 }, // FLIPADST_DCT
1515 : { fdct32, fadst8 }, // DCT_FLIPADST
1516 : { fhalfright32, fadst8 }, // FLIPADST_FLIPADST
1517 : { fhalfright32, fadst8 }, // ADST_FLIPADST
1518 : { fhalfright32, fadst8 }, // FLIPADST_ADST
1519 : { fidtx32, fidtx8 }, // IDTX
1520 : { fdct32, fidtx8 }, // V_DCT
1521 : { fidtx32, fdct8 }, // H_DCT
1522 : { fhalfright32, fidtx8 }, // V_ADST
1523 : { fidtx32, fadst8 }, // H_ADST
1524 : { fhalfright32, fidtx8 }, // V_FLIPADST
1525 : { fidtx32, fadst8 }, // H_FLIPADST
1526 : #endif
1527 : };
1528 0 : const transform_2d ht = FHT[tx_type];
1529 0 : const int n = 8;
1530 0 : const int n4 = 32;
1531 : tran_low_t out[32 * 8];
1532 : tran_low_t temp_in[32], temp_out[32];
1533 : int i, j;
1534 : #if CONFIG_EXT_TX
1535 : int16_t flipped_input[32 * 8];
1536 0 : maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
1537 : #endif
1538 :
1539 : // Rows
1540 0 : for (i = 0; i < n4; ++i) {
1541 0 : for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
1542 0 : ht.rows(temp_in, temp_out);
1543 0 : for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
1544 : }
1545 :
1546 : // Columns
1547 0 : for (i = 0; i < n; ++i) {
1548 0 : for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
1549 0 : ht.cols(temp_in, temp_out);
1550 0 : for (j = 0; j < n4; ++j)
1551 0 : output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
1552 : }
1553 : // Note: overall scale factor of transform is 4 times unitary
1554 0 : }
1555 :
1556 0 : void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
1557 : int tx_type) {
1558 : static const transform_2d FHT[] = {
1559 : { fdct8, fdct32 }, // DCT_DCT
1560 : { fadst8, fdct32 }, // ADST_DCT
1561 : { fdct8, fhalfright32 }, // DCT_ADST
1562 : { fadst8, fhalfright32 }, // ADST_ADST
1563 : #if CONFIG_EXT_TX
1564 : { fadst8, fdct32 }, // FLIPADST_DCT
1565 : { fdct8, fhalfright32 }, // DCT_FLIPADST
1566 : { fadst8, fhalfright32 }, // FLIPADST_FLIPADST
1567 : { fadst8, fhalfright32 }, // ADST_FLIPADST
1568 : { fadst8, fhalfright32 }, // FLIPADST_ADST
1569 : { fidtx8, fidtx32 }, // IDTX
1570 : { fdct8, fidtx32 }, // V_DCT
1571 : { fidtx8, fdct32 }, // H_DCT
1572 : { fadst8, fidtx32 }, // V_ADST
1573 : { fidtx8, fhalfright32 }, // H_ADST
1574 : { fadst8, fidtx32 }, // V_FLIPADST
1575 : { fidtx8, fhalfright32 }, // H_FLIPADST
1576 : #endif
1577 : };
1578 0 : const transform_2d ht = FHT[tx_type];
1579 0 : const int n = 8;
1580 0 : const int n4 = 32;
1581 : tran_low_t out[32 * 8];
1582 : tran_low_t temp_in[32], temp_out[32];
1583 : int i, j;
1584 : #if CONFIG_EXT_TX
1585 : int16_t flipped_input[32 * 8];
1586 0 : maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
1587 : #endif
1588 :
1589 : // Columns
1590 0 : for (i = 0; i < n4; ++i) {
1591 0 : for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
1592 0 : ht.cols(temp_in, temp_out);
1593 0 : for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
1594 : }
1595 :
1596 : // Rows
1597 0 : for (i = 0; i < n; ++i) {
1598 0 : for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
1599 0 : ht.rows(temp_in, temp_out);
1600 0 : for (j = 0; j < n4; ++j)
1601 0 : output[j + i * n4] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
1602 : }
1603 : // Note: overall scale factor of transform is 4 times unitary
1604 0 : }
1605 :
1606 0 : void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
1607 : int tx_type) {
1608 : static const transform_2d FHT[] = {
1609 : { fdct32, fdct16 }, // DCT_DCT
1610 : { fhalfright32, fdct16 }, // ADST_DCT
1611 : { fdct32, fadst16 }, // DCT_ADST
1612 : { fhalfright32, fadst16 }, // ADST_ADST
1613 : #if CONFIG_EXT_TX
1614 : { fhalfright32, fdct16 }, // FLIPADST_DCT
1615 : { fdct32, fadst16 }, // DCT_FLIPADST
1616 : { fhalfright32, fadst16 }, // FLIPADST_FLIPADST
1617 : { fhalfright32, fadst16 }, // ADST_FLIPADST
1618 : { fhalfright32, fadst16 }, // FLIPADST_ADST
1619 : { fidtx32, fidtx16 }, // IDTX
1620 : { fdct32, fidtx16 }, // V_DCT
1621 : { fidtx32, fdct16 }, // H_DCT
1622 : { fhalfright32, fidtx16 }, // V_ADST
1623 : { fidtx32, fadst16 }, // H_ADST
1624 : { fhalfright32, fidtx16 }, // V_FLIPADST
1625 : { fidtx32, fadst16 }, // H_FLIPADST
1626 : #endif
1627 : };
1628 0 : const transform_2d ht = FHT[tx_type];
1629 0 : const int n = 16;
1630 0 : const int n2 = 32;
1631 : tran_low_t out[32 * 16];
1632 : tran_low_t temp_in[32], temp_out[32];
1633 : int i, j;
1634 : #if CONFIG_EXT_TX
1635 : int16_t flipped_input[32 * 16];
1636 0 : maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
1637 : #endif
1638 :
1639 : // Rows
1640 0 : for (i = 0; i < n2; ++i) {
1641 0 : for (j = 0; j < n; ++j)
1642 0 : temp_in[j] =
1643 0 : (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
1644 0 : ht.rows(temp_in, temp_out);
1645 0 : for (j = 0; j < n; ++j)
1646 0 : out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
1647 : }
1648 :
1649 : // Columns
1650 0 : for (i = 0; i < n; ++i) {
1651 0 : for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
1652 0 : ht.cols(temp_in, temp_out);
1653 0 : for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
1654 : }
1655 : // Note: overall scale factor of transform is 4 times unitary
1656 0 : }
1657 :
1658 0 : void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
1659 : int tx_type) {
1660 : static const transform_2d FHT[] = {
1661 : { fdct16, fdct32 }, // DCT_DCT
1662 : { fadst16, fdct32 }, // ADST_DCT
1663 : { fdct16, fhalfright32 }, // DCT_ADST
1664 : { fadst16, fhalfright32 }, // ADST_ADST
1665 : #if CONFIG_EXT_TX
1666 : { fadst16, fdct32 }, // FLIPADST_DCT
1667 : { fdct16, fhalfright32 }, // DCT_FLIPADST
1668 : { fadst16, fhalfright32 }, // FLIPADST_FLIPADST
1669 : { fadst16, fhalfright32 }, // ADST_FLIPADST
1670 : { fadst16, fhalfright32 }, // FLIPADST_ADST
1671 : { fidtx16, fidtx32 }, // IDTX
1672 : { fdct16, fidtx32 }, // V_DCT
1673 : { fidtx16, fdct32 }, // H_DCT
1674 : { fadst16, fidtx32 }, // V_ADST
1675 : { fidtx16, fhalfright32 }, // H_ADST
1676 : { fadst16, fidtx32 }, // V_FLIPADST
1677 : { fidtx16, fhalfright32 }, // H_FLIPADST
1678 : #endif
1679 : };
1680 0 : const transform_2d ht = FHT[tx_type];
1681 0 : const int n = 16;
1682 0 : const int n2 = 32;
1683 : tran_low_t out[32 * 16];
1684 : tran_low_t temp_in[32], temp_out[32];
1685 : int i, j;
1686 : #if CONFIG_EXT_TX
1687 : int16_t flipped_input[32 * 16];
1688 0 : maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
1689 : #endif
1690 :
1691 : // Columns
1692 0 : for (i = 0; i < n2; ++i) {
1693 0 : for (j = 0; j < n; ++j)
1694 0 : temp_in[j] =
1695 0 : (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
1696 0 : ht.cols(temp_in, temp_out);
1697 0 : for (j = 0; j < n; ++j)
1698 0 : out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
1699 : }
1700 :
1701 : // Rows
1702 0 : for (i = 0; i < n; ++i) {
1703 0 : for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
1704 0 : ht.rows(temp_in, temp_out);
1705 0 : for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
1706 : }
1707 : // Note: overall scale factor of transform is 4 times unitary
1708 0 : }
1709 :
1710 0 : void av1_fdct8x8_quant_c(const int16_t *input, int stride,
1711 : tran_low_t *coeff_ptr, intptr_t n_coeffs,
1712 : int skip_block, const int16_t *zbin_ptr,
1713 : const int16_t *round_ptr, const int16_t *quant_ptr,
1714 : const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
1715 : tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
1716 : uint16_t *eob_ptr, const int16_t *scan,
1717 : const int16_t *iscan
1718 : #if CONFIG_AOM_QM
1719 : ,
1720 : const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
1721 : #endif
1722 : ) {
1723 0 : int eob = -1;
1724 :
1725 : int i, j;
1726 : tran_low_t intermediate[64];
1727 :
1728 : // Transform columns
1729 : {
1730 0 : tran_low_t *output = intermediate;
1731 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
1732 : tran_high_t t0, t1, t2, t3; // needs32
1733 : tran_high_t x0, x1, x2, x3; // canbe16
1734 :
1735 0 : for (i = 0; i < 8; i++) {
1736 : // stage 1
1737 0 : s0 = (input[0 * stride] + input[7 * stride]) * 4;
1738 0 : s1 = (input[1 * stride] + input[6 * stride]) * 4;
1739 0 : s2 = (input[2 * stride] + input[5 * stride]) * 4;
1740 0 : s3 = (input[3 * stride] + input[4 * stride]) * 4;
1741 0 : s4 = (input[3 * stride] - input[4 * stride]) * 4;
1742 0 : s5 = (input[2 * stride] - input[5 * stride]) * 4;
1743 0 : s6 = (input[1 * stride] - input[6 * stride]) * 4;
1744 0 : s7 = (input[0 * stride] - input[7 * stride]) * 4;
1745 :
1746 : // fdct4(step, step);
1747 0 : x0 = s0 + s3;
1748 0 : x1 = s1 + s2;
1749 0 : x2 = s1 - s2;
1750 0 : x3 = s0 - s3;
1751 0 : t0 = (x0 + x1) * cospi_16_64;
1752 0 : t1 = (x0 - x1) * cospi_16_64;
1753 0 : t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
1754 0 : t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
1755 0 : output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
1756 0 : output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
1757 0 : output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
1758 0 : output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
1759 :
1760 : // stage 2
1761 0 : t0 = (s6 - s5) * cospi_16_64;
1762 0 : t1 = (s6 + s5) * cospi_16_64;
1763 0 : t2 = fdct_round_shift(t0);
1764 0 : t3 = fdct_round_shift(t1);
1765 :
1766 : // stage 3
1767 0 : x0 = s4 + t2;
1768 0 : x1 = s4 - t2;
1769 0 : x2 = s7 - t3;
1770 0 : x3 = s7 + t3;
1771 :
1772 : // stage 4
1773 0 : t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
1774 0 : t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
1775 0 : t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
1776 0 : t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
1777 0 : output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
1778 0 : output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
1779 0 : output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
1780 0 : output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
1781 0 : input++;
1782 0 : output++;
1783 : }
1784 : }
1785 :
1786 : // Rows
1787 0 : for (i = 0; i < 8; ++i) {
1788 0 : fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
1789 0 : for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2;
1790 : }
1791 :
1792 : // TODO(jingning) Decide the need of these arguments after the
1793 : // quantization process is completed.
1794 : (void)zbin_ptr;
1795 : (void)quant_shift_ptr;
1796 : (void)iscan;
1797 :
1798 0 : memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
1799 0 : memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
1800 :
1801 0 : if (!skip_block) {
1802 : // Quantization pass: All coefficients with index >= zero_flag are
1803 : // skippable. Note: zero_flag can be zero.
1804 0 : for (i = 0; i < n_coeffs; i++) {
1805 0 : const int rc = scan[i];
1806 0 : const int coeff = coeff_ptr[rc];
1807 : #if CONFIG_AOM_QM
1808 : const qm_val_t wt = qm_ptr[rc];
1809 : const qm_val_t iwt = iqm_ptr[rc];
1810 : const int dequant =
1811 : (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
1812 : AOM_QM_BITS;
1813 : #endif
1814 0 : const int coeff_sign = (coeff >> 31);
1815 0 : const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
1816 :
1817 0 : int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
1818 : int tmp32;
1819 : #if CONFIG_AOM_QM
1820 : tmp32 = (int)((tmp * quant_ptr[rc != 0] * wt) >> (16 + AOM_QM_BITS));
1821 : qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
1822 : dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
1823 : #else
1824 0 : tmp32 = (int)((tmp * quant_ptr[rc != 0]) >> 16);
1825 0 : qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
1826 0 : dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
1827 : #endif
1828 :
1829 0 : if (tmp32) eob = i;
1830 : }
1831 : }
1832 0 : *eob_ptr = eob + 1;
1833 0 : }
1834 :
1835 0 : void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
1836 : int tx_type) {
1837 0 : if (tx_type == DCT_DCT) {
1838 0 : aom_fdct8x8_c(input, output, stride);
1839 : } else {
1840 : static const transform_2d FHT[] = {
1841 : { fdct8, fdct8 }, // DCT_DCT
1842 : { fadst8, fdct8 }, // ADST_DCT
1843 : { fdct8, fadst8 }, // DCT_ADST
1844 : { fadst8, fadst8 }, // ADST_ADST
1845 : #if CONFIG_EXT_TX
1846 : { fadst8, fdct8 }, // FLIPADST_DCT
1847 : { fdct8, fadst8 }, // DCT_FLIPADST
1848 : { fadst8, fadst8 }, // FLIPADST_FLIPADST
1849 : { fadst8, fadst8 }, // ADST_FLIPADST
1850 : { fadst8, fadst8 }, // FLIPADST_ADST
1851 : { fidtx8, fidtx8 }, // IDTX
1852 : { fdct8, fidtx8 }, // V_DCT
1853 : { fidtx8, fdct8 }, // H_DCT
1854 : { fadst8, fidtx8 }, // V_ADST
1855 : { fidtx8, fadst8 }, // H_ADST
1856 : { fadst8, fidtx8 }, // V_FLIPADST
1857 : { fidtx8, fadst8 }, // H_FLIPADST
1858 : #endif // CONFIG_EXT_TX
1859 : };
1860 0 : const transform_2d ht = FHT[tx_type];
1861 : tran_low_t out[64];
1862 : int i, j;
1863 : tran_low_t temp_in[8], temp_out[8];
1864 :
1865 : #if CONFIG_EXT_TX
1866 : int16_t flipped_input[8 * 8];
1867 0 : maybe_flip_input(&input, &stride, 8, 8, flipped_input, tx_type);
1868 : #endif
1869 :
1870 : // Columns
1871 0 : for (i = 0; i < 8; ++i) {
1872 0 : for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
1873 0 : ht.cols(temp_in, temp_out);
1874 0 : for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
1875 : }
1876 :
1877 : // Rows
1878 0 : for (i = 0; i < 8; ++i) {
1879 0 : for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
1880 0 : ht.rows(temp_in, temp_out);
1881 0 : for (j = 0; j < 8; ++j)
1882 0 : output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
1883 : }
1884 : }
1885 0 : }
1886 :
1887 : /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
1888 : pixel. */
1889 0 : void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
1890 : int i;
1891 : tran_high_t a1, b1, c1, d1, e1;
1892 0 : const int16_t *ip_pass0 = input;
1893 0 : const tran_low_t *ip = NULL;
1894 0 : tran_low_t *op = output;
1895 :
1896 0 : for (i = 0; i < 4; i++) {
1897 0 : a1 = ip_pass0[0 * stride];
1898 0 : b1 = ip_pass0[1 * stride];
1899 0 : c1 = ip_pass0[2 * stride];
1900 0 : d1 = ip_pass0[3 * stride];
1901 :
1902 0 : a1 += b1;
1903 0 : d1 = d1 - c1;
1904 0 : e1 = (a1 - d1) >> 1;
1905 0 : b1 = e1 - b1;
1906 0 : c1 = e1 - c1;
1907 0 : a1 -= c1;
1908 0 : d1 += b1;
1909 0 : op[0] = (tran_low_t)a1;
1910 0 : op[4] = (tran_low_t)c1;
1911 0 : op[8] = (tran_low_t)d1;
1912 0 : op[12] = (tran_low_t)b1;
1913 :
1914 0 : ip_pass0++;
1915 0 : op++;
1916 : }
1917 0 : ip = output;
1918 0 : op = output;
1919 :
1920 0 : for (i = 0; i < 4; i++) {
1921 0 : a1 = ip[0];
1922 0 : b1 = ip[1];
1923 0 : c1 = ip[2];
1924 0 : d1 = ip[3];
1925 :
1926 0 : a1 += b1;
1927 0 : d1 -= c1;
1928 0 : e1 = (a1 - d1) >> 1;
1929 0 : b1 = e1 - b1;
1930 0 : c1 = e1 - c1;
1931 0 : a1 -= c1;
1932 0 : d1 += b1;
1933 0 : op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
1934 0 : op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
1935 0 : op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
1936 0 : op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
1937 :
1938 0 : ip += 4;
1939 0 : op += 4;
1940 : }
1941 0 : }
1942 :
1943 0 : void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
1944 : int tx_type) {
1945 : static const transform_2d FHT[] = {
1946 : { fdct16, fdct16 }, // DCT_DCT
1947 : { fadst16, fdct16 }, // ADST_DCT
1948 : { fdct16, fadst16 }, // DCT_ADST
1949 : { fadst16, fadst16 }, // ADST_ADST
1950 : #if CONFIG_EXT_TX
1951 : { fadst16, fdct16 }, // FLIPADST_DCT
1952 : { fdct16, fadst16 }, // DCT_FLIPADST
1953 : { fadst16, fadst16 }, // FLIPADST_FLIPADST
1954 : { fadst16, fadst16 }, // ADST_FLIPADST
1955 : { fadst16, fadst16 }, // FLIPADST_ADST
1956 : { fidtx16, fidtx16 }, // IDTX
1957 : { fdct16, fidtx16 }, // V_DCT
1958 : { fidtx16, fdct16 }, // H_DCT
1959 : { fadst16, fidtx16 }, // V_ADST
1960 : { fidtx16, fadst16 }, // H_ADST
1961 : { fadst16, fidtx16 }, // V_FLIPADST
1962 : { fidtx16, fadst16 }, // H_FLIPADST
1963 : #endif // CONFIG_EXT_TX
1964 : };
1965 :
1966 0 : const transform_2d ht = FHT[tx_type];
1967 : tran_low_t out[256];
1968 : int i, j;
1969 : tran_low_t temp_in[16], temp_out[16];
1970 :
1971 : #if CONFIG_EXT_TX
1972 : int16_t flipped_input[16 * 16];
1973 0 : maybe_flip_input(&input, &stride, 16, 16, flipped_input, tx_type);
1974 : #endif
1975 :
1976 : // Columns
1977 0 : for (i = 0; i < 16; ++i) {
1978 0 : for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
1979 0 : ht.cols(temp_in, temp_out);
1980 0 : for (j = 0; j < 16; ++j)
1981 0 : out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1982 : }
1983 :
1984 : // Rows
1985 0 : for (i = 0; i < 16; ++i) {
1986 0 : for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
1987 0 : ht.rows(temp_in, temp_out);
1988 0 : for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
1989 : }
1990 0 : }
1991 :
1992 : #if CONFIG_HIGHBITDEPTH
1993 0 : void av1_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
1994 : int tx_type) {
1995 0 : av1_fht4x4_c(input, output, stride, tx_type);
1996 0 : }
1997 :
1998 0 : void av1_highbd_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
1999 : int tx_type) {
2000 0 : av1_fht4x8_c(input, output, stride, tx_type);
2001 0 : }
2002 :
2003 0 : void av1_highbd_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
2004 : int tx_type) {
2005 0 : av1_fht8x4_c(input, output, stride, tx_type);
2006 0 : }
2007 :
2008 0 : void av1_highbd_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
2009 : int tx_type) {
2010 0 : av1_fht8x16_c(input, output, stride, tx_type);
2011 0 : }
2012 :
2013 0 : void av1_highbd_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
2014 : int tx_type) {
2015 0 : av1_fht16x8_c(input, output, stride, tx_type);
2016 0 : }
2017 :
2018 0 : void av1_highbd_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
2019 : int tx_type) {
2020 0 : av1_fht16x32_c(input, output, stride, tx_type);
2021 0 : }
2022 :
2023 0 : void av1_highbd_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
2024 : int tx_type) {
2025 0 : av1_fht32x16_c(input, output, stride, tx_type);
2026 0 : }
2027 :
2028 0 : void av1_highbd_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
2029 : int tx_type) {
2030 0 : av1_fht4x16_c(input, output, stride, tx_type);
2031 0 : }
2032 :
2033 0 : void av1_highbd_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
2034 : int tx_type) {
2035 0 : av1_fht16x4_c(input, output, stride, tx_type);
2036 0 : }
2037 :
2038 0 : void av1_highbd_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
2039 : int tx_type) {
2040 0 : av1_fht8x32_c(input, output, stride, tx_type);
2041 0 : }
2042 :
2043 0 : void av1_highbd_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
2044 : int tx_type) {
2045 0 : av1_fht32x8_c(input, output, stride, tx_type);
2046 0 : }
2047 :
2048 0 : void av1_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
2049 : int tx_type) {
2050 0 : av1_fht8x8_c(input, output, stride, tx_type);
2051 0 : }
2052 :
2053 0 : void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
2054 : int stride) {
2055 0 : av1_fwht4x4_c(input, output, stride);
2056 0 : }
2057 :
2058 0 : void av1_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
2059 : int tx_type) {
2060 0 : av1_fht16x16_c(input, output, stride, tx_type);
2061 0 : }
2062 : #endif // CONFIG_HIGHBITDEPTH
2063 :
2064 0 : void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
2065 : int tx_type) {
2066 : static const transform_2d FHT[] = {
2067 : { fdct32, fdct32 }, // DCT_DCT
2068 : #if CONFIG_EXT_TX
2069 : { fhalfright32, fdct32 }, // ADST_DCT
2070 : { fdct32, fhalfright32 }, // DCT_ADST
2071 : { fhalfright32, fhalfright32 }, // ADST_ADST
2072 : { fhalfright32, fdct32 }, // FLIPADST_DCT
2073 : { fdct32, fhalfright32 }, // DCT_FLIPADST
2074 : { fhalfright32, fhalfright32 }, // FLIPADST_FLIPADST
2075 : { fhalfright32, fhalfright32 }, // ADST_FLIPADST
2076 : { fhalfright32, fhalfright32 }, // FLIPADST_ADST
2077 : { fidtx32, fidtx32 }, // IDTX
2078 : { fdct32, fidtx32 }, // V_DCT
2079 : { fidtx32, fdct32 }, // H_DCT
2080 : { fhalfright32, fidtx32 }, // V_ADST
2081 : { fidtx32, fhalfright32 }, // H_ADST
2082 : { fhalfright32, fidtx32 }, // V_FLIPADST
2083 : { fidtx32, fhalfright32 }, // H_FLIPADST
2084 : #endif
2085 : };
2086 0 : const transform_2d ht = FHT[tx_type];
2087 : tran_low_t out[1024];
2088 : int i, j;
2089 : tran_low_t temp_in[32], temp_out[32];
2090 :
2091 : #if CONFIG_EXT_TX
2092 : int16_t flipped_input[32 * 32];
2093 0 : maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
2094 : #endif
2095 :
2096 : // Columns
2097 0 : for (i = 0; i < 32; ++i) {
2098 0 : for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
2099 0 : ht.cols(temp_in, temp_out);
2100 0 : for (j = 0; j < 32; ++j)
2101 0 : out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
2102 : }
2103 :
2104 : // Rows
2105 0 : for (i = 0; i < 32; ++i) {
2106 0 : for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
2107 0 : ht.rows(temp_in, temp_out);
2108 0 : for (j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j];
2109 : }
2110 0 : }
2111 :
2112 : #if CONFIG_TX64X64
2113 : #if CONFIG_EXT_TX
2114 : static void fidtx64(const tran_low_t *input, tran_low_t *output) {
2115 : int i;
2116 : for (i = 0; i < 64; ++i)
2117 : output[i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
2118 : }
2119 :
2120 : // For use in lieu of ADST
2121 : static void fhalfright64(const tran_low_t *input, tran_low_t *output) {
2122 : int i;
2123 : tran_low_t inputhalf[32];
2124 : for (i = 0; i < 32; ++i) {
2125 : output[32 + i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
2126 : }
2127 : // Multiply input by sqrt(2)
2128 : for (i = 0; i < 32; ++i) {
2129 : inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 32] * Sqrt2);
2130 : }
2131 : fdct32(inputhalf, output);
2132 : // Note overall scaling factor is 2 times unitary
2133 : }
2134 : #endif // CONFIG_EXT_TX
2135 :
2136 : static void fdct64_col(const tran_low_t *input, tran_low_t *output) {
2137 : int32_t in[64], out[64];
2138 : int i;
2139 : for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
2140 : av1_fdct64_new(in, out, fwd_cos_bit_col_dct_64, fwd_stage_range_col_dct_64);
2141 : for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
2142 : }
2143 :
2144 : static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
2145 : int32_t in[64], out[64];
2146 : int i;
2147 : for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
2148 : av1_fdct64_new(in, out, fwd_cos_bit_row_dct_64, fwd_stage_range_row_dct_64);
2149 : for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
2150 : }
2151 :
2152 : void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
2153 : int tx_type) {
2154 : static const transform_2d FHT[] = {
2155 : { fdct64_col, fdct64_row }, // DCT_DCT
2156 : #if CONFIG_EXT_TX
2157 : { fhalfright64, fdct64_row }, // ADST_DCT
2158 : { fdct64_col, fhalfright64 }, // DCT_ADST
2159 : { fhalfright64, fhalfright64 }, // ADST_ADST
2160 : { fhalfright64, fdct64_row }, // FLIPADST_DCT
2161 : { fdct64_col, fhalfright64 }, // DCT_FLIPADST
2162 : { fhalfright64, fhalfright64 }, // FLIPADST_FLIPADST
2163 : { fhalfright64, fhalfright64 }, // ADST_FLIPADST
2164 : { fhalfright64, fhalfright64 }, // FLIPADST_ADST
2165 : { fidtx64, fidtx64 }, // IDTX
2166 : { fdct64_col, fidtx64 }, // V_DCT
2167 : { fidtx64, fdct64_row }, // H_DCT
2168 : { fhalfright64, fidtx64 }, // V_ADST
2169 : { fidtx64, fhalfright64 }, // H_ADST
2170 : { fhalfright64, fidtx64 }, // V_FLIPADST
2171 : { fidtx64, fhalfright64 }, // H_FLIPADST
2172 : #endif
2173 : };
2174 : const transform_2d ht = FHT[tx_type];
2175 : tran_low_t out[4096];
2176 : int i, j;
2177 : tran_low_t temp_in[64], temp_out[64];
2178 : #if CONFIG_EXT_TX
2179 : int16_t flipped_input[64 * 64];
2180 : maybe_flip_input(&input, &stride, 64, 64, flipped_input, tx_type);
2181 : #endif
2182 : // Columns
2183 : for (i = 0; i < 64; ++i) {
2184 : for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
2185 : ht.cols(temp_in, temp_out);
2186 : for (j = 0; j < 64; ++j)
2187 : out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
2188 : }
2189 :
2190 : // Rows
2191 : for (i = 0; i < 64; ++i) {
2192 : for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
2193 : ht.rows(temp_in, temp_out);
2194 : for (j = 0; j < 64; ++j)
2195 : output[j + i * 64] =
2196 : (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
2197 : }
2198 : }
2199 : #endif // CONFIG_TX64X64
2200 :
2201 : #if CONFIG_EXT_TX
2202 : // Forward identity transform.
2203 0 : void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
2204 : int bs, int tx_type) {
2205 : int r, c;
2206 0 : const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1);
2207 0 : if (tx_type == IDTX) {
2208 0 : for (r = 0; r < bs; ++r) {
2209 0 : for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] * (1 << shift);
2210 0 : src_diff += stride;
2211 0 : coeff += bs;
2212 : }
2213 : }
2214 0 : }
2215 : #endif // CONFIG_EXT_TX
2216 :
2217 : #if CONFIG_HIGHBITDEPTH
2218 0 : void av1_highbd_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
2219 : int tx_type) {
2220 0 : av1_fht32x32_c(input, output, stride, tx_type);
2221 0 : }
2222 :
2223 : #if CONFIG_TX64X64
2224 : void av1_highbd_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
2225 : int tx_type) {
2226 : av1_fht64x64_c(input, output, stride, tx_type);
2227 : }
2228 : #endif // CONFIG_TX64X64
2229 : #endif // CONFIG_HIGHBITDEPTH
2230 :
2231 : #if CONFIG_DPCM_INTRA
2232 : void av1_dpcm_ft4_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
2233 : tran_low_t *output) {
2234 : assert(tx_type < TX_TYPES_1D);
2235 : static const transform_1d FHT[] = { fdct4, fadst4, fadst4, fidtx4 };
2236 : const transform_1d ft = FHT[tx_type];
2237 : tran_low_t temp_in[4];
2238 : for (int i = 0; i < 4; ++i)
2239 : temp_in[i] = (tran_low_t)fdct_round_shift(input[i * stride] * 4 * Sqrt2);
2240 : ft(temp_in, output);
2241 : }
2242 :
2243 : void av1_dpcm_ft8_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
2244 : tran_low_t *output) {
2245 : assert(tx_type < TX_TYPES_1D);
2246 : static const transform_1d FHT[] = { fdct8, fadst8, fadst8, fidtx8 };
2247 : const transform_1d ft = FHT[tx_type];
2248 : tran_low_t temp_in[8];
2249 : for (int i = 0; i < 8; ++i) temp_in[i] = input[i * stride] * 4;
2250 : ft(temp_in, output);
2251 : }
2252 :
2253 : void av1_dpcm_ft16_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
2254 : tran_low_t *output) {
2255 : assert(tx_type < TX_TYPES_1D);
2256 : static const transform_1d FHT[] = { fdct16, fadst16, fadst16, fidtx16 };
2257 : const transform_1d ft = FHT[tx_type];
2258 : tran_low_t temp_in[16];
2259 : for (int i = 0; i < 16; ++i)
2260 : temp_in[i] = (tran_low_t)fdct_round_shift(input[i * stride] * 2 * Sqrt2);
2261 : ft(temp_in, output);
2262 : }
2263 :
2264 : void av1_dpcm_ft32_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
2265 : tran_low_t *output) {
2266 : assert(tx_type < TX_TYPES_1D);
2267 : static const transform_1d FHT[] = { fdct32, fhalfright32, fhalfright32,
2268 : fidtx32 };
2269 : const transform_1d ft = FHT[tx_type];
2270 : tran_low_t temp_in[32];
2271 : for (int i = 0; i < 32; ++i) temp_in[i] = input[i * stride];
2272 : ft(temp_in, output);
2273 : }
2274 : #endif // CONFIG_DPCM_INTRA
2275 : #endif // !AV1_DCT_GTEST
|