Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <math.h>
13 : #include <string.h>
14 :
15 : #include "./aom_dsp_rtcd.h"
16 : #include "aom_dsp/inv_txfm.h"
17 :
18 0 : void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
19 : /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
20 : 0.5 shifts per pixel. */
21 : int i;
22 : tran_low_t output[16];
23 : tran_high_t a1, b1, c1, d1, e1;
24 0 : const tran_low_t *ip = input;
25 0 : tran_low_t *op = output;
26 :
27 0 : for (i = 0; i < 4; i++) {
28 0 : a1 = ip[0] >> UNIT_QUANT_SHIFT;
29 0 : c1 = ip[1] >> UNIT_QUANT_SHIFT;
30 0 : d1 = ip[2] >> UNIT_QUANT_SHIFT;
31 0 : b1 = ip[3] >> UNIT_QUANT_SHIFT;
32 0 : a1 += c1;
33 0 : d1 -= b1;
34 0 : e1 = (a1 - d1) >> 1;
35 0 : b1 = e1 - b1;
36 0 : c1 = e1 - c1;
37 0 : a1 -= b1;
38 0 : d1 += c1;
39 0 : op[0] = WRAPLOW(a1);
40 0 : op[1] = WRAPLOW(b1);
41 0 : op[2] = WRAPLOW(c1);
42 0 : op[3] = WRAPLOW(d1);
43 0 : ip += 4;
44 0 : op += 4;
45 : }
46 :
47 0 : ip = output;
48 0 : for (i = 0; i < 4; i++) {
49 0 : a1 = ip[4 * 0];
50 0 : c1 = ip[4 * 1];
51 0 : d1 = ip[4 * 2];
52 0 : b1 = ip[4 * 3];
53 0 : a1 += c1;
54 0 : d1 -= b1;
55 0 : e1 = (a1 - d1) >> 1;
56 0 : b1 = e1 - b1;
57 0 : c1 = e1 - c1;
58 0 : a1 -= b1;
59 0 : d1 += c1;
60 0 : dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
61 0 : dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
62 0 : dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
63 0 : dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
64 :
65 0 : ip++;
66 0 : dest++;
67 : }
68 0 : }
69 :
70 0 : void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
71 : int i;
72 : tran_high_t a1, e1;
73 : tran_low_t tmp[4];
74 0 : const tran_low_t *ip = in;
75 0 : tran_low_t *op = tmp;
76 :
77 0 : a1 = ip[0] >> UNIT_QUANT_SHIFT;
78 0 : e1 = a1 >> 1;
79 0 : a1 -= e1;
80 0 : op[0] = WRAPLOW(a1);
81 0 : op[1] = op[2] = op[3] = WRAPLOW(e1);
82 :
83 0 : ip = tmp;
84 0 : for (i = 0; i < 4; i++) {
85 0 : e1 = ip[0] >> 1;
86 0 : a1 = ip[0] - e1;
87 0 : dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
88 0 : dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
89 0 : dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
90 0 : dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
91 0 : ip++;
92 0 : dest++;
93 : }
94 0 : }
95 :
96 0 : void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
97 : tran_low_t step[4];
98 : tran_high_t temp1, temp2;
99 : // stage 1
100 0 : temp1 = (input[0] + input[2]) * cospi_16_64;
101 0 : temp2 = (input[0] - input[2]) * cospi_16_64;
102 0 : step[0] = WRAPLOW(dct_const_round_shift(temp1));
103 0 : step[1] = WRAPLOW(dct_const_round_shift(temp2));
104 0 : temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
105 0 : temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
106 0 : step[2] = WRAPLOW(dct_const_round_shift(temp1));
107 0 : step[3] = WRAPLOW(dct_const_round_shift(temp2));
108 :
109 : // stage 2
110 0 : output[0] = WRAPLOW(step[0] + step[3]);
111 0 : output[1] = WRAPLOW(step[1] + step[2]);
112 0 : output[2] = WRAPLOW(step[1] - step[2]);
113 0 : output[3] = WRAPLOW(step[0] - step[3]);
114 0 : }
115 :
116 0 : void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
117 : tran_low_t out[4 * 4];
118 0 : tran_low_t *outptr = out;
119 : int i, j;
120 : tran_low_t temp_in[4], temp_out[4];
121 :
122 : // Rows
123 0 : for (i = 0; i < 4; ++i) {
124 0 : aom_idct4_c(input, outptr);
125 0 : input += 4;
126 0 : outptr += 4;
127 : }
128 :
129 : // Columns
130 0 : for (i = 0; i < 4; ++i) {
131 0 : for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
132 0 : aom_idct4_c(temp_in, temp_out);
133 0 : for (j = 0; j < 4; ++j) {
134 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
135 0 : ROUND_POWER_OF_TWO(temp_out[j], 4));
136 : }
137 : }
138 0 : }
139 :
140 0 : void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
141 : int dest_stride) {
142 : int i;
143 : tran_high_t a1;
144 0 : tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
145 0 : out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
146 0 : a1 = ROUND_POWER_OF_TWO(out, 4);
147 :
148 0 : if (a1 == 0) return;
149 :
150 0 : for (i = 0; i < 4; i++) {
151 0 : dest[0] = clip_pixel_add(dest[0], a1);
152 0 : dest[1] = clip_pixel_add(dest[1], a1);
153 0 : dest[2] = clip_pixel_add(dest[2], a1);
154 0 : dest[3] = clip_pixel_add(dest[3], a1);
155 0 : dest += dest_stride;
156 : }
157 : }
158 :
159 0 : void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
160 : tran_low_t step1[8], step2[8];
161 : tran_high_t temp1, temp2;
162 : // stage 1
163 0 : step1[0] = input[0];
164 0 : step1[2] = input[4];
165 0 : step1[1] = input[2];
166 0 : step1[3] = input[6];
167 0 : temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
168 0 : temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
169 0 : step1[4] = WRAPLOW(dct_const_round_shift(temp1));
170 0 : step1[7] = WRAPLOW(dct_const_round_shift(temp2));
171 0 : temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
172 0 : temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
173 0 : step1[5] = WRAPLOW(dct_const_round_shift(temp1));
174 0 : step1[6] = WRAPLOW(dct_const_round_shift(temp2));
175 :
176 : // stage 2
177 0 : temp1 = (step1[0] + step1[2]) * cospi_16_64;
178 0 : temp2 = (step1[0] - step1[2]) * cospi_16_64;
179 0 : step2[0] = WRAPLOW(dct_const_round_shift(temp1));
180 0 : step2[1] = WRAPLOW(dct_const_round_shift(temp2));
181 0 : temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
182 0 : temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
183 0 : step2[2] = WRAPLOW(dct_const_round_shift(temp1));
184 0 : step2[3] = WRAPLOW(dct_const_round_shift(temp2));
185 0 : step2[4] = WRAPLOW(step1[4] + step1[5]);
186 0 : step2[5] = WRAPLOW(step1[4] - step1[5]);
187 0 : step2[6] = WRAPLOW(-step1[6] + step1[7]);
188 0 : step2[7] = WRAPLOW(step1[6] + step1[7]);
189 :
190 : // stage 3
191 0 : step1[0] = WRAPLOW(step2[0] + step2[3]);
192 0 : step1[1] = WRAPLOW(step2[1] + step2[2]);
193 0 : step1[2] = WRAPLOW(step2[1] - step2[2]);
194 0 : step1[3] = WRAPLOW(step2[0] - step2[3]);
195 0 : step1[4] = step2[4];
196 0 : temp1 = (step2[6] - step2[5]) * cospi_16_64;
197 0 : temp2 = (step2[5] + step2[6]) * cospi_16_64;
198 0 : step1[5] = WRAPLOW(dct_const_round_shift(temp1));
199 0 : step1[6] = WRAPLOW(dct_const_round_shift(temp2));
200 0 : step1[7] = step2[7];
201 :
202 : // stage 4
203 0 : output[0] = WRAPLOW(step1[0] + step1[7]);
204 0 : output[1] = WRAPLOW(step1[1] + step1[6]);
205 0 : output[2] = WRAPLOW(step1[2] + step1[5]);
206 0 : output[3] = WRAPLOW(step1[3] + step1[4]);
207 0 : output[4] = WRAPLOW(step1[3] - step1[4]);
208 0 : output[5] = WRAPLOW(step1[2] - step1[5]);
209 0 : output[6] = WRAPLOW(step1[1] - step1[6]);
210 0 : output[7] = WRAPLOW(step1[0] - step1[7]);
211 0 : }
212 :
213 0 : void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
214 : tran_low_t out[8 * 8];
215 0 : tran_low_t *outptr = out;
216 : int i, j;
217 : tran_low_t temp_in[8], temp_out[8];
218 :
219 : // First transform rows
220 0 : for (i = 0; i < 8; ++i) {
221 0 : aom_idct8_c(input, outptr);
222 0 : input += 8;
223 0 : outptr += 8;
224 : }
225 :
226 : // Then transform columns
227 0 : for (i = 0; i < 8; ++i) {
228 0 : for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
229 0 : aom_idct8_c(temp_in, temp_out);
230 0 : for (j = 0; j < 8; ++j) {
231 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
232 0 : ROUND_POWER_OF_TWO(temp_out[j], 5));
233 : }
234 : }
235 0 : }
236 :
237 0 : void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
238 : int i, j;
239 : tran_high_t a1;
240 0 : tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
241 0 : out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
242 0 : a1 = ROUND_POWER_OF_TWO(out, 5);
243 0 : if (a1 == 0) return;
244 0 : for (j = 0; j < 8; ++j) {
245 0 : for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
246 0 : dest += stride;
247 : }
248 : }
249 :
250 0 : void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
251 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
252 :
253 0 : tran_low_t x0 = input[0];
254 0 : tran_low_t x1 = input[1];
255 0 : tran_low_t x2 = input[2];
256 0 : tran_low_t x3 = input[3];
257 :
258 0 : if (!(x0 | x1 | x2 | x3)) {
259 0 : output[0] = output[1] = output[2] = output[3] = 0;
260 0 : return;
261 : }
262 :
263 0 : s0 = sinpi_1_9 * x0;
264 0 : s1 = sinpi_2_9 * x0;
265 0 : s2 = sinpi_3_9 * x1;
266 0 : s3 = sinpi_4_9 * x2;
267 0 : s4 = sinpi_1_9 * x2;
268 0 : s5 = sinpi_2_9 * x3;
269 0 : s6 = sinpi_4_9 * x3;
270 0 : s7 = WRAPLOW(x0 - x2 + x3);
271 :
272 0 : s0 = s0 + s3 + s5;
273 0 : s1 = s1 - s4 - s6;
274 0 : s3 = s2;
275 0 : s2 = sinpi_3_9 * s7;
276 :
277 : // 1-D transform scaling factor is sqrt(2).
278 : // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
279 : // + 1b (addition) = 29b.
280 : // Hence the output bit depth is 15b.
281 0 : output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
282 0 : output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
283 0 : output[2] = WRAPLOW(dct_const_round_shift(s2));
284 0 : output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
285 : }
286 :
287 0 : void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
288 : int s0, s1, s2, s3, s4, s5, s6, s7;
289 :
290 0 : tran_high_t x0 = input[7];
291 0 : tran_high_t x1 = input[0];
292 0 : tran_high_t x2 = input[5];
293 0 : tran_high_t x3 = input[2];
294 0 : tran_high_t x4 = input[3];
295 0 : tran_high_t x5 = input[4];
296 0 : tran_high_t x6 = input[1];
297 0 : tran_high_t x7 = input[6];
298 :
299 0 : if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
300 0 : output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
301 0 : output[6] = output[7] = 0;
302 0 : return;
303 : }
304 :
305 : // stage 1
306 0 : s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
307 0 : s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
308 0 : s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
309 0 : s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
310 0 : s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
311 0 : s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
312 0 : s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
313 0 : s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
314 :
315 0 : x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
316 0 : x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
317 0 : x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
318 0 : x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
319 0 : x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
320 0 : x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
321 0 : x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
322 0 : x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
323 :
324 : // stage 2
325 0 : s0 = (int)x0;
326 0 : s1 = (int)x1;
327 0 : s2 = (int)x2;
328 0 : s3 = (int)x3;
329 0 : s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
330 0 : s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
331 0 : s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
332 0 : s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
333 :
334 0 : x0 = WRAPLOW(s0 + s2);
335 0 : x1 = WRAPLOW(s1 + s3);
336 0 : x2 = WRAPLOW(s0 - s2);
337 0 : x3 = WRAPLOW(s1 - s3);
338 0 : x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
339 0 : x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
340 0 : x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
341 0 : x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
342 :
343 : // stage 3
344 0 : s2 = (int)(cospi_16_64 * (x2 + x3));
345 0 : s3 = (int)(cospi_16_64 * (x2 - x3));
346 0 : s6 = (int)(cospi_16_64 * (x6 + x7));
347 0 : s7 = (int)(cospi_16_64 * (x6 - x7));
348 :
349 0 : x2 = WRAPLOW(dct_const_round_shift(s2));
350 0 : x3 = WRAPLOW(dct_const_round_shift(s3));
351 0 : x6 = WRAPLOW(dct_const_round_shift(s6));
352 0 : x7 = WRAPLOW(dct_const_round_shift(s7));
353 :
354 0 : output[0] = WRAPLOW(x0);
355 0 : output[1] = WRAPLOW(-x4);
356 0 : output[2] = WRAPLOW(x6);
357 0 : output[3] = WRAPLOW(-x2);
358 0 : output[4] = WRAPLOW(x3);
359 0 : output[5] = WRAPLOW(-x7);
360 0 : output[6] = WRAPLOW(x5);
361 0 : output[7] = WRAPLOW(-x1);
362 : }
363 :
364 0 : void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
365 0 : tran_low_t out[8 * 8] = { 0 };
366 0 : tran_low_t *outptr = out;
367 : int i, j;
368 : tran_low_t temp_in[8], temp_out[8];
369 :
370 : // First transform rows
371 : // only first 4 row has non-zero coefs
372 0 : for (i = 0; i < 4; ++i) {
373 0 : aom_idct8_c(input, outptr);
374 0 : input += 8;
375 0 : outptr += 8;
376 : }
377 :
378 : // Then transform columns
379 0 : for (i = 0; i < 8; ++i) {
380 0 : for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
381 0 : aom_idct8_c(temp_in, temp_out);
382 0 : for (j = 0; j < 8; ++j) {
383 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
384 0 : ROUND_POWER_OF_TWO(temp_out[j], 5));
385 : }
386 : }
387 0 : }
388 :
389 0 : void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
390 : tran_low_t step1[16], step2[16];
391 : tran_high_t temp1, temp2;
392 :
393 : // stage 1
394 0 : step1[0] = input[0 / 2];
395 0 : step1[1] = input[16 / 2];
396 0 : step1[2] = input[8 / 2];
397 0 : step1[3] = input[24 / 2];
398 0 : step1[4] = input[4 / 2];
399 0 : step1[5] = input[20 / 2];
400 0 : step1[6] = input[12 / 2];
401 0 : step1[7] = input[28 / 2];
402 0 : step1[8] = input[2 / 2];
403 0 : step1[9] = input[18 / 2];
404 0 : step1[10] = input[10 / 2];
405 0 : step1[11] = input[26 / 2];
406 0 : step1[12] = input[6 / 2];
407 0 : step1[13] = input[22 / 2];
408 0 : step1[14] = input[14 / 2];
409 0 : step1[15] = input[30 / 2];
410 :
411 : // stage 2
412 0 : step2[0] = step1[0];
413 0 : step2[1] = step1[1];
414 0 : step2[2] = step1[2];
415 0 : step2[3] = step1[3];
416 0 : step2[4] = step1[4];
417 0 : step2[5] = step1[5];
418 0 : step2[6] = step1[6];
419 0 : step2[7] = step1[7];
420 :
421 0 : temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
422 0 : temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
423 0 : step2[8] = WRAPLOW(dct_const_round_shift(temp1));
424 0 : step2[15] = WRAPLOW(dct_const_round_shift(temp2));
425 :
426 0 : temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
427 0 : temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
428 0 : step2[9] = WRAPLOW(dct_const_round_shift(temp1));
429 0 : step2[14] = WRAPLOW(dct_const_round_shift(temp2));
430 :
431 0 : temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
432 0 : temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
433 0 : step2[10] = WRAPLOW(dct_const_round_shift(temp1));
434 0 : step2[13] = WRAPLOW(dct_const_round_shift(temp2));
435 :
436 0 : temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
437 0 : temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
438 0 : step2[11] = WRAPLOW(dct_const_round_shift(temp1));
439 0 : step2[12] = WRAPLOW(dct_const_round_shift(temp2));
440 :
441 : // stage 3
442 0 : step1[0] = step2[0];
443 0 : step1[1] = step2[1];
444 0 : step1[2] = step2[2];
445 0 : step1[3] = step2[3];
446 :
447 0 : temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
448 0 : temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
449 0 : step1[4] = WRAPLOW(dct_const_round_shift(temp1));
450 0 : step1[7] = WRAPLOW(dct_const_round_shift(temp2));
451 0 : temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
452 0 : temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
453 0 : step1[5] = WRAPLOW(dct_const_round_shift(temp1));
454 0 : step1[6] = WRAPLOW(dct_const_round_shift(temp2));
455 :
456 0 : step1[8] = WRAPLOW(step2[8] + step2[9]);
457 0 : step1[9] = WRAPLOW(step2[8] - step2[9]);
458 0 : step1[10] = WRAPLOW(-step2[10] + step2[11]);
459 0 : step1[11] = WRAPLOW(step2[10] + step2[11]);
460 0 : step1[12] = WRAPLOW(step2[12] + step2[13]);
461 0 : step1[13] = WRAPLOW(step2[12] - step2[13]);
462 0 : step1[14] = WRAPLOW(-step2[14] + step2[15]);
463 0 : step1[15] = WRAPLOW(step2[14] + step2[15]);
464 :
465 : // stage 4
466 0 : temp1 = (step1[0] + step1[1]) * cospi_16_64;
467 0 : temp2 = (step1[0] - step1[1]) * cospi_16_64;
468 0 : step2[0] = WRAPLOW(dct_const_round_shift(temp1));
469 0 : step2[1] = WRAPLOW(dct_const_round_shift(temp2));
470 0 : temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
471 0 : temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
472 0 : step2[2] = WRAPLOW(dct_const_round_shift(temp1));
473 0 : step2[3] = WRAPLOW(dct_const_round_shift(temp2));
474 0 : step2[4] = WRAPLOW(step1[4] + step1[5]);
475 0 : step2[5] = WRAPLOW(step1[4] - step1[5]);
476 0 : step2[6] = WRAPLOW(-step1[6] + step1[7]);
477 0 : step2[7] = WRAPLOW(step1[6] + step1[7]);
478 :
479 0 : step2[8] = step1[8];
480 0 : step2[15] = step1[15];
481 0 : temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
482 0 : temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
483 0 : step2[9] = WRAPLOW(dct_const_round_shift(temp1));
484 0 : step2[14] = WRAPLOW(dct_const_round_shift(temp2));
485 0 : temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
486 0 : temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
487 0 : step2[10] = WRAPLOW(dct_const_round_shift(temp1));
488 0 : step2[13] = WRAPLOW(dct_const_round_shift(temp2));
489 0 : step2[11] = step1[11];
490 0 : step2[12] = step1[12];
491 :
492 : // stage 5
493 0 : step1[0] = WRAPLOW(step2[0] + step2[3]);
494 0 : step1[1] = WRAPLOW(step2[1] + step2[2]);
495 0 : step1[2] = WRAPLOW(step2[1] - step2[2]);
496 0 : step1[3] = WRAPLOW(step2[0] - step2[3]);
497 0 : step1[4] = step2[4];
498 0 : temp1 = (step2[6] - step2[5]) * cospi_16_64;
499 0 : temp2 = (step2[5] + step2[6]) * cospi_16_64;
500 0 : step1[5] = WRAPLOW(dct_const_round_shift(temp1));
501 0 : step1[6] = WRAPLOW(dct_const_round_shift(temp2));
502 0 : step1[7] = step2[7];
503 :
504 0 : step1[8] = WRAPLOW(step2[8] + step2[11]);
505 0 : step1[9] = WRAPLOW(step2[9] + step2[10]);
506 0 : step1[10] = WRAPLOW(step2[9] - step2[10]);
507 0 : step1[11] = WRAPLOW(step2[8] - step2[11]);
508 0 : step1[12] = WRAPLOW(-step2[12] + step2[15]);
509 0 : step1[13] = WRAPLOW(-step2[13] + step2[14]);
510 0 : step1[14] = WRAPLOW(step2[13] + step2[14]);
511 0 : step1[15] = WRAPLOW(step2[12] + step2[15]);
512 :
513 : // stage 6
514 0 : step2[0] = WRAPLOW(step1[0] + step1[7]);
515 0 : step2[1] = WRAPLOW(step1[1] + step1[6]);
516 0 : step2[2] = WRAPLOW(step1[2] + step1[5]);
517 0 : step2[3] = WRAPLOW(step1[3] + step1[4]);
518 0 : step2[4] = WRAPLOW(step1[3] - step1[4]);
519 0 : step2[5] = WRAPLOW(step1[2] - step1[5]);
520 0 : step2[6] = WRAPLOW(step1[1] - step1[6]);
521 0 : step2[7] = WRAPLOW(step1[0] - step1[7]);
522 0 : step2[8] = step1[8];
523 0 : step2[9] = step1[9];
524 0 : temp1 = (-step1[10] + step1[13]) * cospi_16_64;
525 0 : temp2 = (step1[10] + step1[13]) * cospi_16_64;
526 0 : step2[10] = WRAPLOW(dct_const_round_shift(temp1));
527 0 : step2[13] = WRAPLOW(dct_const_round_shift(temp2));
528 0 : temp1 = (-step1[11] + step1[12]) * cospi_16_64;
529 0 : temp2 = (step1[11] + step1[12]) * cospi_16_64;
530 0 : step2[11] = WRAPLOW(dct_const_round_shift(temp1));
531 0 : step2[12] = WRAPLOW(dct_const_round_shift(temp2));
532 0 : step2[14] = step1[14];
533 0 : step2[15] = step1[15];
534 :
535 : // stage 7
536 0 : output[0] = WRAPLOW(step2[0] + step2[15]);
537 0 : output[1] = WRAPLOW(step2[1] + step2[14]);
538 0 : output[2] = WRAPLOW(step2[2] + step2[13]);
539 0 : output[3] = WRAPLOW(step2[3] + step2[12]);
540 0 : output[4] = WRAPLOW(step2[4] + step2[11]);
541 0 : output[5] = WRAPLOW(step2[5] + step2[10]);
542 0 : output[6] = WRAPLOW(step2[6] + step2[9]);
543 0 : output[7] = WRAPLOW(step2[7] + step2[8]);
544 0 : output[8] = WRAPLOW(step2[7] - step2[8]);
545 0 : output[9] = WRAPLOW(step2[6] - step2[9]);
546 0 : output[10] = WRAPLOW(step2[5] - step2[10]);
547 0 : output[11] = WRAPLOW(step2[4] - step2[11]);
548 0 : output[12] = WRAPLOW(step2[3] - step2[12]);
549 0 : output[13] = WRAPLOW(step2[2] - step2[13]);
550 0 : output[14] = WRAPLOW(step2[1] - step2[14]);
551 0 : output[15] = WRAPLOW(step2[0] - step2[15]);
552 0 : }
553 :
554 0 : void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
555 : int stride) {
556 : tran_low_t out[16 * 16];
557 0 : tran_low_t *outptr = out;
558 : int i, j;
559 : tran_low_t temp_in[16], temp_out[16];
560 :
561 : // First transform rows
562 0 : for (i = 0; i < 16; ++i) {
563 0 : aom_idct16_c(input, outptr);
564 0 : input += 16;
565 0 : outptr += 16;
566 : }
567 :
568 : // Then transform columns
569 0 : for (i = 0; i < 16; ++i) {
570 0 : for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
571 0 : aom_idct16_c(temp_in, temp_out);
572 0 : for (j = 0; j < 16; ++j) {
573 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
574 0 : ROUND_POWER_OF_TWO(temp_out[j], 6));
575 : }
576 : }
577 0 : }
578 :
579 0 : void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
580 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
581 : tran_high_t s9, s10, s11, s12, s13, s14, s15;
582 :
583 0 : tran_high_t x0 = input[15];
584 0 : tran_high_t x1 = input[0];
585 0 : tran_high_t x2 = input[13];
586 0 : tran_high_t x3 = input[2];
587 0 : tran_high_t x4 = input[11];
588 0 : tran_high_t x5 = input[4];
589 0 : tran_high_t x6 = input[9];
590 0 : tran_high_t x7 = input[6];
591 0 : tran_high_t x8 = input[7];
592 0 : tran_high_t x9 = input[8];
593 0 : tran_high_t x10 = input[5];
594 0 : tran_high_t x11 = input[10];
595 0 : tran_high_t x12 = input[3];
596 0 : tran_high_t x13 = input[12];
597 0 : tran_high_t x14 = input[1];
598 0 : tran_high_t x15 = input[14];
599 :
600 0 : if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
601 0 : x13 | x14 | x15)) {
602 0 : output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
603 0 : output[6] = output[7] = output[8] = output[9] = output[10] =
604 0 : output[11] = output[12] = output[13] = output[14] = output[15] = 0;
605 0 : return;
606 : }
607 :
608 : // stage 1
609 0 : s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
610 0 : s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
611 0 : s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
612 0 : s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
613 0 : s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
614 0 : s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
615 0 : s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
616 0 : s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
617 0 : s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
618 0 : s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
619 0 : s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
620 0 : s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
621 0 : s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
622 0 : s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
623 0 : s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
624 0 : s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
625 :
626 0 : x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
627 0 : x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
628 0 : x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
629 0 : x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
630 0 : x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
631 0 : x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
632 0 : x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
633 0 : x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
634 0 : x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
635 0 : x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
636 0 : x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
637 0 : x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
638 0 : x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
639 0 : x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
640 0 : x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
641 0 : x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
642 :
643 : // stage 2
644 0 : s0 = x0;
645 0 : s1 = x1;
646 0 : s2 = x2;
647 0 : s3 = x3;
648 0 : s4 = x4;
649 0 : s5 = x5;
650 0 : s6 = x6;
651 0 : s7 = x7;
652 0 : s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
653 0 : s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
654 0 : s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
655 0 : s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
656 0 : s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
657 0 : s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
658 0 : s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
659 0 : s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
660 :
661 0 : x0 = WRAPLOW(s0 + s4);
662 0 : x1 = WRAPLOW(s1 + s5);
663 0 : x2 = WRAPLOW(s2 + s6);
664 0 : x3 = WRAPLOW(s3 + s7);
665 0 : x4 = WRAPLOW(s0 - s4);
666 0 : x5 = WRAPLOW(s1 - s5);
667 0 : x6 = WRAPLOW(s2 - s6);
668 0 : x7 = WRAPLOW(s3 - s7);
669 0 : x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
670 0 : x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
671 0 : x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
672 0 : x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
673 0 : x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
674 0 : x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
675 0 : x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
676 0 : x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
677 :
678 : // stage 3
679 0 : s0 = x0;
680 0 : s1 = x1;
681 0 : s2 = x2;
682 0 : s3 = x3;
683 0 : s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
684 0 : s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
685 0 : s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
686 0 : s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
687 0 : s8 = x8;
688 0 : s9 = x9;
689 0 : s10 = x10;
690 0 : s11 = x11;
691 0 : s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
692 0 : s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
693 0 : s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
694 0 : s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
695 :
696 0 : x0 = WRAPLOW(s0 + s2);
697 0 : x1 = WRAPLOW(s1 + s3);
698 0 : x2 = WRAPLOW(s0 - s2);
699 0 : x3 = WRAPLOW(s1 - s3);
700 0 : x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
701 0 : x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
702 0 : x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
703 0 : x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
704 0 : x8 = WRAPLOW(s8 + s10);
705 0 : x9 = WRAPLOW(s9 + s11);
706 0 : x10 = WRAPLOW(s8 - s10);
707 0 : x11 = WRAPLOW(s9 - s11);
708 0 : x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
709 0 : x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
710 0 : x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
711 0 : x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
712 :
713 : // stage 4
714 0 : s2 = (-cospi_16_64) * (x2 + x3);
715 0 : s3 = cospi_16_64 * (x2 - x3);
716 0 : s6 = cospi_16_64 * (x6 + x7);
717 0 : s7 = cospi_16_64 * (-x6 + x7);
718 0 : s10 = cospi_16_64 * (x10 + x11);
719 0 : s11 = cospi_16_64 * (-x10 + x11);
720 0 : s14 = (-cospi_16_64) * (x14 + x15);
721 0 : s15 = cospi_16_64 * (x14 - x15);
722 :
723 0 : x2 = WRAPLOW(dct_const_round_shift(s2));
724 0 : x3 = WRAPLOW(dct_const_round_shift(s3));
725 0 : x6 = WRAPLOW(dct_const_round_shift(s6));
726 0 : x7 = WRAPLOW(dct_const_round_shift(s7));
727 0 : x10 = WRAPLOW(dct_const_round_shift(s10));
728 0 : x11 = WRAPLOW(dct_const_round_shift(s11));
729 0 : x14 = WRAPLOW(dct_const_round_shift(s14));
730 0 : x15 = WRAPLOW(dct_const_round_shift(s15));
731 :
732 0 : output[0] = WRAPLOW(x0);
733 0 : output[1] = WRAPLOW(-x8);
734 0 : output[2] = WRAPLOW(x12);
735 0 : output[3] = WRAPLOW(-x4);
736 0 : output[4] = WRAPLOW(x6);
737 0 : output[5] = WRAPLOW(x14);
738 0 : output[6] = WRAPLOW(x10);
739 0 : output[7] = WRAPLOW(x2);
740 0 : output[8] = WRAPLOW(x3);
741 0 : output[9] = WRAPLOW(x11);
742 0 : output[10] = WRAPLOW(x15);
743 0 : output[11] = WRAPLOW(x7);
744 0 : output[12] = WRAPLOW(x5);
745 0 : output[13] = WRAPLOW(-x13);
746 0 : output[14] = WRAPLOW(x9);
747 0 : output[15] = WRAPLOW(-x1);
748 : }
749 :
750 0 : void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
751 : int stride) {
752 : int i, j;
753 0 : tran_low_t out[16 * 16] = { 0 };
754 0 : tran_low_t *outptr = out;
755 : tran_low_t temp_in[16], temp_out[16];
756 :
757 : // First transform rows. Since all non-zero dct coefficients are in
758 : // upper-left 8x8 area, we only need to calculate first 8 rows here.
759 0 : for (i = 0; i < 8; ++i) {
760 0 : aom_idct16_c(input, outptr);
761 0 : input += 16;
762 0 : outptr += 16;
763 : }
764 :
765 : // Then transform columns
766 0 : for (i = 0; i < 16; ++i) {
767 0 : for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
768 0 : aom_idct16_c(temp_in, temp_out);
769 0 : for (j = 0; j < 16; ++j) {
770 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
771 0 : ROUND_POWER_OF_TWO(temp_out[j], 6));
772 : }
773 : }
774 0 : }
775 :
776 0 : void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
777 : int stride) {
778 0 : tran_low_t out[16 * 16] = { 0 };
779 0 : tran_low_t *outptr = out;
780 : int i, j;
781 : tran_low_t temp_in[16], temp_out[16];
782 :
783 : // First transform rows. Since all non-zero dct coefficients are in
784 : // upper-left 4x4 area, we only need to calculate first 4 rows here.
785 0 : for (i = 0; i < 4; ++i) {
786 0 : aom_idct16_c(input, outptr);
787 0 : input += 16;
788 0 : outptr += 16;
789 : }
790 :
791 : // Then transform columns
792 0 : for (i = 0; i < 16; ++i) {
793 0 : for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
794 0 : aom_idct16_c(temp_in, temp_out);
795 0 : for (j = 0; j < 16; ++j) {
796 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
797 0 : ROUND_POWER_OF_TWO(temp_out[j], 6));
798 : }
799 : }
800 0 : }
801 :
802 0 : void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
803 : int i, j;
804 : tran_high_t a1;
805 0 : tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
806 0 : out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
807 0 : a1 = ROUND_POWER_OF_TWO(out, 6);
808 0 : if (a1 == 0) return;
809 0 : for (j = 0; j < 16; ++j) {
810 0 : for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
811 0 : dest += stride;
812 : }
813 : }
814 :
815 0 : void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
816 : tran_low_t step1[32], step2[32];
817 : tran_high_t temp1, temp2;
818 :
819 : // stage 1
820 0 : step1[0] = input[0];
821 0 : step1[1] = input[16];
822 0 : step1[2] = input[8];
823 0 : step1[3] = input[24];
824 0 : step1[4] = input[4];
825 0 : step1[5] = input[20];
826 0 : step1[6] = input[12];
827 0 : step1[7] = input[28];
828 0 : step1[8] = input[2];
829 0 : step1[9] = input[18];
830 0 : step1[10] = input[10];
831 0 : step1[11] = input[26];
832 0 : step1[12] = input[6];
833 0 : step1[13] = input[22];
834 0 : step1[14] = input[14];
835 0 : step1[15] = input[30];
836 :
837 0 : temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
838 0 : temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
839 0 : step1[16] = WRAPLOW(dct_const_round_shift(temp1));
840 0 : step1[31] = WRAPLOW(dct_const_round_shift(temp2));
841 :
842 0 : temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
843 0 : temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
844 0 : step1[17] = WRAPLOW(dct_const_round_shift(temp1));
845 0 : step1[30] = WRAPLOW(dct_const_round_shift(temp2));
846 :
847 0 : temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
848 0 : temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
849 0 : step1[18] = WRAPLOW(dct_const_round_shift(temp1));
850 0 : step1[29] = WRAPLOW(dct_const_round_shift(temp2));
851 :
852 0 : temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
853 0 : temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
854 0 : step1[19] = WRAPLOW(dct_const_round_shift(temp1));
855 0 : step1[28] = WRAPLOW(dct_const_round_shift(temp2));
856 :
857 0 : temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
858 0 : temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
859 0 : step1[20] = WRAPLOW(dct_const_round_shift(temp1));
860 0 : step1[27] = WRAPLOW(dct_const_round_shift(temp2));
861 :
862 0 : temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
863 0 : temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
864 0 : step1[21] = WRAPLOW(dct_const_round_shift(temp1));
865 0 : step1[26] = WRAPLOW(dct_const_round_shift(temp2));
866 :
867 0 : temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
868 0 : temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
869 0 : step1[22] = WRAPLOW(dct_const_round_shift(temp1));
870 0 : step1[25] = WRAPLOW(dct_const_round_shift(temp2));
871 :
872 0 : temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
873 0 : temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
874 0 : step1[23] = WRAPLOW(dct_const_round_shift(temp1));
875 0 : step1[24] = WRAPLOW(dct_const_round_shift(temp2));
876 :
877 : // stage 2
878 0 : step2[0] = step1[0];
879 0 : step2[1] = step1[1];
880 0 : step2[2] = step1[2];
881 0 : step2[3] = step1[3];
882 0 : step2[4] = step1[4];
883 0 : step2[5] = step1[5];
884 0 : step2[6] = step1[6];
885 0 : step2[7] = step1[7];
886 :
887 0 : temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
888 0 : temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
889 0 : step2[8] = WRAPLOW(dct_const_round_shift(temp1));
890 0 : step2[15] = WRAPLOW(dct_const_round_shift(temp2));
891 :
892 0 : temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
893 0 : temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
894 0 : step2[9] = WRAPLOW(dct_const_round_shift(temp1));
895 0 : step2[14] = WRAPLOW(dct_const_round_shift(temp2));
896 :
897 0 : temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
898 0 : temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
899 0 : step2[10] = WRAPLOW(dct_const_round_shift(temp1));
900 0 : step2[13] = WRAPLOW(dct_const_round_shift(temp2));
901 :
902 0 : temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
903 0 : temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
904 0 : step2[11] = WRAPLOW(dct_const_round_shift(temp1));
905 0 : step2[12] = WRAPLOW(dct_const_round_shift(temp2));
906 :
907 0 : step2[16] = WRAPLOW(step1[16] + step1[17]);
908 0 : step2[17] = WRAPLOW(step1[16] - step1[17]);
909 0 : step2[18] = WRAPLOW(-step1[18] + step1[19]);
910 0 : step2[19] = WRAPLOW(step1[18] + step1[19]);
911 0 : step2[20] = WRAPLOW(step1[20] + step1[21]);
912 0 : step2[21] = WRAPLOW(step1[20] - step1[21]);
913 0 : step2[22] = WRAPLOW(-step1[22] + step1[23]);
914 0 : step2[23] = WRAPLOW(step1[22] + step1[23]);
915 0 : step2[24] = WRAPLOW(step1[24] + step1[25]);
916 0 : step2[25] = WRAPLOW(step1[24] - step1[25]);
917 0 : step2[26] = WRAPLOW(-step1[26] + step1[27]);
918 0 : step2[27] = WRAPLOW(step1[26] + step1[27]);
919 0 : step2[28] = WRAPLOW(step1[28] + step1[29]);
920 0 : step2[29] = WRAPLOW(step1[28] - step1[29]);
921 0 : step2[30] = WRAPLOW(-step1[30] + step1[31]);
922 0 : step2[31] = WRAPLOW(step1[30] + step1[31]);
923 :
924 : // stage 3
925 0 : step1[0] = step2[0];
926 0 : step1[1] = step2[1];
927 0 : step1[2] = step2[2];
928 0 : step1[3] = step2[3];
929 :
930 0 : temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
931 0 : temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
932 0 : step1[4] = WRAPLOW(dct_const_round_shift(temp1));
933 0 : step1[7] = WRAPLOW(dct_const_round_shift(temp2));
934 0 : temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
935 0 : temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
936 0 : step1[5] = WRAPLOW(dct_const_round_shift(temp1));
937 0 : step1[6] = WRAPLOW(dct_const_round_shift(temp2));
938 :
939 0 : step1[8] = WRAPLOW(step2[8] + step2[9]);
940 0 : step1[9] = WRAPLOW(step2[8] - step2[9]);
941 0 : step1[10] = WRAPLOW(-step2[10] + step2[11]);
942 0 : step1[11] = WRAPLOW(step2[10] + step2[11]);
943 0 : step1[12] = WRAPLOW(step2[12] + step2[13]);
944 0 : step1[13] = WRAPLOW(step2[12] - step2[13]);
945 0 : step1[14] = WRAPLOW(-step2[14] + step2[15]);
946 0 : step1[15] = WRAPLOW(step2[14] + step2[15]);
947 :
948 0 : step1[16] = step2[16];
949 0 : step1[31] = step2[31];
950 0 : temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
951 0 : temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
952 0 : step1[17] = WRAPLOW(dct_const_round_shift(temp1));
953 0 : step1[30] = WRAPLOW(dct_const_round_shift(temp2));
954 0 : temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
955 0 : temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
956 0 : step1[18] = WRAPLOW(dct_const_round_shift(temp1));
957 0 : step1[29] = WRAPLOW(dct_const_round_shift(temp2));
958 0 : step1[19] = step2[19];
959 0 : step1[20] = step2[20];
960 0 : temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
961 0 : temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
962 0 : step1[21] = WRAPLOW(dct_const_round_shift(temp1));
963 0 : step1[26] = WRAPLOW(dct_const_round_shift(temp2));
964 0 : temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
965 0 : temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
966 0 : step1[22] = WRAPLOW(dct_const_round_shift(temp1));
967 0 : step1[25] = WRAPLOW(dct_const_round_shift(temp2));
968 0 : step1[23] = step2[23];
969 0 : step1[24] = step2[24];
970 0 : step1[27] = step2[27];
971 0 : step1[28] = step2[28];
972 :
973 : // stage 4
974 0 : temp1 = (step1[0] + step1[1]) * cospi_16_64;
975 0 : temp2 = (step1[0] - step1[1]) * cospi_16_64;
976 0 : step2[0] = WRAPLOW(dct_const_round_shift(temp1));
977 0 : step2[1] = WRAPLOW(dct_const_round_shift(temp2));
978 0 : temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
979 0 : temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
980 0 : step2[2] = WRAPLOW(dct_const_round_shift(temp1));
981 0 : step2[3] = WRAPLOW(dct_const_round_shift(temp2));
982 0 : step2[4] = WRAPLOW(step1[4] + step1[5]);
983 0 : step2[5] = WRAPLOW(step1[4] - step1[5]);
984 0 : step2[6] = WRAPLOW(-step1[6] + step1[7]);
985 0 : step2[7] = WRAPLOW(step1[6] + step1[7]);
986 :
987 0 : step2[8] = step1[8];
988 0 : step2[15] = step1[15];
989 0 : temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
990 0 : temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
991 0 : step2[9] = WRAPLOW(dct_const_round_shift(temp1));
992 0 : step2[14] = WRAPLOW(dct_const_round_shift(temp2));
993 0 : temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
994 0 : temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
995 0 : step2[10] = WRAPLOW(dct_const_round_shift(temp1));
996 0 : step2[13] = WRAPLOW(dct_const_round_shift(temp2));
997 0 : step2[11] = step1[11];
998 0 : step2[12] = step1[12];
999 :
1000 0 : step2[16] = WRAPLOW(step1[16] + step1[19]);
1001 0 : step2[17] = WRAPLOW(step1[17] + step1[18]);
1002 0 : step2[18] = WRAPLOW(step1[17] - step1[18]);
1003 0 : step2[19] = WRAPLOW(step1[16] - step1[19]);
1004 0 : step2[20] = WRAPLOW(-step1[20] + step1[23]);
1005 0 : step2[21] = WRAPLOW(-step1[21] + step1[22]);
1006 0 : step2[22] = WRAPLOW(step1[21] + step1[22]);
1007 0 : step2[23] = WRAPLOW(step1[20] + step1[23]);
1008 :
1009 0 : step2[24] = WRAPLOW(step1[24] + step1[27]);
1010 0 : step2[25] = WRAPLOW(step1[25] + step1[26]);
1011 0 : step2[26] = WRAPLOW(step1[25] - step1[26]);
1012 0 : step2[27] = WRAPLOW(step1[24] - step1[27]);
1013 0 : step2[28] = WRAPLOW(-step1[28] + step1[31]);
1014 0 : step2[29] = WRAPLOW(-step1[29] + step1[30]);
1015 0 : step2[30] = WRAPLOW(step1[29] + step1[30]);
1016 0 : step2[31] = WRAPLOW(step1[28] + step1[31]);
1017 :
1018 : // stage 5
1019 0 : step1[0] = WRAPLOW(step2[0] + step2[3]);
1020 0 : step1[1] = WRAPLOW(step2[1] + step2[2]);
1021 0 : step1[2] = WRAPLOW(step2[1] - step2[2]);
1022 0 : step1[3] = WRAPLOW(step2[0] - step2[3]);
1023 0 : step1[4] = step2[4];
1024 0 : temp1 = (step2[6] - step2[5]) * cospi_16_64;
1025 0 : temp2 = (step2[5] + step2[6]) * cospi_16_64;
1026 0 : step1[5] = WRAPLOW(dct_const_round_shift(temp1));
1027 0 : step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1028 0 : step1[7] = step2[7];
1029 :
1030 0 : step1[8] = WRAPLOW(step2[8] + step2[11]);
1031 0 : step1[9] = WRAPLOW(step2[9] + step2[10]);
1032 0 : step1[10] = WRAPLOW(step2[9] - step2[10]);
1033 0 : step1[11] = WRAPLOW(step2[8] - step2[11]);
1034 0 : step1[12] = WRAPLOW(-step2[12] + step2[15]);
1035 0 : step1[13] = WRAPLOW(-step2[13] + step2[14]);
1036 0 : step1[14] = WRAPLOW(step2[13] + step2[14]);
1037 0 : step1[15] = WRAPLOW(step2[12] + step2[15]);
1038 :
1039 0 : step1[16] = step2[16];
1040 0 : step1[17] = step2[17];
1041 0 : temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1042 0 : temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1043 0 : step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1044 0 : step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1045 0 : temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1046 0 : temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1047 0 : step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1048 0 : step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1049 0 : temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1050 0 : temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1051 0 : step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1052 0 : step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1053 0 : temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1054 0 : temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1055 0 : step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1056 0 : step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1057 0 : step1[22] = step2[22];
1058 0 : step1[23] = step2[23];
1059 0 : step1[24] = step2[24];
1060 0 : step1[25] = step2[25];
1061 0 : step1[30] = step2[30];
1062 0 : step1[31] = step2[31];
1063 :
1064 : // stage 6
1065 0 : step2[0] = WRAPLOW(step1[0] + step1[7]);
1066 0 : step2[1] = WRAPLOW(step1[1] + step1[6]);
1067 0 : step2[2] = WRAPLOW(step1[2] + step1[5]);
1068 0 : step2[3] = WRAPLOW(step1[3] + step1[4]);
1069 0 : step2[4] = WRAPLOW(step1[3] - step1[4]);
1070 0 : step2[5] = WRAPLOW(step1[2] - step1[5]);
1071 0 : step2[6] = WRAPLOW(step1[1] - step1[6]);
1072 0 : step2[7] = WRAPLOW(step1[0] - step1[7]);
1073 0 : step2[8] = step1[8];
1074 0 : step2[9] = step1[9];
1075 0 : temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1076 0 : temp2 = (step1[10] + step1[13]) * cospi_16_64;
1077 0 : step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1078 0 : step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1079 0 : temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1080 0 : temp2 = (step1[11] + step1[12]) * cospi_16_64;
1081 0 : step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1082 0 : step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1083 0 : step2[14] = step1[14];
1084 0 : step2[15] = step1[15];
1085 :
1086 0 : step2[16] = WRAPLOW(step1[16] + step1[23]);
1087 0 : step2[17] = WRAPLOW(step1[17] + step1[22]);
1088 0 : step2[18] = WRAPLOW(step1[18] + step1[21]);
1089 0 : step2[19] = WRAPLOW(step1[19] + step1[20]);
1090 0 : step2[20] = WRAPLOW(step1[19] - step1[20]);
1091 0 : step2[21] = WRAPLOW(step1[18] - step1[21]);
1092 0 : step2[22] = WRAPLOW(step1[17] - step1[22]);
1093 0 : step2[23] = WRAPLOW(step1[16] - step1[23]);
1094 :
1095 0 : step2[24] = WRAPLOW(-step1[24] + step1[31]);
1096 0 : step2[25] = WRAPLOW(-step1[25] + step1[30]);
1097 0 : step2[26] = WRAPLOW(-step1[26] + step1[29]);
1098 0 : step2[27] = WRAPLOW(-step1[27] + step1[28]);
1099 0 : step2[28] = WRAPLOW(step1[27] + step1[28]);
1100 0 : step2[29] = WRAPLOW(step1[26] + step1[29]);
1101 0 : step2[30] = WRAPLOW(step1[25] + step1[30]);
1102 0 : step2[31] = WRAPLOW(step1[24] + step1[31]);
1103 :
1104 : // stage 7
1105 0 : step1[0] = WRAPLOW(step2[0] + step2[15]);
1106 0 : step1[1] = WRAPLOW(step2[1] + step2[14]);
1107 0 : step1[2] = WRAPLOW(step2[2] + step2[13]);
1108 0 : step1[3] = WRAPLOW(step2[3] + step2[12]);
1109 0 : step1[4] = WRAPLOW(step2[4] + step2[11]);
1110 0 : step1[5] = WRAPLOW(step2[5] + step2[10]);
1111 0 : step1[6] = WRAPLOW(step2[6] + step2[9]);
1112 0 : step1[7] = WRAPLOW(step2[7] + step2[8]);
1113 0 : step1[8] = WRAPLOW(step2[7] - step2[8]);
1114 0 : step1[9] = WRAPLOW(step2[6] - step2[9]);
1115 0 : step1[10] = WRAPLOW(step2[5] - step2[10]);
1116 0 : step1[11] = WRAPLOW(step2[4] - step2[11]);
1117 0 : step1[12] = WRAPLOW(step2[3] - step2[12]);
1118 0 : step1[13] = WRAPLOW(step2[2] - step2[13]);
1119 0 : step1[14] = WRAPLOW(step2[1] - step2[14]);
1120 0 : step1[15] = WRAPLOW(step2[0] - step2[15]);
1121 :
1122 0 : step1[16] = step2[16];
1123 0 : step1[17] = step2[17];
1124 0 : step1[18] = step2[18];
1125 0 : step1[19] = step2[19];
1126 0 : temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1127 0 : temp2 = (step2[20] + step2[27]) * cospi_16_64;
1128 0 : step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1129 0 : step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1130 0 : temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1131 0 : temp2 = (step2[21] + step2[26]) * cospi_16_64;
1132 0 : step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1133 0 : step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1134 0 : temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1135 0 : temp2 = (step2[22] + step2[25]) * cospi_16_64;
1136 0 : step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1137 0 : step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1138 0 : temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1139 0 : temp2 = (step2[23] + step2[24]) * cospi_16_64;
1140 0 : step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1141 0 : step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1142 0 : step1[28] = step2[28];
1143 0 : step1[29] = step2[29];
1144 0 : step1[30] = step2[30];
1145 0 : step1[31] = step2[31];
1146 :
1147 : // final stage
1148 0 : output[0] = WRAPLOW(step1[0] + step1[31]);
1149 0 : output[1] = WRAPLOW(step1[1] + step1[30]);
1150 0 : output[2] = WRAPLOW(step1[2] + step1[29]);
1151 0 : output[3] = WRAPLOW(step1[3] + step1[28]);
1152 0 : output[4] = WRAPLOW(step1[4] + step1[27]);
1153 0 : output[5] = WRAPLOW(step1[5] + step1[26]);
1154 0 : output[6] = WRAPLOW(step1[6] + step1[25]);
1155 0 : output[7] = WRAPLOW(step1[7] + step1[24]);
1156 0 : output[8] = WRAPLOW(step1[8] + step1[23]);
1157 0 : output[9] = WRAPLOW(step1[9] + step1[22]);
1158 0 : output[10] = WRAPLOW(step1[10] + step1[21]);
1159 0 : output[11] = WRAPLOW(step1[11] + step1[20]);
1160 0 : output[12] = WRAPLOW(step1[12] + step1[19]);
1161 0 : output[13] = WRAPLOW(step1[13] + step1[18]);
1162 0 : output[14] = WRAPLOW(step1[14] + step1[17]);
1163 0 : output[15] = WRAPLOW(step1[15] + step1[16]);
1164 0 : output[16] = WRAPLOW(step1[15] - step1[16]);
1165 0 : output[17] = WRAPLOW(step1[14] - step1[17]);
1166 0 : output[18] = WRAPLOW(step1[13] - step1[18]);
1167 0 : output[19] = WRAPLOW(step1[12] - step1[19]);
1168 0 : output[20] = WRAPLOW(step1[11] - step1[20]);
1169 0 : output[21] = WRAPLOW(step1[10] - step1[21]);
1170 0 : output[22] = WRAPLOW(step1[9] - step1[22]);
1171 0 : output[23] = WRAPLOW(step1[8] - step1[23]);
1172 0 : output[24] = WRAPLOW(step1[7] - step1[24]);
1173 0 : output[25] = WRAPLOW(step1[6] - step1[25]);
1174 0 : output[26] = WRAPLOW(step1[5] - step1[26]);
1175 0 : output[27] = WRAPLOW(step1[4] - step1[27]);
1176 0 : output[28] = WRAPLOW(step1[3] - step1[28]);
1177 0 : output[29] = WRAPLOW(step1[2] - step1[29]);
1178 0 : output[30] = WRAPLOW(step1[1] - step1[30]);
1179 0 : output[31] = WRAPLOW(step1[0] - step1[31]);
1180 0 : }
1181 :
1182 0 : void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1183 : int stride) {
1184 : tran_low_t out[32 * 32];
1185 0 : tran_low_t *outptr = out;
1186 : int i, j;
1187 : tran_low_t temp_in[32], temp_out[32];
1188 :
1189 : // Rows
1190 0 : for (i = 0; i < 32; ++i) {
1191 : int16_t zero_coeff[16];
1192 0 : for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1193 0 : for (j = 0; j < 8; ++j)
1194 0 : zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1195 0 : for (j = 0; j < 4; ++j)
1196 0 : zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1197 0 : for (j = 0; j < 2; ++j)
1198 0 : zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1199 :
1200 0 : if (zero_coeff[0] | zero_coeff[1])
1201 0 : aom_idct32_c(input, outptr);
1202 : else
1203 0 : memset(outptr, 0, sizeof(tran_low_t) * 32);
1204 0 : input += 32;
1205 0 : outptr += 32;
1206 : }
1207 :
1208 : // Columns
1209 0 : for (i = 0; i < 32; ++i) {
1210 0 : for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1211 0 : aom_idct32_c(temp_in, temp_out);
1212 0 : for (j = 0; j < 32; ++j) {
1213 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1214 0 : ROUND_POWER_OF_TWO(temp_out[j], 6));
1215 : }
1216 : }
1217 0 : }
1218 :
1219 0 : void aom_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
1220 : int stride) {
1221 0 : tran_low_t out[32 * 32] = { 0 };
1222 0 : tran_low_t *outptr = out;
1223 : int i, j;
1224 : tran_low_t temp_in[32], temp_out[32];
1225 :
1226 : // Rows
1227 : // only upper-left 16x16 has non-zero coeff
1228 0 : for (i = 0; i < 16; ++i) {
1229 0 : aom_idct32_c(input, outptr);
1230 0 : input += 32;
1231 0 : outptr += 32;
1232 : }
1233 :
1234 : // Columns
1235 0 : for (i = 0; i < 32; ++i) {
1236 0 : for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1237 0 : aom_idct32_c(temp_in, temp_out);
1238 0 : for (j = 0; j < 32; ++j) {
1239 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1240 0 : ROUND_POWER_OF_TWO(temp_out[j], 6));
1241 : }
1242 : }
1243 0 : }
1244 :
1245 0 : void aom_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1246 : int stride) {
1247 0 : tran_low_t out[32 * 32] = { 0 };
1248 0 : tran_low_t *outptr = out;
1249 : int i, j;
1250 : tran_low_t temp_in[32], temp_out[32];
1251 :
1252 : // Rows
1253 : // only upper-left 8x8 has non-zero coeff
1254 0 : for (i = 0; i < 8; ++i) {
1255 0 : aom_idct32_c(input, outptr);
1256 0 : input += 32;
1257 0 : outptr += 32;
1258 : }
1259 :
1260 : // Columns
1261 0 : for (i = 0; i < 32; ++i) {
1262 0 : for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1263 0 : aom_idct32_c(temp_in, temp_out);
1264 0 : for (j = 0; j < 32; ++j) {
1265 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1266 0 : ROUND_POWER_OF_TWO(temp_out[j], 6));
1267 : }
1268 : }
1269 0 : }
1270 :
1271 0 : void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1272 : int i, j;
1273 : tran_high_t a1;
1274 :
1275 0 : tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
1276 0 : out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1277 0 : a1 = ROUND_POWER_OF_TWO(out, 6);
1278 0 : if (a1 == 0) return;
1279 :
1280 0 : for (j = 0; j < 32; ++j) {
1281 0 : for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1282 0 : dest += stride;
1283 : }
1284 : }
1285 :
1286 : #if CONFIG_HIGHBITDEPTH
1287 0 : void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1288 : int stride, int bd) {
1289 : /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1290 : 0.5 shifts per pixel. */
1291 : int i;
1292 : tran_low_t output[16];
1293 : tran_high_t a1, b1, c1, d1, e1;
1294 0 : const tran_low_t *ip = input;
1295 0 : tran_low_t *op = output;
1296 0 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1297 :
1298 0 : for (i = 0; i < 4; i++) {
1299 0 : a1 = ip[0] >> UNIT_QUANT_SHIFT;
1300 0 : c1 = ip[1] >> UNIT_QUANT_SHIFT;
1301 0 : d1 = ip[2] >> UNIT_QUANT_SHIFT;
1302 0 : b1 = ip[3] >> UNIT_QUANT_SHIFT;
1303 0 : a1 += c1;
1304 0 : d1 -= b1;
1305 0 : e1 = (a1 - d1) >> 1;
1306 0 : b1 = e1 - b1;
1307 0 : c1 = e1 - c1;
1308 0 : a1 -= b1;
1309 0 : d1 += c1;
1310 0 : op[0] = HIGHBD_WRAPLOW(a1, bd);
1311 0 : op[1] = HIGHBD_WRAPLOW(b1, bd);
1312 0 : op[2] = HIGHBD_WRAPLOW(c1, bd);
1313 0 : op[3] = HIGHBD_WRAPLOW(d1, bd);
1314 0 : ip += 4;
1315 0 : op += 4;
1316 : }
1317 :
1318 0 : ip = output;
1319 0 : for (i = 0; i < 4; i++) {
1320 0 : a1 = ip[4 * 0];
1321 0 : c1 = ip[4 * 1];
1322 0 : d1 = ip[4 * 2];
1323 0 : b1 = ip[4 * 3];
1324 0 : a1 += c1;
1325 0 : d1 -= b1;
1326 0 : e1 = (a1 - d1) >> 1;
1327 0 : b1 = e1 - b1;
1328 0 : c1 = e1 - c1;
1329 0 : a1 -= b1;
1330 0 : d1 += c1;
1331 0 : dest[stride * 0] =
1332 0 : highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1333 0 : dest[stride * 1] =
1334 0 : highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1335 0 : dest[stride * 2] =
1336 0 : highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1337 0 : dest[stride * 3] =
1338 0 : highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1339 :
1340 0 : ip++;
1341 0 : dest++;
1342 : }
1343 0 : }
1344 :
1345 0 : void aom_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1346 : int dest_stride, int bd) {
1347 : int i;
1348 : tran_high_t a1, e1;
1349 : tran_low_t tmp[4];
1350 0 : const tran_low_t *ip = in;
1351 0 : tran_low_t *op = tmp;
1352 0 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1353 : (void)bd;
1354 :
1355 0 : a1 = ip[0] >> UNIT_QUANT_SHIFT;
1356 0 : e1 = a1 >> 1;
1357 0 : a1 -= e1;
1358 0 : op[0] = HIGHBD_WRAPLOW(a1, bd);
1359 0 : op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1360 :
1361 0 : ip = tmp;
1362 0 : for (i = 0; i < 4; i++) {
1363 0 : e1 = ip[0] >> 1;
1364 0 : a1 = ip[0] - e1;
1365 0 : dest[dest_stride * 0] =
1366 0 : highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
1367 0 : dest[dest_stride * 1] =
1368 0 : highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
1369 0 : dest[dest_stride * 2] =
1370 0 : highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
1371 0 : dest[dest_stride * 3] =
1372 0 : highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
1373 0 : ip++;
1374 0 : dest++;
1375 : }
1376 0 : }
1377 :
1378 0 : void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1379 : tran_low_t step[4];
1380 : tran_high_t temp1, temp2;
1381 : (void)bd;
1382 : // stage 1
1383 0 : temp1 = (input[0] + input[2]) * cospi_16_64;
1384 0 : temp2 = (input[0] - input[2]) * cospi_16_64;
1385 0 : step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1386 0 : step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1387 0 : temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1388 0 : temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1389 0 : step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1390 0 : step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1391 :
1392 : // stage 2
1393 0 : output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
1394 0 : output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
1395 0 : output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
1396 0 : output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
1397 0 : }
1398 :
1399 0 : void aom_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1400 : int stride, int bd) {
1401 : tran_low_t out[4 * 4];
1402 0 : tran_low_t *outptr = out;
1403 : int i, j;
1404 : tran_low_t temp_in[4], temp_out[4];
1405 0 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1406 :
1407 : // Rows
1408 0 : for (i = 0; i < 4; ++i) {
1409 0 : aom_highbd_idct4_c(input, outptr, bd);
1410 0 : input += 4;
1411 0 : outptr += 4;
1412 : }
1413 :
1414 : // Columns
1415 0 : for (i = 0; i < 4; ++i) {
1416 0 : for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
1417 0 : aom_highbd_idct4_c(temp_in, temp_out, bd);
1418 0 : for (j = 0; j < 4; ++j) {
1419 0 : dest[j * stride + i] = highbd_clip_pixel_add(
1420 0 : dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1421 : }
1422 : }
1423 0 : }
1424 :
1425 0 : void aom_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1426 : int dest_stride, int bd) {
1427 : int i;
1428 : tran_high_t a1;
1429 0 : tran_low_t out =
1430 0 : HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1431 0 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1432 :
1433 0 : out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1434 0 : a1 = ROUND_POWER_OF_TWO(out, 4);
1435 :
1436 0 : for (i = 0; i < 4; i++) {
1437 0 : dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1438 0 : dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1439 0 : dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1440 0 : dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1441 0 : dest += dest_stride;
1442 : }
1443 0 : }
1444 :
1445 0 : void aom_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1446 : tran_low_t step1[8], step2[8];
1447 : tran_high_t temp1, temp2;
1448 : // stage 1
1449 0 : step1[0] = input[0];
1450 0 : step1[2] = input[4];
1451 0 : step1[1] = input[2];
1452 0 : step1[3] = input[6];
1453 0 : temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1454 0 : temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1455 0 : step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1456 0 : step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1457 0 : temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1458 0 : temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1459 0 : step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1460 0 : step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1461 :
1462 : // stage 2 & stage 3 - even half
1463 0 : aom_highbd_idct4_c(step1, step1, bd);
1464 :
1465 : // stage 2 - odd half
1466 0 : step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1467 0 : step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1468 0 : step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1469 0 : step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1470 :
1471 : // stage 3 - odd half
1472 0 : step1[4] = step2[4];
1473 0 : temp1 = (step2[6] - step2[5]) * cospi_16_64;
1474 0 : temp2 = (step2[5] + step2[6]) * cospi_16_64;
1475 0 : step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1476 0 : step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1477 0 : step1[7] = step2[7];
1478 :
1479 : // stage 4
1480 0 : output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1481 0 : output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1482 0 : output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1483 0 : output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1484 0 : output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1485 0 : output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1486 0 : output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1487 0 : output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1488 0 : }
1489 :
1490 0 : void aom_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1491 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1492 :
1493 0 : tran_low_t x0 = input[0];
1494 0 : tran_low_t x1 = input[1];
1495 0 : tran_low_t x2 = input[2];
1496 0 : tran_low_t x3 = input[3];
1497 : (void)bd;
1498 :
1499 0 : if (!(x0 | x1 | x2 | x3)) {
1500 0 : memset(output, 0, 4 * sizeof(*output));
1501 0 : return;
1502 : }
1503 :
1504 0 : s0 = sinpi_1_9 * x0;
1505 0 : s1 = sinpi_2_9 * x0;
1506 0 : s2 = sinpi_3_9 * x1;
1507 0 : s3 = sinpi_4_9 * x2;
1508 0 : s4 = sinpi_1_9 * x2;
1509 0 : s5 = sinpi_2_9 * x3;
1510 0 : s6 = sinpi_4_9 * x3;
1511 0 : s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
1512 :
1513 0 : s0 = s0 + s3 + s5;
1514 0 : s1 = s1 - s4 - s6;
1515 0 : s3 = s2;
1516 0 : s2 = sinpi_3_9 * s7;
1517 :
1518 : // 1-D transform scaling factor is sqrt(2).
1519 : // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1520 : // + 1b (addition) = 29b.
1521 : // Hence the output bit depth is 15b.
1522 0 : output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
1523 0 : output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
1524 0 : output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1525 0 : output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
1526 : }
1527 :
1528 0 : void aom_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1529 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1530 :
1531 0 : tran_low_t x0 = input[7];
1532 0 : tran_low_t x1 = input[0];
1533 0 : tran_low_t x2 = input[5];
1534 0 : tran_low_t x3 = input[2];
1535 0 : tran_low_t x4 = input[3];
1536 0 : tran_low_t x5 = input[4];
1537 0 : tran_low_t x6 = input[1];
1538 0 : tran_low_t x7 = input[6];
1539 : (void)bd;
1540 :
1541 0 : if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1542 0 : memset(output, 0, 8 * sizeof(*output));
1543 0 : return;
1544 : }
1545 :
1546 : // stage 1
1547 0 : s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1548 0 : s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1549 0 : s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1550 0 : s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1551 0 : s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1552 0 : s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1553 0 : s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1554 0 : s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1555 :
1556 0 : x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1557 0 : x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1558 0 : x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1559 0 : x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1560 0 : x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1561 0 : x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1562 0 : x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1563 0 : x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
1564 :
1565 : // stage 2
1566 0 : s0 = x0;
1567 0 : s1 = x1;
1568 0 : s2 = x2;
1569 0 : s3 = x3;
1570 0 : s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1571 0 : s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1572 0 : s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1573 0 : s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1574 :
1575 0 : x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1576 0 : x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1577 0 : x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1578 0 : x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1579 0 : x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1580 0 : x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1581 0 : x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1582 0 : x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1583 :
1584 : // stage 3
1585 0 : s2 = cospi_16_64 * (x2 + x3);
1586 0 : s3 = cospi_16_64 * (x2 - x3);
1587 0 : s6 = cospi_16_64 * (x6 + x7);
1588 0 : s7 = cospi_16_64 * (x6 - x7);
1589 :
1590 0 : x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1591 0 : x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1592 0 : x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1593 0 : x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1594 :
1595 0 : output[0] = HIGHBD_WRAPLOW(x0, bd);
1596 0 : output[1] = HIGHBD_WRAPLOW(-x4, bd);
1597 0 : output[2] = HIGHBD_WRAPLOW(x6, bd);
1598 0 : output[3] = HIGHBD_WRAPLOW(-x2, bd);
1599 0 : output[4] = HIGHBD_WRAPLOW(x3, bd);
1600 0 : output[5] = HIGHBD_WRAPLOW(-x7, bd);
1601 0 : output[6] = HIGHBD_WRAPLOW(x5, bd);
1602 0 : output[7] = HIGHBD_WRAPLOW(-x1, bd);
1603 : }
1604 :
1605 0 : void aom_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1606 : tran_low_t step1[16], step2[16];
1607 : tran_high_t temp1, temp2;
1608 : (void)bd;
1609 :
1610 : // stage 1
1611 0 : step1[0] = input[0 / 2];
1612 0 : step1[1] = input[16 / 2];
1613 0 : step1[2] = input[8 / 2];
1614 0 : step1[3] = input[24 / 2];
1615 0 : step1[4] = input[4 / 2];
1616 0 : step1[5] = input[20 / 2];
1617 0 : step1[6] = input[12 / 2];
1618 0 : step1[7] = input[28 / 2];
1619 0 : step1[8] = input[2 / 2];
1620 0 : step1[9] = input[18 / 2];
1621 0 : step1[10] = input[10 / 2];
1622 0 : step1[11] = input[26 / 2];
1623 0 : step1[12] = input[6 / 2];
1624 0 : step1[13] = input[22 / 2];
1625 0 : step1[14] = input[14 / 2];
1626 0 : step1[15] = input[30 / 2];
1627 :
1628 : // stage 2
1629 0 : step2[0] = step1[0];
1630 0 : step2[1] = step1[1];
1631 0 : step2[2] = step1[2];
1632 0 : step2[3] = step1[3];
1633 0 : step2[4] = step1[4];
1634 0 : step2[5] = step1[5];
1635 0 : step2[6] = step1[6];
1636 0 : step2[7] = step1[7];
1637 :
1638 0 : temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1639 0 : temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1640 0 : step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1641 0 : step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1642 :
1643 0 : temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1644 0 : temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1645 0 : step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1646 0 : step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1647 :
1648 0 : temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1649 0 : temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1650 0 : step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1651 0 : step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1652 :
1653 0 : temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1654 0 : temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1655 0 : step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1656 0 : step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1657 :
1658 : // stage 3
1659 0 : step1[0] = step2[0];
1660 0 : step1[1] = step2[1];
1661 0 : step1[2] = step2[2];
1662 0 : step1[3] = step2[3];
1663 :
1664 0 : temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1665 0 : temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1666 0 : step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1667 0 : step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1668 0 : temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1669 0 : temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1670 0 : step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1671 0 : step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1672 :
1673 0 : step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
1674 0 : step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
1675 0 : step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
1676 0 : step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
1677 0 : step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
1678 0 : step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
1679 0 : step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
1680 0 : step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
1681 :
1682 : // stage 4
1683 0 : temp1 = (step1[0] + step1[1]) * cospi_16_64;
1684 0 : temp2 = (step1[0] - step1[1]) * cospi_16_64;
1685 0 : step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1686 0 : step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1687 0 : temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1688 0 : temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1689 0 : step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1690 0 : step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1691 0 : step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1692 0 : step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1693 0 : step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1694 0 : step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1695 :
1696 0 : step2[8] = step1[8];
1697 0 : step2[15] = step1[15];
1698 0 : temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1699 0 : temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1700 0 : step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1701 0 : step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1702 0 : temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1703 0 : temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1704 0 : step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1705 0 : step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1706 0 : step2[11] = step1[11];
1707 0 : step2[12] = step1[12];
1708 :
1709 : // stage 5
1710 0 : step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
1711 0 : step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
1712 0 : step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
1713 0 : step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
1714 0 : step1[4] = step2[4];
1715 0 : temp1 = (step2[6] - step2[5]) * cospi_16_64;
1716 0 : temp2 = (step2[5] + step2[6]) * cospi_16_64;
1717 0 : step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1718 0 : step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1719 0 : step1[7] = step2[7];
1720 :
1721 0 : step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
1722 0 : step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
1723 0 : step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
1724 0 : step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
1725 0 : step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
1726 0 : step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
1727 0 : step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
1728 0 : step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
1729 :
1730 : // stage 6
1731 0 : step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1732 0 : step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1733 0 : step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1734 0 : step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1735 0 : step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1736 0 : step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1737 0 : step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1738 0 : step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1739 0 : step2[8] = step1[8];
1740 0 : step2[9] = step1[9];
1741 0 : temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1742 0 : temp2 = (step1[10] + step1[13]) * cospi_16_64;
1743 0 : step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1744 0 : step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1745 0 : temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1746 0 : temp2 = (step1[11] + step1[12]) * cospi_16_64;
1747 0 : step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1748 0 : step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1749 0 : step2[14] = step1[14];
1750 0 : step2[15] = step1[15];
1751 :
1752 : // stage 7
1753 0 : output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
1754 0 : output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
1755 0 : output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
1756 0 : output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
1757 0 : output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
1758 0 : output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
1759 0 : output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
1760 0 : output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
1761 0 : output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
1762 0 : output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
1763 0 : output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
1764 0 : output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
1765 0 : output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
1766 0 : output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
1767 0 : output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
1768 0 : output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
1769 0 : }
1770 :
1771 0 : void aom_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1772 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1773 : tran_high_t s9, s10, s11, s12, s13, s14, s15;
1774 :
1775 0 : tran_low_t x0 = input[15];
1776 0 : tran_low_t x1 = input[0];
1777 0 : tran_low_t x2 = input[13];
1778 0 : tran_low_t x3 = input[2];
1779 0 : tran_low_t x4 = input[11];
1780 0 : tran_low_t x5 = input[4];
1781 0 : tran_low_t x6 = input[9];
1782 0 : tran_low_t x7 = input[6];
1783 0 : tran_low_t x8 = input[7];
1784 0 : tran_low_t x9 = input[8];
1785 0 : tran_low_t x10 = input[5];
1786 0 : tran_low_t x11 = input[10];
1787 0 : tran_low_t x12 = input[3];
1788 0 : tran_low_t x13 = input[12];
1789 0 : tran_low_t x14 = input[1];
1790 0 : tran_low_t x15 = input[14];
1791 : (void)bd;
1792 :
1793 0 : if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1794 0 : x13 | x14 | x15)) {
1795 0 : memset(output, 0, 16 * sizeof(*output));
1796 0 : return;
1797 : }
1798 :
1799 : // stage 1
1800 0 : s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1801 0 : s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1802 0 : s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1803 0 : s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1804 0 : s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1805 0 : s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1806 0 : s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1807 0 : s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1808 0 : s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1809 0 : s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1810 0 : s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1811 0 : s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1812 0 : s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1813 0 : s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1814 0 : s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1815 0 : s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1816 :
1817 0 : x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
1818 0 : x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
1819 0 : x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
1820 0 : x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
1821 0 : x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
1822 0 : x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
1823 0 : x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
1824 0 : x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
1825 0 : x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
1826 0 : x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
1827 0 : x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
1828 0 : x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
1829 0 : x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
1830 0 : x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
1831 0 : x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
1832 0 : x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
1833 :
1834 : // stage 2
1835 0 : s0 = x0;
1836 0 : s1 = x1;
1837 0 : s2 = x2;
1838 0 : s3 = x3;
1839 0 : s4 = x4;
1840 0 : s5 = x5;
1841 0 : s6 = x6;
1842 0 : s7 = x7;
1843 0 : s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1844 0 : s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1845 0 : s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1846 0 : s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1847 0 : s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1848 0 : s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1849 0 : s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1850 0 : s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1851 :
1852 0 : x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
1853 0 : x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
1854 0 : x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
1855 0 : x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
1856 0 : x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
1857 0 : x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
1858 0 : x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
1859 0 : x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
1860 0 : x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
1861 0 : x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
1862 0 : x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
1863 0 : x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
1864 0 : x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
1865 0 : x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
1866 0 : x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
1867 0 : x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
1868 :
1869 : // stage 3
1870 0 : s0 = x0;
1871 0 : s1 = x1;
1872 0 : s2 = x2;
1873 0 : s3 = x3;
1874 0 : s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1875 0 : s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1876 0 : s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1877 0 : s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1878 0 : s8 = x8;
1879 0 : s9 = x9;
1880 0 : s10 = x10;
1881 0 : s11 = x11;
1882 0 : s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1883 0 : s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1884 0 : s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1885 0 : s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1886 :
1887 0 : x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1888 0 : x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1889 0 : x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1890 0 : x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1891 0 : x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1892 0 : x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1893 0 : x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1894 0 : x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1895 0 : x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
1896 0 : x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
1897 0 : x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
1898 0 : x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
1899 0 : x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
1900 0 : x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
1901 0 : x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
1902 0 : x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
1903 :
1904 : // stage 4
1905 0 : s2 = (-cospi_16_64) * (x2 + x3);
1906 0 : s3 = cospi_16_64 * (x2 - x3);
1907 0 : s6 = cospi_16_64 * (x6 + x7);
1908 0 : s7 = cospi_16_64 * (-x6 + x7);
1909 0 : s10 = cospi_16_64 * (x10 + x11);
1910 0 : s11 = cospi_16_64 * (-x10 + x11);
1911 0 : s14 = (-cospi_16_64) * (x14 + x15);
1912 0 : s15 = cospi_16_64 * (x14 - x15);
1913 :
1914 0 : x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1915 0 : x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1916 0 : x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1917 0 : x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1918 0 : x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
1919 0 : x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
1920 0 : x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
1921 0 : x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
1922 :
1923 0 : output[0] = HIGHBD_WRAPLOW(x0, bd);
1924 0 : output[1] = HIGHBD_WRAPLOW(-x8, bd);
1925 0 : output[2] = HIGHBD_WRAPLOW(x12, bd);
1926 0 : output[3] = HIGHBD_WRAPLOW(-x4, bd);
1927 0 : output[4] = HIGHBD_WRAPLOW(x6, bd);
1928 0 : output[5] = HIGHBD_WRAPLOW(x14, bd);
1929 0 : output[6] = HIGHBD_WRAPLOW(x10, bd);
1930 0 : output[7] = HIGHBD_WRAPLOW(x2, bd);
1931 0 : output[8] = HIGHBD_WRAPLOW(x3, bd);
1932 0 : output[9] = HIGHBD_WRAPLOW(x11, bd);
1933 0 : output[10] = HIGHBD_WRAPLOW(x15, bd);
1934 0 : output[11] = HIGHBD_WRAPLOW(x7, bd);
1935 0 : output[12] = HIGHBD_WRAPLOW(x5, bd);
1936 0 : output[13] = HIGHBD_WRAPLOW(-x13, bd);
1937 0 : output[14] = HIGHBD_WRAPLOW(x9, bd);
1938 0 : output[15] = HIGHBD_WRAPLOW(-x1, bd);
1939 : }
1940 :
1941 0 : void aom_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd) {
1942 : tran_low_t step1[32], step2[32];
1943 : tran_high_t temp1, temp2;
1944 : (void)bd;
1945 :
1946 : // stage 1
1947 0 : step1[0] = input[0];
1948 0 : step1[1] = input[16];
1949 0 : step1[2] = input[8];
1950 0 : step1[3] = input[24];
1951 0 : step1[4] = input[4];
1952 0 : step1[5] = input[20];
1953 0 : step1[6] = input[12];
1954 0 : step1[7] = input[28];
1955 0 : step1[8] = input[2];
1956 0 : step1[9] = input[18];
1957 0 : step1[10] = input[10];
1958 0 : step1[11] = input[26];
1959 0 : step1[12] = input[6];
1960 0 : step1[13] = input[22];
1961 0 : step1[14] = input[14];
1962 0 : step1[15] = input[30];
1963 :
1964 0 : temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
1965 0 : temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
1966 0 : step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1967 0 : step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1968 :
1969 0 : temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
1970 0 : temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
1971 0 : step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1972 0 : step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1973 :
1974 0 : temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
1975 0 : temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
1976 0 : step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1977 0 : step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1978 :
1979 0 : temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
1980 0 : temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
1981 0 : step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1982 0 : step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1983 :
1984 0 : temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
1985 0 : temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
1986 0 : step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1987 0 : step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1988 :
1989 0 : temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
1990 0 : temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
1991 0 : step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1992 0 : step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1993 :
1994 0 : temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
1995 0 : temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
1996 0 : step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1997 0 : step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1998 :
1999 0 : temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2000 0 : temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2001 0 : step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2002 0 : step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2003 :
2004 : // stage 2
2005 0 : step2[0] = step1[0];
2006 0 : step2[1] = step1[1];
2007 0 : step2[2] = step1[2];
2008 0 : step2[3] = step1[3];
2009 0 : step2[4] = step1[4];
2010 0 : step2[5] = step1[5];
2011 0 : step2[6] = step1[6];
2012 0 : step2[7] = step1[7];
2013 :
2014 0 : temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2015 0 : temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2016 0 : step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2017 0 : step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2018 :
2019 0 : temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2020 0 : temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2021 0 : step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2022 0 : step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2023 :
2024 0 : temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2025 0 : temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2026 0 : step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2027 0 : step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2028 :
2029 0 : temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2030 0 : temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2031 0 : step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2032 0 : step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2033 :
2034 0 : step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
2035 0 : step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
2036 0 : step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
2037 0 : step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
2038 0 : step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
2039 0 : step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
2040 0 : step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
2041 0 : step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
2042 0 : step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
2043 0 : step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
2044 0 : step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
2045 0 : step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
2046 0 : step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
2047 0 : step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
2048 0 : step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
2049 0 : step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
2050 :
2051 : // stage 3
2052 0 : step1[0] = step2[0];
2053 0 : step1[1] = step2[1];
2054 0 : step1[2] = step2[2];
2055 0 : step1[3] = step2[3];
2056 :
2057 0 : temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2058 0 : temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2059 0 : step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2060 0 : step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2061 0 : temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2062 0 : temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2063 0 : step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2064 0 : step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2065 :
2066 0 : step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
2067 0 : step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
2068 0 : step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
2069 0 : step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
2070 0 : step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
2071 0 : step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
2072 0 : step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
2073 0 : step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
2074 :
2075 0 : step1[16] = step2[16];
2076 0 : step1[31] = step2[31];
2077 0 : temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2078 0 : temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2079 0 : step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2080 0 : step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2081 0 : temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2082 0 : temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2083 0 : step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2084 0 : step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2085 0 : step1[19] = step2[19];
2086 0 : step1[20] = step2[20];
2087 0 : temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2088 0 : temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2089 0 : step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2090 0 : step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2091 0 : temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2092 0 : temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2093 0 : step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2094 0 : step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2095 0 : step1[23] = step2[23];
2096 0 : step1[24] = step2[24];
2097 0 : step1[27] = step2[27];
2098 0 : step1[28] = step2[28];
2099 :
2100 : // stage 4
2101 0 : temp1 = (step1[0] + step1[1]) * cospi_16_64;
2102 0 : temp2 = (step1[0] - step1[1]) * cospi_16_64;
2103 0 : step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2104 0 : step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2105 0 : temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2106 0 : temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2107 0 : step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2108 0 : step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2109 0 : step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
2110 0 : step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
2111 0 : step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
2112 0 : step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
2113 :
2114 0 : step2[8] = step1[8];
2115 0 : step2[15] = step1[15];
2116 0 : temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2117 0 : temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2118 0 : step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2119 0 : step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2120 0 : temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2121 0 : temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2122 0 : step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2123 0 : step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2124 0 : step2[11] = step1[11];
2125 0 : step2[12] = step1[12];
2126 :
2127 0 : step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
2128 0 : step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
2129 0 : step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
2130 0 : step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
2131 0 : step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
2132 0 : step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
2133 0 : step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
2134 0 : step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
2135 :
2136 0 : step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
2137 0 : step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
2138 0 : step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
2139 0 : step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
2140 0 : step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
2141 0 : step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
2142 0 : step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
2143 0 : step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
2144 :
2145 : // stage 5
2146 0 : step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2147 0 : step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2148 0 : step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2149 0 : step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2150 0 : step1[4] = step2[4];
2151 0 : temp1 = (step2[6] - step2[5]) * cospi_16_64;
2152 0 : temp2 = (step2[5] + step2[6]) * cospi_16_64;
2153 0 : step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2154 0 : step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2155 0 : step1[7] = step2[7];
2156 :
2157 0 : step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2158 0 : step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2159 0 : step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2160 0 : step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2161 0 : step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2162 0 : step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2163 0 : step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2164 0 : step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2165 :
2166 0 : step1[16] = step2[16];
2167 0 : step1[17] = step2[17];
2168 0 : temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2169 0 : temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2170 0 : step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2171 0 : step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2172 0 : temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2173 0 : temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2174 0 : step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2175 0 : step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2176 0 : temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2177 0 : temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2178 0 : step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2179 0 : step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2180 0 : temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2181 0 : temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2182 0 : step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2183 0 : step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2184 0 : step1[22] = step2[22];
2185 0 : step1[23] = step2[23];
2186 0 : step1[24] = step2[24];
2187 0 : step1[25] = step2[25];
2188 0 : step1[30] = step2[30];
2189 0 : step1[31] = step2[31];
2190 :
2191 : // stage 6
2192 0 : step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2193 0 : step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2194 0 : step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2195 0 : step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2196 0 : step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2197 0 : step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2198 0 : step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2199 0 : step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2200 0 : step2[8] = step1[8];
2201 0 : step2[9] = step1[9];
2202 0 : temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2203 0 : temp2 = (step1[10] + step1[13]) * cospi_16_64;
2204 0 : step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2205 0 : step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2206 0 : temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2207 0 : temp2 = (step1[11] + step1[12]) * cospi_16_64;
2208 0 : step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2209 0 : step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2210 0 : step2[14] = step1[14];
2211 0 : step2[15] = step1[15];
2212 :
2213 0 : step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
2214 0 : step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
2215 0 : step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
2216 0 : step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
2217 0 : step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
2218 0 : step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
2219 0 : step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
2220 0 : step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
2221 :
2222 0 : step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
2223 0 : step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
2224 0 : step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
2225 0 : step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
2226 0 : step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
2227 0 : step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
2228 0 : step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
2229 0 : step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
2230 :
2231 : // stage 7
2232 0 : step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2233 0 : step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2234 0 : step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2235 0 : step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2236 0 : step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2237 0 : step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2238 0 : step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2239 0 : step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2240 0 : step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2241 0 : step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2242 0 : step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2243 0 : step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2244 0 : step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2245 0 : step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2246 0 : step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2247 0 : step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2248 :
2249 0 : step1[16] = step2[16];
2250 0 : step1[17] = step2[17];
2251 0 : step1[18] = step2[18];
2252 0 : step1[19] = step2[19];
2253 0 : temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2254 0 : temp2 = (step2[20] + step2[27]) * cospi_16_64;
2255 0 : step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2256 0 : step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2257 0 : temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2258 0 : temp2 = (step2[21] + step2[26]) * cospi_16_64;
2259 0 : step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2260 0 : step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2261 0 : temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2262 0 : temp2 = (step2[22] + step2[25]) * cospi_16_64;
2263 0 : step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2264 0 : step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2265 0 : temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2266 0 : temp2 = (step2[23] + step2[24]) * cospi_16_64;
2267 0 : step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2268 0 : step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2269 0 : step1[28] = step2[28];
2270 0 : step1[29] = step2[29];
2271 0 : step1[30] = step2[30];
2272 0 : step1[31] = step2[31];
2273 :
2274 : // final stage
2275 0 : output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
2276 0 : output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
2277 0 : output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
2278 0 : output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
2279 0 : output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
2280 0 : output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
2281 0 : output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
2282 0 : output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
2283 0 : output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
2284 0 : output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
2285 0 : output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
2286 0 : output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
2287 0 : output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
2288 0 : output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
2289 0 : output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
2290 0 : output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
2291 0 : output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
2292 0 : output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
2293 0 : output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
2294 0 : output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
2295 0 : output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
2296 0 : output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
2297 0 : output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
2298 0 : output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
2299 0 : output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
2300 0 : output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
2301 0 : output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
2302 0 : output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
2303 0 : output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
2304 0 : output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
2305 0 : output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
2306 0 : output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
2307 0 : }
2308 :
2309 : #endif // CONFIG_HIGHBITDEPTH
|