Line data Source code
1 : /*
2 : * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include <math.h>
12 : #include <stdlib.h>
13 : #include <string.h>
14 :
15 : #include "./vpx_dsp_rtcd.h"
16 : #include "vpx_dsp/inv_txfm.h"
17 :
18 0 : void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
19 : /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
20 : 0.5 shifts per pixel. */
21 : int i;
22 : tran_low_t output[16];
23 : tran_high_t a1, b1, c1, d1, e1;
24 0 : const tran_low_t *ip = input;
25 0 : tran_low_t *op = output;
26 :
27 0 : for (i = 0; i < 4; i++) {
28 0 : a1 = ip[0] >> UNIT_QUANT_SHIFT;
29 0 : c1 = ip[1] >> UNIT_QUANT_SHIFT;
30 0 : d1 = ip[2] >> UNIT_QUANT_SHIFT;
31 0 : b1 = ip[3] >> UNIT_QUANT_SHIFT;
32 0 : a1 += c1;
33 0 : d1 -= b1;
34 0 : e1 = (a1 - d1) >> 1;
35 0 : b1 = e1 - b1;
36 0 : c1 = e1 - c1;
37 0 : a1 -= b1;
38 0 : d1 += c1;
39 0 : op[0] = WRAPLOW(a1);
40 0 : op[1] = WRAPLOW(b1);
41 0 : op[2] = WRAPLOW(c1);
42 0 : op[3] = WRAPLOW(d1);
43 0 : ip += 4;
44 0 : op += 4;
45 : }
46 :
47 0 : ip = output;
48 0 : for (i = 0; i < 4; i++) {
49 0 : a1 = ip[4 * 0];
50 0 : c1 = ip[4 * 1];
51 0 : d1 = ip[4 * 2];
52 0 : b1 = ip[4 * 3];
53 0 : a1 += c1;
54 0 : d1 -= b1;
55 0 : e1 = (a1 - d1) >> 1;
56 0 : b1 = e1 - b1;
57 0 : c1 = e1 - c1;
58 0 : a1 -= b1;
59 0 : d1 += c1;
60 0 : dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
61 0 : dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
62 0 : dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
63 0 : dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
64 :
65 0 : ip++;
66 0 : dest++;
67 : }
68 0 : }
69 :
70 0 : void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
71 : int i;
72 : tran_high_t a1, e1;
73 : tran_low_t tmp[4];
74 0 : const tran_low_t *ip = in;
75 0 : tran_low_t *op = tmp;
76 :
77 0 : a1 = ip[0] >> UNIT_QUANT_SHIFT;
78 0 : e1 = a1 >> 1;
79 0 : a1 -= e1;
80 0 : op[0] = WRAPLOW(a1);
81 0 : op[1] = op[2] = op[3] = WRAPLOW(e1);
82 :
83 0 : ip = tmp;
84 0 : for (i = 0; i < 4; i++) {
85 0 : e1 = ip[0] >> 1;
86 0 : a1 = ip[0] - e1;
87 0 : dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
88 0 : dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
89 0 : dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
90 0 : dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
91 0 : ip++;
92 0 : dest++;
93 : }
94 0 : }
95 :
96 0 : void idct4_c(const tran_low_t *input, tran_low_t *output) {
97 : tran_low_t step[4];
98 : tran_high_t temp1, temp2;
99 :
100 : // stage 1
101 0 : temp1 = (input[0] + input[2]) * cospi_16_64;
102 0 : temp2 = (input[0] - input[2]) * cospi_16_64;
103 0 : step[0] = WRAPLOW(dct_const_round_shift(temp1));
104 0 : step[1] = WRAPLOW(dct_const_round_shift(temp2));
105 0 : temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
106 0 : temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
107 0 : step[2] = WRAPLOW(dct_const_round_shift(temp1));
108 0 : step[3] = WRAPLOW(dct_const_round_shift(temp2));
109 :
110 : // stage 2
111 0 : output[0] = WRAPLOW(step[0] + step[3]);
112 0 : output[1] = WRAPLOW(step[1] + step[2]);
113 0 : output[2] = WRAPLOW(step[1] - step[2]);
114 0 : output[3] = WRAPLOW(step[0] - step[3]);
115 0 : }
116 :
117 0 : void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
118 : int i, j;
119 : tran_low_t out[4 * 4];
120 0 : tran_low_t *outptr = out;
121 : tran_low_t temp_in[4], temp_out[4];
122 :
123 : // Rows
124 0 : for (i = 0; i < 4; ++i) {
125 0 : idct4_c(input, outptr);
126 0 : input += 4;
127 0 : outptr += 4;
128 : }
129 :
130 : // Columns
131 0 : for (i = 0; i < 4; ++i) {
132 0 : for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
133 0 : idct4_c(temp_in, temp_out);
134 0 : for (j = 0; j < 4; ++j) {
135 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
136 0 : ROUND_POWER_OF_TWO(temp_out[j], 4));
137 : }
138 : }
139 0 : }
140 :
141 0 : void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
142 : int i;
143 : tran_high_t a1;
144 0 : tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
145 :
146 0 : out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
147 0 : a1 = ROUND_POWER_OF_TWO(out, 4);
148 :
149 0 : for (i = 0; i < 4; i++) {
150 0 : dest[0] = clip_pixel_add(dest[0], a1);
151 0 : dest[1] = clip_pixel_add(dest[1], a1);
152 0 : dest[2] = clip_pixel_add(dest[2], a1);
153 0 : dest[3] = clip_pixel_add(dest[3], a1);
154 0 : dest += stride;
155 : }
156 0 : }
157 :
158 0 : void idct8_c(const tran_low_t *input, tran_low_t *output) {
159 : tran_low_t step1[8], step2[8];
160 : tran_high_t temp1, temp2;
161 :
162 : // stage 1
163 0 : step1[0] = input[0];
164 0 : step1[2] = input[4];
165 0 : step1[1] = input[2];
166 0 : step1[3] = input[6];
167 0 : temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
168 0 : temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
169 0 : step1[4] = WRAPLOW(dct_const_round_shift(temp1));
170 0 : step1[7] = WRAPLOW(dct_const_round_shift(temp2));
171 0 : temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
172 0 : temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
173 0 : step1[5] = WRAPLOW(dct_const_round_shift(temp1));
174 0 : step1[6] = WRAPLOW(dct_const_round_shift(temp2));
175 :
176 : // stage 2
177 0 : temp1 = (step1[0] + step1[2]) * cospi_16_64;
178 0 : temp2 = (step1[0] - step1[2]) * cospi_16_64;
179 0 : step2[0] = WRAPLOW(dct_const_round_shift(temp1));
180 0 : step2[1] = WRAPLOW(dct_const_round_shift(temp2));
181 0 : temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
182 0 : temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
183 0 : step2[2] = WRAPLOW(dct_const_round_shift(temp1));
184 0 : step2[3] = WRAPLOW(dct_const_round_shift(temp2));
185 0 : step2[4] = WRAPLOW(step1[4] + step1[5]);
186 0 : step2[5] = WRAPLOW(step1[4] - step1[5]);
187 0 : step2[6] = WRAPLOW(-step1[6] + step1[7]);
188 0 : step2[7] = WRAPLOW(step1[6] + step1[7]);
189 :
190 : // stage 3
191 0 : step1[0] = WRAPLOW(step2[0] + step2[3]);
192 0 : step1[1] = WRAPLOW(step2[1] + step2[2]);
193 0 : step1[2] = WRAPLOW(step2[1] - step2[2]);
194 0 : step1[3] = WRAPLOW(step2[0] - step2[3]);
195 0 : step1[4] = step2[4];
196 0 : temp1 = (step2[6] - step2[5]) * cospi_16_64;
197 0 : temp2 = (step2[5] + step2[6]) * cospi_16_64;
198 0 : step1[5] = WRAPLOW(dct_const_round_shift(temp1));
199 0 : step1[6] = WRAPLOW(dct_const_round_shift(temp2));
200 0 : step1[7] = step2[7];
201 :
202 : // stage 4
203 0 : output[0] = WRAPLOW(step1[0] + step1[7]);
204 0 : output[1] = WRAPLOW(step1[1] + step1[6]);
205 0 : output[2] = WRAPLOW(step1[2] + step1[5]);
206 0 : output[3] = WRAPLOW(step1[3] + step1[4]);
207 0 : output[4] = WRAPLOW(step1[3] - step1[4]);
208 0 : output[5] = WRAPLOW(step1[2] - step1[5]);
209 0 : output[6] = WRAPLOW(step1[1] - step1[6]);
210 0 : output[7] = WRAPLOW(step1[0] - step1[7]);
211 0 : }
212 :
213 0 : void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
214 : int i, j;
215 : tran_low_t out[8 * 8];
216 0 : tran_low_t *outptr = out;
217 : tran_low_t temp_in[8], temp_out[8];
218 :
219 : // First transform rows
220 0 : for (i = 0; i < 8; ++i) {
221 0 : idct8_c(input, outptr);
222 0 : input += 8;
223 0 : outptr += 8;
224 : }
225 :
226 : // Then transform columns
227 0 : for (i = 0; i < 8; ++i) {
228 0 : for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
229 0 : idct8_c(temp_in, temp_out);
230 0 : for (j = 0; j < 8; ++j) {
231 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
232 0 : ROUND_POWER_OF_TWO(temp_out[j], 5));
233 : }
234 : }
235 0 : }
236 :
237 0 : void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
238 : int i, j;
239 : tran_high_t a1;
240 0 : tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
241 :
242 0 : out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
243 0 : a1 = ROUND_POWER_OF_TWO(out, 5);
244 0 : for (j = 0; j < 8; ++j) {
245 0 : for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
246 0 : dest += stride;
247 : }
248 0 : }
249 :
250 0 : void iadst4_c(const tran_low_t *input, tran_low_t *output) {
251 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
252 0 : tran_low_t x0 = input[0];
253 0 : tran_low_t x1 = input[1];
254 0 : tran_low_t x2 = input[2];
255 0 : tran_low_t x3 = input[3];
256 :
257 0 : if (!(x0 | x1 | x2 | x3)) {
258 0 : memset(output, 0, 4 * sizeof(*output));
259 0 : return;
260 : }
261 :
262 0 : s0 = sinpi_1_9 * x0;
263 0 : s1 = sinpi_2_9 * x0;
264 0 : s2 = sinpi_3_9 * x1;
265 0 : s3 = sinpi_4_9 * x2;
266 0 : s4 = sinpi_1_9 * x2;
267 0 : s5 = sinpi_2_9 * x3;
268 0 : s6 = sinpi_4_9 * x3;
269 0 : s7 = WRAPLOW(x0 - x2 + x3);
270 :
271 0 : s0 = s0 + s3 + s5;
272 0 : s1 = s1 - s4 - s6;
273 0 : s3 = s2;
274 0 : s2 = sinpi_3_9 * s7;
275 :
276 : // 1-D transform scaling factor is sqrt(2).
277 : // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
278 : // + 1b (addition) = 29b.
279 : // Hence the output bit depth is 15b.
280 0 : output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
281 0 : output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
282 0 : output[2] = WRAPLOW(dct_const_round_shift(s2));
283 0 : output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
284 : }
285 :
286 0 : void iadst8_c(const tran_low_t *input, tran_low_t *output) {
287 : int s0, s1, s2, s3, s4, s5, s6, s7;
288 0 : tran_high_t x0 = input[7];
289 0 : tran_high_t x1 = input[0];
290 0 : tran_high_t x2 = input[5];
291 0 : tran_high_t x3 = input[2];
292 0 : tran_high_t x4 = input[3];
293 0 : tran_high_t x5 = input[4];
294 0 : tran_high_t x6 = input[1];
295 0 : tran_high_t x7 = input[6];
296 :
297 0 : if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
298 0 : memset(output, 0, 8 * sizeof(*output));
299 0 : return;
300 : }
301 :
302 : // stage 1
303 0 : s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
304 0 : s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
305 0 : s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
306 0 : s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
307 0 : s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
308 0 : s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
309 0 : s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
310 0 : s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
311 :
312 0 : x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
313 0 : x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
314 0 : x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
315 0 : x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
316 0 : x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
317 0 : x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
318 0 : x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
319 0 : x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
320 :
321 : // stage 2
322 0 : s0 = (int)x0;
323 0 : s1 = (int)x1;
324 0 : s2 = (int)x2;
325 0 : s3 = (int)x3;
326 0 : s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
327 0 : s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
328 0 : s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
329 0 : s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
330 :
331 0 : x0 = WRAPLOW(s0 + s2);
332 0 : x1 = WRAPLOW(s1 + s3);
333 0 : x2 = WRAPLOW(s0 - s2);
334 0 : x3 = WRAPLOW(s1 - s3);
335 0 : x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
336 0 : x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
337 0 : x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
338 0 : x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
339 :
340 : // stage 3
341 0 : s2 = (int)(cospi_16_64 * (x2 + x3));
342 0 : s3 = (int)(cospi_16_64 * (x2 - x3));
343 0 : s6 = (int)(cospi_16_64 * (x6 + x7));
344 0 : s7 = (int)(cospi_16_64 * (x6 - x7));
345 :
346 0 : x2 = WRAPLOW(dct_const_round_shift(s2));
347 0 : x3 = WRAPLOW(dct_const_round_shift(s3));
348 0 : x6 = WRAPLOW(dct_const_round_shift(s6));
349 0 : x7 = WRAPLOW(dct_const_round_shift(s7));
350 :
351 0 : output[0] = WRAPLOW(x0);
352 0 : output[1] = WRAPLOW(-x4);
353 0 : output[2] = WRAPLOW(x6);
354 0 : output[3] = WRAPLOW(-x2);
355 0 : output[4] = WRAPLOW(x3);
356 0 : output[5] = WRAPLOW(-x7);
357 0 : output[6] = WRAPLOW(x5);
358 0 : output[7] = WRAPLOW(-x1);
359 : }
360 :
361 0 : void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
362 : int i, j;
363 0 : tran_low_t out[8 * 8] = { 0 };
364 0 : tran_low_t *outptr = out;
365 : tran_low_t temp_in[8], temp_out[8];
366 :
367 : // First transform rows
368 : // Only first 4 row has non-zero coefs
369 0 : for (i = 0; i < 4; ++i) {
370 0 : idct8_c(input, outptr);
371 0 : input += 8;
372 0 : outptr += 8;
373 : }
374 :
375 : // Then transform columns
376 0 : for (i = 0; i < 8; ++i) {
377 0 : for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
378 0 : idct8_c(temp_in, temp_out);
379 0 : for (j = 0; j < 8; ++j) {
380 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
381 0 : ROUND_POWER_OF_TWO(temp_out[j], 5));
382 : }
383 : }
384 0 : }
385 :
386 0 : void idct16_c(const tran_low_t *input, tran_low_t *output) {
387 : tran_low_t step1[16], step2[16];
388 : tran_high_t temp1, temp2;
389 :
390 : // stage 1
391 0 : step1[0] = input[0 / 2];
392 0 : step1[1] = input[16 / 2];
393 0 : step1[2] = input[8 / 2];
394 0 : step1[3] = input[24 / 2];
395 0 : step1[4] = input[4 / 2];
396 0 : step1[5] = input[20 / 2];
397 0 : step1[6] = input[12 / 2];
398 0 : step1[7] = input[28 / 2];
399 0 : step1[8] = input[2 / 2];
400 0 : step1[9] = input[18 / 2];
401 0 : step1[10] = input[10 / 2];
402 0 : step1[11] = input[26 / 2];
403 0 : step1[12] = input[6 / 2];
404 0 : step1[13] = input[22 / 2];
405 0 : step1[14] = input[14 / 2];
406 0 : step1[15] = input[30 / 2];
407 :
408 : // stage 2
409 0 : step2[0] = step1[0];
410 0 : step2[1] = step1[1];
411 0 : step2[2] = step1[2];
412 0 : step2[3] = step1[3];
413 0 : step2[4] = step1[4];
414 0 : step2[5] = step1[5];
415 0 : step2[6] = step1[6];
416 0 : step2[7] = step1[7];
417 :
418 0 : temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
419 0 : temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
420 0 : step2[8] = WRAPLOW(dct_const_round_shift(temp1));
421 0 : step2[15] = WRAPLOW(dct_const_round_shift(temp2));
422 :
423 0 : temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
424 0 : temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
425 0 : step2[9] = WRAPLOW(dct_const_round_shift(temp1));
426 0 : step2[14] = WRAPLOW(dct_const_round_shift(temp2));
427 :
428 0 : temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
429 0 : temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
430 0 : step2[10] = WRAPLOW(dct_const_round_shift(temp1));
431 0 : step2[13] = WRAPLOW(dct_const_round_shift(temp2));
432 :
433 0 : temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
434 0 : temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
435 0 : step2[11] = WRAPLOW(dct_const_round_shift(temp1));
436 0 : step2[12] = WRAPLOW(dct_const_round_shift(temp2));
437 :
438 : // stage 3
439 0 : step1[0] = step2[0];
440 0 : step1[1] = step2[1];
441 0 : step1[2] = step2[2];
442 0 : step1[3] = step2[3];
443 :
444 0 : temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
445 0 : temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
446 0 : step1[4] = WRAPLOW(dct_const_round_shift(temp1));
447 0 : step1[7] = WRAPLOW(dct_const_round_shift(temp2));
448 0 : temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
449 0 : temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
450 0 : step1[5] = WRAPLOW(dct_const_round_shift(temp1));
451 0 : step1[6] = WRAPLOW(dct_const_round_shift(temp2));
452 :
453 0 : step1[8] = WRAPLOW(step2[8] + step2[9]);
454 0 : step1[9] = WRAPLOW(step2[8] - step2[9]);
455 0 : step1[10] = WRAPLOW(-step2[10] + step2[11]);
456 0 : step1[11] = WRAPLOW(step2[10] + step2[11]);
457 0 : step1[12] = WRAPLOW(step2[12] + step2[13]);
458 0 : step1[13] = WRAPLOW(step2[12] - step2[13]);
459 0 : step1[14] = WRAPLOW(-step2[14] + step2[15]);
460 0 : step1[15] = WRAPLOW(step2[14] + step2[15]);
461 :
462 : // stage 4
463 0 : temp1 = (step1[0] + step1[1]) * cospi_16_64;
464 0 : temp2 = (step1[0] - step1[1]) * cospi_16_64;
465 0 : step2[0] = WRAPLOW(dct_const_round_shift(temp1));
466 0 : step2[1] = WRAPLOW(dct_const_round_shift(temp2));
467 0 : temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
468 0 : temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
469 0 : step2[2] = WRAPLOW(dct_const_round_shift(temp1));
470 0 : step2[3] = WRAPLOW(dct_const_round_shift(temp2));
471 0 : step2[4] = WRAPLOW(step1[4] + step1[5]);
472 0 : step2[5] = WRAPLOW(step1[4] - step1[5]);
473 0 : step2[6] = WRAPLOW(-step1[6] + step1[7]);
474 0 : step2[7] = WRAPLOW(step1[6] + step1[7]);
475 :
476 0 : step2[8] = step1[8];
477 0 : step2[15] = step1[15];
478 0 : temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
479 0 : temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
480 0 : step2[9] = WRAPLOW(dct_const_round_shift(temp1));
481 0 : step2[14] = WRAPLOW(dct_const_round_shift(temp2));
482 0 : temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
483 0 : temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
484 0 : step2[10] = WRAPLOW(dct_const_round_shift(temp1));
485 0 : step2[13] = WRAPLOW(dct_const_round_shift(temp2));
486 0 : step2[11] = step1[11];
487 0 : step2[12] = step1[12];
488 :
489 : // stage 5
490 0 : step1[0] = WRAPLOW(step2[0] + step2[3]);
491 0 : step1[1] = WRAPLOW(step2[1] + step2[2]);
492 0 : step1[2] = WRAPLOW(step2[1] - step2[2]);
493 0 : step1[3] = WRAPLOW(step2[0] - step2[3]);
494 0 : step1[4] = step2[4];
495 0 : temp1 = (step2[6] - step2[5]) * cospi_16_64;
496 0 : temp2 = (step2[5] + step2[6]) * cospi_16_64;
497 0 : step1[5] = WRAPLOW(dct_const_round_shift(temp1));
498 0 : step1[6] = WRAPLOW(dct_const_round_shift(temp2));
499 0 : step1[7] = step2[7];
500 :
501 0 : step1[8] = WRAPLOW(step2[8] + step2[11]);
502 0 : step1[9] = WRAPLOW(step2[9] + step2[10]);
503 0 : step1[10] = WRAPLOW(step2[9] - step2[10]);
504 0 : step1[11] = WRAPLOW(step2[8] - step2[11]);
505 0 : step1[12] = WRAPLOW(-step2[12] + step2[15]);
506 0 : step1[13] = WRAPLOW(-step2[13] + step2[14]);
507 0 : step1[14] = WRAPLOW(step2[13] + step2[14]);
508 0 : step1[15] = WRAPLOW(step2[12] + step2[15]);
509 :
510 : // stage 6
511 0 : step2[0] = WRAPLOW(step1[0] + step1[7]);
512 0 : step2[1] = WRAPLOW(step1[1] + step1[6]);
513 0 : step2[2] = WRAPLOW(step1[2] + step1[5]);
514 0 : step2[3] = WRAPLOW(step1[3] + step1[4]);
515 0 : step2[4] = WRAPLOW(step1[3] - step1[4]);
516 0 : step2[5] = WRAPLOW(step1[2] - step1[5]);
517 0 : step2[6] = WRAPLOW(step1[1] - step1[6]);
518 0 : step2[7] = WRAPLOW(step1[0] - step1[7]);
519 0 : step2[8] = step1[8];
520 0 : step2[9] = step1[9];
521 0 : temp1 = (-step1[10] + step1[13]) * cospi_16_64;
522 0 : temp2 = (step1[10] + step1[13]) * cospi_16_64;
523 0 : step2[10] = WRAPLOW(dct_const_round_shift(temp1));
524 0 : step2[13] = WRAPLOW(dct_const_round_shift(temp2));
525 0 : temp1 = (-step1[11] + step1[12]) * cospi_16_64;
526 0 : temp2 = (step1[11] + step1[12]) * cospi_16_64;
527 0 : step2[11] = WRAPLOW(dct_const_round_shift(temp1));
528 0 : step2[12] = WRAPLOW(dct_const_round_shift(temp2));
529 0 : step2[14] = step1[14];
530 0 : step2[15] = step1[15];
531 :
532 : // stage 7
533 0 : output[0] = WRAPLOW(step2[0] + step2[15]);
534 0 : output[1] = WRAPLOW(step2[1] + step2[14]);
535 0 : output[2] = WRAPLOW(step2[2] + step2[13]);
536 0 : output[3] = WRAPLOW(step2[3] + step2[12]);
537 0 : output[4] = WRAPLOW(step2[4] + step2[11]);
538 0 : output[5] = WRAPLOW(step2[5] + step2[10]);
539 0 : output[6] = WRAPLOW(step2[6] + step2[9]);
540 0 : output[7] = WRAPLOW(step2[7] + step2[8]);
541 0 : output[8] = WRAPLOW(step2[7] - step2[8]);
542 0 : output[9] = WRAPLOW(step2[6] - step2[9]);
543 0 : output[10] = WRAPLOW(step2[5] - step2[10]);
544 0 : output[11] = WRAPLOW(step2[4] - step2[11]);
545 0 : output[12] = WRAPLOW(step2[3] - step2[12]);
546 0 : output[13] = WRAPLOW(step2[2] - step2[13]);
547 0 : output[14] = WRAPLOW(step2[1] - step2[14]);
548 0 : output[15] = WRAPLOW(step2[0] - step2[15]);
549 0 : }
550 :
551 0 : void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
552 : int stride) {
553 : int i, j;
554 : tran_low_t out[16 * 16];
555 0 : tran_low_t *outptr = out;
556 : tran_low_t temp_in[16], temp_out[16];
557 :
558 : // First transform rows
559 0 : for (i = 0; i < 16; ++i) {
560 0 : idct16_c(input, outptr);
561 0 : input += 16;
562 0 : outptr += 16;
563 : }
564 :
565 : // Then transform columns
566 0 : for (i = 0; i < 16; ++i) {
567 0 : for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
568 0 : idct16_c(temp_in, temp_out);
569 0 : for (j = 0; j < 16; ++j) {
570 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
571 0 : ROUND_POWER_OF_TWO(temp_out[j], 6));
572 : }
573 : }
574 0 : }
575 :
576 0 : void iadst16_c(const tran_low_t *input, tran_low_t *output) {
577 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
578 : tran_high_t s9, s10, s11, s12, s13, s14, s15;
579 0 : tran_high_t x0 = input[15];
580 0 : tran_high_t x1 = input[0];
581 0 : tran_high_t x2 = input[13];
582 0 : tran_high_t x3 = input[2];
583 0 : tran_high_t x4 = input[11];
584 0 : tran_high_t x5 = input[4];
585 0 : tran_high_t x6 = input[9];
586 0 : tran_high_t x7 = input[6];
587 0 : tran_high_t x8 = input[7];
588 0 : tran_high_t x9 = input[8];
589 0 : tran_high_t x10 = input[5];
590 0 : tran_high_t x11 = input[10];
591 0 : tran_high_t x12 = input[3];
592 0 : tran_high_t x13 = input[12];
593 0 : tran_high_t x14 = input[1];
594 0 : tran_high_t x15 = input[14];
595 :
596 0 : if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
597 0 : x13 | x14 | x15)) {
598 0 : memset(output, 0, 16 * sizeof(*output));
599 0 : return;
600 : }
601 :
602 : // stage 1
603 0 : s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
604 0 : s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
605 0 : s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
606 0 : s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
607 0 : s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
608 0 : s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
609 0 : s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
610 0 : s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
611 0 : s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
612 0 : s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
613 0 : s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
614 0 : s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
615 0 : s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
616 0 : s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
617 0 : s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
618 0 : s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
619 :
620 0 : x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
621 0 : x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
622 0 : x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
623 0 : x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
624 0 : x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
625 0 : x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
626 0 : x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
627 0 : x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
628 0 : x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
629 0 : x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
630 0 : x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
631 0 : x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
632 0 : x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
633 0 : x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
634 0 : x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
635 0 : x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
636 :
637 : // stage 2
638 0 : s0 = x0;
639 0 : s1 = x1;
640 0 : s2 = x2;
641 0 : s3 = x3;
642 0 : s4 = x4;
643 0 : s5 = x5;
644 0 : s6 = x6;
645 0 : s7 = x7;
646 0 : s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
647 0 : s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
648 0 : s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
649 0 : s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
650 0 : s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
651 0 : s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
652 0 : s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
653 0 : s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
654 :
655 0 : x0 = WRAPLOW(s0 + s4);
656 0 : x1 = WRAPLOW(s1 + s5);
657 0 : x2 = WRAPLOW(s2 + s6);
658 0 : x3 = WRAPLOW(s3 + s7);
659 0 : x4 = WRAPLOW(s0 - s4);
660 0 : x5 = WRAPLOW(s1 - s5);
661 0 : x6 = WRAPLOW(s2 - s6);
662 0 : x7 = WRAPLOW(s3 - s7);
663 0 : x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
664 0 : x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
665 0 : x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
666 0 : x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
667 0 : x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
668 0 : x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
669 0 : x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
670 0 : x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
671 :
672 : // stage 3
673 0 : s0 = x0;
674 0 : s1 = x1;
675 0 : s2 = x2;
676 0 : s3 = x3;
677 0 : s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
678 0 : s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
679 0 : s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
680 0 : s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
681 0 : s8 = x8;
682 0 : s9 = x9;
683 0 : s10 = x10;
684 0 : s11 = x11;
685 0 : s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
686 0 : s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
687 0 : s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
688 0 : s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
689 :
690 0 : x0 = WRAPLOW(s0 + s2);
691 0 : x1 = WRAPLOW(s1 + s3);
692 0 : x2 = WRAPLOW(s0 - s2);
693 0 : x3 = WRAPLOW(s1 - s3);
694 0 : x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
695 0 : x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
696 0 : x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
697 0 : x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
698 0 : x8 = WRAPLOW(s8 + s10);
699 0 : x9 = WRAPLOW(s9 + s11);
700 0 : x10 = WRAPLOW(s8 - s10);
701 0 : x11 = WRAPLOW(s9 - s11);
702 0 : x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
703 0 : x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
704 0 : x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
705 0 : x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
706 :
707 : // stage 4
708 0 : s2 = (-cospi_16_64) * (x2 + x3);
709 0 : s3 = cospi_16_64 * (x2 - x3);
710 0 : s6 = cospi_16_64 * (x6 + x7);
711 0 : s7 = cospi_16_64 * (-x6 + x7);
712 0 : s10 = cospi_16_64 * (x10 + x11);
713 0 : s11 = cospi_16_64 * (-x10 + x11);
714 0 : s14 = (-cospi_16_64) * (x14 + x15);
715 0 : s15 = cospi_16_64 * (x14 - x15);
716 :
717 0 : x2 = WRAPLOW(dct_const_round_shift(s2));
718 0 : x3 = WRAPLOW(dct_const_round_shift(s3));
719 0 : x6 = WRAPLOW(dct_const_round_shift(s6));
720 0 : x7 = WRAPLOW(dct_const_round_shift(s7));
721 0 : x10 = WRAPLOW(dct_const_round_shift(s10));
722 0 : x11 = WRAPLOW(dct_const_round_shift(s11));
723 0 : x14 = WRAPLOW(dct_const_round_shift(s14));
724 0 : x15 = WRAPLOW(dct_const_round_shift(s15));
725 :
726 0 : output[0] = WRAPLOW(x0);
727 0 : output[1] = WRAPLOW(-x8);
728 0 : output[2] = WRAPLOW(x12);
729 0 : output[3] = WRAPLOW(-x4);
730 0 : output[4] = WRAPLOW(x6);
731 0 : output[5] = WRAPLOW(x14);
732 0 : output[6] = WRAPLOW(x10);
733 0 : output[7] = WRAPLOW(x2);
734 0 : output[8] = WRAPLOW(x3);
735 0 : output[9] = WRAPLOW(x11);
736 0 : output[10] = WRAPLOW(x15);
737 0 : output[11] = WRAPLOW(x7);
738 0 : output[12] = WRAPLOW(x5);
739 0 : output[13] = WRAPLOW(-x13);
740 0 : output[14] = WRAPLOW(x9);
741 0 : output[15] = WRAPLOW(-x1);
742 : }
743 :
744 0 : void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
745 : int stride) {
746 : int i, j;
747 0 : tran_low_t out[16 * 16] = { 0 };
748 0 : tran_low_t *outptr = out;
749 : tran_low_t temp_in[16], temp_out[16];
750 :
751 : // First transform rows. Since all non-zero dct coefficients are in
752 : // upper-left 4x4 area, we only need to calculate first 4 rows here.
753 0 : for (i = 0; i < 4; ++i) {
754 0 : idct16_c(input, outptr);
755 0 : input += 16;
756 0 : outptr += 16;
757 : }
758 :
759 : // Then transform columns
760 0 : for (i = 0; i < 16; ++i) {
761 0 : for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
762 0 : idct16_c(temp_in, temp_out);
763 0 : for (j = 0; j < 16; ++j) {
764 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
765 0 : ROUND_POWER_OF_TWO(temp_out[j], 6));
766 : }
767 : }
768 0 : }
769 :
770 0 : void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
771 : int i, j;
772 : tran_high_t a1;
773 0 : tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
774 :
775 0 : out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
776 0 : a1 = ROUND_POWER_OF_TWO(out, 6);
777 0 : for (j = 0; j < 16; ++j) {
778 0 : for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
779 0 : dest += stride;
780 : }
781 0 : }
782 :
783 0 : void idct32_c(const tran_low_t *input, tran_low_t *output) {
784 : tran_low_t step1[32], step2[32];
785 : tran_high_t temp1, temp2;
786 :
787 : // stage 1
788 0 : step1[0] = input[0];
789 0 : step1[1] = input[16];
790 0 : step1[2] = input[8];
791 0 : step1[3] = input[24];
792 0 : step1[4] = input[4];
793 0 : step1[5] = input[20];
794 0 : step1[6] = input[12];
795 0 : step1[7] = input[28];
796 0 : step1[8] = input[2];
797 0 : step1[9] = input[18];
798 0 : step1[10] = input[10];
799 0 : step1[11] = input[26];
800 0 : step1[12] = input[6];
801 0 : step1[13] = input[22];
802 0 : step1[14] = input[14];
803 0 : step1[15] = input[30];
804 :
805 0 : temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
806 0 : temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
807 0 : step1[16] = WRAPLOW(dct_const_round_shift(temp1));
808 0 : step1[31] = WRAPLOW(dct_const_round_shift(temp2));
809 :
810 0 : temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
811 0 : temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
812 0 : step1[17] = WRAPLOW(dct_const_round_shift(temp1));
813 0 : step1[30] = WRAPLOW(dct_const_round_shift(temp2));
814 :
815 0 : temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
816 0 : temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
817 0 : step1[18] = WRAPLOW(dct_const_round_shift(temp1));
818 0 : step1[29] = WRAPLOW(dct_const_round_shift(temp2));
819 :
820 0 : temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
821 0 : temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
822 0 : step1[19] = WRAPLOW(dct_const_round_shift(temp1));
823 0 : step1[28] = WRAPLOW(dct_const_round_shift(temp2));
824 :
825 0 : temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
826 0 : temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
827 0 : step1[20] = WRAPLOW(dct_const_round_shift(temp1));
828 0 : step1[27] = WRAPLOW(dct_const_round_shift(temp2));
829 :
830 0 : temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
831 0 : temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
832 0 : step1[21] = WRAPLOW(dct_const_round_shift(temp1));
833 0 : step1[26] = WRAPLOW(dct_const_round_shift(temp2));
834 :
835 0 : temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
836 0 : temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
837 0 : step1[22] = WRAPLOW(dct_const_round_shift(temp1));
838 0 : step1[25] = WRAPLOW(dct_const_round_shift(temp2));
839 :
840 0 : temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
841 0 : temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
842 0 : step1[23] = WRAPLOW(dct_const_round_shift(temp1));
843 0 : step1[24] = WRAPLOW(dct_const_round_shift(temp2));
844 :
845 : // stage 2
846 0 : step2[0] = step1[0];
847 0 : step2[1] = step1[1];
848 0 : step2[2] = step1[2];
849 0 : step2[3] = step1[3];
850 0 : step2[4] = step1[4];
851 0 : step2[5] = step1[5];
852 0 : step2[6] = step1[6];
853 0 : step2[7] = step1[7];
854 :
855 0 : temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
856 0 : temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
857 0 : step2[8] = WRAPLOW(dct_const_round_shift(temp1));
858 0 : step2[15] = WRAPLOW(dct_const_round_shift(temp2));
859 :
860 0 : temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
861 0 : temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
862 0 : step2[9] = WRAPLOW(dct_const_round_shift(temp1));
863 0 : step2[14] = WRAPLOW(dct_const_round_shift(temp2));
864 :
865 0 : temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
866 0 : temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
867 0 : step2[10] = WRAPLOW(dct_const_round_shift(temp1));
868 0 : step2[13] = WRAPLOW(dct_const_round_shift(temp2));
869 :
870 0 : temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
871 0 : temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
872 0 : step2[11] = WRAPLOW(dct_const_round_shift(temp1));
873 0 : step2[12] = WRAPLOW(dct_const_round_shift(temp2));
874 :
875 0 : step2[16] = WRAPLOW(step1[16] + step1[17]);
876 0 : step2[17] = WRAPLOW(step1[16] - step1[17]);
877 0 : step2[18] = WRAPLOW(-step1[18] + step1[19]);
878 0 : step2[19] = WRAPLOW(step1[18] + step1[19]);
879 0 : step2[20] = WRAPLOW(step1[20] + step1[21]);
880 0 : step2[21] = WRAPLOW(step1[20] - step1[21]);
881 0 : step2[22] = WRAPLOW(-step1[22] + step1[23]);
882 0 : step2[23] = WRAPLOW(step1[22] + step1[23]);
883 0 : step2[24] = WRAPLOW(step1[24] + step1[25]);
884 0 : step2[25] = WRAPLOW(step1[24] - step1[25]);
885 0 : step2[26] = WRAPLOW(-step1[26] + step1[27]);
886 0 : step2[27] = WRAPLOW(step1[26] + step1[27]);
887 0 : step2[28] = WRAPLOW(step1[28] + step1[29]);
888 0 : step2[29] = WRAPLOW(step1[28] - step1[29]);
889 0 : step2[30] = WRAPLOW(-step1[30] + step1[31]);
890 0 : step2[31] = WRAPLOW(step1[30] + step1[31]);
891 :
892 : // stage 3
893 0 : step1[0] = step2[0];
894 0 : step1[1] = step2[1];
895 0 : step1[2] = step2[2];
896 0 : step1[3] = step2[3];
897 :
898 0 : temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
899 0 : temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
900 0 : step1[4] = WRAPLOW(dct_const_round_shift(temp1));
901 0 : step1[7] = WRAPLOW(dct_const_round_shift(temp2));
902 0 : temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
903 0 : temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
904 0 : step1[5] = WRAPLOW(dct_const_round_shift(temp1));
905 0 : step1[6] = WRAPLOW(dct_const_round_shift(temp2));
906 :
907 0 : step1[8] = WRAPLOW(step2[8] + step2[9]);
908 0 : step1[9] = WRAPLOW(step2[8] - step2[9]);
909 0 : step1[10] = WRAPLOW(-step2[10] + step2[11]);
910 0 : step1[11] = WRAPLOW(step2[10] + step2[11]);
911 0 : step1[12] = WRAPLOW(step2[12] + step2[13]);
912 0 : step1[13] = WRAPLOW(step2[12] - step2[13]);
913 0 : step1[14] = WRAPLOW(-step2[14] + step2[15]);
914 0 : step1[15] = WRAPLOW(step2[14] + step2[15]);
915 :
916 0 : step1[16] = step2[16];
917 0 : step1[31] = step2[31];
918 0 : temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
919 0 : temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
920 0 : step1[17] = WRAPLOW(dct_const_round_shift(temp1));
921 0 : step1[30] = WRAPLOW(dct_const_round_shift(temp2));
922 0 : temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
923 0 : temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
924 0 : step1[18] = WRAPLOW(dct_const_round_shift(temp1));
925 0 : step1[29] = WRAPLOW(dct_const_round_shift(temp2));
926 0 : step1[19] = step2[19];
927 0 : step1[20] = step2[20];
928 0 : temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
929 0 : temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
930 0 : step1[21] = WRAPLOW(dct_const_round_shift(temp1));
931 0 : step1[26] = WRAPLOW(dct_const_round_shift(temp2));
932 0 : temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
933 0 : temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
934 0 : step1[22] = WRAPLOW(dct_const_round_shift(temp1));
935 0 : step1[25] = WRAPLOW(dct_const_round_shift(temp2));
936 0 : step1[23] = step2[23];
937 0 : step1[24] = step2[24];
938 0 : step1[27] = step2[27];
939 0 : step1[28] = step2[28];
940 :
941 : // stage 4
942 0 : temp1 = (step1[0] + step1[1]) * cospi_16_64;
943 0 : temp2 = (step1[0] - step1[1]) * cospi_16_64;
944 0 : step2[0] = WRAPLOW(dct_const_round_shift(temp1));
945 0 : step2[1] = WRAPLOW(dct_const_round_shift(temp2));
946 0 : temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
947 0 : temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
948 0 : step2[2] = WRAPLOW(dct_const_round_shift(temp1));
949 0 : step2[3] = WRAPLOW(dct_const_round_shift(temp2));
950 0 : step2[4] = WRAPLOW(step1[4] + step1[5]);
951 0 : step2[5] = WRAPLOW(step1[4] - step1[5]);
952 0 : step2[6] = WRAPLOW(-step1[6] + step1[7]);
953 0 : step2[7] = WRAPLOW(step1[6] + step1[7]);
954 :
955 0 : step2[8] = step1[8];
956 0 : step2[15] = step1[15];
957 0 : temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
958 0 : temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
959 0 : step2[9] = WRAPLOW(dct_const_round_shift(temp1));
960 0 : step2[14] = WRAPLOW(dct_const_round_shift(temp2));
961 0 : temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
962 0 : temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
963 0 : step2[10] = WRAPLOW(dct_const_round_shift(temp1));
964 0 : step2[13] = WRAPLOW(dct_const_round_shift(temp2));
965 0 : step2[11] = step1[11];
966 0 : step2[12] = step1[12];
967 :
968 0 : step2[16] = WRAPLOW(step1[16] + step1[19]);
969 0 : step2[17] = WRAPLOW(step1[17] + step1[18]);
970 0 : step2[18] = WRAPLOW(step1[17] - step1[18]);
971 0 : step2[19] = WRAPLOW(step1[16] - step1[19]);
972 0 : step2[20] = WRAPLOW(-step1[20] + step1[23]);
973 0 : step2[21] = WRAPLOW(-step1[21] + step1[22]);
974 0 : step2[22] = WRAPLOW(step1[21] + step1[22]);
975 0 : step2[23] = WRAPLOW(step1[20] + step1[23]);
976 :
977 0 : step2[24] = WRAPLOW(step1[24] + step1[27]);
978 0 : step2[25] = WRAPLOW(step1[25] + step1[26]);
979 0 : step2[26] = WRAPLOW(step1[25] - step1[26]);
980 0 : step2[27] = WRAPLOW(step1[24] - step1[27]);
981 0 : step2[28] = WRAPLOW(-step1[28] + step1[31]);
982 0 : step2[29] = WRAPLOW(-step1[29] + step1[30]);
983 0 : step2[30] = WRAPLOW(step1[29] + step1[30]);
984 0 : step2[31] = WRAPLOW(step1[28] + step1[31]);
985 :
986 : // stage 5
987 0 : step1[0] = WRAPLOW(step2[0] + step2[3]);
988 0 : step1[1] = WRAPLOW(step2[1] + step2[2]);
989 0 : step1[2] = WRAPLOW(step2[1] - step2[2]);
990 0 : step1[3] = WRAPLOW(step2[0] - step2[3]);
991 0 : step1[4] = step2[4];
992 0 : temp1 = (step2[6] - step2[5]) * cospi_16_64;
993 0 : temp2 = (step2[5] + step2[6]) * cospi_16_64;
994 0 : step1[5] = WRAPLOW(dct_const_round_shift(temp1));
995 0 : step1[6] = WRAPLOW(dct_const_round_shift(temp2));
996 0 : step1[7] = step2[7];
997 :
998 0 : step1[8] = WRAPLOW(step2[8] + step2[11]);
999 0 : step1[9] = WRAPLOW(step2[9] + step2[10]);
1000 0 : step1[10] = WRAPLOW(step2[9] - step2[10]);
1001 0 : step1[11] = WRAPLOW(step2[8] - step2[11]);
1002 0 : step1[12] = WRAPLOW(-step2[12] + step2[15]);
1003 0 : step1[13] = WRAPLOW(-step2[13] + step2[14]);
1004 0 : step1[14] = WRAPLOW(step2[13] + step2[14]);
1005 0 : step1[15] = WRAPLOW(step2[12] + step2[15]);
1006 :
1007 0 : step1[16] = step2[16];
1008 0 : step1[17] = step2[17];
1009 0 : temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1010 0 : temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1011 0 : step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1012 0 : step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1013 0 : temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1014 0 : temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1015 0 : step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1016 0 : step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1017 0 : temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1018 0 : temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1019 0 : step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1020 0 : step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1021 0 : temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1022 0 : temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1023 0 : step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1024 0 : step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1025 0 : step1[22] = step2[22];
1026 0 : step1[23] = step2[23];
1027 0 : step1[24] = step2[24];
1028 0 : step1[25] = step2[25];
1029 0 : step1[30] = step2[30];
1030 0 : step1[31] = step2[31];
1031 :
1032 : // stage 6
1033 0 : step2[0] = WRAPLOW(step1[0] + step1[7]);
1034 0 : step2[1] = WRAPLOW(step1[1] + step1[6]);
1035 0 : step2[2] = WRAPLOW(step1[2] + step1[5]);
1036 0 : step2[3] = WRAPLOW(step1[3] + step1[4]);
1037 0 : step2[4] = WRAPLOW(step1[3] - step1[4]);
1038 0 : step2[5] = WRAPLOW(step1[2] - step1[5]);
1039 0 : step2[6] = WRAPLOW(step1[1] - step1[6]);
1040 0 : step2[7] = WRAPLOW(step1[0] - step1[7]);
1041 0 : step2[8] = step1[8];
1042 0 : step2[9] = step1[9];
1043 0 : temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1044 0 : temp2 = (step1[10] + step1[13]) * cospi_16_64;
1045 0 : step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1046 0 : step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1047 0 : temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1048 0 : temp2 = (step1[11] + step1[12]) * cospi_16_64;
1049 0 : step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1050 0 : step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1051 0 : step2[14] = step1[14];
1052 0 : step2[15] = step1[15];
1053 :
1054 0 : step2[16] = WRAPLOW(step1[16] + step1[23]);
1055 0 : step2[17] = WRAPLOW(step1[17] + step1[22]);
1056 0 : step2[18] = WRAPLOW(step1[18] + step1[21]);
1057 0 : step2[19] = WRAPLOW(step1[19] + step1[20]);
1058 0 : step2[20] = WRAPLOW(step1[19] - step1[20]);
1059 0 : step2[21] = WRAPLOW(step1[18] - step1[21]);
1060 0 : step2[22] = WRAPLOW(step1[17] - step1[22]);
1061 0 : step2[23] = WRAPLOW(step1[16] - step1[23]);
1062 :
1063 0 : step2[24] = WRAPLOW(-step1[24] + step1[31]);
1064 0 : step2[25] = WRAPLOW(-step1[25] + step1[30]);
1065 0 : step2[26] = WRAPLOW(-step1[26] + step1[29]);
1066 0 : step2[27] = WRAPLOW(-step1[27] + step1[28]);
1067 0 : step2[28] = WRAPLOW(step1[27] + step1[28]);
1068 0 : step2[29] = WRAPLOW(step1[26] + step1[29]);
1069 0 : step2[30] = WRAPLOW(step1[25] + step1[30]);
1070 0 : step2[31] = WRAPLOW(step1[24] + step1[31]);
1071 :
1072 : // stage 7
1073 0 : step1[0] = WRAPLOW(step2[0] + step2[15]);
1074 0 : step1[1] = WRAPLOW(step2[1] + step2[14]);
1075 0 : step1[2] = WRAPLOW(step2[2] + step2[13]);
1076 0 : step1[3] = WRAPLOW(step2[3] + step2[12]);
1077 0 : step1[4] = WRAPLOW(step2[4] + step2[11]);
1078 0 : step1[5] = WRAPLOW(step2[5] + step2[10]);
1079 0 : step1[6] = WRAPLOW(step2[6] + step2[9]);
1080 0 : step1[7] = WRAPLOW(step2[7] + step2[8]);
1081 0 : step1[8] = WRAPLOW(step2[7] - step2[8]);
1082 0 : step1[9] = WRAPLOW(step2[6] - step2[9]);
1083 0 : step1[10] = WRAPLOW(step2[5] - step2[10]);
1084 0 : step1[11] = WRAPLOW(step2[4] - step2[11]);
1085 0 : step1[12] = WRAPLOW(step2[3] - step2[12]);
1086 0 : step1[13] = WRAPLOW(step2[2] - step2[13]);
1087 0 : step1[14] = WRAPLOW(step2[1] - step2[14]);
1088 0 : step1[15] = WRAPLOW(step2[0] - step2[15]);
1089 :
1090 0 : step1[16] = step2[16];
1091 0 : step1[17] = step2[17];
1092 0 : step1[18] = step2[18];
1093 0 : step1[19] = step2[19];
1094 0 : temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1095 0 : temp2 = (step2[20] + step2[27]) * cospi_16_64;
1096 0 : step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1097 0 : step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1098 0 : temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1099 0 : temp2 = (step2[21] + step2[26]) * cospi_16_64;
1100 0 : step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1101 0 : step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1102 0 : temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1103 0 : temp2 = (step2[22] + step2[25]) * cospi_16_64;
1104 0 : step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1105 0 : step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1106 0 : temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1107 0 : temp2 = (step2[23] + step2[24]) * cospi_16_64;
1108 0 : step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1109 0 : step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1110 0 : step1[28] = step2[28];
1111 0 : step1[29] = step2[29];
1112 0 : step1[30] = step2[30];
1113 0 : step1[31] = step2[31];
1114 :
1115 : // final stage
1116 0 : output[0] = WRAPLOW(step1[0] + step1[31]);
1117 0 : output[1] = WRAPLOW(step1[1] + step1[30]);
1118 0 : output[2] = WRAPLOW(step1[2] + step1[29]);
1119 0 : output[3] = WRAPLOW(step1[3] + step1[28]);
1120 0 : output[4] = WRAPLOW(step1[4] + step1[27]);
1121 0 : output[5] = WRAPLOW(step1[5] + step1[26]);
1122 0 : output[6] = WRAPLOW(step1[6] + step1[25]);
1123 0 : output[7] = WRAPLOW(step1[7] + step1[24]);
1124 0 : output[8] = WRAPLOW(step1[8] + step1[23]);
1125 0 : output[9] = WRAPLOW(step1[9] + step1[22]);
1126 0 : output[10] = WRAPLOW(step1[10] + step1[21]);
1127 0 : output[11] = WRAPLOW(step1[11] + step1[20]);
1128 0 : output[12] = WRAPLOW(step1[12] + step1[19]);
1129 0 : output[13] = WRAPLOW(step1[13] + step1[18]);
1130 0 : output[14] = WRAPLOW(step1[14] + step1[17]);
1131 0 : output[15] = WRAPLOW(step1[15] + step1[16]);
1132 0 : output[16] = WRAPLOW(step1[15] - step1[16]);
1133 0 : output[17] = WRAPLOW(step1[14] - step1[17]);
1134 0 : output[18] = WRAPLOW(step1[13] - step1[18]);
1135 0 : output[19] = WRAPLOW(step1[12] - step1[19]);
1136 0 : output[20] = WRAPLOW(step1[11] - step1[20]);
1137 0 : output[21] = WRAPLOW(step1[10] - step1[21]);
1138 0 : output[22] = WRAPLOW(step1[9] - step1[22]);
1139 0 : output[23] = WRAPLOW(step1[8] - step1[23]);
1140 0 : output[24] = WRAPLOW(step1[7] - step1[24]);
1141 0 : output[25] = WRAPLOW(step1[6] - step1[25]);
1142 0 : output[26] = WRAPLOW(step1[5] - step1[26]);
1143 0 : output[27] = WRAPLOW(step1[4] - step1[27]);
1144 0 : output[28] = WRAPLOW(step1[3] - step1[28]);
1145 0 : output[29] = WRAPLOW(step1[2] - step1[29]);
1146 0 : output[30] = WRAPLOW(step1[1] - step1[30]);
1147 0 : output[31] = WRAPLOW(step1[0] - step1[31]);
1148 0 : }
1149 :
1150 0 : void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1151 : int stride) {
1152 : int i, j;
1153 : tran_low_t out[32 * 32];
1154 0 : tran_low_t *outptr = out;
1155 : tran_low_t temp_in[32], temp_out[32];
1156 :
1157 : // Rows
1158 0 : for (i = 0; i < 32; ++i) {
1159 : int16_t zero_coeff[16];
1160 0 : for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1161 0 : for (j = 0; j < 8; ++j)
1162 0 : zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1163 0 : for (j = 0; j < 4; ++j)
1164 0 : zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1165 0 : for (j = 0; j < 2; ++j)
1166 0 : zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1167 :
1168 0 : if (zero_coeff[0] | zero_coeff[1])
1169 0 : idct32_c(input, outptr);
1170 : else
1171 0 : memset(outptr, 0, sizeof(tran_low_t) * 32);
1172 0 : input += 32;
1173 0 : outptr += 32;
1174 : }
1175 :
1176 : // Columns
1177 0 : for (i = 0; i < 32; ++i) {
1178 0 : for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1179 0 : idct32_c(temp_in, temp_out);
1180 0 : for (j = 0; j < 32; ++j) {
1181 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1182 0 : ROUND_POWER_OF_TWO(temp_out[j], 6));
1183 : }
1184 : }
1185 0 : }
1186 :
1187 0 : void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
1188 : int stride) {
1189 : int i, j;
1190 0 : tran_low_t out[32 * 32] = { 0 };
1191 0 : tran_low_t *outptr = out;
1192 : tran_low_t temp_in[32], temp_out[32];
1193 :
1194 : // Rows
1195 : // Only upper-left 16x16 has non-zero coeff
1196 0 : for (i = 0; i < 16; ++i) {
1197 0 : idct32_c(input, outptr);
1198 0 : input += 32;
1199 0 : outptr += 32;
1200 : }
1201 :
1202 : // Columns
1203 0 : for (i = 0; i < 32; ++i) {
1204 0 : for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1205 0 : idct32_c(temp_in, temp_out);
1206 0 : for (j = 0; j < 32; ++j) {
1207 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1208 0 : ROUND_POWER_OF_TWO(temp_out[j], 6));
1209 : }
1210 : }
1211 0 : }
1212 :
1213 0 : void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1214 : int stride) {
1215 : int i, j;
1216 0 : tran_low_t out[32 * 32] = { 0 };
1217 0 : tran_low_t *outptr = out;
1218 : tran_low_t temp_in[32], temp_out[32];
1219 :
1220 : // Rows
1221 : // Only upper-left 8x8 has non-zero coeff
1222 0 : for (i = 0; i < 8; ++i) {
1223 0 : idct32_c(input, outptr);
1224 0 : input += 32;
1225 0 : outptr += 32;
1226 : }
1227 :
1228 : // Columns
1229 0 : for (i = 0; i < 32; ++i) {
1230 0 : for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1231 0 : idct32_c(temp_in, temp_out);
1232 0 : for (j = 0; j < 32; ++j) {
1233 0 : dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1234 0 : ROUND_POWER_OF_TWO(temp_out[j], 6));
1235 : }
1236 : }
1237 0 : }
1238 :
1239 0 : void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1240 : int i, j;
1241 : tran_high_t a1;
1242 0 : tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
1243 :
1244 0 : out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1245 0 : a1 = ROUND_POWER_OF_TWO(out, 6);
1246 :
1247 0 : for (j = 0; j < 32; ++j) {
1248 0 : for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1249 0 : dest += stride;
1250 : }
1251 0 : }
1252 :
1253 : #if CONFIG_VP9_HIGHBITDEPTH
1254 :
1255 : // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
1256 : // transform amplify bits + 1 bit for contingency in rounding and quantizing
1257 : #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
1258 :
1259 : static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
1260 : int size) {
1261 : int i;
1262 : for (i = 0; i < size; ++i)
1263 : if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
1264 : return 0;
1265 : }
1266 :
1267 : void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1268 : int stride, int bd) {
1269 : /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1270 : 0.5 shifts per pixel. */
1271 : int i;
1272 : tran_low_t output[16];
1273 : tran_high_t a1, b1, c1, d1, e1;
1274 : const tran_low_t *ip = input;
1275 : tran_low_t *op = output;
1276 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1277 :
1278 : for (i = 0; i < 4; i++) {
1279 : a1 = ip[0] >> UNIT_QUANT_SHIFT;
1280 : c1 = ip[1] >> UNIT_QUANT_SHIFT;
1281 : d1 = ip[2] >> UNIT_QUANT_SHIFT;
1282 : b1 = ip[3] >> UNIT_QUANT_SHIFT;
1283 : a1 += c1;
1284 : d1 -= b1;
1285 : e1 = (a1 - d1) >> 1;
1286 : b1 = e1 - b1;
1287 : c1 = e1 - c1;
1288 : a1 -= b1;
1289 : d1 += c1;
1290 : op[0] = HIGHBD_WRAPLOW(a1, bd);
1291 : op[1] = HIGHBD_WRAPLOW(b1, bd);
1292 : op[2] = HIGHBD_WRAPLOW(c1, bd);
1293 : op[3] = HIGHBD_WRAPLOW(d1, bd);
1294 : ip += 4;
1295 : op += 4;
1296 : }
1297 :
1298 : ip = output;
1299 : for (i = 0; i < 4; i++) {
1300 : a1 = ip[4 * 0];
1301 : c1 = ip[4 * 1];
1302 : d1 = ip[4 * 2];
1303 : b1 = ip[4 * 3];
1304 : a1 += c1;
1305 : d1 -= b1;
1306 : e1 = (a1 - d1) >> 1;
1307 : b1 = e1 - b1;
1308 : c1 = e1 - c1;
1309 : a1 -= b1;
1310 : d1 += c1;
1311 : dest[stride * 0] =
1312 : highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1313 : dest[stride * 1] =
1314 : highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1315 : dest[stride * 2] =
1316 : highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1317 : dest[stride * 3] =
1318 : highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1319 :
1320 : ip++;
1321 : dest++;
1322 : }
1323 : }
1324 :
1325 : void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1326 : int stride, int bd) {
1327 : int i;
1328 : tran_high_t a1, e1;
1329 : tran_low_t tmp[4];
1330 : const tran_low_t *ip = in;
1331 : tran_low_t *op = tmp;
1332 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1333 : (void)bd;
1334 :
1335 : a1 = ip[0] >> UNIT_QUANT_SHIFT;
1336 : e1 = a1 >> 1;
1337 : a1 -= e1;
1338 : op[0] = HIGHBD_WRAPLOW(a1, bd);
1339 : op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1340 :
1341 : ip = tmp;
1342 : for (i = 0; i < 4; i++) {
1343 : e1 = ip[0] >> 1;
1344 : a1 = ip[0] - e1;
1345 : dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1346 : dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
1347 : dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
1348 : dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
1349 : ip++;
1350 : dest++;
1351 : }
1352 : }
1353 :
1354 : void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1355 : tran_low_t step[4];
1356 : tran_high_t temp1, temp2;
1357 : (void)bd;
1358 :
1359 : if (detect_invalid_highbd_input(input, 4)) {
1360 : #if CONFIG_COEFFICIENT_RANGE_CHECKING
1361 : assert(0 && "invalid highbd txfm input");
1362 : #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1363 : memset(output, 0, sizeof(*output) * 4);
1364 : return;
1365 : }
1366 :
1367 : // stage 1
1368 : temp1 = (input[0] + input[2]) * cospi_16_64;
1369 : temp2 = (input[0] - input[2]) * cospi_16_64;
1370 : step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1371 : step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1372 : temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1373 : temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1374 : step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1375 : step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1376 :
1377 : // stage 2
1378 : output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
1379 : output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
1380 : output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
1381 : output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
1382 : }
1383 :
1384 : void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1385 : int stride, int bd) {
1386 : int i, j;
1387 : tran_low_t out[4 * 4];
1388 : tran_low_t *outptr = out;
1389 : tran_low_t temp_in[4], temp_out[4];
1390 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1391 :
1392 : // Rows
1393 : for (i = 0; i < 4; ++i) {
1394 : vpx_highbd_idct4_c(input, outptr, bd);
1395 : input += 4;
1396 : outptr += 4;
1397 : }
1398 :
1399 : // Columns
1400 : for (i = 0; i < 4; ++i) {
1401 : for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
1402 : vpx_highbd_idct4_c(temp_in, temp_out, bd);
1403 : for (j = 0; j < 4; ++j) {
1404 : dest[j * stride + i] = highbd_clip_pixel_add(
1405 : dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1406 : }
1407 : }
1408 : }
1409 :
1410 : void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1411 : int stride, int bd) {
1412 : int i;
1413 : tran_high_t a1;
1414 : tran_low_t out =
1415 : HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1416 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1417 :
1418 : out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1419 : a1 = ROUND_POWER_OF_TWO(out, 4);
1420 :
1421 : for (i = 0; i < 4; i++) {
1422 : dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1423 : dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1424 : dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1425 : dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1426 : dest += stride;
1427 : }
1428 : }
1429 :
1430 : void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1431 : tran_low_t step1[8], step2[8];
1432 : tran_high_t temp1, temp2;
1433 :
1434 : if (detect_invalid_highbd_input(input, 8)) {
1435 : #if CONFIG_COEFFICIENT_RANGE_CHECKING
1436 : assert(0 && "invalid highbd txfm input");
1437 : #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1438 : memset(output, 0, sizeof(*output) * 8);
1439 : return;
1440 : }
1441 :
1442 : // stage 1
1443 : step1[0] = input[0];
1444 : step1[2] = input[4];
1445 : step1[1] = input[2];
1446 : step1[3] = input[6];
1447 : temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1448 : temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1449 : step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1450 : step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1451 : temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1452 : temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1453 : step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1454 : step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1455 :
1456 : // stage 2 & stage 3 - even half
1457 : vpx_highbd_idct4_c(step1, step1, bd);
1458 :
1459 : // stage 2 - odd half
1460 : step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1461 : step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1462 : step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1463 : step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1464 :
1465 : // stage 3 - odd half
1466 : step1[4] = step2[4];
1467 : temp1 = (step2[6] - step2[5]) * cospi_16_64;
1468 : temp2 = (step2[5] + step2[6]) * cospi_16_64;
1469 : step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1470 : step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1471 : step1[7] = step2[7];
1472 :
1473 : // stage 4
1474 : output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1475 : output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1476 : output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1477 : output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1478 : output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1479 : output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1480 : output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1481 : output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1482 : }
1483 :
1484 : void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1485 : int stride, int bd) {
1486 : int i, j;
1487 : tran_low_t out[8 * 8];
1488 : tran_low_t *outptr = out;
1489 : tran_low_t temp_in[8], temp_out[8];
1490 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1491 :
1492 : // First transform rows
1493 : for (i = 0; i < 8; ++i) {
1494 : vpx_highbd_idct8_c(input, outptr, bd);
1495 : input += 8;
1496 : outptr += 8;
1497 : }
1498 :
1499 : // Then transform columns
1500 : for (i = 0; i < 8; ++i) {
1501 : for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1502 : vpx_highbd_idct8_c(temp_in, temp_out, bd);
1503 : for (j = 0; j < 8; ++j) {
1504 : dest[j * stride + i] = highbd_clip_pixel_add(
1505 : dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1506 : }
1507 : }
1508 : }
1509 :
1510 : void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1511 : int stride, int bd) {
1512 : int i, j;
1513 : tran_high_t a1;
1514 : tran_low_t out =
1515 : HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1516 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1517 :
1518 : out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1519 : a1 = ROUND_POWER_OF_TWO(out, 5);
1520 : for (j = 0; j < 8; ++j) {
1521 : for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1522 : dest += stride;
1523 : }
1524 : }
1525 :
1526 : void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1527 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1528 : tran_low_t x0 = input[0];
1529 : tran_low_t x1 = input[1];
1530 : tran_low_t x2 = input[2];
1531 : tran_low_t x3 = input[3];
1532 : (void)bd;
1533 :
1534 : if (detect_invalid_highbd_input(input, 4)) {
1535 : #if CONFIG_COEFFICIENT_RANGE_CHECKING
1536 : assert(0 && "invalid highbd txfm input");
1537 : #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1538 : memset(output, 0, sizeof(*output) * 4);
1539 : return;
1540 : }
1541 :
1542 : if (!(x0 | x1 | x2 | x3)) {
1543 : memset(output, 0, 4 * sizeof(*output));
1544 : return;
1545 : }
1546 :
1547 : s0 = sinpi_1_9 * x0;
1548 : s1 = sinpi_2_9 * x0;
1549 : s2 = sinpi_3_9 * x1;
1550 : s3 = sinpi_4_9 * x2;
1551 : s4 = sinpi_1_9 * x2;
1552 : s5 = sinpi_2_9 * x3;
1553 : s6 = sinpi_4_9 * x3;
1554 : s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
1555 :
1556 : s0 = s0 + s3 + s5;
1557 : s1 = s1 - s4 - s6;
1558 : s3 = s2;
1559 : s2 = sinpi_3_9 * s7;
1560 :
1561 : // 1-D transform scaling factor is sqrt(2).
1562 : // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1563 : // + 1b (addition) = 29b.
1564 : // Hence the output bit depth is 15b.
1565 : output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
1566 : output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
1567 : output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1568 : output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
1569 : }
1570 :
1571 : void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1572 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1573 : tran_low_t x0 = input[7];
1574 : tran_low_t x1 = input[0];
1575 : tran_low_t x2 = input[5];
1576 : tran_low_t x3 = input[2];
1577 : tran_low_t x4 = input[3];
1578 : tran_low_t x5 = input[4];
1579 : tran_low_t x6 = input[1];
1580 : tran_low_t x7 = input[6];
1581 : (void)bd;
1582 :
1583 : if (detect_invalid_highbd_input(input, 8)) {
1584 : #if CONFIG_COEFFICIENT_RANGE_CHECKING
1585 : assert(0 && "invalid highbd txfm input");
1586 : #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1587 : memset(output, 0, sizeof(*output) * 8);
1588 : return;
1589 : }
1590 :
1591 : if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1592 : memset(output, 0, 8 * sizeof(*output));
1593 : return;
1594 : }
1595 :
1596 : // stage 1
1597 : s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1598 : s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1599 : s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1600 : s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1601 : s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1602 : s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1603 : s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1604 : s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1605 :
1606 : x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1607 : x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1608 : x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1609 : x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1610 : x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1611 : x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1612 : x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1613 : x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
1614 :
1615 : // stage 2
1616 : s0 = x0;
1617 : s1 = x1;
1618 : s2 = x2;
1619 : s3 = x3;
1620 : s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1621 : s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1622 : s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1623 : s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1624 :
1625 : x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1626 : x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1627 : x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1628 : x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1629 : x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1630 : x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1631 : x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1632 : x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1633 :
1634 : // stage 3
1635 : s2 = cospi_16_64 * (x2 + x3);
1636 : s3 = cospi_16_64 * (x2 - x3);
1637 : s6 = cospi_16_64 * (x6 + x7);
1638 : s7 = cospi_16_64 * (x6 - x7);
1639 :
1640 : x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1641 : x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1642 : x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1643 : x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1644 :
1645 : output[0] = HIGHBD_WRAPLOW(x0, bd);
1646 : output[1] = HIGHBD_WRAPLOW(-x4, bd);
1647 : output[2] = HIGHBD_WRAPLOW(x6, bd);
1648 : output[3] = HIGHBD_WRAPLOW(-x2, bd);
1649 : output[4] = HIGHBD_WRAPLOW(x3, bd);
1650 : output[5] = HIGHBD_WRAPLOW(-x7, bd);
1651 : output[6] = HIGHBD_WRAPLOW(x5, bd);
1652 : output[7] = HIGHBD_WRAPLOW(-x1, bd);
1653 : }
1654 :
1655 : void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
1656 : int stride, int bd) {
1657 : int i, j;
1658 : tran_low_t out[8 * 8] = { 0 };
1659 : tran_low_t *outptr = out;
1660 : tran_low_t temp_in[8], temp_out[8];
1661 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1662 :
1663 : // First transform rows
1664 : // Only first 4 row has non-zero coefs
1665 : for (i = 0; i < 4; ++i) {
1666 : vpx_highbd_idct8_c(input, outptr, bd);
1667 : input += 8;
1668 : outptr += 8;
1669 : }
1670 :
1671 : // Then transform columns
1672 : for (i = 0; i < 8; ++i) {
1673 : for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1674 : vpx_highbd_idct8_c(temp_in, temp_out, bd);
1675 : for (j = 0; j < 8; ++j) {
1676 : dest[j * stride + i] = highbd_clip_pixel_add(
1677 : dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1678 : }
1679 : }
1680 : }
1681 :
1682 : void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1683 : tran_low_t step1[16], step2[16];
1684 : tran_high_t temp1, temp2;
1685 : (void)bd;
1686 :
1687 : if (detect_invalid_highbd_input(input, 16)) {
1688 : #if CONFIG_COEFFICIENT_RANGE_CHECKING
1689 : assert(0 && "invalid highbd txfm input");
1690 : #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1691 : memset(output, 0, sizeof(*output) * 16);
1692 : return;
1693 : }
1694 :
1695 : // stage 1
1696 : step1[0] = input[0 / 2];
1697 : step1[1] = input[16 / 2];
1698 : step1[2] = input[8 / 2];
1699 : step1[3] = input[24 / 2];
1700 : step1[4] = input[4 / 2];
1701 : step1[5] = input[20 / 2];
1702 : step1[6] = input[12 / 2];
1703 : step1[7] = input[28 / 2];
1704 : step1[8] = input[2 / 2];
1705 : step1[9] = input[18 / 2];
1706 : step1[10] = input[10 / 2];
1707 : step1[11] = input[26 / 2];
1708 : step1[12] = input[6 / 2];
1709 : step1[13] = input[22 / 2];
1710 : step1[14] = input[14 / 2];
1711 : step1[15] = input[30 / 2];
1712 :
1713 : // stage 2
1714 : step2[0] = step1[0];
1715 : step2[1] = step1[1];
1716 : step2[2] = step1[2];
1717 : step2[3] = step1[3];
1718 : step2[4] = step1[4];
1719 : step2[5] = step1[5];
1720 : step2[6] = step1[6];
1721 : step2[7] = step1[7];
1722 :
1723 : temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1724 : temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1725 : step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1726 : step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1727 :
1728 : temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1729 : temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1730 : step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1731 : step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1732 :
1733 : temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1734 : temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1735 : step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1736 : step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1737 :
1738 : temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1739 : temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1740 : step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1741 : step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1742 :
1743 : // stage 3
1744 : step1[0] = step2[0];
1745 : step1[1] = step2[1];
1746 : step1[2] = step2[2];
1747 : step1[3] = step2[3];
1748 :
1749 : temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1750 : temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1751 : step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1752 : step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1753 : temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1754 : temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1755 : step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1756 : step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1757 :
1758 : step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
1759 : step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
1760 : step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
1761 : step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
1762 : step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
1763 : step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
1764 : step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
1765 : step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
1766 :
1767 : // stage 4
1768 : temp1 = (step1[0] + step1[1]) * cospi_16_64;
1769 : temp2 = (step1[0] - step1[1]) * cospi_16_64;
1770 : step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1771 : step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1772 : temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1773 : temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1774 : step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1775 : step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1776 : step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1777 : step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1778 : step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1779 : step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1780 :
1781 : step2[8] = step1[8];
1782 : step2[15] = step1[15];
1783 : temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1784 : temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1785 : step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1786 : step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1787 : temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1788 : temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1789 : step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1790 : step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1791 : step2[11] = step1[11];
1792 : step2[12] = step1[12];
1793 :
1794 : // stage 5
1795 : step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
1796 : step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
1797 : step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
1798 : step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
1799 : step1[4] = step2[4];
1800 : temp1 = (step2[6] - step2[5]) * cospi_16_64;
1801 : temp2 = (step2[5] + step2[6]) * cospi_16_64;
1802 : step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1803 : step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1804 : step1[7] = step2[7];
1805 :
1806 : step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
1807 : step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
1808 : step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
1809 : step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
1810 : step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
1811 : step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
1812 : step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
1813 : step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
1814 :
1815 : // stage 6
1816 : step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1817 : step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1818 : step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1819 : step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1820 : step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1821 : step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1822 : step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1823 : step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1824 : step2[8] = step1[8];
1825 : step2[9] = step1[9];
1826 : temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1827 : temp2 = (step1[10] + step1[13]) * cospi_16_64;
1828 : step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1829 : step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1830 : temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1831 : temp2 = (step1[11] + step1[12]) * cospi_16_64;
1832 : step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1833 : step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1834 : step2[14] = step1[14];
1835 : step2[15] = step1[15];
1836 :
1837 : // stage 7
1838 : output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
1839 : output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
1840 : output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
1841 : output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
1842 : output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
1843 : output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
1844 : output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
1845 : output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
1846 : output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
1847 : output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
1848 : output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
1849 : output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
1850 : output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
1851 : output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
1852 : output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
1853 : output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
1854 : }
1855 :
1856 : void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
1857 : int stride, int bd) {
1858 : int i, j;
1859 : tran_low_t out[16 * 16];
1860 : tran_low_t *outptr = out;
1861 : tran_low_t temp_in[16], temp_out[16];
1862 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1863 :
1864 : // First transform rows
1865 : for (i = 0; i < 16; ++i) {
1866 : vpx_highbd_idct16_c(input, outptr, bd);
1867 : input += 16;
1868 : outptr += 16;
1869 : }
1870 :
1871 : // Then transform columns
1872 : for (i = 0; i < 16; ++i) {
1873 : for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
1874 : vpx_highbd_idct16_c(temp_in, temp_out, bd);
1875 : for (j = 0; j < 16; ++j) {
1876 : dest[j * stride + i] = highbd_clip_pixel_add(
1877 : dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
1878 : }
1879 : }
1880 : }
1881 :
1882 : void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1883 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1884 : tran_high_t s9, s10, s11, s12, s13, s14, s15;
1885 : tran_low_t x0 = input[15];
1886 : tran_low_t x1 = input[0];
1887 : tran_low_t x2 = input[13];
1888 : tran_low_t x3 = input[2];
1889 : tran_low_t x4 = input[11];
1890 : tran_low_t x5 = input[4];
1891 : tran_low_t x6 = input[9];
1892 : tran_low_t x7 = input[6];
1893 : tran_low_t x8 = input[7];
1894 : tran_low_t x9 = input[8];
1895 : tran_low_t x10 = input[5];
1896 : tran_low_t x11 = input[10];
1897 : tran_low_t x12 = input[3];
1898 : tran_low_t x13 = input[12];
1899 : tran_low_t x14 = input[1];
1900 : tran_low_t x15 = input[14];
1901 : (void)bd;
1902 :
1903 : if (detect_invalid_highbd_input(input, 16)) {
1904 : #if CONFIG_COEFFICIENT_RANGE_CHECKING
1905 : assert(0 && "invalid highbd txfm input");
1906 : #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1907 : memset(output, 0, sizeof(*output) * 16);
1908 : return;
1909 : }
1910 :
1911 : if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1912 : x13 | x14 | x15)) {
1913 : memset(output, 0, 16 * sizeof(*output));
1914 : return;
1915 : }
1916 :
1917 : // stage 1
1918 : s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1919 : s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1920 : s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1921 : s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1922 : s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1923 : s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1924 : s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1925 : s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1926 : s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1927 : s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1928 : s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1929 : s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1930 : s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1931 : s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1932 : s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1933 : s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1934 :
1935 : x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
1936 : x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
1937 : x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
1938 : x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
1939 : x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
1940 : x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
1941 : x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
1942 : x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
1943 : x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
1944 : x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
1945 : x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
1946 : x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
1947 : x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
1948 : x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
1949 : x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
1950 : x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
1951 :
1952 : // stage 2
1953 : s0 = x0;
1954 : s1 = x1;
1955 : s2 = x2;
1956 : s3 = x3;
1957 : s4 = x4;
1958 : s5 = x5;
1959 : s6 = x6;
1960 : s7 = x7;
1961 : s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1962 : s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1963 : s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1964 : s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1965 : s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1966 : s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1967 : s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1968 : s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1969 :
1970 : x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
1971 : x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
1972 : x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
1973 : x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
1974 : x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
1975 : x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
1976 : x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
1977 : x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
1978 : x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
1979 : x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
1980 : x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
1981 : x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
1982 : x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
1983 : x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
1984 : x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
1985 : x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
1986 :
1987 : // stage 3
1988 : s0 = x0;
1989 : s1 = x1;
1990 : s2 = x2;
1991 : s3 = x3;
1992 : s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1993 : s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1994 : s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1995 : s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1996 : s8 = x8;
1997 : s9 = x9;
1998 : s10 = x10;
1999 : s11 = x11;
2000 : s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
2001 : s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
2002 : s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
2003 : s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
2004 :
2005 : x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
2006 : x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
2007 : x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
2008 : x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
2009 : x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
2010 : x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
2011 : x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
2012 : x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
2013 : x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
2014 : x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
2015 : x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
2016 : x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
2017 : x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
2018 : x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
2019 : x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
2020 : x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
2021 :
2022 : // stage 4
2023 : s2 = (-cospi_16_64) * (x2 + x3);
2024 : s3 = cospi_16_64 * (x2 - x3);
2025 : s6 = cospi_16_64 * (x6 + x7);
2026 : s7 = cospi_16_64 * (-x6 + x7);
2027 : s10 = cospi_16_64 * (x10 + x11);
2028 : s11 = cospi_16_64 * (-x10 + x11);
2029 : s14 = (-cospi_16_64) * (x14 + x15);
2030 : s15 = cospi_16_64 * (x14 - x15);
2031 :
2032 : x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
2033 : x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
2034 : x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
2035 : x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
2036 : x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
2037 : x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
2038 : x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
2039 : x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
2040 :
2041 : output[0] = HIGHBD_WRAPLOW(x0, bd);
2042 : output[1] = HIGHBD_WRAPLOW(-x8, bd);
2043 : output[2] = HIGHBD_WRAPLOW(x12, bd);
2044 : output[3] = HIGHBD_WRAPLOW(-x4, bd);
2045 : output[4] = HIGHBD_WRAPLOW(x6, bd);
2046 : output[5] = HIGHBD_WRAPLOW(x14, bd);
2047 : output[6] = HIGHBD_WRAPLOW(x10, bd);
2048 : output[7] = HIGHBD_WRAPLOW(x2, bd);
2049 : output[8] = HIGHBD_WRAPLOW(x3, bd);
2050 : output[9] = HIGHBD_WRAPLOW(x11, bd);
2051 : output[10] = HIGHBD_WRAPLOW(x15, bd);
2052 : output[11] = HIGHBD_WRAPLOW(x7, bd);
2053 : output[12] = HIGHBD_WRAPLOW(x5, bd);
2054 : output[13] = HIGHBD_WRAPLOW(-x13, bd);
2055 : output[14] = HIGHBD_WRAPLOW(x9, bd);
2056 : output[15] = HIGHBD_WRAPLOW(-x1, bd);
2057 : }
2058 :
2059 : void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
2060 : int stride, int bd) {
2061 : int i, j;
2062 : tran_low_t out[16 * 16] = { 0 };
2063 : tran_low_t *outptr = out;
2064 : tran_low_t temp_in[16], temp_out[16];
2065 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2066 :
2067 : // First transform rows. Since all non-zero dct coefficients are in
2068 : // upper-left 4x4 area, we only need to calculate first 4 rows here.
2069 : for (i = 0; i < 4; ++i) {
2070 : vpx_highbd_idct16_c(input, outptr, bd);
2071 : input += 16;
2072 : outptr += 16;
2073 : }
2074 :
2075 : // Then transform columns
2076 : for (i = 0; i < 16; ++i) {
2077 : for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2078 : vpx_highbd_idct16_c(temp_in, temp_out, bd);
2079 : for (j = 0; j < 16; ++j) {
2080 : dest[j * stride + i] = highbd_clip_pixel_add(
2081 : dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2082 : }
2083 : }
2084 : }
2085 :
2086 : void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2087 : int stride, int bd) {
2088 : int i, j;
2089 : tran_high_t a1;
2090 : tran_low_t out =
2091 : HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2092 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2093 :
2094 : out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2095 : a1 = ROUND_POWER_OF_TWO(out, 6);
2096 : for (j = 0; j < 16; ++j) {
2097 : for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2098 : dest += stride;
2099 : }
2100 : }
2101 :
2102 : static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
2103 : int bd) {
2104 : tran_low_t step1[32], step2[32];
2105 : tran_high_t temp1, temp2;
2106 : (void)bd;
2107 :
2108 : if (detect_invalid_highbd_input(input, 32)) {
2109 : #if CONFIG_COEFFICIENT_RANGE_CHECKING
2110 : assert(0 && "invalid highbd txfm input");
2111 : #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
2112 : memset(output, 0, sizeof(*output) * 32);
2113 : return;
2114 : }
2115 :
2116 : // stage 1
2117 : step1[0] = input[0];
2118 : step1[1] = input[16];
2119 : step1[2] = input[8];
2120 : step1[3] = input[24];
2121 : step1[4] = input[4];
2122 : step1[5] = input[20];
2123 : step1[6] = input[12];
2124 : step1[7] = input[28];
2125 : step1[8] = input[2];
2126 : step1[9] = input[18];
2127 : step1[10] = input[10];
2128 : step1[11] = input[26];
2129 : step1[12] = input[6];
2130 : step1[13] = input[22];
2131 : step1[14] = input[14];
2132 : step1[15] = input[30];
2133 :
2134 : temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2135 : temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2136 : step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2137 : step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2138 :
2139 : temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2140 : temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2141 : step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2142 : step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2143 :
2144 : temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2145 : temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2146 : step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2147 : step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2148 :
2149 : temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2150 : temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2151 : step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2152 : step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2153 :
2154 : temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2155 : temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2156 : step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2157 : step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2158 :
2159 : temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2160 : temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2161 : step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2162 : step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2163 :
2164 : temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2165 : temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2166 : step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2167 : step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2168 :
2169 : temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2170 : temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2171 : step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2172 : step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2173 :
2174 : // stage 2
2175 : step2[0] = step1[0];
2176 : step2[1] = step1[1];
2177 : step2[2] = step1[2];
2178 : step2[3] = step1[3];
2179 : step2[4] = step1[4];
2180 : step2[5] = step1[5];
2181 : step2[6] = step1[6];
2182 : step2[7] = step1[7];
2183 :
2184 : temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2185 : temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2186 : step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2187 : step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2188 :
2189 : temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2190 : temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2191 : step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2192 : step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2193 :
2194 : temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2195 : temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2196 : step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2197 : step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2198 :
2199 : temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2200 : temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2201 : step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2202 : step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2203 :
2204 : step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
2205 : step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
2206 : step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
2207 : step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
2208 : step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
2209 : step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
2210 : step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
2211 : step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
2212 : step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
2213 : step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
2214 : step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
2215 : step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
2216 : step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
2217 : step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
2218 : step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
2219 : step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
2220 :
2221 : // stage 3
2222 : step1[0] = step2[0];
2223 : step1[1] = step2[1];
2224 : step1[2] = step2[2];
2225 : step1[3] = step2[3];
2226 :
2227 : temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2228 : temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2229 : step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2230 : step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2231 : temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2232 : temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2233 : step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2234 : step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2235 :
2236 : step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
2237 : step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
2238 : step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
2239 : step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
2240 : step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
2241 : step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
2242 : step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
2243 : step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
2244 :
2245 : step1[16] = step2[16];
2246 : step1[31] = step2[31];
2247 : temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2248 : temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2249 : step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2250 : step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2251 : temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2252 : temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2253 : step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2254 : step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2255 : step1[19] = step2[19];
2256 : step1[20] = step2[20];
2257 : temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2258 : temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2259 : step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2260 : step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2261 : temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2262 : temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2263 : step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2264 : step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2265 : step1[23] = step2[23];
2266 : step1[24] = step2[24];
2267 : step1[27] = step2[27];
2268 : step1[28] = step2[28];
2269 :
2270 : // stage 4
2271 : temp1 = (step1[0] + step1[1]) * cospi_16_64;
2272 : temp2 = (step1[0] - step1[1]) * cospi_16_64;
2273 : step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2274 : step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2275 : temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2276 : temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2277 : step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2278 : step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2279 : step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
2280 : step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
2281 : step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
2282 : step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
2283 :
2284 : step2[8] = step1[8];
2285 : step2[15] = step1[15];
2286 : temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2287 : temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2288 : step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2289 : step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2290 : temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2291 : temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2292 : step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2293 : step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2294 : step2[11] = step1[11];
2295 : step2[12] = step1[12];
2296 :
2297 : step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
2298 : step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
2299 : step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
2300 : step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
2301 : step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
2302 : step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
2303 : step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
2304 : step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
2305 :
2306 : step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
2307 : step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
2308 : step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
2309 : step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
2310 : step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
2311 : step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
2312 : step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
2313 : step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
2314 :
2315 : // stage 5
2316 : step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2317 : step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2318 : step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2319 : step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2320 : step1[4] = step2[4];
2321 : temp1 = (step2[6] - step2[5]) * cospi_16_64;
2322 : temp2 = (step2[5] + step2[6]) * cospi_16_64;
2323 : step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2324 : step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2325 : step1[7] = step2[7];
2326 :
2327 : step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2328 : step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2329 : step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2330 : step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2331 : step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2332 : step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2333 : step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2334 : step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2335 :
2336 : step1[16] = step2[16];
2337 : step1[17] = step2[17];
2338 : temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2339 : temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2340 : step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2341 : step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2342 : temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2343 : temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2344 : step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2345 : step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2346 : temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2347 : temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2348 : step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2349 : step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2350 : temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2351 : temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2352 : step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2353 : step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2354 : step1[22] = step2[22];
2355 : step1[23] = step2[23];
2356 : step1[24] = step2[24];
2357 : step1[25] = step2[25];
2358 : step1[30] = step2[30];
2359 : step1[31] = step2[31];
2360 :
2361 : // stage 6
2362 : step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2363 : step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2364 : step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2365 : step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2366 : step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2367 : step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2368 : step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2369 : step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2370 : step2[8] = step1[8];
2371 : step2[9] = step1[9];
2372 : temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2373 : temp2 = (step1[10] + step1[13]) * cospi_16_64;
2374 : step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2375 : step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2376 : temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2377 : temp2 = (step1[11] + step1[12]) * cospi_16_64;
2378 : step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2379 : step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2380 : step2[14] = step1[14];
2381 : step2[15] = step1[15];
2382 :
2383 : step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
2384 : step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
2385 : step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
2386 : step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
2387 : step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
2388 : step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
2389 : step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
2390 : step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
2391 :
2392 : step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
2393 : step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
2394 : step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
2395 : step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
2396 : step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
2397 : step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
2398 : step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
2399 : step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
2400 :
2401 : // stage 7
2402 : step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2403 : step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2404 : step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2405 : step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2406 : step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2407 : step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2408 : step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2409 : step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2410 : step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2411 : step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2412 : step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2413 : step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2414 : step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2415 : step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2416 : step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2417 : step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2418 :
2419 : step1[16] = step2[16];
2420 : step1[17] = step2[17];
2421 : step1[18] = step2[18];
2422 : step1[19] = step2[19];
2423 : temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2424 : temp2 = (step2[20] + step2[27]) * cospi_16_64;
2425 : step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2426 : step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2427 : temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2428 : temp2 = (step2[21] + step2[26]) * cospi_16_64;
2429 : step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2430 : step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2431 : temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2432 : temp2 = (step2[22] + step2[25]) * cospi_16_64;
2433 : step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2434 : step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2435 : temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2436 : temp2 = (step2[23] + step2[24]) * cospi_16_64;
2437 : step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2438 : step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2439 : step1[28] = step2[28];
2440 : step1[29] = step2[29];
2441 : step1[30] = step2[30];
2442 : step1[31] = step2[31];
2443 :
2444 : // final stage
2445 : output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
2446 : output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
2447 : output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
2448 : output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
2449 : output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
2450 : output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
2451 : output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
2452 : output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
2453 : output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
2454 : output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
2455 : output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
2456 : output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
2457 : output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
2458 : output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
2459 : output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
2460 : output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
2461 : output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
2462 : output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
2463 : output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
2464 : output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
2465 : output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
2466 : output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
2467 : output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
2468 : output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
2469 : output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
2470 : output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
2471 : output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
2472 : output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
2473 : output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
2474 : output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
2475 : output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
2476 : output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
2477 : }
2478 :
2479 : void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2480 : int stride, int bd) {
2481 : int i, j;
2482 : tran_low_t out[32 * 32];
2483 : tran_low_t *outptr = out;
2484 : tran_low_t temp_in[32], temp_out[32];
2485 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2486 :
2487 : // Rows
2488 : for (i = 0; i < 32; ++i) {
2489 : tran_low_t zero_coeff[16];
2490 : for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2491 : for (j = 0; j < 8; ++j)
2492 : zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2493 : for (j = 0; j < 4; ++j)
2494 : zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2495 : for (j = 0; j < 2; ++j)
2496 : zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2497 :
2498 : if (zero_coeff[0] | zero_coeff[1])
2499 : highbd_idct32_c(input, outptr, bd);
2500 : else
2501 : memset(outptr, 0, sizeof(tran_low_t) * 32);
2502 : input += 32;
2503 : outptr += 32;
2504 : }
2505 :
2506 : // Columns
2507 : for (i = 0; i < 32; ++i) {
2508 : for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2509 : highbd_idct32_c(temp_in, temp_out, bd);
2510 : for (j = 0; j < 32; ++j) {
2511 : dest[j * stride + i] = highbd_clip_pixel_add(
2512 : dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2513 : }
2514 : }
2515 : }
2516 :
2517 : void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2518 : int stride, int bd) {
2519 : int i, j;
2520 : tran_low_t out[32 * 32] = { 0 };
2521 : tran_low_t *outptr = out;
2522 : tran_low_t temp_in[32], temp_out[32];
2523 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2524 :
2525 : // Rows
2526 : // Only upper-left 8x8 has non-zero coeff
2527 : for (i = 0; i < 8; ++i) {
2528 : highbd_idct32_c(input, outptr, bd);
2529 : input += 32;
2530 : outptr += 32;
2531 : }
2532 :
2533 : // Columns
2534 : for (i = 0; i < 32; ++i) {
2535 : for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2536 : highbd_idct32_c(temp_in, temp_out, bd);
2537 : for (j = 0; j < 32; ++j) {
2538 : dest[j * stride + i] = highbd_clip_pixel_add(
2539 : dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2540 : }
2541 : }
2542 : }
2543 :
2544 : void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2545 : int stride, int bd) {
2546 : int i, j;
2547 : int a1;
2548 : uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2549 : tran_low_t out =
2550 : HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2551 :
2552 : out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2553 : a1 = ROUND_POWER_OF_TWO(out, 6);
2554 :
2555 : for (j = 0; j < 32; ++j) {
2556 : for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2557 : dest += stride;
2558 : }
2559 : }
2560 :
2561 : #endif // CONFIG_VP9_HIGHBITDEPTH
|