LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp - inv_txfm.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 1825 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 33 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <math.h>
      13             : #include <string.h>
      14             : 
      15             : #include "./aom_dsp_rtcd.h"
      16             : #include "aom_dsp/inv_txfm.h"
      17             : 
      18           0 : void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
      19             :   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
      20             :      0.5 shifts per pixel. */
      21             :   int i;
      22             :   tran_low_t output[16];
      23             :   tran_high_t a1, b1, c1, d1, e1;
      24           0 :   const tran_low_t *ip = input;
      25           0 :   tran_low_t *op = output;
      26             : 
      27           0 :   for (i = 0; i < 4; i++) {
      28           0 :     a1 = ip[0] >> UNIT_QUANT_SHIFT;
      29           0 :     c1 = ip[1] >> UNIT_QUANT_SHIFT;
      30           0 :     d1 = ip[2] >> UNIT_QUANT_SHIFT;
      31           0 :     b1 = ip[3] >> UNIT_QUANT_SHIFT;
      32           0 :     a1 += c1;
      33           0 :     d1 -= b1;
      34           0 :     e1 = (a1 - d1) >> 1;
      35           0 :     b1 = e1 - b1;
      36           0 :     c1 = e1 - c1;
      37           0 :     a1 -= b1;
      38           0 :     d1 += c1;
      39           0 :     op[0] = WRAPLOW(a1);
      40           0 :     op[1] = WRAPLOW(b1);
      41           0 :     op[2] = WRAPLOW(c1);
      42           0 :     op[3] = WRAPLOW(d1);
      43           0 :     ip += 4;
      44           0 :     op += 4;
      45             :   }
      46             : 
      47           0 :   ip = output;
      48           0 :   for (i = 0; i < 4; i++) {
      49           0 :     a1 = ip[4 * 0];
      50           0 :     c1 = ip[4 * 1];
      51           0 :     d1 = ip[4 * 2];
      52           0 :     b1 = ip[4 * 3];
      53           0 :     a1 += c1;
      54           0 :     d1 -= b1;
      55           0 :     e1 = (a1 - d1) >> 1;
      56           0 :     b1 = e1 - b1;
      57           0 :     c1 = e1 - c1;
      58           0 :     a1 -= b1;
      59           0 :     d1 += c1;
      60           0 :     dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
      61           0 :     dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
      62           0 :     dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
      63           0 :     dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
      64             : 
      65           0 :     ip++;
      66           0 :     dest++;
      67             :   }
      68           0 : }
      69             : 
      70           0 : void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
      71             :   int i;
      72             :   tran_high_t a1, e1;
      73             :   tran_low_t tmp[4];
      74           0 :   const tran_low_t *ip = in;
      75           0 :   tran_low_t *op = tmp;
      76             : 
      77           0 :   a1 = ip[0] >> UNIT_QUANT_SHIFT;
      78           0 :   e1 = a1 >> 1;
      79           0 :   a1 -= e1;
      80           0 :   op[0] = WRAPLOW(a1);
      81           0 :   op[1] = op[2] = op[3] = WRAPLOW(e1);
      82             : 
      83           0 :   ip = tmp;
      84           0 :   for (i = 0; i < 4; i++) {
      85           0 :     e1 = ip[0] >> 1;
      86           0 :     a1 = ip[0] - e1;
      87           0 :     dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
      88           0 :     dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
      89           0 :     dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
      90           0 :     dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
      91           0 :     ip++;
      92           0 :     dest++;
      93             :   }
      94           0 : }
      95             : 
      96           0 : void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
      97             :   tran_low_t step[4];
      98             :   tran_high_t temp1, temp2;
      99             :   // stage 1
     100           0 :   temp1 = (input[0] + input[2]) * cospi_16_64;
     101           0 :   temp2 = (input[0] - input[2]) * cospi_16_64;
     102           0 :   step[0] = WRAPLOW(dct_const_round_shift(temp1));
     103           0 :   step[1] = WRAPLOW(dct_const_round_shift(temp2));
     104           0 :   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
     105           0 :   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
     106           0 :   step[2] = WRAPLOW(dct_const_round_shift(temp1));
     107           0 :   step[3] = WRAPLOW(dct_const_round_shift(temp2));
     108             : 
     109             :   // stage 2
     110           0 :   output[0] = WRAPLOW(step[0] + step[3]);
     111           0 :   output[1] = WRAPLOW(step[1] + step[2]);
     112           0 :   output[2] = WRAPLOW(step[1] - step[2]);
     113           0 :   output[3] = WRAPLOW(step[0] - step[3]);
     114           0 : }
     115             : 
     116           0 : void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     117             :   tran_low_t out[4 * 4];
     118           0 :   tran_low_t *outptr = out;
     119             :   int i, j;
     120             :   tran_low_t temp_in[4], temp_out[4];
     121             : 
     122             :   // Rows
     123           0 :   for (i = 0; i < 4; ++i) {
     124           0 :     aom_idct4_c(input, outptr);
     125           0 :     input += 4;
     126           0 :     outptr += 4;
     127             :   }
     128             : 
     129             :   // Columns
     130           0 :   for (i = 0; i < 4; ++i) {
     131           0 :     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
     132           0 :     aom_idct4_c(temp_in, temp_out);
     133           0 :     for (j = 0; j < 4; ++j) {
     134           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
     135           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
     136             :     }
     137             :   }
     138           0 : }
     139             : 
     140           0 : void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
     141             :                          int dest_stride) {
     142             :   int i;
     143             :   tran_high_t a1;
     144           0 :   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
     145           0 :   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
     146           0 :   a1 = ROUND_POWER_OF_TWO(out, 4);
     147             : 
     148           0 :   if (a1 == 0) return;
     149             : 
     150           0 :   for (i = 0; i < 4; i++) {
     151           0 :     dest[0] = clip_pixel_add(dest[0], a1);
     152           0 :     dest[1] = clip_pixel_add(dest[1], a1);
     153           0 :     dest[2] = clip_pixel_add(dest[2], a1);
     154           0 :     dest[3] = clip_pixel_add(dest[3], a1);
     155           0 :     dest += dest_stride;
     156             :   }
     157             : }
     158             : 
     159           0 : void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
     160             :   tran_low_t step1[8], step2[8];
     161             :   tran_high_t temp1, temp2;
     162             :   // stage 1
     163           0 :   step1[0] = input[0];
     164           0 :   step1[2] = input[4];
     165           0 :   step1[1] = input[2];
     166           0 :   step1[3] = input[6];
     167           0 :   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
     168           0 :   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
     169           0 :   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
     170           0 :   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
     171           0 :   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
     172           0 :   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
     173           0 :   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
     174           0 :   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
     175             : 
     176             :   // stage 2
     177           0 :   temp1 = (step1[0] + step1[2]) * cospi_16_64;
     178           0 :   temp2 = (step1[0] - step1[2]) * cospi_16_64;
     179           0 :   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
     180           0 :   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
     181           0 :   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
     182           0 :   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
     183           0 :   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
     184           0 :   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
     185           0 :   step2[4] = WRAPLOW(step1[4] + step1[5]);
     186           0 :   step2[5] = WRAPLOW(step1[4] - step1[5]);
     187           0 :   step2[6] = WRAPLOW(-step1[6] + step1[7]);
     188           0 :   step2[7] = WRAPLOW(step1[6] + step1[7]);
     189             : 
     190             :   // stage 3
     191           0 :   step1[0] = WRAPLOW(step2[0] + step2[3]);
     192           0 :   step1[1] = WRAPLOW(step2[1] + step2[2]);
     193           0 :   step1[2] = WRAPLOW(step2[1] - step2[2]);
     194           0 :   step1[3] = WRAPLOW(step2[0] - step2[3]);
     195           0 :   step1[4] = step2[4];
     196           0 :   temp1 = (step2[6] - step2[5]) * cospi_16_64;
     197           0 :   temp2 = (step2[5] + step2[6]) * cospi_16_64;
     198           0 :   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
     199           0 :   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
     200           0 :   step1[7] = step2[7];
     201             : 
     202             :   // stage 4
     203           0 :   output[0] = WRAPLOW(step1[0] + step1[7]);
     204           0 :   output[1] = WRAPLOW(step1[1] + step1[6]);
     205           0 :   output[2] = WRAPLOW(step1[2] + step1[5]);
     206           0 :   output[3] = WRAPLOW(step1[3] + step1[4]);
     207           0 :   output[4] = WRAPLOW(step1[3] - step1[4]);
     208           0 :   output[5] = WRAPLOW(step1[2] - step1[5]);
     209           0 :   output[6] = WRAPLOW(step1[1] - step1[6]);
     210           0 :   output[7] = WRAPLOW(step1[0] - step1[7]);
     211           0 : }
     212             : 
     213           0 : void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     214             :   tran_low_t out[8 * 8];
     215           0 :   tran_low_t *outptr = out;
     216             :   int i, j;
     217             :   tran_low_t temp_in[8], temp_out[8];
     218             : 
     219             :   // First transform rows
     220           0 :   for (i = 0; i < 8; ++i) {
     221           0 :     aom_idct8_c(input, outptr);
     222           0 :     input += 8;
     223           0 :     outptr += 8;
     224             :   }
     225             : 
     226             :   // Then transform columns
     227           0 :   for (i = 0; i < 8; ++i) {
     228           0 :     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     229           0 :     aom_idct8_c(temp_in, temp_out);
     230           0 :     for (j = 0; j < 8; ++j) {
     231           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
     232           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
     233             :     }
     234             :   }
     235           0 : }
     236             : 
     237           0 : void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     238             :   int i, j;
     239             :   tran_high_t a1;
     240           0 :   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
     241           0 :   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
     242           0 :   a1 = ROUND_POWER_OF_TWO(out, 5);
     243           0 :   if (a1 == 0) return;
     244           0 :   for (j = 0; j < 8; ++j) {
     245           0 :     for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
     246           0 :     dest += stride;
     247             :   }
     248             : }
     249             : 
     250           0 : void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
     251             :   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
     252             : 
     253           0 :   tran_low_t x0 = input[0];
     254           0 :   tran_low_t x1 = input[1];
     255           0 :   tran_low_t x2 = input[2];
     256           0 :   tran_low_t x3 = input[3];
     257             : 
     258           0 :   if (!(x0 | x1 | x2 | x3)) {
     259           0 :     output[0] = output[1] = output[2] = output[3] = 0;
     260           0 :     return;
     261             :   }
     262             : 
     263           0 :   s0 = sinpi_1_9 * x0;
     264           0 :   s1 = sinpi_2_9 * x0;
     265           0 :   s2 = sinpi_3_9 * x1;
     266           0 :   s3 = sinpi_4_9 * x2;
     267           0 :   s4 = sinpi_1_9 * x2;
     268           0 :   s5 = sinpi_2_9 * x3;
     269           0 :   s6 = sinpi_4_9 * x3;
     270           0 :   s7 = WRAPLOW(x0 - x2 + x3);
     271             : 
     272           0 :   s0 = s0 + s3 + s5;
     273           0 :   s1 = s1 - s4 - s6;
     274           0 :   s3 = s2;
     275           0 :   s2 = sinpi_3_9 * s7;
     276             : 
     277             :   // 1-D transform scaling factor is sqrt(2).
     278             :   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
     279             :   // + 1b (addition) = 29b.
     280             :   // Hence the output bit depth is 15b.
     281           0 :   output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
     282           0 :   output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
     283           0 :   output[2] = WRAPLOW(dct_const_round_shift(s2));
     284           0 :   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
     285             : }
     286             : 
     287           0 : void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
     288             :   int s0, s1, s2, s3, s4, s5, s6, s7;
     289             : 
     290           0 :   tran_high_t x0 = input[7];
     291           0 :   tran_high_t x1 = input[0];
     292           0 :   tran_high_t x2 = input[5];
     293           0 :   tran_high_t x3 = input[2];
     294           0 :   tran_high_t x4 = input[3];
     295           0 :   tran_high_t x5 = input[4];
     296           0 :   tran_high_t x6 = input[1];
     297           0 :   tran_high_t x7 = input[6];
     298             : 
     299           0 :   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
     300           0 :     output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
     301           0 :         output[6] = output[7] = 0;
     302           0 :     return;
     303             :   }
     304             : 
     305             :   // stage 1
     306           0 :   s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
     307           0 :   s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
     308           0 :   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
     309           0 :   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
     310           0 :   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
     311           0 :   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
     312           0 :   s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
     313           0 :   s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
     314             : 
     315           0 :   x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
     316           0 :   x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
     317           0 :   x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
     318           0 :   x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
     319           0 :   x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
     320           0 :   x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
     321           0 :   x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
     322           0 :   x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
     323             : 
     324             :   // stage 2
     325           0 :   s0 = (int)x0;
     326           0 :   s1 = (int)x1;
     327           0 :   s2 = (int)x2;
     328           0 :   s3 = (int)x3;
     329           0 :   s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
     330           0 :   s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
     331           0 :   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
     332           0 :   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
     333             : 
     334           0 :   x0 = WRAPLOW(s0 + s2);
     335           0 :   x1 = WRAPLOW(s1 + s3);
     336           0 :   x2 = WRAPLOW(s0 - s2);
     337           0 :   x3 = WRAPLOW(s1 - s3);
     338           0 :   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
     339           0 :   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
     340           0 :   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
     341           0 :   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
     342             : 
     343             :   // stage 3
     344           0 :   s2 = (int)(cospi_16_64 * (x2 + x3));
     345           0 :   s3 = (int)(cospi_16_64 * (x2 - x3));
     346           0 :   s6 = (int)(cospi_16_64 * (x6 + x7));
     347           0 :   s7 = (int)(cospi_16_64 * (x6 - x7));
     348             : 
     349           0 :   x2 = WRAPLOW(dct_const_round_shift(s2));
     350           0 :   x3 = WRAPLOW(dct_const_round_shift(s3));
     351           0 :   x6 = WRAPLOW(dct_const_round_shift(s6));
     352           0 :   x7 = WRAPLOW(dct_const_round_shift(s7));
     353             : 
     354           0 :   output[0] = WRAPLOW(x0);
     355           0 :   output[1] = WRAPLOW(-x4);
     356           0 :   output[2] = WRAPLOW(x6);
     357           0 :   output[3] = WRAPLOW(-x2);
     358           0 :   output[4] = WRAPLOW(x3);
     359           0 :   output[5] = WRAPLOW(-x7);
     360           0 :   output[6] = WRAPLOW(x5);
     361           0 :   output[7] = WRAPLOW(-x1);
     362             : }
     363             : 
     364           0 : void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     365           0 :   tran_low_t out[8 * 8] = { 0 };
     366           0 :   tran_low_t *outptr = out;
     367             :   int i, j;
     368             :   tran_low_t temp_in[8], temp_out[8];
     369             : 
     370             :   // First transform rows
     371             :   // only first 4 row has non-zero coefs
     372           0 :   for (i = 0; i < 4; ++i) {
     373           0 :     aom_idct8_c(input, outptr);
     374           0 :     input += 8;
     375           0 :     outptr += 8;
     376             :   }
     377             : 
     378             :   // Then transform columns
     379           0 :   for (i = 0; i < 8; ++i) {
     380           0 :     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     381           0 :     aom_idct8_c(temp_in, temp_out);
     382           0 :     for (j = 0; j < 8; ++j) {
     383           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
     384           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
     385             :     }
     386             :   }
     387           0 : }
     388             : 
     389           0 : void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
     390             :   tran_low_t step1[16], step2[16];
     391             :   tran_high_t temp1, temp2;
     392             : 
     393             :   // stage 1
     394           0 :   step1[0] = input[0 / 2];
     395           0 :   step1[1] = input[16 / 2];
     396           0 :   step1[2] = input[8 / 2];
     397           0 :   step1[3] = input[24 / 2];
     398           0 :   step1[4] = input[4 / 2];
     399           0 :   step1[5] = input[20 / 2];
     400           0 :   step1[6] = input[12 / 2];
     401           0 :   step1[7] = input[28 / 2];
     402           0 :   step1[8] = input[2 / 2];
     403           0 :   step1[9] = input[18 / 2];
     404           0 :   step1[10] = input[10 / 2];
     405           0 :   step1[11] = input[26 / 2];
     406           0 :   step1[12] = input[6 / 2];
     407           0 :   step1[13] = input[22 / 2];
     408           0 :   step1[14] = input[14 / 2];
     409           0 :   step1[15] = input[30 / 2];
     410             : 
     411             :   // stage 2
     412           0 :   step2[0] = step1[0];
     413           0 :   step2[1] = step1[1];
     414           0 :   step2[2] = step1[2];
     415           0 :   step2[3] = step1[3];
     416           0 :   step2[4] = step1[4];
     417           0 :   step2[5] = step1[5];
     418           0 :   step2[6] = step1[6];
     419           0 :   step2[7] = step1[7];
     420             : 
     421           0 :   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
     422           0 :   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
     423           0 :   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
     424           0 :   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
     425             : 
     426           0 :   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
     427           0 :   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
     428           0 :   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
     429           0 :   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
     430             : 
     431           0 :   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
     432           0 :   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
     433           0 :   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
     434           0 :   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
     435             : 
     436           0 :   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
     437           0 :   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
     438           0 :   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
     439           0 :   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
     440             : 
     441             :   // stage 3
     442           0 :   step1[0] = step2[0];
     443           0 :   step1[1] = step2[1];
     444           0 :   step1[2] = step2[2];
     445           0 :   step1[3] = step2[3];
     446             : 
     447           0 :   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
     448           0 :   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
     449           0 :   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
     450           0 :   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
     451           0 :   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
     452           0 :   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
     453           0 :   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
     454           0 :   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
     455             : 
     456           0 :   step1[8] = WRAPLOW(step2[8] + step2[9]);
     457           0 :   step1[9] = WRAPLOW(step2[8] - step2[9]);
     458           0 :   step1[10] = WRAPLOW(-step2[10] + step2[11]);
     459           0 :   step1[11] = WRAPLOW(step2[10] + step2[11]);
     460           0 :   step1[12] = WRAPLOW(step2[12] + step2[13]);
     461           0 :   step1[13] = WRAPLOW(step2[12] - step2[13]);
     462           0 :   step1[14] = WRAPLOW(-step2[14] + step2[15]);
     463           0 :   step1[15] = WRAPLOW(step2[14] + step2[15]);
     464             : 
     465             :   // stage 4
     466           0 :   temp1 = (step1[0] + step1[1]) * cospi_16_64;
     467           0 :   temp2 = (step1[0] - step1[1]) * cospi_16_64;
     468           0 :   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
     469           0 :   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
     470           0 :   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
     471           0 :   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
     472           0 :   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
     473           0 :   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
     474           0 :   step2[4] = WRAPLOW(step1[4] + step1[5]);
     475           0 :   step2[5] = WRAPLOW(step1[4] - step1[5]);
     476           0 :   step2[6] = WRAPLOW(-step1[6] + step1[7]);
     477           0 :   step2[7] = WRAPLOW(step1[6] + step1[7]);
     478             : 
     479           0 :   step2[8] = step1[8];
     480           0 :   step2[15] = step1[15];
     481           0 :   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
     482           0 :   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
     483           0 :   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
     484           0 :   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
     485           0 :   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
     486           0 :   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
     487           0 :   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
     488           0 :   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
     489           0 :   step2[11] = step1[11];
     490           0 :   step2[12] = step1[12];
     491             : 
     492             :   // stage 5
     493           0 :   step1[0] = WRAPLOW(step2[0] + step2[3]);
     494           0 :   step1[1] = WRAPLOW(step2[1] + step2[2]);
     495           0 :   step1[2] = WRAPLOW(step2[1] - step2[2]);
     496           0 :   step1[3] = WRAPLOW(step2[0] - step2[3]);
     497           0 :   step1[4] = step2[4];
     498           0 :   temp1 = (step2[6] - step2[5]) * cospi_16_64;
     499           0 :   temp2 = (step2[5] + step2[6]) * cospi_16_64;
     500           0 :   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
     501           0 :   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
     502           0 :   step1[7] = step2[7];
     503             : 
     504           0 :   step1[8] = WRAPLOW(step2[8] + step2[11]);
     505           0 :   step1[9] = WRAPLOW(step2[9] + step2[10]);
     506           0 :   step1[10] = WRAPLOW(step2[9] - step2[10]);
     507           0 :   step1[11] = WRAPLOW(step2[8] - step2[11]);
     508           0 :   step1[12] = WRAPLOW(-step2[12] + step2[15]);
     509           0 :   step1[13] = WRAPLOW(-step2[13] + step2[14]);
     510           0 :   step1[14] = WRAPLOW(step2[13] + step2[14]);
     511           0 :   step1[15] = WRAPLOW(step2[12] + step2[15]);
     512             : 
     513             :   // stage 6
     514           0 :   step2[0] = WRAPLOW(step1[0] + step1[7]);
     515           0 :   step2[1] = WRAPLOW(step1[1] + step1[6]);
     516           0 :   step2[2] = WRAPLOW(step1[2] + step1[5]);
     517           0 :   step2[3] = WRAPLOW(step1[3] + step1[4]);
     518           0 :   step2[4] = WRAPLOW(step1[3] - step1[4]);
     519           0 :   step2[5] = WRAPLOW(step1[2] - step1[5]);
     520           0 :   step2[6] = WRAPLOW(step1[1] - step1[6]);
     521           0 :   step2[7] = WRAPLOW(step1[0] - step1[7]);
     522           0 :   step2[8] = step1[8];
     523           0 :   step2[9] = step1[9];
     524           0 :   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
     525           0 :   temp2 = (step1[10] + step1[13]) * cospi_16_64;
     526           0 :   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
     527           0 :   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
     528           0 :   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
     529           0 :   temp2 = (step1[11] + step1[12]) * cospi_16_64;
     530           0 :   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
     531           0 :   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
     532           0 :   step2[14] = step1[14];
     533           0 :   step2[15] = step1[15];
     534             : 
     535             :   // stage 7
     536           0 :   output[0] = WRAPLOW(step2[0] + step2[15]);
     537           0 :   output[1] = WRAPLOW(step2[1] + step2[14]);
     538           0 :   output[2] = WRAPLOW(step2[2] + step2[13]);
     539           0 :   output[3] = WRAPLOW(step2[3] + step2[12]);
     540           0 :   output[4] = WRAPLOW(step2[4] + step2[11]);
     541           0 :   output[5] = WRAPLOW(step2[5] + step2[10]);
     542           0 :   output[6] = WRAPLOW(step2[6] + step2[9]);
     543           0 :   output[7] = WRAPLOW(step2[7] + step2[8]);
     544           0 :   output[8] = WRAPLOW(step2[7] - step2[8]);
     545           0 :   output[9] = WRAPLOW(step2[6] - step2[9]);
     546           0 :   output[10] = WRAPLOW(step2[5] - step2[10]);
     547           0 :   output[11] = WRAPLOW(step2[4] - step2[11]);
     548           0 :   output[12] = WRAPLOW(step2[3] - step2[12]);
     549           0 :   output[13] = WRAPLOW(step2[2] - step2[13]);
     550           0 :   output[14] = WRAPLOW(step2[1] - step2[14]);
     551           0 :   output[15] = WRAPLOW(step2[0] - step2[15]);
     552           0 : }
     553             : 
     554           0 : void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
     555             :                              int stride) {
     556             :   tran_low_t out[16 * 16];
     557           0 :   tran_low_t *outptr = out;
     558             :   int i, j;
     559             :   tran_low_t temp_in[16], temp_out[16];
     560             : 
     561             :   // First transform rows
     562           0 :   for (i = 0; i < 16; ++i) {
     563           0 :     aom_idct16_c(input, outptr);
     564           0 :     input += 16;
     565           0 :     outptr += 16;
     566             :   }
     567             : 
     568             :   // Then transform columns
     569           0 :   for (i = 0; i < 16; ++i) {
     570           0 :     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     571           0 :     aom_idct16_c(temp_in, temp_out);
     572           0 :     for (j = 0; j < 16; ++j) {
     573           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
     574           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
     575             :     }
     576             :   }
     577           0 : }
     578             : 
     579           0 : void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
     580             :   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
     581             :   tran_high_t s9, s10, s11, s12, s13, s14, s15;
     582             : 
     583           0 :   tran_high_t x0 = input[15];
     584           0 :   tran_high_t x1 = input[0];
     585           0 :   tran_high_t x2 = input[13];
     586           0 :   tran_high_t x3 = input[2];
     587           0 :   tran_high_t x4 = input[11];
     588           0 :   tran_high_t x5 = input[4];
     589           0 :   tran_high_t x6 = input[9];
     590           0 :   tran_high_t x7 = input[6];
     591           0 :   tran_high_t x8 = input[7];
     592           0 :   tran_high_t x9 = input[8];
     593           0 :   tran_high_t x10 = input[5];
     594           0 :   tran_high_t x11 = input[10];
     595           0 :   tran_high_t x12 = input[3];
     596           0 :   tran_high_t x13 = input[12];
     597           0 :   tran_high_t x14 = input[1];
     598           0 :   tran_high_t x15 = input[14];
     599             : 
     600           0 :   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
     601           0 :         x13 | x14 | x15)) {
     602           0 :     output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
     603           0 :         output[6] = output[7] = output[8] = output[9] = output[10] =
     604           0 :             output[11] = output[12] = output[13] = output[14] = output[15] = 0;
     605           0 :     return;
     606             :   }
     607             : 
     608             :   // stage 1
     609           0 :   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
     610           0 :   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
     611           0 :   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
     612           0 :   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
     613           0 :   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
     614           0 :   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
     615           0 :   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
     616           0 :   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
     617           0 :   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
     618           0 :   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
     619           0 :   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
     620           0 :   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
     621           0 :   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
     622           0 :   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
     623           0 :   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
     624           0 :   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
     625             : 
     626           0 :   x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
     627           0 :   x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
     628           0 :   x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
     629           0 :   x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
     630           0 :   x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
     631           0 :   x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
     632           0 :   x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
     633           0 :   x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
     634           0 :   x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
     635           0 :   x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
     636           0 :   x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
     637           0 :   x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
     638           0 :   x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
     639           0 :   x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
     640           0 :   x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
     641           0 :   x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
     642             : 
     643             :   // stage 2
     644           0 :   s0 = x0;
     645           0 :   s1 = x1;
     646           0 :   s2 = x2;
     647           0 :   s3 = x3;
     648           0 :   s4 = x4;
     649           0 :   s5 = x5;
     650           0 :   s6 = x6;
     651           0 :   s7 = x7;
     652           0 :   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
     653           0 :   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
     654           0 :   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
     655           0 :   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
     656           0 :   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
     657           0 :   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
     658           0 :   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
     659           0 :   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
     660             : 
     661           0 :   x0 = WRAPLOW(s0 + s4);
     662           0 :   x1 = WRAPLOW(s1 + s5);
     663           0 :   x2 = WRAPLOW(s2 + s6);
     664           0 :   x3 = WRAPLOW(s3 + s7);
     665           0 :   x4 = WRAPLOW(s0 - s4);
     666           0 :   x5 = WRAPLOW(s1 - s5);
     667           0 :   x6 = WRAPLOW(s2 - s6);
     668           0 :   x7 = WRAPLOW(s3 - s7);
     669           0 :   x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
     670           0 :   x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
     671           0 :   x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
     672           0 :   x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
     673           0 :   x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
     674           0 :   x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
     675           0 :   x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
     676           0 :   x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
     677             : 
     678             :   // stage 3
     679           0 :   s0 = x0;
     680           0 :   s1 = x1;
     681           0 :   s2 = x2;
     682           0 :   s3 = x3;
     683           0 :   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
     684           0 :   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
     685           0 :   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
     686           0 :   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
     687           0 :   s8 = x8;
     688           0 :   s9 = x9;
     689           0 :   s10 = x10;
     690           0 :   s11 = x11;
     691           0 :   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
     692           0 :   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
     693           0 :   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
     694           0 :   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
     695             : 
     696           0 :   x0 = WRAPLOW(s0 + s2);
     697           0 :   x1 = WRAPLOW(s1 + s3);
     698           0 :   x2 = WRAPLOW(s0 - s2);
     699           0 :   x3 = WRAPLOW(s1 - s3);
     700           0 :   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
     701           0 :   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
     702           0 :   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
     703           0 :   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
     704           0 :   x8 = WRAPLOW(s8 + s10);
     705           0 :   x9 = WRAPLOW(s9 + s11);
     706           0 :   x10 = WRAPLOW(s8 - s10);
     707           0 :   x11 = WRAPLOW(s9 - s11);
     708           0 :   x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
     709           0 :   x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
     710           0 :   x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
     711           0 :   x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
     712             : 
     713             :   // stage 4
     714           0 :   s2 = (-cospi_16_64) * (x2 + x3);
     715           0 :   s3 = cospi_16_64 * (x2 - x3);
     716           0 :   s6 = cospi_16_64 * (x6 + x7);
     717           0 :   s7 = cospi_16_64 * (-x6 + x7);
     718           0 :   s10 = cospi_16_64 * (x10 + x11);
     719           0 :   s11 = cospi_16_64 * (-x10 + x11);
     720           0 :   s14 = (-cospi_16_64) * (x14 + x15);
     721           0 :   s15 = cospi_16_64 * (x14 - x15);
     722             : 
     723           0 :   x2 = WRAPLOW(dct_const_round_shift(s2));
     724           0 :   x3 = WRAPLOW(dct_const_round_shift(s3));
     725           0 :   x6 = WRAPLOW(dct_const_round_shift(s6));
     726           0 :   x7 = WRAPLOW(dct_const_round_shift(s7));
     727           0 :   x10 = WRAPLOW(dct_const_round_shift(s10));
     728           0 :   x11 = WRAPLOW(dct_const_round_shift(s11));
     729           0 :   x14 = WRAPLOW(dct_const_round_shift(s14));
     730           0 :   x15 = WRAPLOW(dct_const_round_shift(s15));
     731             : 
     732           0 :   output[0] = WRAPLOW(x0);
     733           0 :   output[1] = WRAPLOW(-x8);
     734           0 :   output[2] = WRAPLOW(x12);
     735           0 :   output[3] = WRAPLOW(-x4);
     736           0 :   output[4] = WRAPLOW(x6);
     737           0 :   output[5] = WRAPLOW(x14);
     738           0 :   output[6] = WRAPLOW(x10);
     739           0 :   output[7] = WRAPLOW(x2);
     740           0 :   output[8] = WRAPLOW(x3);
     741           0 :   output[9] = WRAPLOW(x11);
     742           0 :   output[10] = WRAPLOW(x15);
     743           0 :   output[11] = WRAPLOW(x7);
     744           0 :   output[12] = WRAPLOW(x5);
     745           0 :   output[13] = WRAPLOW(-x13);
     746           0 :   output[14] = WRAPLOW(x9);
     747           0 :   output[15] = WRAPLOW(-x1);
     748             : }
     749             : 
     750           0 : void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
     751             :                             int stride) {
     752             :   int i, j;
     753           0 :   tran_low_t out[16 * 16] = { 0 };
     754           0 :   tran_low_t *outptr = out;
     755             :   tran_low_t temp_in[16], temp_out[16];
     756             : 
     757             :   // First transform rows. Since all non-zero dct coefficients are in
     758             :   // upper-left 8x8 area, we only need to calculate first 8 rows here.
     759           0 :   for (i = 0; i < 8; ++i) {
     760           0 :     aom_idct16_c(input, outptr);
     761           0 :     input += 16;
     762           0 :     outptr += 16;
     763             :   }
     764             : 
     765             :   // Then transform columns
     766           0 :   for (i = 0; i < 16; ++i) {
     767           0 :     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     768           0 :     aom_idct16_c(temp_in, temp_out);
     769           0 :     for (j = 0; j < 16; ++j) {
     770           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
     771           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
     772             :     }
     773             :   }
     774           0 : }
     775             : 
     776           0 : void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
     777             :                             int stride) {
     778           0 :   tran_low_t out[16 * 16] = { 0 };
     779           0 :   tran_low_t *outptr = out;
     780             :   int i, j;
     781             :   tran_low_t temp_in[16], temp_out[16];
     782             : 
     783             :   // First transform rows. Since all non-zero dct coefficients are in
     784             :   // upper-left 4x4 area, we only need to calculate first 4 rows here.
     785           0 :   for (i = 0; i < 4; ++i) {
     786           0 :     aom_idct16_c(input, outptr);
     787           0 :     input += 16;
     788           0 :     outptr += 16;
     789             :   }
     790             : 
     791             :   // Then transform columns
     792           0 :   for (i = 0; i < 16; ++i) {
     793           0 :     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     794           0 :     aom_idct16_c(temp_in, temp_out);
     795           0 :     for (j = 0; j < 16; ++j) {
     796           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
     797           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
     798             :     }
     799             :   }
     800           0 : }
     801             : 
     802           0 : void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     803             :   int i, j;
     804             :   tran_high_t a1;
     805           0 :   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
     806           0 :   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
     807           0 :   a1 = ROUND_POWER_OF_TWO(out, 6);
     808           0 :   if (a1 == 0) return;
     809           0 :   for (j = 0; j < 16; ++j) {
     810           0 :     for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
     811           0 :     dest += stride;
     812             :   }
     813             : }
     814             : 
     815           0 : void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
     816             :   tran_low_t step1[32], step2[32];
     817             :   tran_high_t temp1, temp2;
     818             : 
     819             :   // stage 1
     820           0 :   step1[0] = input[0];
     821           0 :   step1[1] = input[16];
     822           0 :   step1[2] = input[8];
     823           0 :   step1[3] = input[24];
     824           0 :   step1[4] = input[4];
     825           0 :   step1[5] = input[20];
     826           0 :   step1[6] = input[12];
     827           0 :   step1[7] = input[28];
     828           0 :   step1[8] = input[2];
     829           0 :   step1[9] = input[18];
     830           0 :   step1[10] = input[10];
     831           0 :   step1[11] = input[26];
     832           0 :   step1[12] = input[6];
     833           0 :   step1[13] = input[22];
     834           0 :   step1[14] = input[14];
     835           0 :   step1[15] = input[30];
     836             : 
     837           0 :   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
     838           0 :   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
     839           0 :   step1[16] = WRAPLOW(dct_const_round_shift(temp1));
     840           0 :   step1[31] = WRAPLOW(dct_const_round_shift(temp2));
     841             : 
     842           0 :   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
     843           0 :   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
     844           0 :   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
     845           0 :   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
     846             : 
     847           0 :   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
     848           0 :   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
     849           0 :   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
     850           0 :   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
     851             : 
     852           0 :   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
     853           0 :   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
     854           0 :   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
     855           0 :   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
     856             : 
     857           0 :   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
     858           0 :   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
     859           0 :   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
     860           0 :   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
     861             : 
     862           0 :   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
     863           0 :   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
     864           0 :   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
     865           0 :   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
     866             : 
     867           0 :   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
     868           0 :   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
     869           0 :   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
     870           0 :   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
     871             : 
     872           0 :   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
     873           0 :   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
     874           0 :   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
     875           0 :   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
     876             : 
     877             :   // stage 2
     878           0 :   step2[0] = step1[0];
     879           0 :   step2[1] = step1[1];
     880           0 :   step2[2] = step1[2];
     881           0 :   step2[3] = step1[3];
     882           0 :   step2[4] = step1[4];
     883           0 :   step2[5] = step1[5];
     884           0 :   step2[6] = step1[6];
     885           0 :   step2[7] = step1[7];
     886             : 
     887           0 :   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
     888           0 :   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
     889           0 :   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
     890           0 :   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
     891             : 
     892           0 :   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
     893           0 :   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
     894           0 :   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
     895           0 :   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
     896             : 
     897           0 :   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
     898           0 :   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
     899           0 :   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
     900           0 :   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
     901             : 
     902           0 :   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
     903           0 :   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
     904           0 :   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
     905           0 :   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
     906             : 
     907           0 :   step2[16] = WRAPLOW(step1[16] + step1[17]);
     908           0 :   step2[17] = WRAPLOW(step1[16] - step1[17]);
     909           0 :   step2[18] = WRAPLOW(-step1[18] + step1[19]);
     910           0 :   step2[19] = WRAPLOW(step1[18] + step1[19]);
     911           0 :   step2[20] = WRAPLOW(step1[20] + step1[21]);
     912           0 :   step2[21] = WRAPLOW(step1[20] - step1[21]);
     913           0 :   step2[22] = WRAPLOW(-step1[22] + step1[23]);
     914           0 :   step2[23] = WRAPLOW(step1[22] + step1[23]);
     915           0 :   step2[24] = WRAPLOW(step1[24] + step1[25]);
     916           0 :   step2[25] = WRAPLOW(step1[24] - step1[25]);
     917           0 :   step2[26] = WRAPLOW(-step1[26] + step1[27]);
     918           0 :   step2[27] = WRAPLOW(step1[26] + step1[27]);
     919           0 :   step2[28] = WRAPLOW(step1[28] + step1[29]);
     920           0 :   step2[29] = WRAPLOW(step1[28] - step1[29]);
     921           0 :   step2[30] = WRAPLOW(-step1[30] + step1[31]);
     922           0 :   step2[31] = WRAPLOW(step1[30] + step1[31]);
     923             : 
     924             :   // stage 3
     925           0 :   step1[0] = step2[0];
     926           0 :   step1[1] = step2[1];
     927           0 :   step1[2] = step2[2];
     928           0 :   step1[3] = step2[3];
     929             : 
     930           0 :   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
     931           0 :   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
     932           0 :   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
     933           0 :   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
     934           0 :   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
     935           0 :   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
     936           0 :   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
     937           0 :   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
     938             : 
     939           0 :   step1[8] = WRAPLOW(step2[8] + step2[9]);
     940           0 :   step1[9] = WRAPLOW(step2[8] - step2[9]);
     941           0 :   step1[10] = WRAPLOW(-step2[10] + step2[11]);
     942           0 :   step1[11] = WRAPLOW(step2[10] + step2[11]);
     943           0 :   step1[12] = WRAPLOW(step2[12] + step2[13]);
     944           0 :   step1[13] = WRAPLOW(step2[12] - step2[13]);
     945           0 :   step1[14] = WRAPLOW(-step2[14] + step2[15]);
     946           0 :   step1[15] = WRAPLOW(step2[14] + step2[15]);
     947             : 
     948           0 :   step1[16] = step2[16];
     949           0 :   step1[31] = step2[31];
     950           0 :   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
     951           0 :   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
     952           0 :   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
     953           0 :   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
     954           0 :   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
     955           0 :   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
     956           0 :   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
     957           0 :   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
     958           0 :   step1[19] = step2[19];
     959           0 :   step1[20] = step2[20];
     960           0 :   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
     961           0 :   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
     962           0 :   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
     963           0 :   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
     964           0 :   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
     965           0 :   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
     966           0 :   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
     967           0 :   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
     968           0 :   step1[23] = step2[23];
     969           0 :   step1[24] = step2[24];
     970           0 :   step1[27] = step2[27];
     971           0 :   step1[28] = step2[28];
     972             : 
     973             :   // stage 4
     974           0 :   temp1 = (step1[0] + step1[1]) * cospi_16_64;
     975           0 :   temp2 = (step1[0] - step1[1]) * cospi_16_64;
     976           0 :   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
     977           0 :   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
     978           0 :   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
     979           0 :   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
     980           0 :   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
     981           0 :   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
     982           0 :   step2[4] = WRAPLOW(step1[4] + step1[5]);
     983           0 :   step2[5] = WRAPLOW(step1[4] - step1[5]);
     984           0 :   step2[6] = WRAPLOW(-step1[6] + step1[7]);
     985           0 :   step2[7] = WRAPLOW(step1[6] + step1[7]);
     986             : 
     987           0 :   step2[8] = step1[8];
     988           0 :   step2[15] = step1[15];
     989           0 :   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
     990           0 :   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
     991           0 :   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
     992           0 :   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
     993           0 :   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
     994           0 :   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
     995           0 :   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
     996           0 :   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
     997           0 :   step2[11] = step1[11];
     998           0 :   step2[12] = step1[12];
     999             : 
    1000           0 :   step2[16] = WRAPLOW(step1[16] + step1[19]);
    1001           0 :   step2[17] = WRAPLOW(step1[17] + step1[18]);
    1002           0 :   step2[18] = WRAPLOW(step1[17] - step1[18]);
    1003           0 :   step2[19] = WRAPLOW(step1[16] - step1[19]);
    1004           0 :   step2[20] = WRAPLOW(-step1[20] + step1[23]);
    1005           0 :   step2[21] = WRAPLOW(-step1[21] + step1[22]);
    1006           0 :   step2[22] = WRAPLOW(step1[21] + step1[22]);
    1007           0 :   step2[23] = WRAPLOW(step1[20] + step1[23]);
    1008             : 
    1009           0 :   step2[24] = WRAPLOW(step1[24] + step1[27]);
    1010           0 :   step2[25] = WRAPLOW(step1[25] + step1[26]);
    1011           0 :   step2[26] = WRAPLOW(step1[25] - step1[26]);
    1012           0 :   step2[27] = WRAPLOW(step1[24] - step1[27]);
    1013           0 :   step2[28] = WRAPLOW(-step1[28] + step1[31]);
    1014           0 :   step2[29] = WRAPLOW(-step1[29] + step1[30]);
    1015           0 :   step2[30] = WRAPLOW(step1[29] + step1[30]);
    1016           0 :   step2[31] = WRAPLOW(step1[28] + step1[31]);
    1017             : 
    1018             :   // stage 5
    1019           0 :   step1[0] = WRAPLOW(step2[0] + step2[3]);
    1020           0 :   step1[1] = WRAPLOW(step2[1] + step2[2]);
    1021           0 :   step1[2] = WRAPLOW(step2[1] - step2[2]);
    1022           0 :   step1[3] = WRAPLOW(step2[0] - step2[3]);
    1023           0 :   step1[4] = step2[4];
    1024           0 :   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    1025           0 :   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    1026           0 :   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
    1027           0 :   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
    1028           0 :   step1[7] = step2[7];
    1029             : 
    1030           0 :   step1[8] = WRAPLOW(step2[8] + step2[11]);
    1031           0 :   step1[9] = WRAPLOW(step2[9] + step2[10]);
    1032           0 :   step1[10] = WRAPLOW(step2[9] - step2[10]);
    1033           0 :   step1[11] = WRAPLOW(step2[8] - step2[11]);
    1034           0 :   step1[12] = WRAPLOW(-step2[12] + step2[15]);
    1035           0 :   step1[13] = WRAPLOW(-step2[13] + step2[14]);
    1036           0 :   step1[14] = WRAPLOW(step2[13] + step2[14]);
    1037           0 :   step1[15] = WRAPLOW(step2[12] + step2[15]);
    1038             : 
    1039           0 :   step1[16] = step2[16];
    1040           0 :   step1[17] = step2[17];
    1041           0 :   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
    1042           0 :   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
    1043           0 :   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
    1044           0 :   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
    1045           0 :   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
    1046           0 :   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
    1047           0 :   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
    1048           0 :   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
    1049           0 :   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
    1050           0 :   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
    1051           0 :   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
    1052           0 :   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
    1053           0 :   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
    1054           0 :   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
    1055           0 :   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
    1056           0 :   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
    1057           0 :   step1[22] = step2[22];
    1058           0 :   step1[23] = step2[23];
    1059           0 :   step1[24] = step2[24];
    1060           0 :   step1[25] = step2[25];
    1061           0 :   step1[30] = step2[30];
    1062           0 :   step1[31] = step2[31];
    1063             : 
    1064             :   // stage 6
    1065           0 :   step2[0] = WRAPLOW(step1[0] + step1[7]);
    1066           0 :   step2[1] = WRAPLOW(step1[1] + step1[6]);
    1067           0 :   step2[2] = WRAPLOW(step1[2] + step1[5]);
    1068           0 :   step2[3] = WRAPLOW(step1[3] + step1[4]);
    1069           0 :   step2[4] = WRAPLOW(step1[3] - step1[4]);
    1070           0 :   step2[5] = WRAPLOW(step1[2] - step1[5]);
    1071           0 :   step2[6] = WRAPLOW(step1[1] - step1[6]);
    1072           0 :   step2[7] = WRAPLOW(step1[0] - step1[7]);
    1073           0 :   step2[8] = step1[8];
    1074           0 :   step2[9] = step1[9];
    1075           0 :   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
    1076           0 :   temp2 = (step1[10] + step1[13]) * cospi_16_64;
    1077           0 :   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
    1078           0 :   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
    1079           0 :   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
    1080           0 :   temp2 = (step1[11] + step1[12]) * cospi_16_64;
    1081           0 :   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
    1082           0 :   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
    1083           0 :   step2[14] = step1[14];
    1084           0 :   step2[15] = step1[15];
    1085             : 
    1086           0 :   step2[16] = WRAPLOW(step1[16] + step1[23]);
    1087           0 :   step2[17] = WRAPLOW(step1[17] + step1[22]);
    1088           0 :   step2[18] = WRAPLOW(step1[18] + step1[21]);
    1089           0 :   step2[19] = WRAPLOW(step1[19] + step1[20]);
    1090           0 :   step2[20] = WRAPLOW(step1[19] - step1[20]);
    1091           0 :   step2[21] = WRAPLOW(step1[18] - step1[21]);
    1092           0 :   step2[22] = WRAPLOW(step1[17] - step1[22]);
    1093           0 :   step2[23] = WRAPLOW(step1[16] - step1[23]);
    1094             : 
    1095           0 :   step2[24] = WRAPLOW(-step1[24] + step1[31]);
    1096           0 :   step2[25] = WRAPLOW(-step1[25] + step1[30]);
    1097           0 :   step2[26] = WRAPLOW(-step1[26] + step1[29]);
    1098           0 :   step2[27] = WRAPLOW(-step1[27] + step1[28]);
    1099           0 :   step2[28] = WRAPLOW(step1[27] + step1[28]);
    1100           0 :   step2[29] = WRAPLOW(step1[26] + step1[29]);
    1101           0 :   step2[30] = WRAPLOW(step1[25] + step1[30]);
    1102           0 :   step2[31] = WRAPLOW(step1[24] + step1[31]);
    1103             : 
    1104             :   // stage 7
    1105           0 :   step1[0] = WRAPLOW(step2[0] + step2[15]);
    1106           0 :   step1[1] = WRAPLOW(step2[1] + step2[14]);
    1107           0 :   step1[2] = WRAPLOW(step2[2] + step2[13]);
    1108           0 :   step1[3] = WRAPLOW(step2[3] + step2[12]);
    1109           0 :   step1[4] = WRAPLOW(step2[4] + step2[11]);
    1110           0 :   step1[5] = WRAPLOW(step2[5] + step2[10]);
    1111           0 :   step1[6] = WRAPLOW(step2[6] + step2[9]);
    1112           0 :   step1[7] = WRAPLOW(step2[7] + step2[8]);
    1113           0 :   step1[8] = WRAPLOW(step2[7] - step2[8]);
    1114           0 :   step1[9] = WRAPLOW(step2[6] - step2[9]);
    1115           0 :   step1[10] = WRAPLOW(step2[5] - step2[10]);
    1116           0 :   step1[11] = WRAPLOW(step2[4] - step2[11]);
    1117           0 :   step1[12] = WRAPLOW(step2[3] - step2[12]);
    1118           0 :   step1[13] = WRAPLOW(step2[2] - step2[13]);
    1119           0 :   step1[14] = WRAPLOW(step2[1] - step2[14]);
    1120           0 :   step1[15] = WRAPLOW(step2[0] - step2[15]);
    1121             : 
    1122           0 :   step1[16] = step2[16];
    1123           0 :   step1[17] = step2[17];
    1124           0 :   step1[18] = step2[18];
    1125           0 :   step1[19] = step2[19];
    1126           0 :   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
    1127           0 :   temp2 = (step2[20] + step2[27]) * cospi_16_64;
    1128           0 :   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
    1129           0 :   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
    1130           0 :   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
    1131           0 :   temp2 = (step2[21] + step2[26]) * cospi_16_64;
    1132           0 :   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
    1133           0 :   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
    1134           0 :   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
    1135           0 :   temp2 = (step2[22] + step2[25]) * cospi_16_64;
    1136           0 :   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
    1137           0 :   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
    1138           0 :   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
    1139           0 :   temp2 = (step2[23] + step2[24]) * cospi_16_64;
    1140           0 :   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
    1141           0 :   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
    1142           0 :   step1[28] = step2[28];
    1143           0 :   step1[29] = step2[29];
    1144           0 :   step1[30] = step2[30];
    1145           0 :   step1[31] = step2[31];
    1146             : 
    1147             :   // final stage
    1148           0 :   output[0] = WRAPLOW(step1[0] + step1[31]);
    1149           0 :   output[1] = WRAPLOW(step1[1] + step1[30]);
    1150           0 :   output[2] = WRAPLOW(step1[2] + step1[29]);
    1151           0 :   output[3] = WRAPLOW(step1[3] + step1[28]);
    1152           0 :   output[4] = WRAPLOW(step1[4] + step1[27]);
    1153           0 :   output[5] = WRAPLOW(step1[5] + step1[26]);
    1154           0 :   output[6] = WRAPLOW(step1[6] + step1[25]);
    1155           0 :   output[7] = WRAPLOW(step1[7] + step1[24]);
    1156           0 :   output[8] = WRAPLOW(step1[8] + step1[23]);
    1157           0 :   output[9] = WRAPLOW(step1[9] + step1[22]);
    1158           0 :   output[10] = WRAPLOW(step1[10] + step1[21]);
    1159           0 :   output[11] = WRAPLOW(step1[11] + step1[20]);
    1160           0 :   output[12] = WRAPLOW(step1[12] + step1[19]);
    1161           0 :   output[13] = WRAPLOW(step1[13] + step1[18]);
    1162           0 :   output[14] = WRAPLOW(step1[14] + step1[17]);
    1163           0 :   output[15] = WRAPLOW(step1[15] + step1[16]);
    1164           0 :   output[16] = WRAPLOW(step1[15] - step1[16]);
    1165           0 :   output[17] = WRAPLOW(step1[14] - step1[17]);
    1166           0 :   output[18] = WRAPLOW(step1[13] - step1[18]);
    1167           0 :   output[19] = WRAPLOW(step1[12] - step1[19]);
    1168           0 :   output[20] = WRAPLOW(step1[11] - step1[20]);
    1169           0 :   output[21] = WRAPLOW(step1[10] - step1[21]);
    1170           0 :   output[22] = WRAPLOW(step1[9] - step1[22]);
    1171           0 :   output[23] = WRAPLOW(step1[8] - step1[23]);
    1172           0 :   output[24] = WRAPLOW(step1[7] - step1[24]);
    1173           0 :   output[25] = WRAPLOW(step1[6] - step1[25]);
    1174           0 :   output[26] = WRAPLOW(step1[5] - step1[26]);
    1175           0 :   output[27] = WRAPLOW(step1[4] - step1[27]);
    1176           0 :   output[28] = WRAPLOW(step1[3] - step1[28]);
    1177           0 :   output[29] = WRAPLOW(step1[2] - step1[29]);
    1178           0 :   output[30] = WRAPLOW(step1[1] - step1[30]);
    1179           0 :   output[31] = WRAPLOW(step1[0] - step1[31]);
    1180           0 : }
    1181             : 
    1182           0 : void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
    1183             :                               int stride) {
    1184             :   tran_low_t out[32 * 32];
    1185           0 :   tran_low_t *outptr = out;
    1186             :   int i, j;
    1187             :   tran_low_t temp_in[32], temp_out[32];
    1188             : 
    1189             :   // Rows
    1190           0 :   for (i = 0; i < 32; ++i) {
    1191             :     int16_t zero_coeff[16];
    1192           0 :     for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
    1193           0 :     for (j = 0; j < 8; ++j)
    1194           0 :       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
    1195           0 :     for (j = 0; j < 4; ++j)
    1196           0 :       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
    1197           0 :     for (j = 0; j < 2; ++j)
    1198           0 :       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
    1199             : 
    1200           0 :     if (zero_coeff[0] | zero_coeff[1])
    1201           0 :       aom_idct32_c(input, outptr);
    1202             :     else
    1203           0 :       memset(outptr, 0, sizeof(tran_low_t) * 32);
    1204           0 :     input += 32;
    1205           0 :     outptr += 32;
    1206             :   }
    1207             : 
    1208             :   // Columns
    1209           0 :   for (i = 0; i < 32; ++i) {
    1210           0 :     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
    1211           0 :     aom_idct32_c(temp_in, temp_out);
    1212           0 :     for (j = 0; j < 32; ++j) {
    1213           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    1214           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    1215             :     }
    1216             :   }
    1217           0 : }
    1218             : 
    1219           0 : void aom_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
    1220             :                              int stride) {
    1221           0 :   tran_low_t out[32 * 32] = { 0 };
    1222           0 :   tran_low_t *outptr = out;
    1223             :   int i, j;
    1224             :   tran_low_t temp_in[32], temp_out[32];
    1225             : 
    1226             :   // Rows
    1227             :   // only upper-left 16x16 has non-zero coeff
    1228           0 :   for (i = 0; i < 16; ++i) {
    1229           0 :     aom_idct32_c(input, outptr);
    1230           0 :     input += 32;
    1231           0 :     outptr += 32;
    1232             :   }
    1233             : 
    1234             :   // Columns
    1235           0 :   for (i = 0; i < 32; ++i) {
    1236           0 :     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
    1237           0 :     aom_idct32_c(temp_in, temp_out);
    1238           0 :     for (j = 0; j < 32; ++j) {
    1239           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    1240           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    1241             :     }
    1242             :   }
    1243           0 : }
    1244             : 
    1245           0 : void aom_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
    1246             :                             int stride) {
    1247           0 :   tran_low_t out[32 * 32] = { 0 };
    1248           0 :   tran_low_t *outptr = out;
    1249             :   int i, j;
    1250             :   tran_low_t temp_in[32], temp_out[32];
    1251             : 
    1252             :   // Rows
    1253             :   // only upper-left 8x8 has non-zero coeff
    1254           0 :   for (i = 0; i < 8; ++i) {
    1255           0 :     aom_idct32_c(input, outptr);
    1256           0 :     input += 32;
    1257           0 :     outptr += 32;
    1258             :   }
    1259             : 
    1260             :   // Columns
    1261           0 :   for (i = 0; i < 32; ++i) {
    1262           0 :     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
    1263           0 :     aom_idct32_c(temp_in, temp_out);
    1264           0 :     for (j = 0; j < 32; ++j) {
    1265           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    1266           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    1267             :     }
    1268             :   }
    1269           0 : }
    1270             : 
    1271           0 : void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    1272             :   int i, j;
    1273             :   tran_high_t a1;
    1274             : 
    1275           0 :   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
    1276           0 :   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
    1277           0 :   a1 = ROUND_POWER_OF_TWO(out, 6);
    1278           0 :   if (a1 == 0) return;
    1279             : 
    1280           0 :   for (j = 0; j < 32; ++j) {
    1281           0 :     for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
    1282           0 :     dest += stride;
    1283             :   }
    1284             : }
    1285             : 
    1286             : #if CONFIG_HIGHBITDEPTH
    1287           0 : void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
    1288             :                                  int stride, int bd) {
    1289             :   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
    1290             :      0.5 shifts per pixel. */
    1291             :   int i;
    1292             :   tran_low_t output[16];
    1293             :   tran_high_t a1, b1, c1, d1, e1;
    1294           0 :   const tran_low_t *ip = input;
    1295           0 :   tran_low_t *op = output;
    1296           0 :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    1297             : 
    1298           0 :   for (i = 0; i < 4; i++) {
    1299           0 :     a1 = ip[0] >> UNIT_QUANT_SHIFT;
    1300           0 :     c1 = ip[1] >> UNIT_QUANT_SHIFT;
    1301           0 :     d1 = ip[2] >> UNIT_QUANT_SHIFT;
    1302           0 :     b1 = ip[3] >> UNIT_QUANT_SHIFT;
    1303           0 :     a1 += c1;
    1304           0 :     d1 -= b1;
    1305           0 :     e1 = (a1 - d1) >> 1;
    1306           0 :     b1 = e1 - b1;
    1307           0 :     c1 = e1 - c1;
    1308           0 :     a1 -= b1;
    1309           0 :     d1 += c1;
    1310           0 :     op[0] = HIGHBD_WRAPLOW(a1, bd);
    1311           0 :     op[1] = HIGHBD_WRAPLOW(b1, bd);
    1312           0 :     op[2] = HIGHBD_WRAPLOW(c1, bd);
    1313           0 :     op[3] = HIGHBD_WRAPLOW(d1, bd);
    1314           0 :     ip += 4;
    1315           0 :     op += 4;
    1316             :   }
    1317             : 
    1318           0 :   ip = output;
    1319           0 :   for (i = 0; i < 4; i++) {
    1320           0 :     a1 = ip[4 * 0];
    1321           0 :     c1 = ip[4 * 1];
    1322           0 :     d1 = ip[4 * 2];
    1323           0 :     b1 = ip[4 * 3];
    1324           0 :     a1 += c1;
    1325           0 :     d1 -= b1;
    1326           0 :     e1 = (a1 - d1) >> 1;
    1327           0 :     b1 = e1 - b1;
    1328           0 :     c1 = e1 - c1;
    1329           0 :     a1 -= b1;
    1330           0 :     d1 += c1;
    1331           0 :     dest[stride * 0] =
    1332           0 :         highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
    1333           0 :     dest[stride * 1] =
    1334           0 :         highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
    1335           0 :     dest[stride * 2] =
    1336           0 :         highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
    1337           0 :     dest[stride * 3] =
    1338           0 :         highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
    1339             : 
    1340           0 :     ip++;
    1341           0 :     dest++;
    1342             :   }
    1343           0 : }
    1344             : 
    1345           0 : void aom_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
    1346             :                                 int dest_stride, int bd) {
    1347             :   int i;
    1348             :   tran_high_t a1, e1;
    1349             :   tran_low_t tmp[4];
    1350           0 :   const tran_low_t *ip = in;
    1351           0 :   tran_low_t *op = tmp;
    1352           0 :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    1353             :   (void)bd;
    1354             : 
    1355           0 :   a1 = ip[0] >> UNIT_QUANT_SHIFT;
    1356           0 :   e1 = a1 >> 1;
    1357           0 :   a1 -= e1;
    1358           0 :   op[0] = HIGHBD_WRAPLOW(a1, bd);
    1359           0 :   op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
    1360             : 
    1361           0 :   ip = tmp;
    1362           0 :   for (i = 0; i < 4; i++) {
    1363           0 :     e1 = ip[0] >> 1;
    1364           0 :     a1 = ip[0] - e1;
    1365           0 :     dest[dest_stride * 0] =
    1366           0 :         highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
    1367           0 :     dest[dest_stride * 1] =
    1368           0 :         highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
    1369           0 :     dest[dest_stride * 2] =
    1370           0 :         highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
    1371           0 :     dest[dest_stride * 3] =
    1372           0 :         highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
    1373           0 :     ip++;
    1374           0 :     dest++;
    1375             :   }
    1376           0 : }
    1377             : 
    1378           0 : void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
    1379             :   tran_low_t step[4];
    1380             :   tran_high_t temp1, temp2;
    1381             :   (void)bd;
    1382             :   // stage 1
    1383           0 :   temp1 = (input[0] + input[2]) * cospi_16_64;
    1384           0 :   temp2 = (input[0] - input[2]) * cospi_16_64;
    1385           0 :   step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1386           0 :   step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1387           0 :   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
    1388           0 :   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
    1389           0 :   step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1390           0 :   step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1391             : 
    1392             :   // stage 2
    1393           0 :   output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
    1394           0 :   output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
    1395           0 :   output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
    1396           0 :   output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
    1397           0 : }
    1398             : 
    1399           0 : void aom_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
    1400             :                                  int stride, int bd) {
    1401             :   tran_low_t out[4 * 4];
    1402           0 :   tran_low_t *outptr = out;
    1403             :   int i, j;
    1404             :   tran_low_t temp_in[4], temp_out[4];
    1405           0 :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    1406             : 
    1407             :   // Rows
    1408           0 :   for (i = 0; i < 4; ++i) {
    1409           0 :     aom_highbd_idct4_c(input, outptr, bd);
    1410           0 :     input += 4;
    1411           0 :     outptr += 4;
    1412             :   }
    1413             : 
    1414             :   // Columns
    1415           0 :   for (i = 0; i < 4; ++i) {
    1416           0 :     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
    1417           0 :     aom_highbd_idct4_c(temp_in, temp_out, bd);
    1418           0 :     for (j = 0; j < 4; ++j) {
    1419           0 :       dest[j * stride + i] = highbd_clip_pixel_add(
    1420           0 :           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
    1421             :     }
    1422             :   }
    1423           0 : }
    1424             : 
    1425           0 : void aom_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
    1426             :                                 int dest_stride, int bd) {
    1427             :   int i;
    1428             :   tran_high_t a1;
    1429           0 :   tran_low_t out =
    1430           0 :       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
    1431           0 :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    1432             : 
    1433           0 :   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
    1434           0 :   a1 = ROUND_POWER_OF_TWO(out, 4);
    1435             : 
    1436           0 :   for (i = 0; i < 4; i++) {
    1437           0 :     dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
    1438           0 :     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
    1439           0 :     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
    1440           0 :     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
    1441           0 :     dest += dest_stride;
    1442             :   }
    1443           0 : }
    1444             : 
    1445           0 : void aom_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
    1446             :   tran_low_t step1[8], step2[8];
    1447             :   tran_high_t temp1, temp2;
    1448             :   // stage 1
    1449           0 :   step1[0] = input[0];
    1450           0 :   step1[2] = input[4];
    1451           0 :   step1[1] = input[2];
    1452           0 :   step1[3] = input[6];
    1453           0 :   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
    1454           0 :   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
    1455           0 :   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1456           0 :   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1457           0 :   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
    1458           0 :   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
    1459           0 :   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1460           0 :   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1461             : 
    1462             :   // stage 2 & stage 3 - even half
    1463           0 :   aom_highbd_idct4_c(step1, step1, bd);
    1464             : 
    1465             :   // stage 2 - odd half
    1466           0 :   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
    1467           0 :   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
    1468           0 :   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
    1469           0 :   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
    1470             : 
    1471             :   // stage 3 - odd half
    1472           0 :   step1[4] = step2[4];
    1473           0 :   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    1474           0 :   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    1475           0 :   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1476           0 :   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1477           0 :   step1[7] = step2[7];
    1478             : 
    1479             :   // stage 4
    1480           0 :   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
    1481           0 :   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
    1482           0 :   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
    1483           0 :   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
    1484           0 :   output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
    1485           0 :   output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
    1486           0 :   output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
    1487           0 :   output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
    1488           0 : }
    1489             : 
    1490           0 : void aom_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
    1491             :   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
    1492             : 
    1493           0 :   tran_low_t x0 = input[0];
    1494           0 :   tran_low_t x1 = input[1];
    1495           0 :   tran_low_t x2 = input[2];
    1496           0 :   tran_low_t x3 = input[3];
    1497             :   (void)bd;
    1498             : 
    1499           0 :   if (!(x0 | x1 | x2 | x3)) {
    1500           0 :     memset(output, 0, 4 * sizeof(*output));
    1501           0 :     return;
    1502             :   }
    1503             : 
    1504           0 :   s0 = sinpi_1_9 * x0;
    1505           0 :   s1 = sinpi_2_9 * x0;
    1506           0 :   s2 = sinpi_3_9 * x1;
    1507           0 :   s3 = sinpi_4_9 * x2;
    1508           0 :   s4 = sinpi_1_9 * x2;
    1509           0 :   s5 = sinpi_2_9 * x3;
    1510           0 :   s6 = sinpi_4_9 * x3;
    1511           0 :   s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
    1512             : 
    1513           0 :   s0 = s0 + s3 + s5;
    1514           0 :   s1 = s1 - s4 - s6;
    1515           0 :   s3 = s2;
    1516           0 :   s2 = sinpi_3_9 * s7;
    1517             : 
    1518             :   // 1-D transform scaling factor is sqrt(2).
    1519             :   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
    1520             :   // + 1b (addition) = 29b.
    1521             :   // Hence the output bit depth is 15b.
    1522           0 :   output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
    1523           0 :   output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
    1524           0 :   output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
    1525           0 :   output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
    1526             : }
    1527             : 
    1528           0 : void aom_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
    1529             :   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
    1530             : 
    1531           0 :   tran_low_t x0 = input[7];
    1532           0 :   tran_low_t x1 = input[0];
    1533           0 :   tran_low_t x2 = input[5];
    1534           0 :   tran_low_t x3 = input[2];
    1535           0 :   tran_low_t x4 = input[3];
    1536           0 :   tran_low_t x5 = input[4];
    1537           0 :   tran_low_t x6 = input[1];
    1538           0 :   tran_low_t x7 = input[6];
    1539             :   (void)bd;
    1540             : 
    1541           0 :   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
    1542           0 :     memset(output, 0, 8 * sizeof(*output));
    1543           0 :     return;
    1544             :   }
    1545             : 
    1546             :   // stage 1
    1547           0 :   s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
    1548           0 :   s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
    1549           0 :   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
    1550           0 :   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
    1551           0 :   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
    1552           0 :   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
    1553           0 :   s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
    1554           0 :   s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
    1555             : 
    1556           0 :   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
    1557           0 :   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
    1558           0 :   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
    1559           0 :   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
    1560           0 :   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
    1561           0 :   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
    1562           0 :   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
    1563           0 :   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
    1564             : 
    1565             :   // stage 2
    1566           0 :   s0 = x0;
    1567           0 :   s1 = x1;
    1568           0 :   s2 = x2;
    1569           0 :   s3 = x3;
    1570           0 :   s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
    1571           0 :   s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
    1572           0 :   s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
    1573           0 :   s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
    1574             : 
    1575           0 :   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
    1576           0 :   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
    1577           0 :   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
    1578           0 :   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
    1579           0 :   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
    1580           0 :   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
    1581           0 :   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
    1582           0 :   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
    1583             : 
    1584             :   // stage 3
    1585           0 :   s2 = cospi_16_64 * (x2 + x3);
    1586           0 :   s3 = cospi_16_64 * (x2 - x3);
    1587           0 :   s6 = cospi_16_64 * (x6 + x7);
    1588           0 :   s7 = cospi_16_64 * (x6 - x7);
    1589             : 
    1590           0 :   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
    1591           0 :   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
    1592           0 :   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
    1593           0 :   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
    1594             : 
    1595           0 :   output[0] = HIGHBD_WRAPLOW(x0, bd);
    1596           0 :   output[1] = HIGHBD_WRAPLOW(-x4, bd);
    1597           0 :   output[2] = HIGHBD_WRAPLOW(x6, bd);
    1598           0 :   output[3] = HIGHBD_WRAPLOW(-x2, bd);
    1599           0 :   output[4] = HIGHBD_WRAPLOW(x3, bd);
    1600           0 :   output[5] = HIGHBD_WRAPLOW(-x7, bd);
    1601           0 :   output[6] = HIGHBD_WRAPLOW(x5, bd);
    1602           0 :   output[7] = HIGHBD_WRAPLOW(-x1, bd);
    1603             : }
    1604             : 
    1605           0 : void aom_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
    1606             :   tran_low_t step1[16], step2[16];
    1607             :   tran_high_t temp1, temp2;
    1608             :   (void)bd;
    1609             : 
    1610             :   // stage 1
    1611           0 :   step1[0] = input[0 / 2];
    1612           0 :   step1[1] = input[16 / 2];
    1613           0 :   step1[2] = input[8 / 2];
    1614           0 :   step1[3] = input[24 / 2];
    1615           0 :   step1[4] = input[4 / 2];
    1616           0 :   step1[5] = input[20 / 2];
    1617           0 :   step1[6] = input[12 / 2];
    1618           0 :   step1[7] = input[28 / 2];
    1619           0 :   step1[8] = input[2 / 2];
    1620           0 :   step1[9] = input[18 / 2];
    1621           0 :   step1[10] = input[10 / 2];
    1622           0 :   step1[11] = input[26 / 2];
    1623           0 :   step1[12] = input[6 / 2];
    1624           0 :   step1[13] = input[22 / 2];
    1625           0 :   step1[14] = input[14 / 2];
    1626           0 :   step1[15] = input[30 / 2];
    1627             : 
    1628             :   // stage 2
    1629           0 :   step2[0] = step1[0];
    1630           0 :   step2[1] = step1[1];
    1631           0 :   step2[2] = step1[2];
    1632           0 :   step2[3] = step1[3];
    1633           0 :   step2[4] = step1[4];
    1634           0 :   step2[5] = step1[5];
    1635           0 :   step2[6] = step1[6];
    1636           0 :   step2[7] = step1[7];
    1637             : 
    1638           0 :   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
    1639           0 :   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
    1640           0 :   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1641           0 :   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1642             : 
    1643           0 :   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
    1644           0 :   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
    1645           0 :   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1646           0 :   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1647             : 
    1648           0 :   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
    1649           0 :   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
    1650           0 :   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1651           0 :   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1652             : 
    1653           0 :   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
    1654           0 :   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
    1655           0 :   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1656           0 :   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1657             : 
    1658             :   // stage 3
    1659           0 :   step1[0] = step2[0];
    1660           0 :   step1[1] = step2[1];
    1661           0 :   step1[2] = step2[2];
    1662           0 :   step1[3] = step2[3];
    1663             : 
    1664           0 :   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
    1665           0 :   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
    1666           0 :   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1667           0 :   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1668           0 :   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
    1669           0 :   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
    1670           0 :   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1671           0 :   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1672             : 
    1673           0 :   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
    1674           0 :   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
    1675           0 :   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
    1676           0 :   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
    1677           0 :   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
    1678           0 :   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
    1679           0 :   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
    1680           0 :   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
    1681             : 
    1682             :   // stage 4
    1683           0 :   temp1 = (step1[0] + step1[1]) * cospi_16_64;
    1684           0 :   temp2 = (step1[0] - step1[1]) * cospi_16_64;
    1685           0 :   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1686           0 :   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1687           0 :   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
    1688           0 :   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
    1689           0 :   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1690           0 :   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1691           0 :   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
    1692           0 :   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
    1693           0 :   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
    1694           0 :   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
    1695             : 
    1696           0 :   step2[8] = step1[8];
    1697           0 :   step2[15] = step1[15];
    1698           0 :   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
    1699           0 :   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
    1700           0 :   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1701           0 :   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1702           0 :   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
    1703           0 :   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
    1704           0 :   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1705           0 :   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1706           0 :   step2[11] = step1[11];
    1707           0 :   step2[12] = step1[12];
    1708             : 
    1709             :   // stage 5
    1710           0 :   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
    1711           0 :   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
    1712           0 :   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
    1713           0 :   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
    1714           0 :   step1[4] = step2[4];
    1715           0 :   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    1716           0 :   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    1717           0 :   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1718           0 :   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1719           0 :   step1[7] = step2[7];
    1720             : 
    1721           0 :   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
    1722           0 :   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
    1723           0 :   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
    1724           0 :   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
    1725           0 :   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
    1726           0 :   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
    1727           0 :   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
    1728           0 :   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
    1729             : 
    1730             :   // stage 6
    1731           0 :   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
    1732           0 :   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
    1733           0 :   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
    1734           0 :   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
    1735           0 :   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
    1736           0 :   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
    1737           0 :   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
    1738           0 :   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
    1739           0 :   step2[8] = step1[8];
    1740           0 :   step2[9] = step1[9];
    1741           0 :   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
    1742           0 :   temp2 = (step1[10] + step1[13]) * cospi_16_64;
    1743           0 :   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1744           0 :   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1745           0 :   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
    1746           0 :   temp2 = (step1[11] + step1[12]) * cospi_16_64;
    1747           0 :   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1748           0 :   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1749           0 :   step2[14] = step1[14];
    1750           0 :   step2[15] = step1[15];
    1751             : 
    1752             :   // stage 7
    1753           0 :   output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
    1754           0 :   output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
    1755           0 :   output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
    1756           0 :   output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
    1757           0 :   output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
    1758           0 :   output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
    1759           0 :   output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
    1760           0 :   output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
    1761           0 :   output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
    1762           0 :   output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
    1763           0 :   output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
    1764           0 :   output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
    1765           0 :   output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
    1766           0 :   output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
    1767           0 :   output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
    1768           0 :   output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
    1769           0 : }
    1770             : 
    1771           0 : void aom_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
    1772             :   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
    1773             :   tran_high_t s9, s10, s11, s12, s13, s14, s15;
    1774             : 
    1775           0 :   tran_low_t x0 = input[15];
    1776           0 :   tran_low_t x1 = input[0];
    1777           0 :   tran_low_t x2 = input[13];
    1778           0 :   tran_low_t x3 = input[2];
    1779           0 :   tran_low_t x4 = input[11];
    1780           0 :   tran_low_t x5 = input[4];
    1781           0 :   tran_low_t x6 = input[9];
    1782           0 :   tran_low_t x7 = input[6];
    1783           0 :   tran_low_t x8 = input[7];
    1784           0 :   tran_low_t x9 = input[8];
    1785           0 :   tran_low_t x10 = input[5];
    1786           0 :   tran_low_t x11 = input[10];
    1787           0 :   tran_low_t x12 = input[3];
    1788           0 :   tran_low_t x13 = input[12];
    1789           0 :   tran_low_t x14 = input[1];
    1790           0 :   tran_low_t x15 = input[14];
    1791             :   (void)bd;
    1792             : 
    1793           0 :   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
    1794           0 :         x13 | x14 | x15)) {
    1795           0 :     memset(output, 0, 16 * sizeof(*output));
    1796           0 :     return;
    1797             :   }
    1798             : 
    1799             :   // stage 1
    1800           0 :   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
    1801           0 :   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
    1802           0 :   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
    1803           0 :   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
    1804           0 :   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
    1805           0 :   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
    1806           0 :   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
    1807           0 :   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
    1808           0 :   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
    1809           0 :   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
    1810           0 :   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
    1811           0 :   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
    1812           0 :   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
    1813           0 :   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
    1814           0 :   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
    1815           0 :   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
    1816             : 
    1817           0 :   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
    1818           0 :   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
    1819           0 :   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
    1820           0 :   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
    1821           0 :   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
    1822           0 :   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
    1823           0 :   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
    1824           0 :   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
    1825           0 :   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
    1826           0 :   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
    1827           0 :   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
    1828           0 :   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
    1829           0 :   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
    1830           0 :   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
    1831           0 :   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
    1832           0 :   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
    1833             : 
    1834             :   // stage 2
    1835           0 :   s0 = x0;
    1836           0 :   s1 = x1;
    1837           0 :   s2 = x2;
    1838           0 :   s3 = x3;
    1839           0 :   s4 = x4;
    1840           0 :   s5 = x5;
    1841           0 :   s6 = x6;
    1842           0 :   s7 = x7;
    1843           0 :   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
    1844           0 :   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
    1845           0 :   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
    1846           0 :   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
    1847           0 :   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
    1848           0 :   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
    1849           0 :   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
    1850           0 :   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
    1851             : 
    1852           0 :   x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
    1853           0 :   x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
    1854           0 :   x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
    1855           0 :   x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
    1856           0 :   x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
    1857           0 :   x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
    1858           0 :   x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
    1859           0 :   x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
    1860           0 :   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
    1861           0 :   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
    1862           0 :   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
    1863           0 :   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
    1864           0 :   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
    1865           0 :   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
    1866           0 :   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
    1867           0 :   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
    1868             : 
    1869             :   // stage 3
    1870           0 :   s0 = x0;
    1871           0 :   s1 = x1;
    1872           0 :   s2 = x2;
    1873           0 :   s3 = x3;
    1874           0 :   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
    1875           0 :   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
    1876           0 :   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
    1877           0 :   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
    1878           0 :   s8 = x8;
    1879           0 :   s9 = x9;
    1880           0 :   s10 = x10;
    1881           0 :   s11 = x11;
    1882           0 :   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
    1883           0 :   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
    1884           0 :   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
    1885           0 :   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
    1886             : 
    1887           0 :   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
    1888           0 :   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
    1889           0 :   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
    1890           0 :   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
    1891           0 :   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
    1892           0 :   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
    1893           0 :   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
    1894           0 :   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
    1895           0 :   x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
    1896           0 :   x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
    1897           0 :   x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
    1898           0 :   x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
    1899           0 :   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
    1900           0 :   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
    1901           0 :   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
    1902           0 :   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
    1903             : 
    1904             :   // stage 4
    1905           0 :   s2 = (-cospi_16_64) * (x2 + x3);
    1906           0 :   s3 = cospi_16_64 * (x2 - x3);
    1907           0 :   s6 = cospi_16_64 * (x6 + x7);
    1908           0 :   s7 = cospi_16_64 * (-x6 + x7);
    1909           0 :   s10 = cospi_16_64 * (x10 + x11);
    1910           0 :   s11 = cospi_16_64 * (-x10 + x11);
    1911           0 :   s14 = (-cospi_16_64) * (x14 + x15);
    1912           0 :   s15 = cospi_16_64 * (x14 - x15);
    1913             : 
    1914           0 :   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
    1915           0 :   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
    1916           0 :   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
    1917           0 :   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
    1918           0 :   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
    1919           0 :   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
    1920           0 :   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
    1921           0 :   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
    1922             : 
    1923           0 :   output[0] = HIGHBD_WRAPLOW(x0, bd);
    1924           0 :   output[1] = HIGHBD_WRAPLOW(-x8, bd);
    1925           0 :   output[2] = HIGHBD_WRAPLOW(x12, bd);
    1926           0 :   output[3] = HIGHBD_WRAPLOW(-x4, bd);
    1927           0 :   output[4] = HIGHBD_WRAPLOW(x6, bd);
    1928           0 :   output[5] = HIGHBD_WRAPLOW(x14, bd);
    1929           0 :   output[6] = HIGHBD_WRAPLOW(x10, bd);
    1930           0 :   output[7] = HIGHBD_WRAPLOW(x2, bd);
    1931           0 :   output[8] = HIGHBD_WRAPLOW(x3, bd);
    1932           0 :   output[9] = HIGHBD_WRAPLOW(x11, bd);
    1933           0 :   output[10] = HIGHBD_WRAPLOW(x15, bd);
    1934           0 :   output[11] = HIGHBD_WRAPLOW(x7, bd);
    1935           0 :   output[12] = HIGHBD_WRAPLOW(x5, bd);
    1936           0 :   output[13] = HIGHBD_WRAPLOW(-x13, bd);
    1937           0 :   output[14] = HIGHBD_WRAPLOW(x9, bd);
    1938           0 :   output[15] = HIGHBD_WRAPLOW(-x1, bd);
    1939             : }
    1940             : 
    1941           0 : void aom_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd) {
    1942             :   tran_low_t step1[32], step2[32];
    1943             :   tran_high_t temp1, temp2;
    1944             :   (void)bd;
    1945             : 
    1946             :   // stage 1
    1947           0 :   step1[0] = input[0];
    1948           0 :   step1[1] = input[16];
    1949           0 :   step1[2] = input[8];
    1950           0 :   step1[3] = input[24];
    1951           0 :   step1[4] = input[4];
    1952           0 :   step1[5] = input[20];
    1953           0 :   step1[6] = input[12];
    1954           0 :   step1[7] = input[28];
    1955           0 :   step1[8] = input[2];
    1956           0 :   step1[9] = input[18];
    1957           0 :   step1[10] = input[10];
    1958           0 :   step1[11] = input[26];
    1959           0 :   step1[12] = input[6];
    1960           0 :   step1[13] = input[22];
    1961           0 :   step1[14] = input[14];
    1962           0 :   step1[15] = input[30];
    1963             : 
    1964           0 :   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
    1965           0 :   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
    1966           0 :   step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1967           0 :   step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1968             : 
    1969           0 :   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
    1970           0 :   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
    1971           0 :   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1972           0 :   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1973             : 
    1974           0 :   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
    1975           0 :   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
    1976           0 :   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1977           0 :   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1978             : 
    1979           0 :   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
    1980           0 :   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
    1981           0 :   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1982           0 :   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1983             : 
    1984           0 :   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
    1985           0 :   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
    1986           0 :   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1987           0 :   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1988             : 
    1989           0 :   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
    1990           0 :   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
    1991           0 :   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1992           0 :   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1993             : 
    1994           0 :   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
    1995           0 :   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
    1996           0 :   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1997           0 :   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1998             : 
    1999           0 :   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
    2000           0 :   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
    2001           0 :   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2002           0 :   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2003             : 
    2004             :   // stage 2
    2005           0 :   step2[0] = step1[0];
    2006           0 :   step2[1] = step1[1];
    2007           0 :   step2[2] = step1[2];
    2008           0 :   step2[3] = step1[3];
    2009           0 :   step2[4] = step1[4];
    2010           0 :   step2[5] = step1[5];
    2011           0 :   step2[6] = step1[6];
    2012           0 :   step2[7] = step1[7];
    2013             : 
    2014           0 :   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
    2015           0 :   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
    2016           0 :   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2017           0 :   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2018             : 
    2019           0 :   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
    2020           0 :   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
    2021           0 :   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2022           0 :   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2023             : 
    2024           0 :   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
    2025           0 :   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
    2026           0 :   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2027           0 :   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2028             : 
    2029           0 :   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
    2030           0 :   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
    2031           0 :   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2032           0 :   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2033             : 
    2034           0 :   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
    2035           0 :   step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
    2036           0 :   step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
    2037           0 :   step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
    2038           0 :   step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
    2039           0 :   step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
    2040           0 :   step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
    2041           0 :   step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
    2042           0 :   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
    2043           0 :   step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
    2044           0 :   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
    2045           0 :   step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
    2046           0 :   step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
    2047           0 :   step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
    2048           0 :   step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
    2049           0 :   step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
    2050             : 
    2051             :   // stage 3
    2052           0 :   step1[0] = step2[0];
    2053           0 :   step1[1] = step2[1];
    2054           0 :   step1[2] = step2[2];
    2055           0 :   step1[3] = step2[3];
    2056             : 
    2057           0 :   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
    2058           0 :   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
    2059           0 :   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2060           0 :   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2061           0 :   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
    2062           0 :   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
    2063           0 :   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2064           0 :   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2065             : 
    2066           0 :   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
    2067           0 :   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
    2068           0 :   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
    2069           0 :   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
    2070           0 :   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
    2071           0 :   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
    2072           0 :   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
    2073           0 :   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
    2074             : 
    2075           0 :   step1[16] = step2[16];
    2076           0 :   step1[31] = step2[31];
    2077           0 :   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
    2078           0 :   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
    2079           0 :   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2080           0 :   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2081           0 :   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
    2082           0 :   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
    2083           0 :   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2084           0 :   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2085           0 :   step1[19] = step2[19];
    2086           0 :   step1[20] = step2[20];
    2087           0 :   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
    2088           0 :   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
    2089           0 :   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2090           0 :   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2091           0 :   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
    2092           0 :   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
    2093           0 :   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2094           0 :   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2095           0 :   step1[23] = step2[23];
    2096           0 :   step1[24] = step2[24];
    2097           0 :   step1[27] = step2[27];
    2098           0 :   step1[28] = step2[28];
    2099             : 
    2100             :   // stage 4
    2101           0 :   temp1 = (step1[0] + step1[1]) * cospi_16_64;
    2102           0 :   temp2 = (step1[0] - step1[1]) * cospi_16_64;
    2103           0 :   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2104           0 :   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2105           0 :   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
    2106           0 :   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
    2107           0 :   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2108           0 :   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2109           0 :   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
    2110           0 :   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
    2111           0 :   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
    2112           0 :   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
    2113             : 
    2114           0 :   step2[8] = step1[8];
    2115           0 :   step2[15] = step1[15];
    2116           0 :   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
    2117           0 :   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
    2118           0 :   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2119           0 :   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2120           0 :   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
    2121           0 :   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
    2122           0 :   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2123           0 :   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2124           0 :   step2[11] = step1[11];
    2125           0 :   step2[12] = step1[12];
    2126             : 
    2127           0 :   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
    2128           0 :   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
    2129           0 :   step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
    2130           0 :   step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
    2131           0 :   step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
    2132           0 :   step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
    2133           0 :   step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
    2134           0 :   step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
    2135             : 
    2136           0 :   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
    2137           0 :   step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
    2138           0 :   step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
    2139           0 :   step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
    2140           0 :   step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
    2141           0 :   step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
    2142           0 :   step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
    2143           0 :   step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
    2144             : 
    2145             :   // stage 5
    2146           0 :   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
    2147           0 :   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
    2148           0 :   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
    2149           0 :   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
    2150           0 :   step1[4] = step2[4];
    2151           0 :   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    2152           0 :   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    2153           0 :   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2154           0 :   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2155           0 :   step1[7] = step2[7];
    2156             : 
    2157           0 :   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
    2158           0 :   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
    2159           0 :   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
    2160           0 :   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
    2161           0 :   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
    2162           0 :   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
    2163           0 :   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
    2164           0 :   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
    2165             : 
    2166           0 :   step1[16] = step2[16];
    2167           0 :   step1[17] = step2[17];
    2168           0 :   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
    2169           0 :   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
    2170           0 :   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2171           0 :   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2172           0 :   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
    2173           0 :   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
    2174           0 :   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2175           0 :   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2176           0 :   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
    2177           0 :   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
    2178           0 :   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2179           0 :   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2180           0 :   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
    2181           0 :   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
    2182           0 :   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2183           0 :   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2184           0 :   step1[22] = step2[22];
    2185           0 :   step1[23] = step2[23];
    2186           0 :   step1[24] = step2[24];
    2187           0 :   step1[25] = step2[25];
    2188           0 :   step1[30] = step2[30];
    2189           0 :   step1[31] = step2[31];
    2190             : 
    2191             :   // stage 6
    2192           0 :   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
    2193           0 :   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
    2194           0 :   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
    2195           0 :   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
    2196           0 :   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
    2197           0 :   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
    2198           0 :   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
    2199           0 :   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
    2200           0 :   step2[8] = step1[8];
    2201           0 :   step2[9] = step1[9];
    2202           0 :   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
    2203           0 :   temp2 = (step1[10] + step1[13]) * cospi_16_64;
    2204           0 :   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2205           0 :   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2206           0 :   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
    2207           0 :   temp2 = (step1[11] + step1[12]) * cospi_16_64;
    2208           0 :   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2209           0 :   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2210           0 :   step2[14] = step1[14];
    2211           0 :   step2[15] = step1[15];
    2212             : 
    2213           0 :   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
    2214           0 :   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
    2215           0 :   step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
    2216           0 :   step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
    2217           0 :   step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
    2218           0 :   step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
    2219           0 :   step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
    2220           0 :   step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
    2221             : 
    2222           0 :   step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
    2223           0 :   step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
    2224           0 :   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
    2225           0 :   step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
    2226           0 :   step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
    2227           0 :   step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
    2228           0 :   step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
    2229           0 :   step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
    2230             : 
    2231             :   // stage 7
    2232           0 :   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
    2233           0 :   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
    2234           0 :   step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
    2235           0 :   step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
    2236           0 :   step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
    2237           0 :   step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
    2238           0 :   step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
    2239           0 :   step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
    2240           0 :   step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
    2241           0 :   step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
    2242           0 :   step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
    2243           0 :   step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
    2244           0 :   step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
    2245           0 :   step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
    2246           0 :   step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
    2247           0 :   step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
    2248             : 
    2249           0 :   step1[16] = step2[16];
    2250           0 :   step1[17] = step2[17];
    2251           0 :   step1[18] = step2[18];
    2252           0 :   step1[19] = step2[19];
    2253           0 :   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
    2254           0 :   temp2 = (step2[20] + step2[27]) * cospi_16_64;
    2255           0 :   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2256           0 :   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2257           0 :   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
    2258           0 :   temp2 = (step2[21] + step2[26]) * cospi_16_64;
    2259           0 :   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2260           0 :   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2261           0 :   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
    2262           0 :   temp2 = (step2[22] + step2[25]) * cospi_16_64;
    2263           0 :   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2264           0 :   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2265           0 :   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
    2266           0 :   temp2 = (step2[23] + step2[24]) * cospi_16_64;
    2267           0 :   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2268           0 :   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2269           0 :   step1[28] = step2[28];
    2270           0 :   step1[29] = step2[29];
    2271           0 :   step1[30] = step2[30];
    2272           0 :   step1[31] = step2[31];
    2273             : 
    2274             :   // final stage
    2275           0 :   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
    2276           0 :   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
    2277           0 :   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
    2278           0 :   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
    2279           0 :   output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
    2280           0 :   output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
    2281           0 :   output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
    2282           0 :   output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
    2283           0 :   output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
    2284           0 :   output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
    2285           0 :   output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
    2286           0 :   output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
    2287           0 :   output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
    2288           0 :   output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
    2289           0 :   output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
    2290           0 :   output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
    2291           0 :   output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
    2292           0 :   output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
    2293           0 :   output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
    2294           0 :   output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
    2295           0 :   output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
    2296           0 :   output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
    2297           0 :   output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
    2298           0 :   output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
    2299           0 :   output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
    2300           0 :   output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
    2301           0 :   output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
    2302           0 :   output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
    2303           0 :   output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
    2304           0 :   output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
    2305           0 :   output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
    2306           0 :   output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
    2307           0 : }
    2308             : 
    2309             : #endif  // CONFIG_HIGHBITDEPTH

Generated by: LCOV version 1.13