LCOV - code coverage report
Current view: top level - media/libvpx/libvpx/vpx_dsp - inv_txfm.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 967 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 21 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include <math.h>
      12             : #include <stdlib.h>
      13             : #include <string.h>
      14             : 
      15             : #include "./vpx_dsp_rtcd.h"
      16             : #include "vpx_dsp/inv_txfm.h"
      17             : 
      18           0 : void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
      19             :   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
      20             :      0.5 shifts per pixel. */
      21             :   int i;
      22             :   tran_low_t output[16];
      23             :   tran_high_t a1, b1, c1, d1, e1;
      24           0 :   const tran_low_t *ip = input;
      25           0 :   tran_low_t *op = output;
      26             : 
      27           0 :   for (i = 0; i < 4; i++) {
      28           0 :     a1 = ip[0] >> UNIT_QUANT_SHIFT;
      29           0 :     c1 = ip[1] >> UNIT_QUANT_SHIFT;
      30           0 :     d1 = ip[2] >> UNIT_QUANT_SHIFT;
      31           0 :     b1 = ip[3] >> UNIT_QUANT_SHIFT;
      32           0 :     a1 += c1;
      33           0 :     d1 -= b1;
      34           0 :     e1 = (a1 - d1) >> 1;
      35           0 :     b1 = e1 - b1;
      36           0 :     c1 = e1 - c1;
      37           0 :     a1 -= b1;
      38           0 :     d1 += c1;
      39           0 :     op[0] = WRAPLOW(a1);
      40           0 :     op[1] = WRAPLOW(b1);
      41           0 :     op[2] = WRAPLOW(c1);
      42           0 :     op[3] = WRAPLOW(d1);
      43           0 :     ip += 4;
      44           0 :     op += 4;
      45             :   }
      46             : 
      47           0 :   ip = output;
      48           0 :   for (i = 0; i < 4; i++) {
      49           0 :     a1 = ip[4 * 0];
      50           0 :     c1 = ip[4 * 1];
      51           0 :     d1 = ip[4 * 2];
      52           0 :     b1 = ip[4 * 3];
      53           0 :     a1 += c1;
      54           0 :     d1 -= b1;
      55           0 :     e1 = (a1 - d1) >> 1;
      56           0 :     b1 = e1 - b1;
      57           0 :     c1 = e1 - c1;
      58           0 :     a1 -= b1;
      59           0 :     d1 += c1;
      60           0 :     dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
      61           0 :     dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
      62           0 :     dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
      63           0 :     dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
      64             : 
      65           0 :     ip++;
      66           0 :     dest++;
      67             :   }
      68           0 : }
      69             : 
      70           0 : void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
      71             :   int i;
      72             :   tran_high_t a1, e1;
      73             :   tran_low_t tmp[4];
      74           0 :   const tran_low_t *ip = in;
      75           0 :   tran_low_t *op = tmp;
      76             : 
      77           0 :   a1 = ip[0] >> UNIT_QUANT_SHIFT;
      78           0 :   e1 = a1 >> 1;
      79           0 :   a1 -= e1;
      80           0 :   op[0] = WRAPLOW(a1);
      81           0 :   op[1] = op[2] = op[3] = WRAPLOW(e1);
      82             : 
      83           0 :   ip = tmp;
      84           0 :   for (i = 0; i < 4; i++) {
      85           0 :     e1 = ip[0] >> 1;
      86           0 :     a1 = ip[0] - e1;
      87           0 :     dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
      88           0 :     dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
      89           0 :     dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
      90           0 :     dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
      91           0 :     ip++;
      92           0 :     dest++;
      93             :   }
      94           0 : }
      95             : 
      96           0 : void idct4_c(const tran_low_t *input, tran_low_t *output) {
      97             :   tran_low_t step[4];
      98             :   tran_high_t temp1, temp2;
      99             : 
     100             :   // stage 1
     101           0 :   temp1 = (input[0] + input[2]) * cospi_16_64;
     102           0 :   temp2 = (input[0] - input[2]) * cospi_16_64;
     103           0 :   step[0] = WRAPLOW(dct_const_round_shift(temp1));
     104           0 :   step[1] = WRAPLOW(dct_const_round_shift(temp2));
     105           0 :   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
     106           0 :   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
     107           0 :   step[2] = WRAPLOW(dct_const_round_shift(temp1));
     108           0 :   step[3] = WRAPLOW(dct_const_round_shift(temp2));
     109             : 
     110             :   // stage 2
     111           0 :   output[0] = WRAPLOW(step[0] + step[3]);
     112           0 :   output[1] = WRAPLOW(step[1] + step[2]);
     113           0 :   output[2] = WRAPLOW(step[1] - step[2]);
     114           0 :   output[3] = WRAPLOW(step[0] - step[3]);
     115           0 : }
     116             : 
     117           0 : void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     118             :   int i, j;
     119             :   tran_low_t out[4 * 4];
     120           0 :   tran_low_t *outptr = out;
     121             :   tran_low_t temp_in[4], temp_out[4];
     122             : 
     123             :   // Rows
     124           0 :   for (i = 0; i < 4; ++i) {
     125           0 :     idct4_c(input, outptr);
     126           0 :     input += 4;
     127           0 :     outptr += 4;
     128             :   }
     129             : 
     130             :   // Columns
     131           0 :   for (i = 0; i < 4; ++i) {
     132           0 :     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
     133           0 :     idct4_c(temp_in, temp_out);
     134           0 :     for (j = 0; j < 4; ++j) {
     135           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
     136           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
     137             :     }
     138             :   }
     139           0 : }
     140             : 
     141           0 : void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     142             :   int i;
     143             :   tran_high_t a1;
     144           0 :   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
     145             : 
     146           0 :   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
     147           0 :   a1 = ROUND_POWER_OF_TWO(out, 4);
     148             : 
     149           0 :   for (i = 0; i < 4; i++) {
     150           0 :     dest[0] = clip_pixel_add(dest[0], a1);
     151           0 :     dest[1] = clip_pixel_add(dest[1], a1);
     152           0 :     dest[2] = clip_pixel_add(dest[2], a1);
     153           0 :     dest[3] = clip_pixel_add(dest[3], a1);
     154           0 :     dest += stride;
     155             :   }
     156           0 : }
     157             : 
     158           0 : void idct8_c(const tran_low_t *input, tran_low_t *output) {
     159             :   tran_low_t step1[8], step2[8];
     160             :   tran_high_t temp1, temp2;
     161             : 
     162             :   // stage 1
     163           0 :   step1[0] = input[0];
     164           0 :   step1[2] = input[4];
     165           0 :   step1[1] = input[2];
     166           0 :   step1[3] = input[6];
     167           0 :   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
     168           0 :   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
     169           0 :   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
     170           0 :   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
     171           0 :   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
     172           0 :   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
     173           0 :   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
     174           0 :   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
     175             : 
     176             :   // stage 2
     177           0 :   temp1 = (step1[0] + step1[2]) * cospi_16_64;
     178           0 :   temp2 = (step1[0] - step1[2]) * cospi_16_64;
     179           0 :   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
     180           0 :   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
     181           0 :   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
     182           0 :   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
     183           0 :   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
     184           0 :   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
     185           0 :   step2[4] = WRAPLOW(step1[4] + step1[5]);
     186           0 :   step2[5] = WRAPLOW(step1[4] - step1[5]);
     187           0 :   step2[6] = WRAPLOW(-step1[6] + step1[7]);
     188           0 :   step2[7] = WRAPLOW(step1[6] + step1[7]);
     189             : 
     190             :   // stage 3
     191           0 :   step1[0] = WRAPLOW(step2[0] + step2[3]);
     192           0 :   step1[1] = WRAPLOW(step2[1] + step2[2]);
     193           0 :   step1[2] = WRAPLOW(step2[1] - step2[2]);
     194           0 :   step1[3] = WRAPLOW(step2[0] - step2[3]);
     195           0 :   step1[4] = step2[4];
     196           0 :   temp1 = (step2[6] - step2[5]) * cospi_16_64;
     197           0 :   temp2 = (step2[5] + step2[6]) * cospi_16_64;
     198           0 :   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
     199           0 :   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
     200           0 :   step1[7] = step2[7];
     201             : 
     202             :   // stage 4
     203           0 :   output[0] = WRAPLOW(step1[0] + step1[7]);
     204           0 :   output[1] = WRAPLOW(step1[1] + step1[6]);
     205           0 :   output[2] = WRAPLOW(step1[2] + step1[5]);
     206           0 :   output[3] = WRAPLOW(step1[3] + step1[4]);
     207           0 :   output[4] = WRAPLOW(step1[3] - step1[4]);
     208           0 :   output[5] = WRAPLOW(step1[2] - step1[5]);
     209           0 :   output[6] = WRAPLOW(step1[1] - step1[6]);
     210           0 :   output[7] = WRAPLOW(step1[0] - step1[7]);
     211           0 : }
     212             : 
     213           0 : void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     214             :   int i, j;
     215             :   tran_low_t out[8 * 8];
     216           0 :   tran_low_t *outptr = out;
     217             :   tran_low_t temp_in[8], temp_out[8];
     218             : 
     219             :   // First transform rows
     220           0 :   for (i = 0; i < 8; ++i) {
     221           0 :     idct8_c(input, outptr);
     222           0 :     input += 8;
     223           0 :     outptr += 8;
     224             :   }
     225             : 
     226             :   // Then transform columns
     227           0 :   for (i = 0; i < 8; ++i) {
     228           0 :     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     229           0 :     idct8_c(temp_in, temp_out);
     230           0 :     for (j = 0; j < 8; ++j) {
     231           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
     232           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
     233             :     }
     234             :   }
     235           0 : }
     236             : 
     237           0 : void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     238             :   int i, j;
     239             :   tran_high_t a1;
     240           0 :   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
     241             : 
     242           0 :   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
     243           0 :   a1 = ROUND_POWER_OF_TWO(out, 5);
     244           0 :   for (j = 0; j < 8; ++j) {
     245           0 :     for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
     246           0 :     dest += stride;
     247             :   }
     248           0 : }
     249             : 
     250           0 : void iadst4_c(const tran_low_t *input, tran_low_t *output) {
     251             :   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
     252           0 :   tran_low_t x0 = input[0];
     253           0 :   tran_low_t x1 = input[1];
     254           0 :   tran_low_t x2 = input[2];
     255           0 :   tran_low_t x3 = input[3];
     256             : 
     257           0 :   if (!(x0 | x1 | x2 | x3)) {
     258           0 :     memset(output, 0, 4 * sizeof(*output));
     259           0 :     return;
     260             :   }
     261             : 
     262           0 :   s0 = sinpi_1_9 * x0;
     263           0 :   s1 = sinpi_2_9 * x0;
     264           0 :   s2 = sinpi_3_9 * x1;
     265           0 :   s3 = sinpi_4_9 * x2;
     266           0 :   s4 = sinpi_1_9 * x2;
     267           0 :   s5 = sinpi_2_9 * x3;
     268           0 :   s6 = sinpi_4_9 * x3;
     269           0 :   s7 = WRAPLOW(x0 - x2 + x3);
     270             : 
     271           0 :   s0 = s0 + s3 + s5;
     272           0 :   s1 = s1 - s4 - s6;
     273           0 :   s3 = s2;
     274           0 :   s2 = sinpi_3_9 * s7;
     275             : 
     276             :   // 1-D transform scaling factor is sqrt(2).
     277             :   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
     278             :   // + 1b (addition) = 29b.
     279             :   // Hence the output bit depth is 15b.
     280           0 :   output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
     281           0 :   output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
     282           0 :   output[2] = WRAPLOW(dct_const_round_shift(s2));
     283           0 :   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
     284             : }
     285             : 
     286           0 : void iadst8_c(const tran_low_t *input, tran_low_t *output) {
     287             :   int s0, s1, s2, s3, s4, s5, s6, s7;
     288           0 :   tran_high_t x0 = input[7];
     289           0 :   tran_high_t x1 = input[0];
     290           0 :   tran_high_t x2 = input[5];
     291           0 :   tran_high_t x3 = input[2];
     292           0 :   tran_high_t x4 = input[3];
     293           0 :   tran_high_t x5 = input[4];
     294           0 :   tran_high_t x6 = input[1];
     295           0 :   tran_high_t x7 = input[6];
     296             : 
     297           0 :   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
     298           0 :     memset(output, 0, 8 * sizeof(*output));
     299           0 :     return;
     300             :   }
     301             : 
     302             :   // stage 1
     303           0 :   s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
     304           0 :   s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
     305           0 :   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
     306           0 :   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
     307           0 :   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
     308           0 :   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
     309           0 :   s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
     310           0 :   s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
     311             : 
     312           0 :   x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
     313           0 :   x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
     314           0 :   x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
     315           0 :   x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
     316           0 :   x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
     317           0 :   x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
     318           0 :   x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
     319           0 :   x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
     320             : 
     321             :   // stage 2
     322           0 :   s0 = (int)x0;
     323           0 :   s1 = (int)x1;
     324           0 :   s2 = (int)x2;
     325           0 :   s3 = (int)x3;
     326           0 :   s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
     327           0 :   s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
     328           0 :   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
     329           0 :   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
     330             : 
     331           0 :   x0 = WRAPLOW(s0 + s2);
     332           0 :   x1 = WRAPLOW(s1 + s3);
     333           0 :   x2 = WRAPLOW(s0 - s2);
     334           0 :   x3 = WRAPLOW(s1 - s3);
     335           0 :   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
     336           0 :   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
     337           0 :   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
     338           0 :   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
     339             : 
     340             :   // stage 3
     341           0 :   s2 = (int)(cospi_16_64 * (x2 + x3));
     342           0 :   s3 = (int)(cospi_16_64 * (x2 - x3));
     343           0 :   s6 = (int)(cospi_16_64 * (x6 + x7));
     344           0 :   s7 = (int)(cospi_16_64 * (x6 - x7));
     345             : 
     346           0 :   x2 = WRAPLOW(dct_const_round_shift(s2));
     347           0 :   x3 = WRAPLOW(dct_const_round_shift(s3));
     348           0 :   x6 = WRAPLOW(dct_const_round_shift(s6));
     349           0 :   x7 = WRAPLOW(dct_const_round_shift(s7));
     350             : 
     351           0 :   output[0] = WRAPLOW(x0);
     352           0 :   output[1] = WRAPLOW(-x4);
     353           0 :   output[2] = WRAPLOW(x6);
     354           0 :   output[3] = WRAPLOW(-x2);
     355           0 :   output[4] = WRAPLOW(x3);
     356           0 :   output[5] = WRAPLOW(-x7);
     357           0 :   output[6] = WRAPLOW(x5);
     358           0 :   output[7] = WRAPLOW(-x1);
     359             : }
     360             : 
     361           0 : void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     362             :   int i, j;
     363           0 :   tran_low_t out[8 * 8] = { 0 };
     364           0 :   tran_low_t *outptr = out;
     365             :   tran_low_t temp_in[8], temp_out[8];
     366             : 
     367             :   // First transform rows
     368             :   // Only first 4 row has non-zero coefs
     369           0 :   for (i = 0; i < 4; ++i) {
     370           0 :     idct8_c(input, outptr);
     371           0 :     input += 8;
     372           0 :     outptr += 8;
     373             :   }
     374             : 
     375             :   // Then transform columns
     376           0 :   for (i = 0; i < 8; ++i) {
     377           0 :     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     378           0 :     idct8_c(temp_in, temp_out);
     379           0 :     for (j = 0; j < 8; ++j) {
     380           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
     381           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
     382             :     }
     383             :   }
     384           0 : }
     385             : 
     386           0 : void idct16_c(const tran_low_t *input, tran_low_t *output) {
     387             :   tran_low_t step1[16], step2[16];
     388             :   tran_high_t temp1, temp2;
     389             : 
     390             :   // stage 1
     391           0 :   step1[0] = input[0 / 2];
     392           0 :   step1[1] = input[16 / 2];
     393           0 :   step1[2] = input[8 / 2];
     394           0 :   step1[3] = input[24 / 2];
     395           0 :   step1[4] = input[4 / 2];
     396           0 :   step1[5] = input[20 / 2];
     397           0 :   step1[6] = input[12 / 2];
     398           0 :   step1[7] = input[28 / 2];
     399           0 :   step1[8] = input[2 / 2];
     400           0 :   step1[9] = input[18 / 2];
     401           0 :   step1[10] = input[10 / 2];
     402           0 :   step1[11] = input[26 / 2];
     403           0 :   step1[12] = input[6 / 2];
     404           0 :   step1[13] = input[22 / 2];
     405           0 :   step1[14] = input[14 / 2];
     406           0 :   step1[15] = input[30 / 2];
     407             : 
     408             :   // stage 2
     409           0 :   step2[0] = step1[0];
     410           0 :   step2[1] = step1[1];
     411           0 :   step2[2] = step1[2];
     412           0 :   step2[3] = step1[3];
     413           0 :   step2[4] = step1[4];
     414           0 :   step2[5] = step1[5];
     415           0 :   step2[6] = step1[6];
     416           0 :   step2[7] = step1[7];
     417             : 
     418           0 :   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
     419           0 :   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
     420           0 :   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
     421           0 :   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
     422             : 
     423           0 :   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
     424           0 :   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
     425           0 :   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
     426           0 :   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
     427             : 
     428           0 :   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
     429           0 :   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
     430           0 :   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
     431           0 :   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
     432             : 
     433           0 :   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
     434           0 :   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
     435           0 :   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
     436           0 :   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
     437             : 
     438             :   // stage 3
     439           0 :   step1[0] = step2[0];
     440           0 :   step1[1] = step2[1];
     441           0 :   step1[2] = step2[2];
     442           0 :   step1[3] = step2[3];
     443             : 
     444           0 :   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
     445           0 :   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
     446           0 :   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
     447           0 :   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
     448           0 :   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
     449           0 :   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
     450           0 :   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
     451           0 :   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
     452             : 
     453           0 :   step1[8] = WRAPLOW(step2[8] + step2[9]);
     454           0 :   step1[9] = WRAPLOW(step2[8] - step2[9]);
     455           0 :   step1[10] = WRAPLOW(-step2[10] + step2[11]);
     456           0 :   step1[11] = WRAPLOW(step2[10] + step2[11]);
     457           0 :   step1[12] = WRAPLOW(step2[12] + step2[13]);
     458           0 :   step1[13] = WRAPLOW(step2[12] - step2[13]);
     459           0 :   step1[14] = WRAPLOW(-step2[14] + step2[15]);
     460           0 :   step1[15] = WRAPLOW(step2[14] + step2[15]);
     461             : 
     462             :   // stage 4
     463           0 :   temp1 = (step1[0] + step1[1]) * cospi_16_64;
     464           0 :   temp2 = (step1[0] - step1[1]) * cospi_16_64;
     465           0 :   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
     466           0 :   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
     467           0 :   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
     468           0 :   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
     469           0 :   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
     470           0 :   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
     471           0 :   step2[4] = WRAPLOW(step1[4] + step1[5]);
     472           0 :   step2[5] = WRAPLOW(step1[4] - step1[5]);
     473           0 :   step2[6] = WRAPLOW(-step1[6] + step1[7]);
     474           0 :   step2[7] = WRAPLOW(step1[6] + step1[7]);
     475             : 
     476           0 :   step2[8] = step1[8];
     477           0 :   step2[15] = step1[15];
     478           0 :   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
     479           0 :   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
     480           0 :   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
     481           0 :   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
     482           0 :   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
     483           0 :   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
     484           0 :   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
     485           0 :   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
     486           0 :   step2[11] = step1[11];
     487           0 :   step2[12] = step1[12];
     488             : 
     489             :   // stage 5
     490           0 :   step1[0] = WRAPLOW(step2[0] + step2[3]);
     491           0 :   step1[1] = WRAPLOW(step2[1] + step2[2]);
     492           0 :   step1[2] = WRAPLOW(step2[1] - step2[2]);
     493           0 :   step1[3] = WRAPLOW(step2[0] - step2[3]);
     494           0 :   step1[4] = step2[4];
     495           0 :   temp1 = (step2[6] - step2[5]) * cospi_16_64;
     496           0 :   temp2 = (step2[5] + step2[6]) * cospi_16_64;
     497           0 :   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
     498           0 :   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
     499           0 :   step1[7] = step2[7];
     500             : 
     501           0 :   step1[8] = WRAPLOW(step2[8] + step2[11]);
     502           0 :   step1[9] = WRAPLOW(step2[9] + step2[10]);
     503           0 :   step1[10] = WRAPLOW(step2[9] - step2[10]);
     504           0 :   step1[11] = WRAPLOW(step2[8] - step2[11]);
     505           0 :   step1[12] = WRAPLOW(-step2[12] + step2[15]);
     506           0 :   step1[13] = WRAPLOW(-step2[13] + step2[14]);
     507           0 :   step1[14] = WRAPLOW(step2[13] + step2[14]);
     508           0 :   step1[15] = WRAPLOW(step2[12] + step2[15]);
     509             : 
     510             :   // stage 6
     511           0 :   step2[0] = WRAPLOW(step1[0] + step1[7]);
     512           0 :   step2[1] = WRAPLOW(step1[1] + step1[6]);
     513           0 :   step2[2] = WRAPLOW(step1[2] + step1[5]);
     514           0 :   step2[3] = WRAPLOW(step1[3] + step1[4]);
     515           0 :   step2[4] = WRAPLOW(step1[3] - step1[4]);
     516           0 :   step2[5] = WRAPLOW(step1[2] - step1[5]);
     517           0 :   step2[6] = WRAPLOW(step1[1] - step1[6]);
     518           0 :   step2[7] = WRAPLOW(step1[0] - step1[7]);
     519           0 :   step2[8] = step1[8];
     520           0 :   step2[9] = step1[9];
     521           0 :   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
     522           0 :   temp2 = (step1[10] + step1[13]) * cospi_16_64;
     523           0 :   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
     524           0 :   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
     525           0 :   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
     526           0 :   temp2 = (step1[11] + step1[12]) * cospi_16_64;
     527           0 :   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
     528           0 :   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
     529           0 :   step2[14] = step1[14];
     530           0 :   step2[15] = step1[15];
     531             : 
     532             :   // stage 7
     533           0 :   output[0] = WRAPLOW(step2[0] + step2[15]);
     534           0 :   output[1] = WRAPLOW(step2[1] + step2[14]);
     535           0 :   output[2] = WRAPLOW(step2[2] + step2[13]);
     536           0 :   output[3] = WRAPLOW(step2[3] + step2[12]);
     537           0 :   output[4] = WRAPLOW(step2[4] + step2[11]);
     538           0 :   output[5] = WRAPLOW(step2[5] + step2[10]);
     539           0 :   output[6] = WRAPLOW(step2[6] + step2[9]);
     540           0 :   output[7] = WRAPLOW(step2[7] + step2[8]);
     541           0 :   output[8] = WRAPLOW(step2[7] - step2[8]);
     542           0 :   output[9] = WRAPLOW(step2[6] - step2[9]);
     543           0 :   output[10] = WRAPLOW(step2[5] - step2[10]);
     544           0 :   output[11] = WRAPLOW(step2[4] - step2[11]);
     545           0 :   output[12] = WRAPLOW(step2[3] - step2[12]);
     546           0 :   output[13] = WRAPLOW(step2[2] - step2[13]);
     547           0 :   output[14] = WRAPLOW(step2[1] - step2[14]);
     548           0 :   output[15] = WRAPLOW(step2[0] - step2[15]);
     549           0 : }
     550             : 
     551           0 : void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
     552             :                              int stride) {
     553             :   int i, j;
     554             :   tran_low_t out[16 * 16];
     555           0 :   tran_low_t *outptr = out;
     556             :   tran_low_t temp_in[16], temp_out[16];
     557             : 
     558             :   // First transform rows
     559           0 :   for (i = 0; i < 16; ++i) {
     560           0 :     idct16_c(input, outptr);
     561           0 :     input += 16;
     562           0 :     outptr += 16;
     563             :   }
     564             : 
     565             :   // Then transform columns
     566           0 :   for (i = 0; i < 16; ++i) {
     567           0 :     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     568           0 :     idct16_c(temp_in, temp_out);
     569           0 :     for (j = 0; j < 16; ++j) {
     570           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
     571           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
     572             :     }
     573             :   }
     574           0 : }
     575             : 
     576           0 : void iadst16_c(const tran_low_t *input, tran_low_t *output) {
     577             :   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
     578             :   tran_high_t s9, s10, s11, s12, s13, s14, s15;
     579           0 :   tran_high_t x0 = input[15];
     580           0 :   tran_high_t x1 = input[0];
     581           0 :   tran_high_t x2 = input[13];
     582           0 :   tran_high_t x3 = input[2];
     583           0 :   tran_high_t x4 = input[11];
     584           0 :   tran_high_t x5 = input[4];
     585           0 :   tran_high_t x6 = input[9];
     586           0 :   tran_high_t x7 = input[6];
     587           0 :   tran_high_t x8 = input[7];
     588           0 :   tran_high_t x9 = input[8];
     589           0 :   tran_high_t x10 = input[5];
     590           0 :   tran_high_t x11 = input[10];
     591           0 :   tran_high_t x12 = input[3];
     592           0 :   tran_high_t x13 = input[12];
     593           0 :   tran_high_t x14 = input[1];
     594           0 :   tran_high_t x15 = input[14];
     595             : 
     596           0 :   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
     597           0 :         x13 | x14 | x15)) {
     598           0 :     memset(output, 0, 16 * sizeof(*output));
     599           0 :     return;
     600             :   }
     601             : 
     602             :   // stage 1
     603           0 :   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
     604           0 :   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
     605           0 :   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
     606           0 :   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
     607           0 :   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
     608           0 :   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
     609           0 :   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
     610           0 :   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
     611           0 :   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
     612           0 :   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
     613           0 :   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
     614           0 :   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
     615           0 :   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
     616           0 :   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
     617           0 :   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
     618           0 :   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
     619             : 
     620           0 :   x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
     621           0 :   x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
     622           0 :   x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
     623           0 :   x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
     624           0 :   x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
     625           0 :   x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
     626           0 :   x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
     627           0 :   x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
     628           0 :   x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
     629           0 :   x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
     630           0 :   x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
     631           0 :   x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
     632           0 :   x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
     633           0 :   x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
     634           0 :   x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
     635           0 :   x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
     636             : 
     637             :   // stage 2
     638           0 :   s0 = x0;
     639           0 :   s1 = x1;
     640           0 :   s2 = x2;
     641           0 :   s3 = x3;
     642           0 :   s4 = x4;
     643           0 :   s5 = x5;
     644           0 :   s6 = x6;
     645           0 :   s7 = x7;
     646           0 :   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
     647           0 :   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
     648           0 :   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
     649           0 :   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
     650           0 :   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
     651           0 :   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
     652           0 :   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
     653           0 :   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
     654             : 
     655           0 :   x0 = WRAPLOW(s0 + s4);
     656           0 :   x1 = WRAPLOW(s1 + s5);
     657           0 :   x2 = WRAPLOW(s2 + s6);
     658           0 :   x3 = WRAPLOW(s3 + s7);
     659           0 :   x4 = WRAPLOW(s0 - s4);
     660           0 :   x5 = WRAPLOW(s1 - s5);
     661           0 :   x6 = WRAPLOW(s2 - s6);
     662           0 :   x7 = WRAPLOW(s3 - s7);
     663           0 :   x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
     664           0 :   x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
     665           0 :   x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
     666           0 :   x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
     667           0 :   x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
     668           0 :   x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
     669           0 :   x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
     670           0 :   x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
     671             : 
     672             :   // stage 3
     673           0 :   s0 = x0;
     674           0 :   s1 = x1;
     675           0 :   s2 = x2;
     676           0 :   s3 = x3;
     677           0 :   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
     678           0 :   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
     679           0 :   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
     680           0 :   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
     681           0 :   s8 = x8;
     682           0 :   s9 = x9;
     683           0 :   s10 = x10;
     684           0 :   s11 = x11;
     685           0 :   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
     686           0 :   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
     687           0 :   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
     688           0 :   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
     689             : 
     690           0 :   x0 = WRAPLOW(s0 + s2);
     691           0 :   x1 = WRAPLOW(s1 + s3);
     692           0 :   x2 = WRAPLOW(s0 - s2);
     693           0 :   x3 = WRAPLOW(s1 - s3);
     694           0 :   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
     695           0 :   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
     696           0 :   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
     697           0 :   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
     698           0 :   x8 = WRAPLOW(s8 + s10);
     699           0 :   x9 = WRAPLOW(s9 + s11);
     700           0 :   x10 = WRAPLOW(s8 - s10);
     701           0 :   x11 = WRAPLOW(s9 - s11);
     702           0 :   x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
     703           0 :   x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
     704           0 :   x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
     705           0 :   x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
     706             : 
     707             :   // stage 4
     708           0 :   s2 = (-cospi_16_64) * (x2 + x3);
     709           0 :   s3 = cospi_16_64 * (x2 - x3);
     710           0 :   s6 = cospi_16_64 * (x6 + x7);
     711           0 :   s7 = cospi_16_64 * (-x6 + x7);
     712           0 :   s10 = cospi_16_64 * (x10 + x11);
     713           0 :   s11 = cospi_16_64 * (-x10 + x11);
     714           0 :   s14 = (-cospi_16_64) * (x14 + x15);
     715           0 :   s15 = cospi_16_64 * (x14 - x15);
     716             : 
     717           0 :   x2 = WRAPLOW(dct_const_round_shift(s2));
     718           0 :   x3 = WRAPLOW(dct_const_round_shift(s3));
     719           0 :   x6 = WRAPLOW(dct_const_round_shift(s6));
     720           0 :   x7 = WRAPLOW(dct_const_round_shift(s7));
     721           0 :   x10 = WRAPLOW(dct_const_round_shift(s10));
     722           0 :   x11 = WRAPLOW(dct_const_round_shift(s11));
     723           0 :   x14 = WRAPLOW(dct_const_round_shift(s14));
     724           0 :   x15 = WRAPLOW(dct_const_round_shift(s15));
     725             : 
     726           0 :   output[0] = WRAPLOW(x0);
     727           0 :   output[1] = WRAPLOW(-x8);
     728           0 :   output[2] = WRAPLOW(x12);
     729           0 :   output[3] = WRAPLOW(-x4);
     730           0 :   output[4] = WRAPLOW(x6);
     731           0 :   output[5] = WRAPLOW(x14);
     732           0 :   output[6] = WRAPLOW(x10);
     733           0 :   output[7] = WRAPLOW(x2);
     734           0 :   output[8] = WRAPLOW(x3);
     735           0 :   output[9] = WRAPLOW(x11);
     736           0 :   output[10] = WRAPLOW(x15);
     737           0 :   output[11] = WRAPLOW(x7);
     738           0 :   output[12] = WRAPLOW(x5);
     739           0 :   output[13] = WRAPLOW(-x13);
     740           0 :   output[14] = WRAPLOW(x9);
     741           0 :   output[15] = WRAPLOW(-x1);
     742             : }
     743             : 
     744           0 : void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
     745             :                             int stride) {
     746             :   int i, j;
     747           0 :   tran_low_t out[16 * 16] = { 0 };
     748           0 :   tran_low_t *outptr = out;
     749             :   tran_low_t temp_in[16], temp_out[16];
     750             : 
     751             :   // First transform rows. Since all non-zero dct coefficients are in
     752             :   // upper-left 4x4 area, we only need to calculate first 4 rows here.
     753           0 :   for (i = 0; i < 4; ++i) {
     754           0 :     idct16_c(input, outptr);
     755           0 :     input += 16;
     756           0 :     outptr += 16;
     757             :   }
     758             : 
     759             :   // Then transform columns
     760           0 :   for (i = 0; i < 16; ++i) {
     761           0 :     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     762           0 :     idct16_c(temp_in, temp_out);
     763           0 :     for (j = 0; j < 16; ++j) {
     764           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
     765           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
     766             :     }
     767             :   }
     768           0 : }
     769             : 
     770           0 : void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     771             :   int i, j;
     772             :   tran_high_t a1;
     773           0 :   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
     774             : 
     775           0 :   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
     776           0 :   a1 = ROUND_POWER_OF_TWO(out, 6);
     777           0 :   for (j = 0; j < 16; ++j) {
     778           0 :     for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
     779           0 :     dest += stride;
     780             :   }
     781           0 : }
     782             : 
     783           0 : void idct32_c(const tran_low_t *input, tran_low_t *output) {
     784             :   tran_low_t step1[32], step2[32];
     785             :   tran_high_t temp1, temp2;
     786             : 
     787             :   // stage 1
     788           0 :   step1[0] = input[0];
     789           0 :   step1[1] = input[16];
     790           0 :   step1[2] = input[8];
     791           0 :   step1[3] = input[24];
     792           0 :   step1[4] = input[4];
     793           0 :   step1[5] = input[20];
     794           0 :   step1[6] = input[12];
     795           0 :   step1[7] = input[28];
     796           0 :   step1[8] = input[2];
     797           0 :   step1[9] = input[18];
     798           0 :   step1[10] = input[10];
     799           0 :   step1[11] = input[26];
     800           0 :   step1[12] = input[6];
     801           0 :   step1[13] = input[22];
     802           0 :   step1[14] = input[14];
     803           0 :   step1[15] = input[30];
     804             : 
     805           0 :   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
     806           0 :   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
     807           0 :   step1[16] = WRAPLOW(dct_const_round_shift(temp1));
     808           0 :   step1[31] = WRAPLOW(dct_const_round_shift(temp2));
     809             : 
     810           0 :   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
     811           0 :   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
     812           0 :   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
     813           0 :   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
     814             : 
     815           0 :   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
     816           0 :   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
     817           0 :   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
     818           0 :   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
     819             : 
     820           0 :   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
     821           0 :   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
     822           0 :   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
     823           0 :   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
     824             : 
     825           0 :   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
     826           0 :   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
     827           0 :   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
     828           0 :   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
     829             : 
     830           0 :   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
     831           0 :   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
     832           0 :   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
     833           0 :   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
     834             : 
     835           0 :   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
     836           0 :   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
     837           0 :   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
     838           0 :   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
     839             : 
     840           0 :   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
     841           0 :   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
     842           0 :   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
     843           0 :   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
     844             : 
     845             :   // stage 2
     846           0 :   step2[0] = step1[0];
     847           0 :   step2[1] = step1[1];
     848           0 :   step2[2] = step1[2];
     849           0 :   step2[3] = step1[3];
     850           0 :   step2[4] = step1[4];
     851           0 :   step2[5] = step1[5];
     852           0 :   step2[6] = step1[6];
     853           0 :   step2[7] = step1[7];
     854             : 
     855           0 :   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
     856           0 :   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
     857           0 :   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
     858           0 :   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
     859             : 
     860           0 :   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
     861           0 :   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
     862           0 :   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
     863           0 :   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
     864             : 
     865           0 :   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
     866           0 :   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
     867           0 :   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
     868           0 :   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
     869             : 
     870           0 :   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
     871           0 :   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
     872           0 :   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
     873           0 :   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
     874             : 
     875           0 :   step2[16] = WRAPLOW(step1[16] + step1[17]);
     876           0 :   step2[17] = WRAPLOW(step1[16] - step1[17]);
     877           0 :   step2[18] = WRAPLOW(-step1[18] + step1[19]);
     878           0 :   step2[19] = WRAPLOW(step1[18] + step1[19]);
     879           0 :   step2[20] = WRAPLOW(step1[20] + step1[21]);
     880           0 :   step2[21] = WRAPLOW(step1[20] - step1[21]);
     881           0 :   step2[22] = WRAPLOW(-step1[22] + step1[23]);
     882           0 :   step2[23] = WRAPLOW(step1[22] + step1[23]);
     883           0 :   step2[24] = WRAPLOW(step1[24] + step1[25]);
     884           0 :   step2[25] = WRAPLOW(step1[24] - step1[25]);
     885           0 :   step2[26] = WRAPLOW(-step1[26] + step1[27]);
     886           0 :   step2[27] = WRAPLOW(step1[26] + step1[27]);
     887           0 :   step2[28] = WRAPLOW(step1[28] + step1[29]);
     888           0 :   step2[29] = WRAPLOW(step1[28] - step1[29]);
     889           0 :   step2[30] = WRAPLOW(-step1[30] + step1[31]);
     890           0 :   step2[31] = WRAPLOW(step1[30] + step1[31]);
     891             : 
     892             :   // stage 3
     893           0 :   step1[0] = step2[0];
     894           0 :   step1[1] = step2[1];
     895           0 :   step1[2] = step2[2];
     896           0 :   step1[3] = step2[3];
     897             : 
     898           0 :   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
     899           0 :   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
     900           0 :   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
     901           0 :   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
     902           0 :   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
     903           0 :   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
     904           0 :   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
     905           0 :   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
     906             : 
     907           0 :   step1[8] = WRAPLOW(step2[8] + step2[9]);
     908           0 :   step1[9] = WRAPLOW(step2[8] - step2[9]);
     909           0 :   step1[10] = WRAPLOW(-step2[10] + step2[11]);
     910           0 :   step1[11] = WRAPLOW(step2[10] + step2[11]);
     911           0 :   step1[12] = WRAPLOW(step2[12] + step2[13]);
     912           0 :   step1[13] = WRAPLOW(step2[12] - step2[13]);
     913           0 :   step1[14] = WRAPLOW(-step2[14] + step2[15]);
     914           0 :   step1[15] = WRAPLOW(step2[14] + step2[15]);
     915             : 
     916           0 :   step1[16] = step2[16];
     917           0 :   step1[31] = step2[31];
     918           0 :   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
     919           0 :   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
     920           0 :   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
     921           0 :   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
     922           0 :   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
     923           0 :   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
     924           0 :   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
     925           0 :   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
     926           0 :   step1[19] = step2[19];
     927           0 :   step1[20] = step2[20];
     928           0 :   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
     929           0 :   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
     930           0 :   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
     931           0 :   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
     932           0 :   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
     933           0 :   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
     934           0 :   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
     935           0 :   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
     936           0 :   step1[23] = step2[23];
     937           0 :   step1[24] = step2[24];
     938           0 :   step1[27] = step2[27];
     939           0 :   step1[28] = step2[28];
     940             : 
     941             :   // stage 4
     942           0 :   temp1 = (step1[0] + step1[1]) * cospi_16_64;
     943           0 :   temp2 = (step1[0] - step1[1]) * cospi_16_64;
     944           0 :   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
     945           0 :   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
     946           0 :   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
     947           0 :   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
     948           0 :   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
     949           0 :   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
     950           0 :   step2[4] = WRAPLOW(step1[4] + step1[5]);
     951           0 :   step2[5] = WRAPLOW(step1[4] - step1[5]);
     952           0 :   step2[6] = WRAPLOW(-step1[6] + step1[7]);
     953           0 :   step2[7] = WRAPLOW(step1[6] + step1[7]);
     954             : 
     955           0 :   step2[8] = step1[8];
     956           0 :   step2[15] = step1[15];
     957           0 :   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
     958           0 :   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
     959           0 :   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
     960           0 :   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
     961           0 :   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
     962           0 :   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
     963           0 :   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
     964           0 :   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
     965           0 :   step2[11] = step1[11];
     966           0 :   step2[12] = step1[12];
     967             : 
     968           0 :   step2[16] = WRAPLOW(step1[16] + step1[19]);
     969           0 :   step2[17] = WRAPLOW(step1[17] + step1[18]);
     970           0 :   step2[18] = WRAPLOW(step1[17] - step1[18]);
     971           0 :   step2[19] = WRAPLOW(step1[16] - step1[19]);
     972           0 :   step2[20] = WRAPLOW(-step1[20] + step1[23]);
     973           0 :   step2[21] = WRAPLOW(-step1[21] + step1[22]);
     974           0 :   step2[22] = WRAPLOW(step1[21] + step1[22]);
     975           0 :   step2[23] = WRAPLOW(step1[20] + step1[23]);
     976             : 
     977           0 :   step2[24] = WRAPLOW(step1[24] + step1[27]);
     978           0 :   step2[25] = WRAPLOW(step1[25] + step1[26]);
     979           0 :   step2[26] = WRAPLOW(step1[25] - step1[26]);
     980           0 :   step2[27] = WRAPLOW(step1[24] - step1[27]);
     981           0 :   step2[28] = WRAPLOW(-step1[28] + step1[31]);
     982           0 :   step2[29] = WRAPLOW(-step1[29] + step1[30]);
     983           0 :   step2[30] = WRAPLOW(step1[29] + step1[30]);
     984           0 :   step2[31] = WRAPLOW(step1[28] + step1[31]);
     985             : 
     986             :   // stage 5
     987           0 :   step1[0] = WRAPLOW(step2[0] + step2[3]);
     988           0 :   step1[1] = WRAPLOW(step2[1] + step2[2]);
     989           0 :   step1[2] = WRAPLOW(step2[1] - step2[2]);
     990           0 :   step1[3] = WRAPLOW(step2[0] - step2[3]);
     991           0 :   step1[4] = step2[4];
     992           0 :   temp1 = (step2[6] - step2[5]) * cospi_16_64;
     993           0 :   temp2 = (step2[5] + step2[6]) * cospi_16_64;
     994           0 :   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
     995           0 :   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
     996           0 :   step1[7] = step2[7];
     997             : 
     998           0 :   step1[8] = WRAPLOW(step2[8] + step2[11]);
     999           0 :   step1[9] = WRAPLOW(step2[9] + step2[10]);
    1000           0 :   step1[10] = WRAPLOW(step2[9] - step2[10]);
    1001           0 :   step1[11] = WRAPLOW(step2[8] - step2[11]);
    1002           0 :   step1[12] = WRAPLOW(-step2[12] + step2[15]);
    1003           0 :   step1[13] = WRAPLOW(-step2[13] + step2[14]);
    1004           0 :   step1[14] = WRAPLOW(step2[13] + step2[14]);
    1005           0 :   step1[15] = WRAPLOW(step2[12] + step2[15]);
    1006             : 
    1007           0 :   step1[16] = step2[16];
    1008           0 :   step1[17] = step2[17];
    1009           0 :   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
    1010           0 :   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
    1011           0 :   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
    1012           0 :   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
    1013           0 :   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
    1014           0 :   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
    1015           0 :   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
    1016           0 :   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
    1017           0 :   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
    1018           0 :   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
    1019           0 :   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
    1020           0 :   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
    1021           0 :   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
    1022           0 :   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
    1023           0 :   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
    1024           0 :   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
    1025           0 :   step1[22] = step2[22];
    1026           0 :   step1[23] = step2[23];
    1027           0 :   step1[24] = step2[24];
    1028           0 :   step1[25] = step2[25];
    1029           0 :   step1[30] = step2[30];
    1030           0 :   step1[31] = step2[31];
    1031             : 
    1032             :   // stage 6
    1033           0 :   step2[0] = WRAPLOW(step1[0] + step1[7]);
    1034           0 :   step2[1] = WRAPLOW(step1[1] + step1[6]);
    1035           0 :   step2[2] = WRAPLOW(step1[2] + step1[5]);
    1036           0 :   step2[3] = WRAPLOW(step1[3] + step1[4]);
    1037           0 :   step2[4] = WRAPLOW(step1[3] - step1[4]);
    1038           0 :   step2[5] = WRAPLOW(step1[2] - step1[5]);
    1039           0 :   step2[6] = WRAPLOW(step1[1] - step1[6]);
    1040           0 :   step2[7] = WRAPLOW(step1[0] - step1[7]);
    1041           0 :   step2[8] = step1[8];
    1042           0 :   step2[9] = step1[9];
    1043           0 :   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
    1044           0 :   temp2 = (step1[10] + step1[13]) * cospi_16_64;
    1045           0 :   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
    1046           0 :   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
    1047           0 :   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
    1048           0 :   temp2 = (step1[11] + step1[12]) * cospi_16_64;
    1049           0 :   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
    1050           0 :   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
    1051           0 :   step2[14] = step1[14];
    1052           0 :   step2[15] = step1[15];
    1053             : 
    1054           0 :   step2[16] = WRAPLOW(step1[16] + step1[23]);
    1055           0 :   step2[17] = WRAPLOW(step1[17] + step1[22]);
    1056           0 :   step2[18] = WRAPLOW(step1[18] + step1[21]);
    1057           0 :   step2[19] = WRAPLOW(step1[19] + step1[20]);
    1058           0 :   step2[20] = WRAPLOW(step1[19] - step1[20]);
    1059           0 :   step2[21] = WRAPLOW(step1[18] - step1[21]);
    1060           0 :   step2[22] = WRAPLOW(step1[17] - step1[22]);
    1061           0 :   step2[23] = WRAPLOW(step1[16] - step1[23]);
    1062             : 
    1063           0 :   step2[24] = WRAPLOW(-step1[24] + step1[31]);
    1064           0 :   step2[25] = WRAPLOW(-step1[25] + step1[30]);
    1065           0 :   step2[26] = WRAPLOW(-step1[26] + step1[29]);
    1066           0 :   step2[27] = WRAPLOW(-step1[27] + step1[28]);
    1067           0 :   step2[28] = WRAPLOW(step1[27] + step1[28]);
    1068           0 :   step2[29] = WRAPLOW(step1[26] + step1[29]);
    1069           0 :   step2[30] = WRAPLOW(step1[25] + step1[30]);
    1070           0 :   step2[31] = WRAPLOW(step1[24] + step1[31]);
    1071             : 
    1072             :   // stage 7
    1073           0 :   step1[0] = WRAPLOW(step2[0] + step2[15]);
    1074           0 :   step1[1] = WRAPLOW(step2[1] + step2[14]);
    1075           0 :   step1[2] = WRAPLOW(step2[2] + step2[13]);
    1076           0 :   step1[3] = WRAPLOW(step2[3] + step2[12]);
    1077           0 :   step1[4] = WRAPLOW(step2[4] + step2[11]);
    1078           0 :   step1[5] = WRAPLOW(step2[5] + step2[10]);
    1079           0 :   step1[6] = WRAPLOW(step2[6] + step2[9]);
    1080           0 :   step1[7] = WRAPLOW(step2[7] + step2[8]);
    1081           0 :   step1[8] = WRAPLOW(step2[7] - step2[8]);
    1082           0 :   step1[9] = WRAPLOW(step2[6] - step2[9]);
    1083           0 :   step1[10] = WRAPLOW(step2[5] - step2[10]);
    1084           0 :   step1[11] = WRAPLOW(step2[4] - step2[11]);
    1085           0 :   step1[12] = WRAPLOW(step2[3] - step2[12]);
    1086           0 :   step1[13] = WRAPLOW(step2[2] - step2[13]);
    1087           0 :   step1[14] = WRAPLOW(step2[1] - step2[14]);
    1088           0 :   step1[15] = WRAPLOW(step2[0] - step2[15]);
    1089             : 
    1090           0 :   step1[16] = step2[16];
    1091           0 :   step1[17] = step2[17];
    1092           0 :   step1[18] = step2[18];
    1093           0 :   step1[19] = step2[19];
    1094           0 :   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
    1095           0 :   temp2 = (step2[20] + step2[27]) * cospi_16_64;
    1096           0 :   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
    1097           0 :   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
    1098           0 :   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
    1099           0 :   temp2 = (step2[21] + step2[26]) * cospi_16_64;
    1100           0 :   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
    1101           0 :   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
    1102           0 :   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
    1103           0 :   temp2 = (step2[22] + step2[25]) * cospi_16_64;
    1104           0 :   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
    1105           0 :   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
    1106           0 :   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
    1107           0 :   temp2 = (step2[23] + step2[24]) * cospi_16_64;
    1108           0 :   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
    1109           0 :   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
    1110           0 :   step1[28] = step2[28];
    1111           0 :   step1[29] = step2[29];
    1112           0 :   step1[30] = step2[30];
    1113           0 :   step1[31] = step2[31];
    1114             : 
    1115             :   // final stage
    1116           0 :   output[0] = WRAPLOW(step1[0] + step1[31]);
    1117           0 :   output[1] = WRAPLOW(step1[1] + step1[30]);
    1118           0 :   output[2] = WRAPLOW(step1[2] + step1[29]);
    1119           0 :   output[3] = WRAPLOW(step1[3] + step1[28]);
    1120           0 :   output[4] = WRAPLOW(step1[4] + step1[27]);
    1121           0 :   output[5] = WRAPLOW(step1[5] + step1[26]);
    1122           0 :   output[6] = WRAPLOW(step1[6] + step1[25]);
    1123           0 :   output[7] = WRAPLOW(step1[7] + step1[24]);
    1124           0 :   output[8] = WRAPLOW(step1[8] + step1[23]);
    1125           0 :   output[9] = WRAPLOW(step1[9] + step1[22]);
    1126           0 :   output[10] = WRAPLOW(step1[10] + step1[21]);
    1127           0 :   output[11] = WRAPLOW(step1[11] + step1[20]);
    1128           0 :   output[12] = WRAPLOW(step1[12] + step1[19]);
    1129           0 :   output[13] = WRAPLOW(step1[13] + step1[18]);
    1130           0 :   output[14] = WRAPLOW(step1[14] + step1[17]);
    1131           0 :   output[15] = WRAPLOW(step1[15] + step1[16]);
    1132           0 :   output[16] = WRAPLOW(step1[15] - step1[16]);
    1133           0 :   output[17] = WRAPLOW(step1[14] - step1[17]);
    1134           0 :   output[18] = WRAPLOW(step1[13] - step1[18]);
    1135           0 :   output[19] = WRAPLOW(step1[12] - step1[19]);
    1136           0 :   output[20] = WRAPLOW(step1[11] - step1[20]);
    1137           0 :   output[21] = WRAPLOW(step1[10] - step1[21]);
    1138           0 :   output[22] = WRAPLOW(step1[9] - step1[22]);
    1139           0 :   output[23] = WRAPLOW(step1[8] - step1[23]);
    1140           0 :   output[24] = WRAPLOW(step1[7] - step1[24]);
    1141           0 :   output[25] = WRAPLOW(step1[6] - step1[25]);
    1142           0 :   output[26] = WRAPLOW(step1[5] - step1[26]);
    1143           0 :   output[27] = WRAPLOW(step1[4] - step1[27]);
    1144           0 :   output[28] = WRAPLOW(step1[3] - step1[28]);
    1145           0 :   output[29] = WRAPLOW(step1[2] - step1[29]);
    1146           0 :   output[30] = WRAPLOW(step1[1] - step1[30]);
    1147           0 :   output[31] = WRAPLOW(step1[0] - step1[31]);
    1148           0 : }
    1149             : 
    1150           0 : void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
    1151             :                               int stride) {
    1152             :   int i, j;
    1153             :   tran_low_t out[32 * 32];
    1154           0 :   tran_low_t *outptr = out;
    1155             :   tran_low_t temp_in[32], temp_out[32];
    1156             : 
    1157             :   // Rows
    1158           0 :   for (i = 0; i < 32; ++i) {
    1159             :     int16_t zero_coeff[16];
    1160           0 :     for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
    1161           0 :     for (j = 0; j < 8; ++j)
    1162           0 :       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
    1163           0 :     for (j = 0; j < 4; ++j)
    1164           0 :       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
    1165           0 :     for (j = 0; j < 2; ++j)
    1166           0 :       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
    1167             : 
    1168           0 :     if (zero_coeff[0] | zero_coeff[1])
    1169           0 :       idct32_c(input, outptr);
    1170             :     else
    1171           0 :       memset(outptr, 0, sizeof(tran_low_t) * 32);
    1172           0 :     input += 32;
    1173           0 :     outptr += 32;
    1174             :   }
    1175             : 
    1176             :   // Columns
    1177           0 :   for (i = 0; i < 32; ++i) {
    1178           0 :     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
    1179           0 :     idct32_c(temp_in, temp_out);
    1180           0 :     for (j = 0; j < 32; ++j) {
    1181           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    1182           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    1183             :     }
    1184             :   }
    1185           0 : }
    1186             : 
    1187           0 : void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
    1188             :                              int stride) {
    1189             :   int i, j;
    1190           0 :   tran_low_t out[32 * 32] = { 0 };
    1191           0 :   tran_low_t *outptr = out;
    1192             :   tran_low_t temp_in[32], temp_out[32];
    1193             : 
    1194             :   // Rows
    1195             :   // Only upper-left 16x16 has non-zero coeff
    1196           0 :   for (i = 0; i < 16; ++i) {
    1197           0 :     idct32_c(input, outptr);
    1198           0 :     input += 32;
    1199           0 :     outptr += 32;
    1200             :   }
    1201             : 
    1202             :   // Columns
    1203           0 :   for (i = 0; i < 32; ++i) {
    1204           0 :     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
    1205           0 :     idct32_c(temp_in, temp_out);
    1206           0 :     for (j = 0; j < 32; ++j) {
    1207           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    1208           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    1209             :     }
    1210             :   }
    1211           0 : }
    1212             : 
    1213           0 : void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
    1214             :                             int stride) {
    1215             :   int i, j;
    1216           0 :   tran_low_t out[32 * 32] = { 0 };
    1217           0 :   tran_low_t *outptr = out;
    1218             :   tran_low_t temp_in[32], temp_out[32];
    1219             : 
    1220             :   // Rows
    1221             :   // Only upper-left 8x8 has non-zero coeff
    1222           0 :   for (i = 0; i < 8; ++i) {
    1223           0 :     idct32_c(input, outptr);
    1224           0 :     input += 32;
    1225           0 :     outptr += 32;
    1226             :   }
    1227             : 
    1228             :   // Columns
    1229           0 :   for (i = 0; i < 32; ++i) {
    1230           0 :     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
    1231           0 :     idct32_c(temp_in, temp_out);
    1232           0 :     for (j = 0; j < 32; ++j) {
    1233           0 :       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
    1234           0 :                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
    1235             :     }
    1236             :   }
    1237           0 : }
    1238             : 
    1239           0 : void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    1240             :   int i, j;
    1241             :   tran_high_t a1;
    1242           0 :   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
    1243             : 
    1244           0 :   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
    1245           0 :   a1 = ROUND_POWER_OF_TWO(out, 6);
    1246             : 
    1247           0 :   for (j = 0; j < 32; ++j) {
    1248           0 :     for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
    1249           0 :     dest += stride;
    1250             :   }
    1251           0 : }
    1252             : 
    1253             : #if CONFIG_VP9_HIGHBITDEPTH
    1254             : 
    1255             : // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
    1256             : // transform amplify bits + 1 bit for contingency in rounding and quantizing
    1257             : #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
    1258             : 
    1259             : static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
    1260             :                                               int size) {
    1261             :   int i;
    1262             :   for (i = 0; i < size; ++i)
    1263             :     if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
    1264             :   return 0;
    1265             : }
    1266             : 
    1267             : void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
    1268             :                                  int stride, int bd) {
    1269             :   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
    1270             :      0.5 shifts per pixel. */
    1271             :   int i;
    1272             :   tran_low_t output[16];
    1273             :   tran_high_t a1, b1, c1, d1, e1;
    1274             :   const tran_low_t *ip = input;
    1275             :   tran_low_t *op = output;
    1276             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    1277             : 
    1278             :   for (i = 0; i < 4; i++) {
    1279             :     a1 = ip[0] >> UNIT_QUANT_SHIFT;
    1280             :     c1 = ip[1] >> UNIT_QUANT_SHIFT;
    1281             :     d1 = ip[2] >> UNIT_QUANT_SHIFT;
    1282             :     b1 = ip[3] >> UNIT_QUANT_SHIFT;
    1283             :     a1 += c1;
    1284             :     d1 -= b1;
    1285             :     e1 = (a1 - d1) >> 1;
    1286             :     b1 = e1 - b1;
    1287             :     c1 = e1 - c1;
    1288             :     a1 -= b1;
    1289             :     d1 += c1;
    1290             :     op[0] = HIGHBD_WRAPLOW(a1, bd);
    1291             :     op[1] = HIGHBD_WRAPLOW(b1, bd);
    1292             :     op[2] = HIGHBD_WRAPLOW(c1, bd);
    1293             :     op[3] = HIGHBD_WRAPLOW(d1, bd);
    1294             :     ip += 4;
    1295             :     op += 4;
    1296             :   }
    1297             : 
    1298             :   ip = output;
    1299             :   for (i = 0; i < 4; i++) {
    1300             :     a1 = ip[4 * 0];
    1301             :     c1 = ip[4 * 1];
    1302             :     d1 = ip[4 * 2];
    1303             :     b1 = ip[4 * 3];
    1304             :     a1 += c1;
    1305             :     d1 -= b1;
    1306             :     e1 = (a1 - d1) >> 1;
    1307             :     b1 = e1 - b1;
    1308             :     c1 = e1 - c1;
    1309             :     a1 -= b1;
    1310             :     d1 += c1;
    1311             :     dest[stride * 0] =
    1312             :         highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
    1313             :     dest[stride * 1] =
    1314             :         highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
    1315             :     dest[stride * 2] =
    1316             :         highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
    1317             :     dest[stride * 3] =
    1318             :         highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
    1319             : 
    1320             :     ip++;
    1321             :     dest++;
    1322             :   }
    1323             : }
    1324             : 
    1325             : void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
    1326             :                                 int stride, int bd) {
    1327             :   int i;
    1328             :   tran_high_t a1, e1;
    1329             :   tran_low_t tmp[4];
    1330             :   const tran_low_t *ip = in;
    1331             :   tran_low_t *op = tmp;
    1332             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    1333             :   (void)bd;
    1334             : 
    1335             :   a1 = ip[0] >> UNIT_QUANT_SHIFT;
    1336             :   e1 = a1 >> 1;
    1337             :   a1 -= e1;
    1338             :   op[0] = HIGHBD_WRAPLOW(a1, bd);
    1339             :   op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
    1340             : 
    1341             :   ip = tmp;
    1342             :   for (i = 0; i < 4; i++) {
    1343             :     e1 = ip[0] >> 1;
    1344             :     a1 = ip[0] - e1;
    1345             :     dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
    1346             :     dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
    1347             :     dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
    1348             :     dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
    1349             :     ip++;
    1350             :     dest++;
    1351             :   }
    1352             : }
    1353             : 
    1354             : void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
    1355             :   tran_low_t step[4];
    1356             :   tran_high_t temp1, temp2;
    1357             :   (void)bd;
    1358             : 
    1359             :   if (detect_invalid_highbd_input(input, 4)) {
    1360             : #if CONFIG_COEFFICIENT_RANGE_CHECKING
    1361             :     assert(0 && "invalid highbd txfm input");
    1362             : #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
    1363             :     memset(output, 0, sizeof(*output) * 4);
    1364             :     return;
    1365             :   }
    1366             : 
    1367             :   // stage 1
    1368             :   temp1 = (input[0] + input[2]) * cospi_16_64;
    1369             :   temp2 = (input[0] - input[2]) * cospi_16_64;
    1370             :   step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1371             :   step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1372             :   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
    1373             :   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
    1374             :   step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1375             :   step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1376             : 
    1377             :   // stage 2
    1378             :   output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
    1379             :   output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
    1380             :   output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
    1381             :   output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
    1382             : }
    1383             : 
    1384             : void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
    1385             :                                  int stride, int bd) {
    1386             :   int i, j;
    1387             :   tran_low_t out[4 * 4];
    1388             :   tran_low_t *outptr = out;
    1389             :   tran_low_t temp_in[4], temp_out[4];
    1390             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    1391             : 
    1392             :   // Rows
    1393             :   for (i = 0; i < 4; ++i) {
    1394             :     vpx_highbd_idct4_c(input, outptr, bd);
    1395             :     input += 4;
    1396             :     outptr += 4;
    1397             :   }
    1398             : 
    1399             :   // Columns
    1400             :   for (i = 0; i < 4; ++i) {
    1401             :     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
    1402             :     vpx_highbd_idct4_c(temp_in, temp_out, bd);
    1403             :     for (j = 0; j < 4; ++j) {
    1404             :       dest[j * stride + i] = highbd_clip_pixel_add(
    1405             :           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
    1406             :     }
    1407             :   }
    1408             : }
    1409             : 
    1410             : void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
    1411             :                                 int stride, int bd) {
    1412             :   int i;
    1413             :   tran_high_t a1;
    1414             :   tran_low_t out =
    1415             :       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
    1416             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    1417             : 
    1418             :   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
    1419             :   a1 = ROUND_POWER_OF_TWO(out, 4);
    1420             : 
    1421             :   for (i = 0; i < 4; i++) {
    1422             :     dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
    1423             :     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
    1424             :     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
    1425             :     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
    1426             :     dest += stride;
    1427             :   }
    1428             : }
    1429             : 
    1430             : void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
    1431             :   tran_low_t step1[8], step2[8];
    1432             :   tran_high_t temp1, temp2;
    1433             : 
    1434             :   if (detect_invalid_highbd_input(input, 8)) {
    1435             : #if CONFIG_COEFFICIENT_RANGE_CHECKING
    1436             :     assert(0 && "invalid highbd txfm input");
    1437             : #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
    1438             :     memset(output, 0, sizeof(*output) * 8);
    1439             :     return;
    1440             :   }
    1441             : 
    1442             :   // stage 1
    1443             :   step1[0] = input[0];
    1444             :   step1[2] = input[4];
    1445             :   step1[1] = input[2];
    1446             :   step1[3] = input[6];
    1447             :   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
    1448             :   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
    1449             :   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1450             :   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1451             :   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
    1452             :   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
    1453             :   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1454             :   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1455             : 
    1456             :   // stage 2 & stage 3 - even half
    1457             :   vpx_highbd_idct4_c(step1, step1, bd);
    1458             : 
    1459             :   // stage 2 - odd half
    1460             :   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
    1461             :   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
    1462             :   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
    1463             :   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
    1464             : 
    1465             :   // stage 3 - odd half
    1466             :   step1[4] = step2[4];
    1467             :   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    1468             :   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    1469             :   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1470             :   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1471             :   step1[7] = step2[7];
    1472             : 
    1473             :   // stage 4
    1474             :   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
    1475             :   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
    1476             :   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
    1477             :   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
    1478             :   output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
    1479             :   output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
    1480             :   output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
    1481             :   output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
    1482             : }
    1483             : 
    1484             : void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
    1485             :                                  int stride, int bd) {
    1486             :   int i, j;
    1487             :   tran_low_t out[8 * 8];
    1488             :   tran_low_t *outptr = out;
    1489             :   tran_low_t temp_in[8], temp_out[8];
    1490             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    1491             : 
    1492             :   // First transform rows
    1493             :   for (i = 0; i < 8; ++i) {
    1494             :     vpx_highbd_idct8_c(input, outptr, bd);
    1495             :     input += 8;
    1496             :     outptr += 8;
    1497             :   }
    1498             : 
    1499             :   // Then transform columns
    1500             :   for (i = 0; i < 8; ++i) {
    1501             :     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
    1502             :     vpx_highbd_idct8_c(temp_in, temp_out, bd);
    1503             :     for (j = 0; j < 8; ++j) {
    1504             :       dest[j * stride + i] = highbd_clip_pixel_add(
    1505             :           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
    1506             :     }
    1507             :   }
    1508             : }
    1509             : 
    1510             : void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
    1511             :                                 int stride, int bd) {
    1512             :   int i, j;
    1513             :   tran_high_t a1;
    1514             :   tran_low_t out =
    1515             :       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
    1516             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    1517             : 
    1518             :   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
    1519             :   a1 = ROUND_POWER_OF_TWO(out, 5);
    1520             :   for (j = 0; j < 8; ++j) {
    1521             :     for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
    1522             :     dest += stride;
    1523             :   }
    1524             : }
    1525             : 
    1526             : void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
    1527             :   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
    1528             :   tran_low_t x0 = input[0];
    1529             :   tran_low_t x1 = input[1];
    1530             :   tran_low_t x2 = input[2];
    1531             :   tran_low_t x3 = input[3];
    1532             :   (void)bd;
    1533             : 
    1534             :   if (detect_invalid_highbd_input(input, 4)) {
    1535             : #if CONFIG_COEFFICIENT_RANGE_CHECKING
    1536             :     assert(0 && "invalid highbd txfm input");
    1537             : #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
    1538             :     memset(output, 0, sizeof(*output) * 4);
    1539             :     return;
    1540             :   }
    1541             : 
    1542             :   if (!(x0 | x1 | x2 | x3)) {
    1543             :     memset(output, 0, 4 * sizeof(*output));
    1544             :     return;
    1545             :   }
    1546             : 
    1547             :   s0 = sinpi_1_9 * x0;
    1548             :   s1 = sinpi_2_9 * x0;
    1549             :   s2 = sinpi_3_9 * x1;
    1550             :   s3 = sinpi_4_9 * x2;
    1551             :   s4 = sinpi_1_9 * x2;
    1552             :   s5 = sinpi_2_9 * x3;
    1553             :   s6 = sinpi_4_9 * x3;
    1554             :   s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
    1555             : 
    1556             :   s0 = s0 + s3 + s5;
    1557             :   s1 = s1 - s4 - s6;
    1558             :   s3 = s2;
    1559             :   s2 = sinpi_3_9 * s7;
    1560             : 
    1561             :   // 1-D transform scaling factor is sqrt(2).
    1562             :   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
    1563             :   // + 1b (addition) = 29b.
    1564             :   // Hence the output bit depth is 15b.
    1565             :   output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
    1566             :   output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
    1567             :   output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
    1568             :   output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
    1569             : }
    1570             : 
    1571             : void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
    1572             :   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
    1573             :   tran_low_t x0 = input[7];
    1574             :   tran_low_t x1 = input[0];
    1575             :   tran_low_t x2 = input[5];
    1576             :   tran_low_t x3 = input[2];
    1577             :   tran_low_t x4 = input[3];
    1578             :   tran_low_t x5 = input[4];
    1579             :   tran_low_t x6 = input[1];
    1580             :   tran_low_t x7 = input[6];
    1581             :   (void)bd;
    1582             : 
    1583             :   if (detect_invalid_highbd_input(input, 8)) {
    1584             : #if CONFIG_COEFFICIENT_RANGE_CHECKING
    1585             :     assert(0 && "invalid highbd txfm input");
    1586             : #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
    1587             :     memset(output, 0, sizeof(*output) * 8);
    1588             :     return;
    1589             :   }
    1590             : 
    1591             :   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
    1592             :     memset(output, 0, 8 * sizeof(*output));
    1593             :     return;
    1594             :   }
    1595             : 
    1596             :   // stage 1
    1597             :   s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
    1598             :   s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
    1599             :   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
    1600             :   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
    1601             :   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
    1602             :   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
    1603             :   s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
    1604             :   s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
    1605             : 
    1606             :   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
    1607             :   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
    1608             :   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
    1609             :   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
    1610             :   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
    1611             :   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
    1612             :   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
    1613             :   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
    1614             : 
    1615             :   // stage 2
    1616             :   s0 = x0;
    1617             :   s1 = x1;
    1618             :   s2 = x2;
    1619             :   s3 = x3;
    1620             :   s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
    1621             :   s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
    1622             :   s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
    1623             :   s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
    1624             : 
    1625             :   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
    1626             :   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
    1627             :   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
    1628             :   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
    1629             :   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
    1630             :   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
    1631             :   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
    1632             :   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
    1633             : 
    1634             :   // stage 3
    1635             :   s2 = cospi_16_64 * (x2 + x3);
    1636             :   s3 = cospi_16_64 * (x2 - x3);
    1637             :   s6 = cospi_16_64 * (x6 + x7);
    1638             :   s7 = cospi_16_64 * (x6 - x7);
    1639             : 
    1640             :   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
    1641             :   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
    1642             :   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
    1643             :   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
    1644             : 
    1645             :   output[0] = HIGHBD_WRAPLOW(x0, bd);
    1646             :   output[1] = HIGHBD_WRAPLOW(-x4, bd);
    1647             :   output[2] = HIGHBD_WRAPLOW(x6, bd);
    1648             :   output[3] = HIGHBD_WRAPLOW(-x2, bd);
    1649             :   output[4] = HIGHBD_WRAPLOW(x3, bd);
    1650             :   output[5] = HIGHBD_WRAPLOW(-x7, bd);
    1651             :   output[6] = HIGHBD_WRAPLOW(x5, bd);
    1652             :   output[7] = HIGHBD_WRAPLOW(-x1, bd);
    1653             : }
    1654             : 
    1655             : void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
    1656             :                                  int stride, int bd) {
    1657             :   int i, j;
    1658             :   tran_low_t out[8 * 8] = { 0 };
    1659             :   tran_low_t *outptr = out;
    1660             :   tran_low_t temp_in[8], temp_out[8];
    1661             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    1662             : 
    1663             :   // First transform rows
    1664             :   // Only first 4 row has non-zero coefs
    1665             :   for (i = 0; i < 4; ++i) {
    1666             :     vpx_highbd_idct8_c(input, outptr, bd);
    1667             :     input += 8;
    1668             :     outptr += 8;
    1669             :   }
    1670             : 
    1671             :   // Then transform columns
    1672             :   for (i = 0; i < 8; ++i) {
    1673             :     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
    1674             :     vpx_highbd_idct8_c(temp_in, temp_out, bd);
    1675             :     for (j = 0; j < 8; ++j) {
    1676             :       dest[j * stride + i] = highbd_clip_pixel_add(
    1677             :           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
    1678             :     }
    1679             :   }
    1680             : }
    1681             : 
    1682             : void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
    1683             :   tran_low_t step1[16], step2[16];
    1684             :   tran_high_t temp1, temp2;
    1685             :   (void)bd;
    1686             : 
    1687             :   if (detect_invalid_highbd_input(input, 16)) {
    1688             : #if CONFIG_COEFFICIENT_RANGE_CHECKING
    1689             :     assert(0 && "invalid highbd txfm input");
    1690             : #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
    1691             :     memset(output, 0, sizeof(*output) * 16);
    1692             :     return;
    1693             :   }
    1694             : 
    1695             :   // stage 1
    1696             :   step1[0] = input[0 / 2];
    1697             :   step1[1] = input[16 / 2];
    1698             :   step1[2] = input[8 / 2];
    1699             :   step1[3] = input[24 / 2];
    1700             :   step1[4] = input[4 / 2];
    1701             :   step1[5] = input[20 / 2];
    1702             :   step1[6] = input[12 / 2];
    1703             :   step1[7] = input[28 / 2];
    1704             :   step1[8] = input[2 / 2];
    1705             :   step1[9] = input[18 / 2];
    1706             :   step1[10] = input[10 / 2];
    1707             :   step1[11] = input[26 / 2];
    1708             :   step1[12] = input[6 / 2];
    1709             :   step1[13] = input[22 / 2];
    1710             :   step1[14] = input[14 / 2];
    1711             :   step1[15] = input[30 / 2];
    1712             : 
    1713             :   // stage 2
    1714             :   step2[0] = step1[0];
    1715             :   step2[1] = step1[1];
    1716             :   step2[2] = step1[2];
    1717             :   step2[3] = step1[3];
    1718             :   step2[4] = step1[4];
    1719             :   step2[5] = step1[5];
    1720             :   step2[6] = step1[6];
    1721             :   step2[7] = step1[7];
    1722             : 
    1723             :   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
    1724             :   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
    1725             :   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1726             :   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1727             : 
    1728             :   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
    1729             :   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
    1730             :   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1731             :   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1732             : 
    1733             :   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
    1734             :   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
    1735             :   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1736             :   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1737             : 
    1738             :   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
    1739             :   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
    1740             :   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1741             :   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1742             : 
    1743             :   // stage 3
    1744             :   step1[0] = step2[0];
    1745             :   step1[1] = step2[1];
    1746             :   step1[2] = step2[2];
    1747             :   step1[3] = step2[3];
    1748             : 
    1749             :   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
    1750             :   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
    1751             :   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1752             :   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1753             :   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
    1754             :   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
    1755             :   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1756             :   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1757             : 
    1758             :   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
    1759             :   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
    1760             :   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
    1761             :   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
    1762             :   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
    1763             :   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
    1764             :   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
    1765             :   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
    1766             : 
    1767             :   // stage 4
    1768             :   temp1 = (step1[0] + step1[1]) * cospi_16_64;
    1769             :   temp2 = (step1[0] - step1[1]) * cospi_16_64;
    1770             :   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1771             :   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1772             :   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
    1773             :   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
    1774             :   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1775             :   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1776             :   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
    1777             :   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
    1778             :   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
    1779             :   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
    1780             : 
    1781             :   step2[8] = step1[8];
    1782             :   step2[15] = step1[15];
    1783             :   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
    1784             :   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
    1785             :   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1786             :   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1787             :   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
    1788             :   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
    1789             :   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1790             :   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1791             :   step2[11] = step1[11];
    1792             :   step2[12] = step1[12];
    1793             : 
    1794             :   // stage 5
    1795             :   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
    1796             :   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
    1797             :   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
    1798             :   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
    1799             :   step1[4] = step2[4];
    1800             :   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    1801             :   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    1802             :   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1803             :   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1804             :   step1[7] = step2[7];
    1805             : 
    1806             :   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
    1807             :   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
    1808             :   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
    1809             :   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
    1810             :   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
    1811             :   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
    1812             :   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
    1813             :   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
    1814             : 
    1815             :   // stage 6
    1816             :   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
    1817             :   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
    1818             :   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
    1819             :   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
    1820             :   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
    1821             :   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
    1822             :   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
    1823             :   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
    1824             :   step2[8] = step1[8];
    1825             :   step2[9] = step1[9];
    1826             :   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
    1827             :   temp2 = (step1[10] + step1[13]) * cospi_16_64;
    1828             :   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1829             :   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1830             :   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
    1831             :   temp2 = (step1[11] + step1[12]) * cospi_16_64;
    1832             :   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    1833             :   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    1834             :   step2[14] = step1[14];
    1835             :   step2[15] = step1[15];
    1836             : 
    1837             :   // stage 7
    1838             :   output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
    1839             :   output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
    1840             :   output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
    1841             :   output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
    1842             :   output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
    1843             :   output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
    1844             :   output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
    1845             :   output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
    1846             :   output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
    1847             :   output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
    1848             :   output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
    1849             :   output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
    1850             :   output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
    1851             :   output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
    1852             :   output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
    1853             :   output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
    1854             : }
    1855             : 
    1856             : void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
    1857             :                                     int stride, int bd) {
    1858             :   int i, j;
    1859             :   tran_low_t out[16 * 16];
    1860             :   tran_low_t *outptr = out;
    1861             :   tran_low_t temp_in[16], temp_out[16];
    1862             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    1863             : 
    1864             :   // First transform rows
    1865             :   for (i = 0; i < 16; ++i) {
    1866             :     vpx_highbd_idct16_c(input, outptr, bd);
    1867             :     input += 16;
    1868             :     outptr += 16;
    1869             :   }
    1870             : 
    1871             :   // Then transform columns
    1872             :   for (i = 0; i < 16; ++i) {
    1873             :     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
    1874             :     vpx_highbd_idct16_c(temp_in, temp_out, bd);
    1875             :     for (j = 0; j < 16; ++j) {
    1876             :       dest[j * stride + i] = highbd_clip_pixel_add(
    1877             :           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
    1878             :     }
    1879             :   }
    1880             : }
    1881             : 
    1882             : void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
    1883             :   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
    1884             :   tran_high_t s9, s10, s11, s12, s13, s14, s15;
    1885             :   tran_low_t x0 = input[15];
    1886             :   tran_low_t x1 = input[0];
    1887             :   tran_low_t x2 = input[13];
    1888             :   tran_low_t x3 = input[2];
    1889             :   tran_low_t x4 = input[11];
    1890             :   tran_low_t x5 = input[4];
    1891             :   tran_low_t x6 = input[9];
    1892             :   tran_low_t x7 = input[6];
    1893             :   tran_low_t x8 = input[7];
    1894             :   tran_low_t x9 = input[8];
    1895             :   tran_low_t x10 = input[5];
    1896             :   tran_low_t x11 = input[10];
    1897             :   tran_low_t x12 = input[3];
    1898             :   tran_low_t x13 = input[12];
    1899             :   tran_low_t x14 = input[1];
    1900             :   tran_low_t x15 = input[14];
    1901             :   (void)bd;
    1902             : 
    1903             :   if (detect_invalid_highbd_input(input, 16)) {
    1904             : #if CONFIG_COEFFICIENT_RANGE_CHECKING
    1905             :     assert(0 && "invalid highbd txfm input");
    1906             : #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
    1907             :     memset(output, 0, sizeof(*output) * 16);
    1908             :     return;
    1909             :   }
    1910             : 
    1911             :   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
    1912             :         x13 | x14 | x15)) {
    1913             :     memset(output, 0, 16 * sizeof(*output));
    1914             :     return;
    1915             :   }
    1916             : 
    1917             :   // stage 1
    1918             :   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
    1919             :   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
    1920             :   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
    1921             :   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
    1922             :   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
    1923             :   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
    1924             :   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
    1925             :   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
    1926             :   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
    1927             :   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
    1928             :   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
    1929             :   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
    1930             :   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
    1931             :   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
    1932             :   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
    1933             :   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
    1934             : 
    1935             :   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
    1936             :   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
    1937             :   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
    1938             :   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
    1939             :   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
    1940             :   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
    1941             :   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
    1942             :   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
    1943             :   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
    1944             :   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
    1945             :   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
    1946             :   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
    1947             :   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
    1948             :   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
    1949             :   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
    1950             :   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
    1951             : 
    1952             :   // stage 2
    1953             :   s0 = x0;
    1954             :   s1 = x1;
    1955             :   s2 = x2;
    1956             :   s3 = x3;
    1957             :   s4 = x4;
    1958             :   s5 = x5;
    1959             :   s6 = x6;
    1960             :   s7 = x7;
    1961             :   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
    1962             :   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
    1963             :   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
    1964             :   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
    1965             :   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
    1966             :   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
    1967             :   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
    1968             :   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
    1969             : 
    1970             :   x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
    1971             :   x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
    1972             :   x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
    1973             :   x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
    1974             :   x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
    1975             :   x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
    1976             :   x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
    1977             :   x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
    1978             :   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
    1979             :   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
    1980             :   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
    1981             :   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
    1982             :   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
    1983             :   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
    1984             :   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
    1985             :   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
    1986             : 
    1987             :   // stage 3
    1988             :   s0 = x0;
    1989             :   s1 = x1;
    1990             :   s2 = x2;
    1991             :   s3 = x3;
    1992             :   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
    1993             :   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
    1994             :   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
    1995             :   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
    1996             :   s8 = x8;
    1997             :   s9 = x9;
    1998             :   s10 = x10;
    1999             :   s11 = x11;
    2000             :   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
    2001             :   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
    2002             :   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
    2003             :   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
    2004             : 
    2005             :   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
    2006             :   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
    2007             :   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
    2008             :   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
    2009             :   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
    2010             :   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
    2011             :   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
    2012             :   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
    2013             :   x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
    2014             :   x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
    2015             :   x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
    2016             :   x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
    2017             :   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
    2018             :   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
    2019             :   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
    2020             :   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
    2021             : 
    2022             :   // stage 4
    2023             :   s2 = (-cospi_16_64) * (x2 + x3);
    2024             :   s3 = cospi_16_64 * (x2 - x3);
    2025             :   s6 = cospi_16_64 * (x6 + x7);
    2026             :   s7 = cospi_16_64 * (-x6 + x7);
    2027             :   s10 = cospi_16_64 * (x10 + x11);
    2028             :   s11 = cospi_16_64 * (-x10 + x11);
    2029             :   s14 = (-cospi_16_64) * (x14 + x15);
    2030             :   s15 = cospi_16_64 * (x14 - x15);
    2031             : 
    2032             :   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
    2033             :   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
    2034             :   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
    2035             :   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
    2036             :   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
    2037             :   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
    2038             :   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
    2039             :   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
    2040             : 
    2041             :   output[0] = HIGHBD_WRAPLOW(x0, bd);
    2042             :   output[1] = HIGHBD_WRAPLOW(-x8, bd);
    2043             :   output[2] = HIGHBD_WRAPLOW(x12, bd);
    2044             :   output[3] = HIGHBD_WRAPLOW(-x4, bd);
    2045             :   output[4] = HIGHBD_WRAPLOW(x6, bd);
    2046             :   output[5] = HIGHBD_WRAPLOW(x14, bd);
    2047             :   output[6] = HIGHBD_WRAPLOW(x10, bd);
    2048             :   output[7] = HIGHBD_WRAPLOW(x2, bd);
    2049             :   output[8] = HIGHBD_WRAPLOW(x3, bd);
    2050             :   output[9] = HIGHBD_WRAPLOW(x11, bd);
    2051             :   output[10] = HIGHBD_WRAPLOW(x15, bd);
    2052             :   output[11] = HIGHBD_WRAPLOW(x7, bd);
    2053             :   output[12] = HIGHBD_WRAPLOW(x5, bd);
    2054             :   output[13] = HIGHBD_WRAPLOW(-x13, bd);
    2055             :   output[14] = HIGHBD_WRAPLOW(x9, bd);
    2056             :   output[15] = HIGHBD_WRAPLOW(-x1, bd);
    2057             : }
    2058             : 
    2059             : void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
    2060             :                                    int stride, int bd) {
    2061             :   int i, j;
    2062             :   tran_low_t out[16 * 16] = { 0 };
    2063             :   tran_low_t *outptr = out;
    2064             :   tran_low_t temp_in[16], temp_out[16];
    2065             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    2066             : 
    2067             :   // First transform rows. Since all non-zero dct coefficients are in
    2068             :   // upper-left 4x4 area, we only need to calculate first 4 rows here.
    2069             :   for (i = 0; i < 4; ++i) {
    2070             :     vpx_highbd_idct16_c(input, outptr, bd);
    2071             :     input += 16;
    2072             :     outptr += 16;
    2073             :   }
    2074             : 
    2075             :   // Then transform columns
    2076             :   for (i = 0; i < 16; ++i) {
    2077             :     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
    2078             :     vpx_highbd_idct16_c(temp_in, temp_out, bd);
    2079             :     for (j = 0; j < 16; ++j) {
    2080             :       dest[j * stride + i] = highbd_clip_pixel_add(
    2081             :           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
    2082             :     }
    2083             :   }
    2084             : }
    2085             : 
    2086             : void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
    2087             :                                   int stride, int bd) {
    2088             :   int i, j;
    2089             :   tran_high_t a1;
    2090             :   tran_low_t out =
    2091             :       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
    2092             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    2093             : 
    2094             :   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
    2095             :   a1 = ROUND_POWER_OF_TWO(out, 6);
    2096             :   for (j = 0; j < 16; ++j) {
    2097             :     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
    2098             :     dest += stride;
    2099             :   }
    2100             : }
    2101             : 
    2102             : static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
    2103             :                             int bd) {
    2104             :   tran_low_t step1[32], step2[32];
    2105             :   tran_high_t temp1, temp2;
    2106             :   (void)bd;
    2107             : 
    2108             :   if (detect_invalid_highbd_input(input, 32)) {
    2109             : #if CONFIG_COEFFICIENT_RANGE_CHECKING
    2110             :     assert(0 && "invalid highbd txfm input");
    2111             : #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
    2112             :     memset(output, 0, sizeof(*output) * 32);
    2113             :     return;
    2114             :   }
    2115             : 
    2116             :   // stage 1
    2117             :   step1[0] = input[0];
    2118             :   step1[1] = input[16];
    2119             :   step1[2] = input[8];
    2120             :   step1[3] = input[24];
    2121             :   step1[4] = input[4];
    2122             :   step1[5] = input[20];
    2123             :   step1[6] = input[12];
    2124             :   step1[7] = input[28];
    2125             :   step1[8] = input[2];
    2126             :   step1[9] = input[18];
    2127             :   step1[10] = input[10];
    2128             :   step1[11] = input[26];
    2129             :   step1[12] = input[6];
    2130             :   step1[13] = input[22];
    2131             :   step1[14] = input[14];
    2132             :   step1[15] = input[30];
    2133             : 
    2134             :   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
    2135             :   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
    2136             :   step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2137             :   step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2138             : 
    2139             :   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
    2140             :   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
    2141             :   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2142             :   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2143             : 
    2144             :   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
    2145             :   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
    2146             :   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2147             :   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2148             : 
    2149             :   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
    2150             :   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
    2151             :   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2152             :   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2153             : 
    2154             :   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
    2155             :   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
    2156             :   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2157             :   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2158             : 
    2159             :   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
    2160             :   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
    2161             :   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2162             :   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2163             : 
    2164             :   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
    2165             :   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
    2166             :   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2167             :   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2168             : 
    2169             :   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
    2170             :   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
    2171             :   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2172             :   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2173             : 
    2174             :   // stage 2
    2175             :   step2[0] = step1[0];
    2176             :   step2[1] = step1[1];
    2177             :   step2[2] = step1[2];
    2178             :   step2[3] = step1[3];
    2179             :   step2[4] = step1[4];
    2180             :   step2[5] = step1[5];
    2181             :   step2[6] = step1[6];
    2182             :   step2[7] = step1[7];
    2183             : 
    2184             :   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
    2185             :   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
    2186             :   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2187             :   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2188             : 
    2189             :   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
    2190             :   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
    2191             :   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2192             :   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2193             : 
    2194             :   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
    2195             :   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
    2196             :   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2197             :   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2198             : 
    2199             :   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
    2200             :   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
    2201             :   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2202             :   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2203             : 
    2204             :   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
    2205             :   step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
    2206             :   step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
    2207             :   step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
    2208             :   step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
    2209             :   step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
    2210             :   step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
    2211             :   step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
    2212             :   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
    2213             :   step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
    2214             :   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
    2215             :   step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
    2216             :   step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
    2217             :   step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
    2218             :   step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
    2219             :   step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
    2220             : 
    2221             :   // stage 3
    2222             :   step1[0] = step2[0];
    2223             :   step1[1] = step2[1];
    2224             :   step1[2] = step2[2];
    2225             :   step1[3] = step2[3];
    2226             : 
    2227             :   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
    2228             :   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
    2229             :   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2230             :   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2231             :   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
    2232             :   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
    2233             :   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2234             :   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2235             : 
    2236             :   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
    2237             :   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
    2238             :   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
    2239             :   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
    2240             :   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
    2241             :   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
    2242             :   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
    2243             :   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
    2244             : 
    2245             :   step1[16] = step2[16];
    2246             :   step1[31] = step2[31];
    2247             :   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
    2248             :   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
    2249             :   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2250             :   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2251             :   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
    2252             :   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
    2253             :   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2254             :   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2255             :   step1[19] = step2[19];
    2256             :   step1[20] = step2[20];
    2257             :   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
    2258             :   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
    2259             :   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2260             :   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2261             :   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
    2262             :   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
    2263             :   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2264             :   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2265             :   step1[23] = step2[23];
    2266             :   step1[24] = step2[24];
    2267             :   step1[27] = step2[27];
    2268             :   step1[28] = step2[28];
    2269             : 
    2270             :   // stage 4
    2271             :   temp1 = (step1[0] + step1[1]) * cospi_16_64;
    2272             :   temp2 = (step1[0] - step1[1]) * cospi_16_64;
    2273             :   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2274             :   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2275             :   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
    2276             :   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
    2277             :   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2278             :   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2279             :   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
    2280             :   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
    2281             :   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
    2282             :   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
    2283             : 
    2284             :   step2[8] = step1[8];
    2285             :   step2[15] = step1[15];
    2286             :   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
    2287             :   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
    2288             :   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2289             :   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2290             :   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
    2291             :   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
    2292             :   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2293             :   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2294             :   step2[11] = step1[11];
    2295             :   step2[12] = step1[12];
    2296             : 
    2297             :   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
    2298             :   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
    2299             :   step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
    2300             :   step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
    2301             :   step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
    2302             :   step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
    2303             :   step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
    2304             :   step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
    2305             : 
    2306             :   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
    2307             :   step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
    2308             :   step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
    2309             :   step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
    2310             :   step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
    2311             :   step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
    2312             :   step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
    2313             :   step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
    2314             : 
    2315             :   // stage 5
    2316             :   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
    2317             :   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
    2318             :   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
    2319             :   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
    2320             :   step1[4] = step2[4];
    2321             :   temp1 = (step2[6] - step2[5]) * cospi_16_64;
    2322             :   temp2 = (step2[5] + step2[6]) * cospi_16_64;
    2323             :   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2324             :   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2325             :   step1[7] = step2[7];
    2326             : 
    2327             :   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
    2328             :   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
    2329             :   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
    2330             :   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
    2331             :   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
    2332             :   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
    2333             :   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
    2334             :   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
    2335             : 
    2336             :   step1[16] = step2[16];
    2337             :   step1[17] = step2[17];
    2338             :   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
    2339             :   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
    2340             :   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2341             :   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2342             :   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
    2343             :   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
    2344             :   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2345             :   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2346             :   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
    2347             :   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
    2348             :   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2349             :   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2350             :   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
    2351             :   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
    2352             :   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2353             :   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2354             :   step1[22] = step2[22];
    2355             :   step1[23] = step2[23];
    2356             :   step1[24] = step2[24];
    2357             :   step1[25] = step2[25];
    2358             :   step1[30] = step2[30];
    2359             :   step1[31] = step2[31];
    2360             : 
    2361             :   // stage 6
    2362             :   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
    2363             :   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
    2364             :   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
    2365             :   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
    2366             :   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
    2367             :   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
    2368             :   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
    2369             :   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
    2370             :   step2[8] = step1[8];
    2371             :   step2[9] = step1[9];
    2372             :   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
    2373             :   temp2 = (step1[10] + step1[13]) * cospi_16_64;
    2374             :   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2375             :   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2376             :   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
    2377             :   temp2 = (step1[11] + step1[12]) * cospi_16_64;
    2378             :   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2379             :   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2380             :   step2[14] = step1[14];
    2381             :   step2[15] = step1[15];
    2382             : 
    2383             :   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
    2384             :   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
    2385             :   step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
    2386             :   step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
    2387             :   step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
    2388             :   step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
    2389             :   step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
    2390             :   step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
    2391             : 
    2392             :   step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
    2393             :   step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
    2394             :   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
    2395             :   step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
    2396             :   step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
    2397             :   step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
    2398             :   step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
    2399             :   step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
    2400             : 
    2401             :   // stage 7
    2402             :   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
    2403             :   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
    2404             :   step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
    2405             :   step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
    2406             :   step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
    2407             :   step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
    2408             :   step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
    2409             :   step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
    2410             :   step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
    2411             :   step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
    2412             :   step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
    2413             :   step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
    2414             :   step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
    2415             :   step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
    2416             :   step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
    2417             :   step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
    2418             : 
    2419             :   step1[16] = step2[16];
    2420             :   step1[17] = step2[17];
    2421             :   step1[18] = step2[18];
    2422             :   step1[19] = step2[19];
    2423             :   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
    2424             :   temp2 = (step2[20] + step2[27]) * cospi_16_64;
    2425             :   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2426             :   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2427             :   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
    2428             :   temp2 = (step2[21] + step2[26]) * cospi_16_64;
    2429             :   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2430             :   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2431             :   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
    2432             :   temp2 = (step2[22] + step2[25]) * cospi_16_64;
    2433             :   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2434             :   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2435             :   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
    2436             :   temp2 = (step2[23] + step2[24]) * cospi_16_64;
    2437             :   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
    2438             :   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
    2439             :   step1[28] = step2[28];
    2440             :   step1[29] = step2[29];
    2441             :   step1[30] = step2[30];
    2442             :   step1[31] = step2[31];
    2443             : 
    2444             :   // final stage
    2445             :   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
    2446             :   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
    2447             :   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
    2448             :   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
    2449             :   output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
    2450             :   output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
    2451             :   output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
    2452             :   output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
    2453             :   output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
    2454             :   output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
    2455             :   output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
    2456             :   output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
    2457             :   output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
    2458             :   output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
    2459             :   output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
    2460             :   output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
    2461             :   output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
    2462             :   output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
    2463             :   output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
    2464             :   output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
    2465             :   output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
    2466             :   output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
    2467             :   output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
    2468             :   output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
    2469             :   output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
    2470             :   output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
    2471             :   output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
    2472             :   output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
    2473             :   output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
    2474             :   output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
    2475             :   output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
    2476             :   output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
    2477             : }
    2478             : 
    2479             : void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
    2480             :                                      int stride, int bd) {
    2481             :   int i, j;
    2482             :   tran_low_t out[32 * 32];
    2483             :   tran_low_t *outptr = out;
    2484             :   tran_low_t temp_in[32], temp_out[32];
    2485             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    2486             : 
    2487             :   // Rows
    2488             :   for (i = 0; i < 32; ++i) {
    2489             :     tran_low_t zero_coeff[16];
    2490             :     for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
    2491             :     for (j = 0; j < 8; ++j)
    2492             :       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
    2493             :     for (j = 0; j < 4; ++j)
    2494             :       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
    2495             :     for (j = 0; j < 2; ++j)
    2496             :       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
    2497             : 
    2498             :     if (zero_coeff[0] | zero_coeff[1])
    2499             :       highbd_idct32_c(input, outptr, bd);
    2500             :     else
    2501             :       memset(outptr, 0, sizeof(tran_low_t) * 32);
    2502             :     input += 32;
    2503             :     outptr += 32;
    2504             :   }
    2505             : 
    2506             :   // Columns
    2507             :   for (i = 0; i < 32; ++i) {
    2508             :     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
    2509             :     highbd_idct32_c(temp_in, temp_out, bd);
    2510             :     for (j = 0; j < 32; ++j) {
    2511             :       dest[j * stride + i] = highbd_clip_pixel_add(
    2512             :           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
    2513             :     }
    2514             :   }
    2515             : }
    2516             : 
    2517             : void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
    2518             :                                    int stride, int bd) {
    2519             :   int i, j;
    2520             :   tran_low_t out[32 * 32] = { 0 };
    2521             :   tran_low_t *outptr = out;
    2522             :   tran_low_t temp_in[32], temp_out[32];
    2523             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    2524             : 
    2525             :   // Rows
    2526             :   // Only upper-left 8x8 has non-zero coeff
    2527             :   for (i = 0; i < 8; ++i) {
    2528             :     highbd_idct32_c(input, outptr, bd);
    2529             :     input += 32;
    2530             :     outptr += 32;
    2531             :   }
    2532             : 
    2533             :   // Columns
    2534             :   for (i = 0; i < 32; ++i) {
    2535             :     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
    2536             :     highbd_idct32_c(temp_in, temp_out, bd);
    2537             :     for (j = 0; j < 32; ++j) {
    2538             :       dest[j * stride + i] = highbd_clip_pixel_add(
    2539             :           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
    2540             :     }
    2541             :   }
    2542             : }
    2543             : 
    2544             : void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
    2545             :                                   int stride, int bd) {
    2546             :   int i, j;
    2547             :   int a1;
    2548             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    2549             :   tran_low_t out =
    2550             :       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
    2551             : 
    2552             :   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
    2553             :   a1 = ROUND_POWER_OF_TWO(out, 6);
    2554             : 
    2555             :   for (j = 0; j < 32; ++j) {
    2556             :     for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
    2557             :     dest += stride;
    2558             :   }
    2559             : }
    2560             : 
    2561             : #endif  // CONFIG_VP9_HIGHBITDEPTH

Generated by: LCOV version 1.13