LCOV - output.info - third_party/aom/av1/encoder/dct.c

LCOV - code coverage report

Current view:	top level - third_party/aom/av1/encoder - dct.c (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	0	1277	0.0 %
Date:	2017-07-14 16:53:18	Functions:	0	53	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <assert.h>
      13             : #include <math.h>
      14             : 
      15             : #include "./aom_config.h"
      16             : #include "./aom_dsp_rtcd.h"
      17             : #include "./av1_rtcd.h"
      18             : #include "aom_dsp/fwd_txfm.h"
      19             : #include "aom_ports/mem.h"
      20             : #include "av1/common/blockd.h"
      21             : #include "av1/common/av1_fwd_txfm1d.h"
      22             : #include "av1/common/av1_fwd_txfm1d_cfg.h"
      23             : #include "av1/common/idct.h"
      24             : 
      25           0 : static INLINE void range_check(const tran_low_t *input, const int size,
      26             :                                const int bit) {
      27             : #if 0  // CONFIG_COEFFICIENT_RANGE_CHECKING
      28             : // TODO(angiebird): the range_check is not used because the bit range
      29             : // in fdct# is not correct. Since we are going to merge in a new version
      30             : // of fdct# from nextgenv2, we won't fix the incorrect bit range now.
      31             :   int i;
      32             :   for (i = 0; i < size; ++i) {
      33             :     assert(abs(input[i]) < (1 << bit));
      34             :   }
      35             : #else
      36             :   (void)input;
      37             :   (void)size;
      38             :   (void)bit;
      39             : #endif
      40           0 : }
      41             : 
      42           0 : static void fdct4(const tran_low_t *input, tran_low_t *output) {
      43             :   tran_high_t temp;
      44             :   tran_low_t step[4];
      45             : 
      46             :   // stage 0
      47           0 :   range_check(input, 4, 14);
      48             : 
      49             :   // stage 1
      50           0 :   output[0] = input[0] + input[3];
      51           0 :   output[1] = input[1] + input[2];
      52           0 :   output[2] = input[1] - input[2];
      53           0 :   output[3] = input[0] - input[3];
      54             : 
      55           0 :   range_check(output, 4, 15);
      56             : 
      57             :   // stage 2
      58           0 :   temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
      59           0 :   step[0] = (tran_low_t)fdct_round_shift(temp);
      60           0 :   temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
      61           0 :   step[1] = (tran_low_t)fdct_round_shift(temp);
      62           0 :   temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
      63           0 :   step[2] = (tran_low_t)fdct_round_shift(temp);
      64           0 :   temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
      65           0 :   step[3] = (tran_low_t)fdct_round_shift(temp);
      66             : 
      67           0 :   range_check(step, 4, 16);
      68             : 
      69             :   // stage 3
      70           0 :   output[0] = step[0];
      71           0 :   output[1] = step[2];
      72           0 :   output[2] = step[1];
      73           0 :   output[3] = step[3];
      74             : 
      75           0 :   range_check(output, 4, 16);
      76           0 : }
      77             : 
      78           0 : static void fdct8(const tran_low_t *input, tran_low_t *output) {
      79             :   tran_high_t temp;
      80             :   tran_low_t step[8];
      81             : 
      82             :   // stage 0
      83           0 :   range_check(input, 8, 13);
      84             : 
      85             :   // stage 1
      86           0 :   output[0] = input[0] + input[7];
      87           0 :   output[1] = input[1] + input[6];
      88           0 :   output[2] = input[2] + input[5];
      89           0 :   output[3] = input[3] + input[4];
      90           0 :   output[4] = input[3] - input[4];
      91           0 :   output[5] = input[2] - input[5];
      92           0 :   output[6] = input[1] - input[6];
      93           0 :   output[7] = input[0] - input[7];
      94             : 
      95           0 :   range_check(output, 8, 14);
      96             : 
      97             :   // stage 2
      98           0 :   step[0] = output[0] + output[3];
      99           0 :   step[1] = output[1] + output[2];
     100           0 :   step[2] = output[1] - output[2];
     101           0 :   step[3] = output[0] - output[3];
     102           0 :   step[4] = output[4];
     103           0 :   temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
     104           0 :   step[5] = (tran_low_t)fdct_round_shift(temp);
     105           0 :   temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
     106           0 :   step[6] = (tran_low_t)fdct_round_shift(temp);
     107           0 :   step[7] = output[7];
     108             : 
     109           0 :   range_check(step, 8, 15);
     110             : 
     111             :   // stage 3
     112           0 :   temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
     113           0 :   output[0] = (tran_low_t)fdct_round_shift(temp);
     114           0 :   temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
     115           0 :   output[1] = (tran_low_t)fdct_round_shift(temp);
     116           0 :   temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
     117           0 :   output[2] = (tran_low_t)fdct_round_shift(temp);
     118           0 :   temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
     119           0 :   output[3] = (tran_low_t)fdct_round_shift(temp);
     120           0 :   output[4] = step[4] + step[5];
     121           0 :   output[5] = step[4] - step[5];
     122           0 :   output[6] = step[7] - step[6];
     123           0 :   output[7] = step[7] + step[6];
     124             : 
     125           0 :   range_check(output, 8, 16);
     126             : 
     127             :   // stage 4
     128           0 :   step[0] = output[0];
     129           0 :   step[1] = output[1];
     130           0 :   step[2] = output[2];
     131           0 :   step[3] = output[3];
     132           0 :   temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
     133           0 :   step[4] = (tran_low_t)fdct_round_shift(temp);
     134           0 :   temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
     135           0 :   step[5] = (tran_low_t)fdct_round_shift(temp);
     136           0 :   temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
     137           0 :   step[6] = (tran_low_t)fdct_round_shift(temp);
     138           0 :   temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
     139           0 :   step[7] = (tran_low_t)fdct_round_shift(temp);
     140             : 
     141           0 :   range_check(step, 8, 16);
     142             : 
     143             :   // stage 5
     144           0 :   output[0] = step[0];
     145           0 :   output[1] = step[4];
     146           0 :   output[2] = step[2];
     147           0 :   output[3] = step[6];
     148           0 :   output[4] = step[1];
     149           0 :   output[5] = step[5];
     150           0 :   output[6] = step[3];
     151           0 :   output[7] = step[7];
     152             : 
     153           0 :   range_check(output, 8, 16);
     154           0 : }
     155             : 
     156           0 : static void fdct16(const tran_low_t *input, tran_low_t *output) {
     157             :   tran_high_t temp;
     158             :   tran_low_t step[16];
     159             : 
     160             :   // stage 0
     161           0 :   range_check(input, 16, 13);
     162             : 
     163             :   // stage 1
     164           0 :   output[0] = input[0] + input[15];
     165           0 :   output[1] = input[1] + input[14];
     166           0 :   output[2] = input[2] + input[13];
     167           0 :   output[3] = input[3] + input[12];
     168           0 :   output[4] = input[4] + input[11];
     169           0 :   output[5] = input[5] + input[10];
     170           0 :   output[6] = input[6] + input[9];
     171           0 :   output[7] = input[7] + input[8];
     172           0 :   output[8] = input[7] - input[8];
     173           0 :   output[9] = input[6] - input[9];
     174           0 :   output[10] = input[5] - input[10];
     175           0 :   output[11] = input[4] - input[11];
     176           0 :   output[12] = input[3] - input[12];
     177           0 :   output[13] = input[2] - input[13];
     178           0 :   output[14] = input[1] - input[14];
     179           0 :   output[15] = input[0] - input[15];
     180             : 
     181           0 :   range_check(output, 16, 14);
     182             : 
     183             :   // stage 2
     184           0 :   step[0] = output[0] + output[7];
     185           0 :   step[1] = output[1] + output[6];
     186           0 :   step[2] = output[2] + output[5];
     187           0 :   step[3] = output[3] + output[4];
     188           0 :   step[4] = output[3] - output[4];
     189           0 :   step[5] = output[2] - output[5];
     190           0 :   step[6] = output[1] - output[6];
     191           0 :   step[7] = output[0] - output[7];
     192           0 :   step[8] = output[8];
     193           0 :   step[9] = output[9];
     194           0 :   temp = output[10] * -cospi_16_64 + output[13] * cospi_16_64;
     195           0 :   step[10] = (tran_low_t)fdct_round_shift(temp);
     196           0 :   temp = output[11] * -cospi_16_64 + output[12] * cospi_16_64;
     197           0 :   step[11] = (tran_low_t)fdct_round_shift(temp);
     198           0 :   temp = output[12] * cospi_16_64 + output[11] * cospi_16_64;
     199           0 :   step[12] = (tran_low_t)fdct_round_shift(temp);
     200           0 :   temp = output[13] * cospi_16_64 + output[10] * cospi_16_64;
     201           0 :   step[13] = (tran_low_t)fdct_round_shift(temp);
     202           0 :   step[14] = output[14];
     203           0 :   step[15] = output[15];
     204             : 
     205           0 :   range_check(step, 16, 15);
     206             : 
     207             :   // stage 3
     208           0 :   output[0] = step[0] + step[3];
     209           0 :   output[1] = step[1] + step[2];
     210           0 :   output[2] = step[1] - step[2];
     211           0 :   output[3] = step[0] - step[3];
     212           0 :   output[4] = step[4];
     213           0 :   temp = step[5] * -cospi_16_64 + step[6] * cospi_16_64;
     214           0 :   output[5] = (tran_low_t)fdct_round_shift(temp);
     215           0 :   temp = step[6] * cospi_16_64 + step[5] * cospi_16_64;
     216           0 :   output[6] = (tran_low_t)fdct_round_shift(temp);
     217           0 :   output[7] = step[7];
     218           0 :   output[8] = step[8] + step[11];
     219           0 :   output[9] = step[9] + step[10];
     220           0 :   output[10] = step[9] - step[10];
     221           0 :   output[11] = step[8] - step[11];
     222           0 :   output[12] = step[15] - step[12];
     223           0 :   output[13] = step[14] - step[13];
     224           0 :   output[14] = step[14] + step[13];
     225           0 :   output[15] = step[15] + step[12];
     226             : 
     227           0 :   range_check(output, 16, 16);
     228             : 
     229             :   // stage 4
     230           0 :   temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
     231           0 :   step[0] = (tran_low_t)fdct_round_shift(temp);
     232           0 :   temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
     233           0 :   step[1] = (tran_low_t)fdct_round_shift(temp);
     234           0 :   temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
     235           0 :   step[2] = (tran_low_t)fdct_round_shift(temp);
     236           0 :   temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
     237           0 :   step[3] = (tran_low_t)fdct_round_shift(temp);
     238           0 :   step[4] = output[4] + output[5];
     239           0 :   step[5] = output[4] - output[5];
     240           0 :   step[6] = output[7] - output[6];
     241           0 :   step[7] = output[7] + output[6];
     242           0 :   step[8] = output[8];
     243           0 :   temp = output[9] * -cospi_8_64 + output[14] * cospi_24_64;
     244           0 :   step[9] = (tran_low_t)fdct_round_shift(temp);
     245           0 :   temp = output[10] * -cospi_24_64 + output[13] * -cospi_8_64;
     246           0 :   step[10] = (tran_low_t)fdct_round_shift(temp);
     247           0 :   step[11] = output[11];
     248           0 :   step[12] = output[12];
     249           0 :   temp = output[13] * cospi_24_64 + output[10] * -cospi_8_64;
     250           0 :   step[13] = (tran_low_t)fdct_round_shift(temp);
     251           0 :   temp = output[14] * cospi_8_64 + output[9] * cospi_24_64;
     252           0 :   step[14] = (tran_low_t)fdct_round_shift(temp);
     253           0 :   step[15] = output[15];
     254             : 
     255           0 :   range_check(step, 16, 16);
     256             : 
     257             :   // stage 5
     258           0 :   output[0] = step[0];
     259           0 :   output[1] = step[1];
     260           0 :   output[2] = step[2];
     261           0 :   output[3] = step[3];
     262           0 :   temp = step[4] * cospi_28_64 + step[7] * cospi_4_64;
     263           0 :   output[4] = (tran_low_t)fdct_round_shift(temp);
     264           0 :   temp = step[5] * cospi_12_64 + step[6] * cospi_20_64;
     265           0 :   output[5] = (tran_low_t)fdct_round_shift(temp);
     266           0 :   temp = step[6] * cospi_12_64 + step[5] * -cospi_20_64;
     267           0 :   output[6] = (tran_low_t)fdct_round_shift(temp);
     268           0 :   temp = step[7] * cospi_28_64 + step[4] * -cospi_4_64;
     269           0 :   output[7] = (tran_low_t)fdct_round_shift(temp);
     270           0 :   output[8] = step[8] + step[9];
     271           0 :   output[9] = step[8] - step[9];
     272           0 :   output[10] = step[11] - step[10];
     273           0 :   output[11] = step[11] + step[10];
     274           0 :   output[12] = step[12] + step[13];
     275           0 :   output[13] = step[12] - step[13];
     276           0 :   output[14] = step[15] - step[14];
     277           0 :   output[15] = step[15] + step[14];
     278             : 
     279           0 :   range_check(output, 16, 16);
     280             : 
     281             :   // stage 6
     282           0 :   step[0] = output[0];
     283           0 :   step[1] = output[1];
     284           0 :   step[2] = output[2];
     285           0 :   step[3] = output[3];
     286           0 :   step[4] = output[4];
     287           0 :   step[5] = output[5];
     288           0 :   step[6] = output[6];
     289           0 :   step[7] = output[7];
     290           0 :   temp = output[8] * cospi_30_64 + output[15] * cospi_2_64;
     291           0 :   step[8] = (tran_low_t)fdct_round_shift(temp);
     292           0 :   temp = output[9] * cospi_14_64 + output[14] * cospi_18_64;
     293           0 :   step[9] = (tran_low_t)fdct_round_shift(temp);
     294           0 :   temp = output[10] * cospi_22_64 + output[13] * cospi_10_64;
     295           0 :   step[10] = (tran_low_t)fdct_round_shift(temp);
     296           0 :   temp = output[11] * cospi_6_64 + output[12] * cospi_26_64;
     297           0 :   step[11] = (tran_low_t)fdct_round_shift(temp);
     298           0 :   temp = output[12] * cospi_6_64 + output[11] * -cospi_26_64;
     299           0 :   step[12] = (tran_low_t)fdct_round_shift(temp);
     300           0 :   temp = output[13] * cospi_22_64 + output[10] * -cospi_10_64;
     301           0 :   step[13] = (tran_low_t)fdct_round_shift(temp);
     302           0 :   temp = output[14] * cospi_14_64 + output[9] * -cospi_18_64;
     303           0 :   step[14] = (tran_low_t)fdct_round_shift(temp);
     304           0 :   temp = output[15] * cospi_30_64 + output[8] * -cospi_2_64;
     305           0 :   step[15] = (tran_low_t)fdct_round_shift(temp);
     306             : 
     307           0 :   range_check(step, 16, 16);
     308             : 
     309             :   // stage 7
     310           0 :   output[0] = step[0];
     311           0 :   output[1] = step[8];
     312           0 :   output[2] = step[4];
     313           0 :   output[3] = step[12];
     314           0 :   output[4] = step[2];
     315           0 :   output[5] = step[10];
     316           0 :   output[6] = step[6];
     317           0 :   output[7] = step[14];
     318           0 :   output[8] = step[1];
     319           0 :   output[9] = step[9];
     320           0 :   output[10] = step[5];
     321           0 :   output[11] = step[13];
     322           0 :   output[12] = step[3];
     323           0 :   output[13] = step[11];
     324           0 :   output[14] = step[7];
     325           0 :   output[15] = step[15];
     326             : 
     327           0 :   range_check(output, 16, 16);
     328           0 : }
     329             : 
     330           0 : static void fdct32(const tran_low_t *input, tran_low_t *output) {
     331             :   tran_high_t temp;
     332             :   tran_low_t step[32];
     333             : 
     334             :   // stage 0
     335           0 :   range_check(input, 32, 14);
     336             : 
     337             :   // stage 1
     338           0 :   output[0] = input[0] + input[31];
     339           0 :   output[1] = input[1] + input[30];
     340           0 :   output[2] = input[2] + input[29];
     341           0 :   output[3] = input[3] + input[28];
     342           0 :   output[4] = input[4] + input[27];
     343           0 :   output[5] = input[5] + input[26];
     344           0 :   output[6] = input[6] + input[25];
     345           0 :   output[7] = input[7] + input[24];
     346           0 :   output[8] = input[8] + input[23];
     347           0 :   output[9] = input[9] + input[22];
     348           0 :   output[10] = input[10] + input[21];
     349           0 :   output[11] = input[11] + input[20];
     350           0 :   output[12] = input[12] + input[19];
     351           0 :   output[13] = input[13] + input[18];
     352           0 :   output[14] = input[14] + input[17];
     353           0 :   output[15] = input[15] + input[16];
     354           0 :   output[16] = input[15] - input[16];
     355           0 :   output[17] = input[14] - input[17];
     356           0 :   output[18] = input[13] - input[18];
     357           0 :   output[19] = input[12] - input[19];
     358           0 :   output[20] = input[11] - input[20];
     359           0 :   output[21] = input[10] - input[21];
     360           0 :   output[22] = input[9] - input[22];
     361           0 :   output[23] = input[8] - input[23];
     362           0 :   output[24] = input[7] - input[24];
     363           0 :   output[25] = input[6] - input[25];
     364           0 :   output[26] = input[5] - input[26];
     365           0 :   output[27] = input[4] - input[27];
     366           0 :   output[28] = input[3] - input[28];
     367           0 :   output[29] = input[2] - input[29];
     368           0 :   output[30] = input[1] - input[30];
     369           0 :   output[31] = input[0] - input[31];
     370             : 
     371           0 :   range_check(output, 32, 15);
     372             : 
     373             :   // stage 2
     374           0 :   step[0] = output[0] + output[15];
     375           0 :   step[1] = output[1] + output[14];
     376           0 :   step[2] = output[2] + output[13];
     377           0 :   step[3] = output[3] + output[12];
     378           0 :   step[4] = output[4] + output[11];
     379           0 :   step[5] = output[5] + output[10];
     380           0 :   step[6] = output[6] + output[9];
     381           0 :   step[7] = output[7] + output[8];
     382           0 :   step[8] = output[7] - output[8];
     383           0 :   step[9] = output[6] - output[9];
     384           0 :   step[10] = output[5] - output[10];
     385           0 :   step[11] = output[4] - output[11];
     386           0 :   step[12] = output[3] - output[12];
     387           0 :   step[13] = output[2] - output[13];
     388           0 :   step[14] = output[1] - output[14];
     389           0 :   step[15] = output[0] - output[15];
     390           0 :   step[16] = output[16];
     391           0 :   step[17] = output[17];
     392           0 :   step[18] = output[18];
     393           0 :   step[19] = output[19];
     394           0 :   temp = output[20] * -cospi_16_64 + output[27] * cospi_16_64;
     395           0 :   step[20] = (tran_low_t)fdct_round_shift(temp);
     396           0 :   temp = output[21] * -cospi_16_64 + output[26] * cospi_16_64;
     397           0 :   step[21] = (tran_low_t)fdct_round_shift(temp);
     398           0 :   temp = output[22] * -cospi_16_64 + output[25] * cospi_16_64;
     399           0 :   step[22] = (tran_low_t)fdct_round_shift(temp);
     400           0 :   temp = output[23] * -cospi_16_64 + output[24] * cospi_16_64;
     401           0 :   step[23] = (tran_low_t)fdct_round_shift(temp);
     402           0 :   temp = output[24] * cospi_16_64 + output[23] * cospi_16_64;
     403           0 :   step[24] = (tran_low_t)fdct_round_shift(temp);
     404           0 :   temp = output[25] * cospi_16_64 + output[22] * cospi_16_64;
     405           0 :   step[25] = (tran_low_t)fdct_round_shift(temp);
     406           0 :   temp = output[26] * cospi_16_64 + output[21] * cospi_16_64;
     407           0 :   step[26] = (tran_low_t)fdct_round_shift(temp);
     408           0 :   temp = output[27] * cospi_16_64 + output[20] * cospi_16_64;
     409           0 :   step[27] = (tran_low_t)fdct_round_shift(temp);
     410           0 :   step[28] = output[28];
     411           0 :   step[29] = output[29];
     412           0 :   step[30] = output[30];
     413           0 :   step[31] = output[31];
     414             : 
     415           0 :   range_check(step, 32, 16);
     416             : 
     417             :   // stage 3
     418           0 :   output[0] = step[0] + step[7];
     419           0 :   output[1] = step[1] + step[6];
     420           0 :   output[2] = step[2] + step[5];
     421           0 :   output[3] = step[3] + step[4];
     422           0 :   output[4] = step[3] - step[4];
     423           0 :   output[5] = step[2] - step[5];
     424           0 :   output[6] = step[1] - step[6];
     425           0 :   output[7] = step[0] - step[7];
     426           0 :   output[8] = step[8];
     427           0 :   output[9] = step[9];
     428           0 :   temp = step[10] * -cospi_16_64 + step[13] * cospi_16_64;
     429           0 :   output[10] = (tran_low_t)fdct_round_shift(temp);
     430           0 :   temp = step[11] * -cospi_16_64 + step[12] * cospi_16_64;
     431           0 :   output[11] = (tran_low_t)fdct_round_shift(temp);
     432           0 :   temp = step[12] * cospi_16_64 + step[11] * cospi_16_64;
     433           0 :   output[12] = (tran_low_t)fdct_round_shift(temp);
     434           0 :   temp = step[13] * cospi_16_64 + step[10] * cospi_16_64;
     435           0 :   output[13] = (tran_low_t)fdct_round_shift(temp);
     436           0 :   output[14] = step[14];
     437           0 :   output[15] = step[15];
     438           0 :   output[16] = step[16] + step[23];
     439           0 :   output[17] = step[17] + step[22];
     440           0 :   output[18] = step[18] + step[21];
     441           0 :   output[19] = step[19] + step[20];
     442           0 :   output[20] = step[19] - step[20];
     443           0 :   output[21] = step[18] - step[21];
     444           0 :   output[22] = step[17] - step[22];
     445           0 :   output[23] = step[16] - step[23];
     446           0 :   output[24] = step[31] - step[24];
     447           0 :   output[25] = step[30] - step[25];
     448           0 :   output[26] = step[29] - step[26];
     449           0 :   output[27] = step[28] - step[27];
     450           0 :   output[28] = step[28] + step[27];
     451           0 :   output[29] = step[29] + step[26];
     452           0 :   output[30] = step[30] + step[25];
     453           0 :   output[31] = step[31] + step[24];
     454             : 
     455           0 :   range_check(output, 32, 17);
     456             : 
     457             :   // stage 4
     458           0 :   step[0] = output[0] + output[3];
     459           0 :   step[1] = output[1] + output[2];
     460           0 :   step[2] = output[1] - output[2];
     461           0 :   step[3] = output[0] - output[3];
     462           0 :   step[4] = output[4];
     463           0 :   temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
     464           0 :   step[5] = (tran_low_t)fdct_round_shift(temp);
     465           0 :   temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
     466           0 :   step[6] = (tran_low_t)fdct_round_shift(temp);
     467           0 :   step[7] = output[7];
     468           0 :   step[8] = output[8] + output[11];
     469           0 :   step[9] = output[9] + output[10];
     470           0 :   step[10] = output[9] - output[10];
     471           0 :   step[11] = output[8] - output[11];
     472           0 :   step[12] = output[15] - output[12];
     473           0 :   step[13] = output[14] - output[13];
     474           0 :   step[14] = output[14] + output[13];
     475           0 :   step[15] = output[15] + output[12];
     476           0 :   step[16] = output[16];
     477           0 :   step[17] = output[17];
     478           0 :   temp = output[18] * -cospi_8_64 + output[29] * cospi_24_64;
     479           0 :   step[18] = (tran_low_t)fdct_round_shift(temp);
     480           0 :   temp = output[19] * -cospi_8_64 + output[28] * cospi_24_64;
     481           0 :   step[19] = (tran_low_t)fdct_round_shift(temp);
     482           0 :   temp = output[20] * -cospi_24_64 + output[27] * -cospi_8_64;
     483           0 :   step[20] = (tran_low_t)fdct_round_shift(temp);
     484           0 :   temp = output[21] * -cospi_24_64 + output[26] * -cospi_8_64;
     485           0 :   step[21] = (tran_low_t)fdct_round_shift(temp);
     486           0 :   step[22] = output[22];
     487           0 :   step[23] = output[23];
     488           0 :   step[24] = output[24];
     489           0 :   step[25] = output[25];
     490           0 :   temp = output[26] * cospi_24_64 + output[21] * -cospi_8_64;
     491           0 :   step[26] = (tran_low_t)fdct_round_shift(temp);
     492           0 :   temp = output[27] * cospi_24_64 + output[20] * -cospi_8_64;
     493           0 :   step[27] = (tran_low_t)fdct_round_shift(temp);
     494           0 :   temp = output[28] * cospi_8_64 + output[19] * cospi_24_64;
     495           0 :   step[28] = (tran_low_t)fdct_round_shift(temp);
     496           0 :   temp = output[29] * cospi_8_64 + output[18] * cospi_24_64;
     497           0 :   step[29] = (tran_low_t)fdct_round_shift(temp);
     498           0 :   step[30] = output[30];
     499           0 :   step[31] = output[31];
     500             : 
     501           0 :   range_check(step, 32, 18);
     502             : 
     503             :   // stage 5
     504           0 :   temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
     505           0 :   output[0] = (tran_low_t)fdct_round_shift(temp);
     506           0 :   temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
     507           0 :   output[1] = (tran_low_t)fdct_round_shift(temp);
     508           0 :   temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
     509           0 :   output[2] = (tran_low_t)fdct_round_shift(temp);
     510           0 :   temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
     511           0 :   output[3] = (tran_low_t)fdct_round_shift(temp);
     512           0 :   output[4] = step[4] + step[5];
     513           0 :   output[5] = step[4] - step[5];
     514           0 :   output[6] = step[7] - step[6];
     515           0 :   output[7] = step[7] + step[6];
     516           0 :   output[8] = step[8];
     517           0 :   temp = step[9] * -cospi_8_64 + step[14] * cospi_24_64;
     518           0 :   output[9] = (tran_low_t)fdct_round_shift(temp);
     519           0 :   temp = step[10] * -cospi_24_64 + step[13] * -cospi_8_64;
     520           0 :   output[10] = (tran_low_t)fdct_round_shift(temp);
     521           0 :   output[11] = step[11];
     522           0 :   output[12] = step[12];
     523           0 :   temp = step[13] * cospi_24_64 + step[10] * -cospi_8_64;
     524           0 :   output[13] = (tran_low_t)fdct_round_shift(temp);
     525           0 :   temp = step[14] * cospi_8_64 + step[9] * cospi_24_64;
     526           0 :   output[14] = (tran_low_t)fdct_round_shift(temp);
     527           0 :   output[15] = step[15];
     528           0 :   output[16] = step[16] + step[19];
     529           0 :   output[17] = step[17] + step[18];
     530           0 :   output[18] = step[17] - step[18];
     531           0 :   output[19] = step[16] - step[19];
     532           0 :   output[20] = step[23] - step[20];
     533           0 :   output[21] = step[22] - step[21];
     534           0 :   output[22] = step[22] + step[21];
     535           0 :   output[23] = step[23] + step[20];
     536           0 :   output[24] = step[24] + step[27];
     537           0 :   output[25] = step[25] + step[26];
     538           0 :   output[26] = step[25] - step[26];
     539           0 :   output[27] = step[24] - step[27];
     540           0 :   output[28] = step[31] - step[28];
     541           0 :   output[29] = step[30] - step[29];
     542           0 :   output[30] = step[30] + step[29];
     543           0 :   output[31] = step[31] + step[28];
     544             : 
     545           0 :   range_check(output, 32, 18);
     546             : 
     547             :   // stage 6
     548           0 :   step[0] = output[0];
     549           0 :   step[1] = output[1];
     550           0 :   step[2] = output[2];
     551           0 :   step[3] = output[3];
     552           0 :   temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
     553           0 :   step[4] = (tran_low_t)fdct_round_shift(temp);
     554           0 :   temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
     555           0 :   step[5] = (tran_low_t)fdct_round_shift(temp);
     556           0 :   temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
     557           0 :   step[6] = (tran_low_t)fdct_round_shift(temp);
     558           0 :   temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
     559           0 :   step[7] = (tran_low_t)fdct_round_shift(temp);
     560           0 :   step[8] = output[8] + output[9];
     561           0 :   step[9] = output[8] - output[9];
     562           0 :   step[10] = output[11] - output[10];
     563           0 :   step[11] = output[11] + output[10];
     564           0 :   step[12] = output[12] + output[13];
     565           0 :   step[13] = output[12] - output[13];
     566           0 :   step[14] = output[15] - output[14];
     567           0 :   step[15] = output[15] + output[14];
     568           0 :   step[16] = output[16];
     569           0 :   temp = output[17] * -cospi_4_64 + output[30] * cospi_28_64;
     570           0 :   step[17] = (tran_low_t)fdct_round_shift(temp);
     571           0 :   temp = output[18] * -cospi_28_64 + output[29] * -cospi_4_64;
     572           0 :   step[18] = (tran_low_t)fdct_round_shift(temp);
     573           0 :   step[19] = output[19];
     574           0 :   step[20] = output[20];
     575           0 :   temp = output[21] * -cospi_20_64 + output[26] * cospi_12_64;
     576           0 :   step[21] = (tran_low_t)fdct_round_shift(temp);
     577           0 :   temp = output[22] * -cospi_12_64 + output[25] * -cospi_20_64;
     578           0 :   step[22] = (tran_low_t)fdct_round_shift(temp);
     579           0 :   step[23] = output[23];
     580           0 :   step[24] = output[24];
     581           0 :   temp = output[25] * cospi_12_64 + output[22] * -cospi_20_64;
     582           0 :   step[25] = (tran_low_t)fdct_round_shift(temp);
     583           0 :   temp = output[26] * cospi_20_64 + output[21] * cospi_12_64;
     584           0 :   step[26] = (tran_low_t)fdct_round_shift(temp);
     585           0 :   step[27] = output[27];
     586           0 :   step[28] = output[28];
     587           0 :   temp = output[29] * cospi_28_64 + output[18] * -cospi_4_64;
     588           0 :   step[29] = (tran_low_t)fdct_round_shift(temp);
     589           0 :   temp = output[30] * cospi_4_64 + output[17] * cospi_28_64;
     590           0 :   step[30] = (tran_low_t)fdct_round_shift(temp);
     591           0 :   step[31] = output[31];
     592             : 
     593           0 :   range_check(step, 32, 18);
     594             : 
     595             :   // stage 7
     596           0 :   output[0] = step[0];
     597           0 :   output[1] = step[1];
     598           0 :   output[2] = step[2];
     599           0 :   output[3] = step[3];
     600           0 :   output[4] = step[4];
     601           0 :   output[5] = step[5];
     602           0 :   output[6] = step[6];
     603           0 :   output[7] = step[7];
     604           0 :   temp = step[8] * cospi_30_64 + step[15] * cospi_2_64;
     605           0 :   output[8] = (tran_low_t)fdct_round_shift(temp);
     606           0 :   temp = step[9] * cospi_14_64 + step[14] * cospi_18_64;
     607           0 :   output[9] = (tran_low_t)fdct_round_shift(temp);
     608           0 :   temp = step[10] * cospi_22_64 + step[13] * cospi_10_64;
     609           0 :   output[10] = (tran_low_t)fdct_round_shift(temp);
     610           0 :   temp = step[11] * cospi_6_64 + step[12] * cospi_26_64;
     611           0 :   output[11] = (tran_low_t)fdct_round_shift(temp);
     612           0 :   temp = step[12] * cospi_6_64 + step[11] * -cospi_26_64;
     613           0 :   output[12] = (tran_low_t)fdct_round_shift(temp);
     614           0 :   temp = step[13] * cospi_22_64 + step[10] * -cospi_10_64;
     615           0 :   output[13] = (tran_low_t)fdct_round_shift(temp);
     616           0 :   temp = step[14] * cospi_14_64 + step[9] * -cospi_18_64;
     617           0 :   output[14] = (tran_low_t)fdct_round_shift(temp);
     618           0 :   temp = step[15] * cospi_30_64 + step[8] * -cospi_2_64;
     619           0 :   output[15] = (tran_low_t)fdct_round_shift(temp);
     620           0 :   output[16] = step[16] + step[17];
     621           0 :   output[17] = step[16] - step[17];
     622           0 :   output[18] = step[19] - step[18];
     623           0 :   output[19] = step[19] + step[18];
     624           0 :   output[20] = step[20] + step[21];
     625           0 :   output[21] = step[20] - step[21];
     626           0 :   output[22] = step[23] - step[22];
     627           0 :   output[23] = step[23] + step[22];
     628           0 :   output[24] = step[24] + step[25];
     629           0 :   output[25] = step[24] - step[25];
     630           0 :   output[26] = step[27] - step[26];
     631           0 :   output[27] = step[27] + step[26];
     632           0 :   output[28] = step[28] + step[29];
     633           0 :   output[29] = step[28] - step[29];
     634           0 :   output[30] = step[31] - step[30];
     635           0 :   output[31] = step[31] + step[30];
     636             : 
     637           0 :   range_check(output, 32, 18);
     638             : 
     639             :   // stage 8
     640           0 :   step[0] = output[0];
     641           0 :   step[1] = output[1];
     642           0 :   step[2] = output[2];
     643           0 :   step[3] = output[3];
     644           0 :   step[4] = output[4];
     645           0 :   step[5] = output[5];
     646           0 :   step[6] = output[6];
     647           0 :   step[7] = output[7];
     648           0 :   step[8] = output[8];
     649           0 :   step[9] = output[9];
     650           0 :   step[10] = output[10];
     651           0 :   step[11] = output[11];
     652           0 :   step[12] = output[12];
     653           0 :   step[13] = output[13];
     654           0 :   step[14] = output[14];
     655           0 :   step[15] = output[15];
     656           0 :   temp = output[16] * cospi_31_64 + output[31] * cospi_1_64;
     657           0 :   step[16] = (tran_low_t)fdct_round_shift(temp);
     658           0 :   temp = output[17] * cospi_15_64 + output[30] * cospi_17_64;
     659           0 :   step[17] = (tran_low_t)fdct_round_shift(temp);
     660           0 :   temp = output[18] * cospi_23_64 + output[29] * cospi_9_64;
     661           0 :   step[18] = (tran_low_t)fdct_round_shift(temp);
     662           0 :   temp = output[19] * cospi_7_64 + output[28] * cospi_25_64;
     663           0 :   step[19] = (tran_low_t)fdct_round_shift(temp);
     664           0 :   temp = output[20] * cospi_27_64 + output[27] * cospi_5_64;
     665           0 :   step[20] = (tran_low_t)fdct_round_shift(temp);
     666           0 :   temp = output[21] * cospi_11_64 + output[26] * cospi_21_64;
     667           0 :   step[21] = (tran_low_t)fdct_round_shift(temp);
     668           0 :   temp = output[22] * cospi_19_64 + output[25] * cospi_13_64;
     669           0 :   step[22] = (tran_low_t)fdct_round_shift(temp);
     670           0 :   temp = output[23] * cospi_3_64 + output[24] * cospi_29_64;
     671           0 :   step[23] = (tran_low_t)fdct_round_shift(temp);
     672           0 :   temp = output[24] * cospi_3_64 + output[23] * -cospi_29_64;
     673           0 :   step[24] = (tran_low_t)fdct_round_shift(temp);
     674           0 :   temp = output[25] * cospi_19_64 + output[22] * -cospi_13_64;
     675           0 :   step[25] = (tran_low_t)fdct_round_shift(temp);
     676           0 :   temp = output[26] * cospi_11_64 + output[21] * -cospi_21_64;
     677           0 :   step[26] = (tran_low_t)fdct_round_shift(temp);
     678           0 :   temp = output[27] * cospi_27_64 + output[20] * -cospi_5_64;
     679           0 :   step[27] = (tran_low_t)fdct_round_shift(temp);
     680           0 :   temp = output[28] * cospi_7_64 + output[19] * -cospi_25_64;
     681           0 :   step[28] = (tran_low_t)fdct_round_shift(temp);
     682           0 :   temp = output[29] * cospi_23_64 + output[18] * -cospi_9_64;
     683           0 :   step[29] = (tran_low_t)fdct_round_shift(temp);
     684           0 :   temp = output[30] * cospi_15_64 + output[17] * -cospi_17_64;
     685           0 :   step[30] = (tran_low_t)fdct_round_shift(temp);
     686           0 :   temp = output[31] * cospi_31_64 + output[16] * -cospi_1_64;
     687           0 :   step[31] = (tran_low_t)fdct_round_shift(temp);
     688             : 
     689           0 :   range_check(step, 32, 18);
     690             : 
     691             :   // stage 9
     692           0 :   output[0] = step[0];
     693           0 :   output[1] = step[16];
     694           0 :   output[2] = step[8];
     695           0 :   output[3] = step[24];
     696           0 :   output[4] = step[4];
     697           0 :   output[5] = step[20];
     698           0 :   output[6] = step[12];
     699           0 :   output[7] = step[28];
     700           0 :   output[8] = step[2];
     701           0 :   output[9] = step[18];
     702           0 :   output[10] = step[10];
     703           0 :   output[11] = step[26];
     704           0 :   output[12] = step[6];
     705           0 :   output[13] = step[22];
     706           0 :   output[14] = step[14];
     707           0 :   output[15] = step[30];
     708           0 :   output[16] = step[1];
     709           0 :   output[17] = step[17];
     710           0 :   output[18] = step[9];
     711           0 :   output[19] = step[25];
     712           0 :   output[20] = step[5];
     713           0 :   output[21] = step[21];
     714           0 :   output[22] = step[13];
     715           0 :   output[23] = step[29];
     716           0 :   output[24] = step[3];
     717           0 :   output[25] = step[19];
     718           0 :   output[26] = step[11];
     719           0 :   output[27] = step[27];
     720           0 :   output[28] = step[7];
     721           0 :   output[29] = step[23];
     722           0 :   output[30] = step[15];
     723           0 :   output[31] = step[31];
     724             : 
     725           0 :   range_check(output, 32, 18);
     726           0 : }
     727             : 
     728             : #ifndef AV1_DCT_GTEST
     729             : 
     730           0 : static void fadst4(const tran_low_t *input, tran_low_t *output) {
     731             :   tran_high_t x0, x1, x2, x3;
     732             :   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
     733             : 
     734           0 :   x0 = input[0];
     735           0 :   x1 = input[1];
     736           0 :   x2 = input[2];
     737           0 :   x3 = input[3];
     738             : 
     739           0 :   if (!(x0 | x1 | x2 | x3)) {
     740           0 :     output[0] = output[1] = output[2] = output[3] = 0;
     741           0 :     return;
     742             :   }
     743             : 
     744           0 :   s0 = sinpi_1_9 * x0;
     745           0 :   s1 = sinpi_4_9 * x0;
     746           0 :   s2 = sinpi_2_9 * x1;
     747           0 :   s3 = sinpi_1_9 * x1;
     748           0 :   s4 = sinpi_3_9 * x2;
     749           0 :   s5 = sinpi_4_9 * x3;
     750           0 :   s6 = sinpi_2_9 * x3;
     751           0 :   s7 = x0 + x1 - x3;
     752             : 
     753           0 :   x0 = s0 + s2 + s5;
     754           0 :   x1 = sinpi_3_9 * s7;
     755           0 :   x2 = s1 - s3 + s6;
     756           0 :   x3 = s4;
     757             : 
     758           0 :   s0 = x0 + x3;
     759           0 :   s1 = x1;
     760           0 :   s2 = x2 - x3;
     761           0 :   s3 = x2 - x0 + x3;
     762             : 
     763             :   // 1-D transform scaling factor is sqrt(2).
     764           0 :   output[0] = (tran_low_t)fdct_round_shift(s0);
     765           0 :   output[1] = (tran_low_t)fdct_round_shift(s1);
     766           0 :   output[2] = (tran_low_t)fdct_round_shift(s2);
     767           0 :   output[3] = (tran_low_t)fdct_round_shift(s3);
     768             : }
     769             : 
     770           0 : static void fadst8(const tran_low_t *input, tran_low_t *output) {
     771             :   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
     772             : 
     773           0 :   tran_high_t x0 = input[7];
     774           0 :   tran_high_t x1 = input[0];
     775           0 :   tran_high_t x2 = input[5];
     776           0 :   tran_high_t x3 = input[2];
     777           0 :   tran_high_t x4 = input[3];
     778           0 :   tran_high_t x5 = input[4];
     779           0 :   tran_high_t x6 = input[1];
     780           0 :   tran_high_t x7 = input[6];
     781             : 
     782             :   // stage 1
     783           0 :   s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
     784           0 :   s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
     785           0 :   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
     786           0 :   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
     787           0 :   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
     788           0 :   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
     789           0 :   s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
     790           0 :   s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
     791             : 
     792           0 :   x0 = s0 + s4;
     793           0 :   x1 = s1 + s5;
     794           0 :   x2 = s2 + s6;
     795           0 :   x3 = s3 + s7;
     796           0 :   x4 = fdct_round_shift(s0 - s4);
     797           0 :   x5 = fdct_round_shift(s1 - s5);
     798           0 :   x6 = fdct_round_shift(s2 - s6);
     799           0 :   x7 = fdct_round_shift(s3 - s7);
     800             : 
     801             :   // stage 2
     802           0 :   s0 = x0;
     803           0 :   s1 = x1;
     804           0 :   s2 = x2;
     805           0 :   s3 = x3;
     806           0 :   s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
     807           0 :   s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
     808           0 :   s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
     809           0 :   s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
     810             : 
     811           0 :   x0 = fdct_round_shift(s0 + s2);
     812           0 :   x1 = fdct_round_shift(s1 + s3);
     813           0 :   x2 = fdct_round_shift(s0 - s2);
     814           0 :   x3 = fdct_round_shift(s1 - s3);
     815           0 :   x4 = fdct_round_shift(s4 + s6);
     816           0 :   x5 = fdct_round_shift(s5 + s7);
     817           0 :   x6 = fdct_round_shift(s4 - s6);
     818           0 :   x7 = fdct_round_shift(s5 - s7);
     819             : 
     820             :   // stage 3
     821           0 :   s2 = cospi_16_64 * (x2 + x3);
     822           0 :   s3 = cospi_16_64 * (x2 - x3);
     823           0 :   s6 = cospi_16_64 * (x6 + x7);
     824           0 :   s7 = cospi_16_64 * (x6 - x7);
     825             : 
     826           0 :   x2 = fdct_round_shift(s2);
     827           0 :   x3 = fdct_round_shift(s3);
     828           0 :   x6 = fdct_round_shift(s6);
     829           0 :   x7 = fdct_round_shift(s7);
     830             : 
     831           0 :   output[0] = (tran_low_t)x0;
     832           0 :   output[1] = (tran_low_t)-x4;
     833           0 :   output[2] = (tran_low_t)x6;
     834           0 :   output[3] = (tran_low_t)-x2;
     835           0 :   output[4] = (tran_low_t)x3;
     836           0 :   output[5] = (tran_low_t)-x7;
     837           0 :   output[6] = (tran_low_t)x5;
     838           0 :   output[7] = (tran_low_t)-x1;
     839           0 : }
     840             : 
     841           0 : static void fadst16(const tran_low_t *input, tran_low_t *output) {
     842             :   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
     843             :   tran_high_t s9, s10, s11, s12, s13, s14, s15;
     844             : 
     845           0 :   tran_high_t x0 = input[15];
     846           0 :   tran_high_t x1 = input[0];
     847           0 :   tran_high_t x2 = input[13];
     848           0 :   tran_high_t x3 = input[2];
     849           0 :   tran_high_t x4 = input[11];
     850           0 :   tran_high_t x5 = input[4];
     851           0 :   tran_high_t x6 = input[9];
     852           0 :   tran_high_t x7 = input[6];
     853           0 :   tran_high_t x8 = input[7];
     854           0 :   tran_high_t x9 = input[8];
     855           0 :   tran_high_t x10 = input[5];
     856           0 :   tran_high_t x11 = input[10];
     857           0 :   tran_high_t x12 = input[3];
     858           0 :   tran_high_t x13 = input[12];
     859           0 :   tran_high_t x14 = input[1];
     860           0 :   tran_high_t x15 = input[14];
     861             : 
     862             :   // stage 1
     863           0 :   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
     864           0 :   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
     865           0 :   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
     866           0 :   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
     867           0 :   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
     868           0 :   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
     869           0 :   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
     870           0 :   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
     871           0 :   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
     872           0 :   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
     873           0 :   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
     874           0 :   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
     875           0 :   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
     876           0 :   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
     877           0 :   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
     878           0 :   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
     879             : 
     880           0 :   x0 = s0 + s8;
     881           0 :   x1 = s1 + s9;
     882           0 :   x2 = s2 + s10;
     883           0 :   x3 = s3 + s11;
     884           0 :   x4 = s4 + s12;
     885           0 :   x5 = s5 + s13;
     886           0 :   x6 = s6 + s14;
     887           0 :   x7 = s7 + s15;
     888             : 
     889           0 :   x8 = fdct_round_shift(s0 - s8);
     890           0 :   x9 = fdct_round_shift(s1 - s9);
     891           0 :   x10 = fdct_round_shift(s2 - s10);
     892           0 :   x11 = fdct_round_shift(s3 - s11);
     893           0 :   x12 = fdct_round_shift(s4 - s12);
     894           0 :   x13 = fdct_round_shift(s5 - s13);
     895           0 :   x14 = fdct_round_shift(s6 - s14);
     896           0 :   x15 = fdct_round_shift(s7 - s15);
     897             : 
     898             :   // stage 2
     899           0 :   s0 = x0;
     900           0 :   s1 = x1;
     901           0 :   s2 = x2;
     902           0 :   s3 = x3;
     903           0 :   s4 = x4;
     904           0 :   s5 = x5;
     905           0 :   s6 = x6;
     906           0 :   s7 = x7;
     907           0 :   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
     908           0 :   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
     909           0 :   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
     910           0 :   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
     911           0 :   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
     912           0 :   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
     913           0 :   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
     914           0 :   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
     915             : 
     916           0 :   x0 = s0 + s4;
     917           0 :   x1 = s1 + s5;
     918           0 :   x2 = s2 + s6;
     919           0 :   x3 = s3 + s7;
     920           0 :   x4 = fdct_round_shift(s0 - s4);
     921           0 :   x5 = fdct_round_shift(s1 - s5);
     922           0 :   x6 = fdct_round_shift(s2 - s6);
     923           0 :   x7 = fdct_round_shift(s3 - s7);
     924             : 
     925           0 :   x8 = s8 + s12;
     926           0 :   x9 = s9 + s13;
     927           0 :   x10 = s10 + s14;
     928           0 :   x11 = s11 + s15;
     929           0 :   x12 = fdct_round_shift(s8 - s12);
     930           0 :   x13 = fdct_round_shift(s9 - s13);
     931           0 :   x14 = fdct_round_shift(s10 - s14);
     932           0 :   x15 = fdct_round_shift(s11 - s15);
     933             : 
     934             :   // stage 3
     935           0 :   s0 = x0;
     936           0 :   s1 = x1;
     937           0 :   s2 = x2;
     938           0 :   s3 = x3;
     939           0 :   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
     940           0 :   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
     941           0 :   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
     942           0 :   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
     943           0 :   s8 = x8;
     944           0 :   s9 = x9;
     945           0 :   s10 = x10;
     946           0 :   s11 = x11;
     947           0 :   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
     948           0 :   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
     949           0 :   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
     950           0 :   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
     951             : 
     952           0 :   x0 = fdct_round_shift(s0 + s2);
     953           0 :   x1 = fdct_round_shift(s1 + s3);
     954           0 :   x2 = fdct_round_shift(s0 - s2);
     955           0 :   x3 = fdct_round_shift(s1 - s3);
     956             : 
     957           0 :   x4 = fdct_round_shift(s4 + s6);
     958           0 :   x5 = fdct_round_shift(s5 + s7);
     959           0 :   x6 = fdct_round_shift(s4 - s6);
     960           0 :   x7 = fdct_round_shift(s5 - s7);
     961             : 
     962           0 :   x8 = fdct_round_shift(s8 + s10);
     963           0 :   x9 = fdct_round_shift(s9 + s11);
     964           0 :   x10 = fdct_round_shift(s8 - s10);
     965           0 :   x11 = fdct_round_shift(s9 - s11);
     966             : 
     967           0 :   x12 = fdct_round_shift(s12 + s14);
     968           0 :   x13 = fdct_round_shift(s13 + s15);
     969           0 :   x14 = fdct_round_shift(s12 - s14);
     970           0 :   x15 = fdct_round_shift(s13 - s15);
     971             : 
     972             :   // stage 4
     973           0 :   s2 = (-cospi_16_64) * (x2 + x3);
     974           0 :   s3 = cospi_16_64 * (x2 - x3);
     975           0 :   s6 = cospi_16_64 * (x6 + x7);
     976           0 :   s7 = cospi_16_64 * (-x6 + x7);
     977           0 :   s10 = cospi_16_64 * (x10 + x11);
     978           0 :   s11 = cospi_16_64 * (-x10 + x11);
     979           0 :   s14 = (-cospi_16_64) * (x14 + x15);
     980           0 :   s15 = cospi_16_64 * (x14 - x15);
     981             : 
     982           0 :   x2 = fdct_round_shift(s2);
     983           0 :   x3 = fdct_round_shift(s3);
     984           0 :   x6 = fdct_round_shift(s6);
     985           0 :   x7 = fdct_round_shift(s7);
     986           0 :   x10 = fdct_round_shift(s10);
     987           0 :   x11 = fdct_round_shift(s11);
     988           0 :   x14 = fdct_round_shift(s14);
     989           0 :   x15 = fdct_round_shift(s15);
     990             : 
     991           0 :   output[0] = (tran_low_t)x0;
     992           0 :   output[1] = (tran_low_t)-x8;
     993           0 :   output[2] = (tran_low_t)x12;
     994           0 :   output[3] = (tran_low_t)-x4;
     995           0 :   output[4] = (tran_low_t)x6;
     996           0 :   output[5] = (tran_low_t)x14;
     997           0 :   output[6] = (tran_low_t)x10;
     998           0 :   output[7] = (tran_low_t)x2;
     999           0 :   output[8] = (tran_low_t)x3;
    1000           0 :   output[9] = (tran_low_t)x11;
    1001           0 :   output[10] = (tran_low_t)x15;
    1002           0 :   output[11] = (tran_low_t)x7;
    1003           0 :   output[12] = (tran_low_t)x5;
    1004           0 :   output[13] = (tran_low_t)-x13;
    1005           0 :   output[14] = (tran_low_t)x9;
    1006           0 :   output[15] = (tran_low_t)-x1;
    1007           0 : }
    1008             : 
    1009             : // For use in lieu of ADST
    1010           0 : static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
    1011             :   int i;
    1012             :   tran_low_t inputhalf[16];
    1013           0 :   for (i = 0; i < 16; ++i) {
    1014           0 :     output[16 + i] = input[i] * 4;
    1015             :   }
    1016             :   // Multiply input by sqrt(2)
    1017           0 :   for (i = 0; i < 16; ++i) {
    1018           0 :     inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
    1019             :   }
    1020           0 :   fdct16(inputhalf, output);
    1021             :   // Note overall scaling factor is 4 times orthogonal
    1022           0 : }
    1023             : 
    1024             : #if CONFIG_EXT_TX
    1025             : // TODO(sarahparker) these functions will be removed once the highbitdepth
    1026             : // codepath works properly for rectangular transforms. They have almost
    1027             : // identical versions in av1_fwd_txfm1d.c, but those are currently only
    1028             : // being used for square transforms.
    1029           0 : static void fidtx4(const tran_low_t *input, tran_low_t *output) {
    1030             :   int i;
    1031           0 :   for (i = 0; i < 4; ++i)
    1032           0 :     output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
    1033           0 : }
    1034             : 
    1035           0 : static void fidtx8(const tran_low_t *input, tran_low_t *output) {
    1036             :   int i;
    1037           0 :   for (i = 0; i < 8; ++i) output[i] = input[i] * 2;
    1038           0 : }
    1039             : 
    1040           0 : static void fidtx16(const tran_low_t *input, tran_low_t *output) {
    1041             :   int i;
    1042           0 :   for (i = 0; i < 16; ++i)
    1043           0 :     output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2);
    1044           0 : }
    1045             : 
    1046           0 : static void fidtx32(const tran_low_t *input, tran_low_t *output) {
    1047             :   int i;
    1048           0 :   for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
    1049           0 : }
    1050             : 
    1051           0 : static void copy_block(const int16_t *src, int src_stride, int l, int w,
    1052             :                        int16_t *dest, int dest_stride) {
    1053             :   int i;
    1054           0 :   for (i = 0; i < l; ++i) {
    1055           0 :     memcpy(dest + dest_stride * i, src + src_stride * i, w * sizeof(int16_t));
    1056             :   }
    1057           0 : }
    1058             : 
    1059           0 : static void fliplr(int16_t *dest, int stride, int l, int w) {
    1060             :   int i, j;
    1061           0 :   for (i = 0; i < l; ++i) {
    1062           0 :     for (j = 0; j < w / 2; ++j) {
    1063           0 :       const int16_t tmp = dest[i * stride + j];
    1064           0 :       dest[i * stride + j] = dest[i * stride + w - 1 - j];
    1065           0 :       dest[i * stride + w - 1 - j] = tmp;
    1066             :     }
    1067             :   }
    1068           0 : }
    1069             : 
    1070           0 : static void flipud(int16_t *dest, int stride, int l, int w) {
    1071             :   int i, j;
    1072           0 :   for (j = 0; j < w; ++j) {
    1073           0 :     for (i = 0; i < l / 2; ++i) {
    1074           0 :       const int16_t tmp = dest[i * stride + j];
    1075           0 :       dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
    1076           0 :       dest[(l - 1 - i) * stride + j] = tmp;
    1077             :     }
    1078             :   }
    1079           0 : }
    1080             : 
    1081           0 : static void fliplrud(int16_t *dest, int stride, int l, int w) {
    1082             :   int i, j;
    1083           0 :   for (i = 0; i < l / 2; ++i) {
    1084           0 :     for (j = 0; j < w; ++j) {
    1085           0 :       const int16_t tmp = dest[i * stride + j];
    1086           0 :       dest[i * stride + j] = dest[(l - 1 - i) * stride + w - 1 - j];
    1087           0 :       dest[(l - 1 - i) * stride + w - 1 - j] = tmp;
    1088             :     }
    1089             :   }
    1090           0 : }
    1091             : 
    1092           0 : static void copy_fliplr(const int16_t *src, int src_stride, int l, int w,
    1093             :                         int16_t *dest, int dest_stride) {
    1094           0 :   copy_block(src, src_stride, l, w, dest, dest_stride);
    1095           0 :   fliplr(dest, dest_stride, l, w);
    1096           0 : }
    1097             : 
    1098           0 : static void copy_flipud(const int16_t *src, int src_stride, int l, int w,
    1099             :                         int16_t *dest, int dest_stride) {
    1100           0 :   copy_block(src, src_stride, l, w, dest, dest_stride);
    1101           0 :   flipud(dest, dest_stride, l, w);
    1102           0 : }
    1103             : 
    1104           0 : static void copy_fliplrud(const int16_t *src, int src_stride, int l, int w,
    1105             :                           int16_t *dest, int dest_stride) {
    1106           0 :   copy_block(src, src_stride, l, w, dest, dest_stride);
    1107           0 :   fliplrud(dest, dest_stride, l, w);
    1108           0 : }
    1109             : 
    1110           0 : static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w,
    1111             :                              int16_t *buff, int tx_type) {
    1112           0 :   switch (tx_type) {
    1113             :     case DCT_DCT:
    1114             :     case ADST_DCT:
    1115             :     case DCT_ADST:
    1116             :     case ADST_ADST:
    1117             :     case IDTX:
    1118             :     case V_DCT:
    1119             :     case H_DCT:
    1120             :     case V_ADST:
    1121           0 :     case H_ADST: break;
    1122             :     case FLIPADST_DCT:
    1123             :     case FLIPADST_ADST:
    1124             :     case V_FLIPADST:
    1125           0 :       copy_flipud(*src, *src_stride, l, w, buff, w);
    1126           0 :       *src = buff;
    1127           0 :       *src_stride = w;
    1128           0 :       break;
    1129             :     case DCT_FLIPADST:
    1130             :     case ADST_FLIPADST:
    1131             :     case H_FLIPADST:
    1132           0 :       copy_fliplr(*src, *src_stride, l, w, buff, w);
    1133           0 :       *src = buff;
    1134           0 :       *src_stride = w;
    1135           0 :       break;
    1136             :     case FLIPADST_FLIPADST:
    1137           0 :       copy_fliplrud(*src, *src_stride, l, w, buff, w);
    1138           0 :       *src = buff;
    1139           0 :       *src_stride = w;
    1140           0 :       break;
    1141           0 :     default: assert(0); break;
    1142             :   }
    1143           0 : }
    1144             : #endif  // CONFIG_EXT_TX
    1145             : 
    1146           0 : void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
    1147             :                   int tx_type) {
    1148           0 :   if (tx_type == DCT_DCT) {
    1149           0 :     aom_fdct4x4_c(input, output, stride);
    1150             :   } else {
    1151             :     static const transform_2d FHT[] = {
    1152             :       { fdct4, fdct4 },    // DCT_DCT
    1153             :       { fadst4, fdct4 },   // ADST_DCT
    1154             :       { fdct4, fadst4 },   // DCT_ADST
    1155             :       { fadst4, fadst4 },  // ADST_ADST
    1156             : #if CONFIG_EXT_TX
    1157             :       { fadst4, fdct4 },   // FLIPADST_DCT
    1158             :       { fdct4, fadst4 },   // DCT_FLIPADST
    1159             :       { fadst4, fadst4 },  // FLIPADST_FLIPADST
    1160             :       { fadst4, fadst4 },  // ADST_FLIPADST
    1161             :       { fadst4, fadst4 },  // FLIPADST_ADST
    1162             :       { fidtx4, fidtx4 },  // IDTX
    1163             :       { fdct4, fidtx4 },   // V_DCT
    1164             :       { fidtx4, fdct4 },   // H_DCT
    1165             :       { fadst4, fidtx4 },  // V_ADST
    1166             :       { fidtx4, fadst4 },  // H_ADST
    1167             :       { fadst4, fidtx4 },  // V_FLIPADST
    1168             :       { fidtx4, fadst4 },  // H_FLIPADST
    1169             : #endif                     // CONFIG_EXT_TX
    1170             :     };
    1171           0 :     const transform_2d ht = FHT[tx_type];
    1172             :     tran_low_t out[4 * 4];
    1173             :     int i, j;
    1174             :     tran_low_t temp_in[4], temp_out[4];
    1175             : 
    1176             : #if CONFIG_EXT_TX
    1177             :     int16_t flipped_input[4 * 4];
    1178           0 :     maybe_flip_input(&input, &stride, 4, 4, flipped_input, tx_type);
    1179             : #endif
    1180             : 
    1181             :     // Columns
    1182           0 :     for (i = 0; i < 4; ++i) {
    1183           0 :       for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
    1184           0 :       if (i == 0 && temp_in[0]) temp_in[0] += 1;
    1185           0 :       ht.cols(temp_in, temp_out);
    1186           0 :       for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
    1187             :     }
    1188             : 
    1189             :     // Rows
    1190           0 :     for (i = 0; i < 4; ++i) {
    1191           0 :       for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
    1192           0 :       ht.rows(temp_in, temp_out);
    1193           0 :       for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
    1194             :     }
    1195             :   }
    1196           0 : }
    1197             : 
    1198           0 : void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
    1199             :                   int tx_type) {
    1200             :   static const transform_2d FHT[] = {
    1201             :     { fdct8, fdct4 },    // DCT_DCT
    1202             :     { fadst8, fdct4 },   // ADST_DCT
    1203             :     { fdct8, fadst4 },   // DCT_ADST
    1204             :     { fadst8, fadst4 },  // ADST_ADST
    1205             : #if CONFIG_EXT_TX
    1206             :     { fadst8, fdct4 },   // FLIPADST_DCT
    1207             :     { fdct8, fadst4 },   // DCT_FLIPADST
    1208             :     { fadst8, fadst4 },  // FLIPADST_FLIPADST
    1209             :     { fadst8, fadst4 },  // ADST_FLIPADST
    1210             :     { fadst8, fadst4 },  // FLIPADST_ADST
    1211             :     { fidtx8, fidtx4 },  // IDTX
    1212             :     { fdct8, fidtx4 },   // V_DCT
    1213             :     { fidtx8, fdct4 },   // H_DCT
    1214             :     { fadst8, fidtx4 },  // V_ADST
    1215             :     { fidtx8, fadst4 },  // H_ADST
    1216             :     { fadst8, fidtx4 },  // V_FLIPADST
    1217             :     { fidtx8, fadst4 },  // H_FLIPADST
    1218             : #endif
    1219             :   };
    1220           0 :   const transform_2d ht = FHT[tx_type];
    1221           0 :   const int n = 4;
    1222           0 :   const int n2 = 8;
    1223             :   tran_low_t out[8 * 4];
    1224             :   tran_low_t temp_in[8], temp_out[8];
    1225             :   int i, j;
    1226             : #if CONFIG_EXT_TX
    1227             :   int16_t flipped_input[8 * 4];
    1228           0 :   maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
    1229             : #endif
    1230             : 
    1231             :   // Rows
    1232           0 :   for (i = 0; i < n2; ++i) {
    1233           0 :     for (j = 0; j < n; ++j)
    1234           0 :       temp_in[j] =
    1235           0 :           (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
    1236           0 :     ht.rows(temp_in, temp_out);
    1237           0 :     for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
    1238             :   }
    1239             : 
    1240             :   // Columns
    1241           0 :   for (i = 0; i < n; ++i) {
    1242           0 :     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
    1243           0 :     ht.cols(temp_in, temp_out);
    1244           0 :     for (j = 0; j < n2; ++j)
    1245           0 :       output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
    1246             :   }
    1247             :   // Note: overall scale factor of transform is 8 times unitary
    1248           0 : }
    1249             : 
    1250           0 : void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
    1251             :                   int tx_type) {
    1252             :   static const transform_2d FHT[] = {
    1253             :     { fdct4, fdct8 },    // DCT_DCT
    1254             :     { fadst4, fdct8 },   // ADST_DCT
    1255             :     { fdct4, fadst8 },   // DCT_ADST
    1256             :     { fadst4, fadst8 },  // ADST_ADST
    1257             : #if CONFIG_EXT_TX
    1258             :     { fadst4, fdct8 },   // FLIPADST_DCT
    1259             :     { fdct4, fadst8 },   // DCT_FLIPADST
    1260             :     { fadst4, fadst8 },  // FLIPADST_FLIPADST
    1261             :     { fadst4, fadst8 },  // ADST_FLIPADST
    1262             :     { fadst4, fadst8 },  // FLIPADST_ADST
    1263             :     { fidtx4, fidtx8 },  // IDTX
    1264             :     { fdct4, fidtx8 },   // V_DCT
    1265             :     { fidtx4, fdct8 },   // H_DCT
    1266             :     { fadst4, fidtx8 },  // V_ADST
    1267             :     { fidtx4, fadst8 },  // H_ADST
    1268             :     { fadst4, fidtx8 },  // V_FLIPADST
    1269             :     { fidtx4, fadst8 },  // H_FLIPADST
    1270             : #endif
    1271             :   };
    1272           0 :   const transform_2d ht = FHT[tx_type];
    1273           0 :   const int n = 4;
    1274           0 :   const int n2 = 8;
    1275             :   tran_low_t out[8 * 4];
    1276             :   tran_low_t temp_in[8], temp_out[8];
    1277             :   int i, j;
    1278             : #if CONFIG_EXT_TX
    1279             :   int16_t flipped_input[8 * 4];
    1280           0 :   maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
    1281             : #endif
    1282             : 
    1283             :   // Columns
    1284           0 :   for (i = 0; i < n2; ++i) {
    1285           0 :     for (j = 0; j < n; ++j)
    1286           0 :       temp_in[j] =
    1287           0 :           (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
    1288           0 :     ht.cols(temp_in, temp_out);
    1289           0 :     for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
    1290             :   }
    1291             : 
    1292             :   // Rows
    1293           0 :   for (i = 0; i < n; ++i) {
    1294           0 :     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
    1295           0 :     ht.rows(temp_in, temp_out);
    1296           0 :     for (j = 0; j < n2; ++j)
    1297           0 :       output[j + i * n2] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
    1298             :   }
    1299             :   // Note: overall scale factor of transform is 8 times unitary
    1300           0 : }
    1301             : 
    1302           0 : void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
    1303             :                    int tx_type) {
    1304             :   static const transform_2d FHT[] = {
    1305             :     { fdct16, fdct4 },    // DCT_DCT
    1306             :     { fadst16, fdct4 },   // ADST_DCT
    1307             :     { fdct16, fadst4 },   // DCT_ADST
    1308             :     { fadst16, fadst4 },  // ADST_ADST
    1309             : #if CONFIG_EXT_TX
    1310             :     { fadst16, fdct4 },   // FLIPADST_DCT
    1311             :     { fdct16, fadst4 },   // DCT_FLIPADST
    1312             :     { fadst16, fadst4 },  // FLIPADST_FLIPADST
    1313             :     { fadst16, fadst4 },  // ADST_FLIPADST
    1314             :     { fadst16, fadst4 },  // FLIPADST_ADST
    1315             :     { fidtx16, fidtx4 },  // IDTX
    1316             :     { fdct16, fidtx4 },   // V_DCT
    1317             :     { fidtx16, fdct4 },   // H_DCT
    1318             :     { fadst16, fidtx4 },  // V_ADST
    1319             :     { fidtx16, fadst4 },  // H_ADST
    1320             :     { fadst16, fidtx4 },  // V_FLIPADST
    1321             :     { fidtx16, fadst4 },  // H_FLIPADST
    1322             : #endif
    1323             :   };
    1324           0 :   const transform_2d ht = FHT[tx_type];
    1325           0 :   const int n = 4;
    1326           0 :   const int n4 = 16;
    1327             :   tran_low_t out[16 * 4];
    1328             :   tran_low_t temp_in[16], temp_out[16];
    1329             :   int i, j;
    1330             : #if CONFIG_EXT_TX
    1331             :   int16_t flipped_input[16 * 4];
    1332           0 :   maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
    1333             : #endif
    1334             : 
    1335             :   // Rows
    1336           0 :   for (i = 0; i < n4; ++i) {
    1337           0 :     for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
    1338           0 :     ht.rows(temp_in, temp_out);
    1339           0 :     for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
    1340             :   }
    1341             : 
    1342             :   // Columns
    1343           0 :   for (i = 0; i < n; ++i) {
    1344           0 :     for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
    1345           0 :     ht.cols(temp_in, temp_out);
    1346           0 :     for (j = 0; j < n4; ++j)
    1347           0 :       output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
    1348             :   }
    1349             :   // Note: overall scale factor of transform is 8 times unitary
    1350           0 : }
    1351             : 
    1352           0 : void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
    1353             :                    int tx_type) {
    1354             :   static const transform_2d FHT[] = {
    1355             :     { fdct4, fdct16 },    // DCT_DCT
    1356             :     { fadst4, fdct16 },   // ADST_DCT
    1357             :     { fdct4, fadst16 },   // DCT_ADST
    1358             :     { fadst4, fadst16 },  // ADST_ADST
    1359             : #if CONFIG_EXT_TX
    1360             :     { fadst4, fdct16 },   // FLIPADST_DCT
    1361             :     { fdct4, fadst16 },   // DCT_FLIPADST
    1362             :     { fadst4, fadst16 },  // FLIPADST_FLIPADST
    1363             :     { fadst4, fadst16 },  // ADST_FLIPADST
    1364             :     { fadst4, fadst16 },  // FLIPADST_ADST
    1365             :     { fidtx4, fidtx16 },  // IDTX
    1366             :     { fdct4, fidtx16 },   // V_DCT
    1367             :     { fidtx4, fdct16 },   // H_DCT
    1368             :     { fadst4, fidtx16 },  // V_ADST
    1369             :     { fidtx4, fadst16 },  // H_ADST
    1370             :     { fadst4, fidtx16 },  // V_FLIPADST
    1371             :     { fidtx4, fadst16 },  // H_FLIPADST
    1372             : #endif
    1373             :   };
    1374           0 :   const transform_2d ht = FHT[tx_type];
    1375           0 :   const int n = 4;
    1376           0 :   const int n4 = 16;
    1377             :   tran_low_t out[16 * 4];
    1378             :   tran_low_t temp_in[16], temp_out[16];
    1379             :   int i, j;
    1380             : #if CONFIG_EXT_TX
    1381             :   int16_t flipped_input[16 * 4];
    1382           0 :   maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
    1383             : #endif
    1384             : 
    1385             :   // Columns
    1386           0 :   for (i = 0; i < n4; ++i) {
    1387           0 :     for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
    1388           0 :     ht.cols(temp_in, temp_out);
    1389           0 :     for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
    1390             :   }
    1391             : 
    1392             :   // Rows
    1393           0 :   for (i = 0; i < n; ++i) {
    1394           0 :     for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
    1395           0 :     ht.rows(temp_in, temp_out);
    1396           0 :     for (j = 0; j < n4; ++j)
    1397           0 :       output[j + i * n4] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
    1398             :   }
    1399             :   // Note: overall scale factor of transform is 8 times unitary
    1400           0 : }
    1401             : 
    1402           0 : void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
    1403             :                    int tx_type) {
    1404             :   static const transform_2d FHT[] = {
    1405             :     { fdct16, fdct8 },    // DCT_DCT
    1406             :     { fadst16, fdct8 },   // ADST_DCT
    1407             :     { fdct16, fadst8 },   // DCT_ADST
    1408             :     { fadst16, fadst8 },  // ADST_ADST
    1409             : #if CONFIG_EXT_TX
    1410             :     { fadst16, fdct8 },   // FLIPADST_DCT
    1411             :     { fdct16, fadst8 },   // DCT_FLIPADST
    1412             :     { fadst16, fadst8 },  // FLIPADST_FLIPADST
    1413             :     { fadst16, fadst8 },  // ADST_FLIPADST
    1414             :     { fadst16, fadst8 },  // FLIPADST_ADST
    1415             :     { fidtx16, fidtx8 },  // IDTX
    1416             :     { fdct16, fidtx8 },   // V_DCT
    1417             :     { fidtx16, fdct8 },   // H_DCT
    1418             :     { fadst16, fidtx8 },  // V_ADST
    1419             :     { fidtx16, fadst8 },  // H_ADST
    1420             :     { fadst16, fidtx8 },  // V_FLIPADST
    1421             :     { fidtx16, fadst8 },  // H_FLIPADST
    1422             : #endif
    1423             :   };
    1424           0 :   const transform_2d ht = FHT[tx_type];
    1425           0 :   const int n = 8;
    1426           0 :   const int n2 = 16;
    1427             :   tran_low_t out[16 * 8];
    1428             :   tran_low_t temp_in[16], temp_out[16];
    1429             :   int i, j;
    1430             : #if CONFIG_EXT_TX
    1431             :   int16_t flipped_input[16 * 8];
    1432           0 :   maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
    1433             : #endif
    1434             : 
    1435             :   // Rows
    1436           0 :   for (i = 0; i < n2; ++i) {
    1437           0 :     for (j = 0; j < n; ++j)
    1438           0 :       temp_in[j] =
    1439           0 :           (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
    1440           0 :     ht.rows(temp_in, temp_out);
    1441           0 :     for (j = 0; j < n; ++j)
    1442           0 :       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
    1443             :   }
    1444             : 
    1445             :   // Columns
    1446           0 :   for (i = 0; i < n; ++i) {
    1447           0 :     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
    1448           0 :     ht.cols(temp_in, temp_out);
    1449           0 :     for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
    1450             :   }
    1451             :   // Note: overall scale factor of transform is 8 times unitary
    1452           0 : }
    1453             : 
    1454           0 : void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
    1455             :                    int tx_type) {
    1456             :   static const transform_2d FHT[] = {
    1457             :     { fdct8, fdct16 },    // DCT_DCT
    1458             :     { fadst8, fdct16 },   // ADST_DCT
    1459             :     { fdct8, fadst16 },   // DCT_ADST
    1460             :     { fadst8, fadst16 },  // ADST_ADST
    1461             : #if CONFIG_EXT_TX
    1462             :     { fadst8, fdct16 },   // FLIPADST_DCT
    1463             :     { fdct8, fadst16 },   // DCT_FLIPADST
    1464             :     { fadst8, fadst16 },  // FLIPADST_FLIPADST
    1465             :     { fadst8, fadst16 },  // ADST_FLIPADST
    1466             :     { fadst8, fadst16 },  // FLIPADST_ADST
    1467             :     { fidtx8, fidtx16 },  // IDTX
    1468             :     { fdct8, fidtx16 },   // V_DCT
    1469             :     { fidtx8, fdct16 },   // H_DCT
    1470             :     { fadst8, fidtx16 },  // V_ADST
    1471             :     { fidtx8, fadst16 },  // H_ADST
    1472             :     { fadst8, fidtx16 },  // V_FLIPADST
    1473             :     { fidtx8, fadst16 },  // H_FLIPADST
    1474             : #endif
    1475             :   };
    1476           0 :   const transform_2d ht = FHT[tx_type];
    1477           0 :   const int n = 8;
    1478           0 :   const int n2 = 16;
    1479             :   tran_low_t out[16 * 8];
    1480             :   tran_low_t temp_in[16], temp_out[16];
    1481             :   int i, j;
    1482             : #if CONFIG_EXT_TX
    1483             :   int16_t flipped_input[16 * 8];
    1484           0 :   maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
    1485             : #endif
    1486             : 
    1487             :   // Columns
    1488           0 :   for (i = 0; i < n2; ++i) {
    1489           0 :     for (j = 0; j < n; ++j)
    1490           0 :       temp_in[j] =
    1491           0 :           (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
    1492           0 :     ht.cols(temp_in, temp_out);
    1493           0 :     for (j = 0; j < n; ++j)
    1494           0 :       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
    1495             :   }
    1496             : 
    1497             :   // Rows
    1498           0 :   for (i = 0; i < n; ++i) {
    1499           0 :     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
    1500           0 :     ht.rows(temp_in, temp_out);
    1501           0 :     for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
    1502             :   }
    1503             :   // Note: overall scale factor of transform is 8 times unitary
    1504           0 : }
    1505             : 
    1506           0 : void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
    1507             :                    int tx_type) {
    1508             :   static const transform_2d FHT[] = {
    1509             :     { fdct32, fdct8 },         // DCT_DCT
    1510             :     { fhalfright32, fdct8 },   // ADST_DCT
    1511             :     { fdct32, fadst8 },        // DCT_ADST
    1512             :     { fhalfright32, fadst8 },  // ADST_ADST
    1513             : #if CONFIG_EXT_TX
    1514             :     { fhalfright32, fdct8 },   // FLIPADST_DCT
    1515             :     { fdct32, fadst8 },        // DCT_FLIPADST
    1516             :     { fhalfright32, fadst8 },  // FLIPADST_FLIPADST
    1517             :     { fhalfright32, fadst8 },  // ADST_FLIPADST
    1518             :     { fhalfright32, fadst8 },  // FLIPADST_ADST
    1519             :     { fidtx32, fidtx8 },       // IDTX
    1520             :     { fdct32, fidtx8 },        // V_DCT
    1521             :     { fidtx32, fdct8 },        // H_DCT
    1522             :     { fhalfright32, fidtx8 },  // V_ADST
    1523             :     { fidtx32, fadst8 },       // H_ADST
    1524             :     { fhalfright32, fidtx8 },  // V_FLIPADST
    1525             :     { fidtx32, fadst8 },       // H_FLIPADST
    1526             : #endif
    1527             :   };
    1528           0 :   const transform_2d ht = FHT[tx_type];
    1529           0 :   const int n = 8;
    1530           0 :   const int n4 = 32;
    1531             :   tran_low_t out[32 * 8];
    1532             :   tran_low_t temp_in[32], temp_out[32];
    1533             :   int i, j;
    1534             : #if CONFIG_EXT_TX
    1535             :   int16_t flipped_input[32 * 8];
    1536           0 :   maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
    1537             : #endif
    1538             : 
    1539             :   // Rows
    1540           0 :   for (i = 0; i < n4; ++i) {
    1541           0 :     for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
    1542           0 :     ht.rows(temp_in, temp_out);
    1543           0 :     for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
    1544             :   }
    1545             : 
    1546             :   // Columns
    1547           0 :   for (i = 0; i < n; ++i) {
    1548           0 :     for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
    1549           0 :     ht.cols(temp_in, temp_out);
    1550           0 :     for (j = 0; j < n4; ++j)
    1551           0 :       output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
    1552             :   }
    1553             :   // Note: overall scale factor of transform is 4 times unitary
    1554           0 : }
    1555             : 
    1556           0 : void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
    1557             :                    int tx_type) {
    1558             :   static const transform_2d FHT[] = {
    1559             :     { fdct8, fdct32 },         // DCT_DCT
    1560             :     { fadst8, fdct32 },        // ADST_DCT
    1561             :     { fdct8, fhalfright32 },   // DCT_ADST
    1562             :     { fadst8, fhalfright32 },  // ADST_ADST
    1563             : #if CONFIG_EXT_TX
    1564             :     { fadst8, fdct32 },        // FLIPADST_DCT
    1565             :     { fdct8, fhalfright32 },   // DCT_FLIPADST
    1566             :     { fadst8, fhalfright32 },  // FLIPADST_FLIPADST
    1567             :     { fadst8, fhalfright32 },  // ADST_FLIPADST
    1568             :     { fadst8, fhalfright32 },  // FLIPADST_ADST
    1569             :     { fidtx8, fidtx32 },       // IDTX
    1570             :     { fdct8, fidtx32 },        // V_DCT
    1571             :     { fidtx8, fdct32 },        // H_DCT
    1572             :     { fadst8, fidtx32 },       // V_ADST
    1573             :     { fidtx8, fhalfright32 },  // H_ADST
    1574             :     { fadst8, fidtx32 },       // V_FLIPADST
    1575             :     { fidtx8, fhalfright32 },  // H_FLIPADST
    1576             : #endif
    1577             :   };
    1578           0 :   const transform_2d ht = FHT[tx_type];
    1579           0 :   const int n = 8;
    1580           0 :   const int n4 = 32;
    1581             :   tran_low_t out[32 * 8];
    1582             :   tran_low_t temp_in[32], temp_out[32];
    1583             :   int i, j;
    1584             : #if CONFIG_EXT_TX
    1585             :   int16_t flipped_input[32 * 8];
    1586           0 :   maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
    1587             : #endif
    1588             : 
    1589             :   // Columns
    1590           0 :   for (i = 0; i < n4; ++i) {
    1591           0 :     for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
    1592           0 :     ht.cols(temp_in, temp_out);
    1593           0 :     for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
    1594             :   }
    1595             : 
    1596             :   // Rows
    1597           0 :   for (i = 0; i < n; ++i) {
    1598           0 :     for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
    1599           0 :     ht.rows(temp_in, temp_out);
    1600           0 :     for (j = 0; j < n4; ++j)
    1601           0 :       output[j + i * n4] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
    1602             :   }
    1603             :   // Note: overall scale factor of transform is 4 times unitary
    1604           0 : }
    1605             : 
    1606           0 : void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
    1607             :                     int tx_type) {
    1608             :   static const transform_2d FHT[] = {
    1609             :     { fdct32, fdct16 },         // DCT_DCT
    1610             :     { fhalfright32, fdct16 },   // ADST_DCT
    1611             :     { fdct32, fadst16 },        // DCT_ADST
    1612             :     { fhalfright32, fadst16 },  // ADST_ADST
    1613             : #if CONFIG_EXT_TX
    1614             :     { fhalfright32, fdct16 },   // FLIPADST_DCT
    1615             :     { fdct32, fadst16 },        // DCT_FLIPADST
    1616             :     { fhalfright32, fadst16 },  // FLIPADST_FLIPADST
    1617             :     { fhalfright32, fadst16 },  // ADST_FLIPADST
    1618             :     { fhalfright32, fadst16 },  // FLIPADST_ADST
    1619             :     { fidtx32, fidtx16 },       // IDTX
    1620             :     { fdct32, fidtx16 },        // V_DCT
    1621             :     { fidtx32, fdct16 },        // H_DCT
    1622             :     { fhalfright32, fidtx16 },  // V_ADST
    1623             :     { fidtx32, fadst16 },       // H_ADST
    1624             :     { fhalfright32, fidtx16 },  // V_FLIPADST
    1625             :     { fidtx32, fadst16 },       // H_FLIPADST
    1626             : #endif
    1627             :   };
    1628           0 :   const transform_2d ht = FHT[tx_type];
    1629           0 :   const int n = 16;
    1630           0 :   const int n2 = 32;
    1631             :   tran_low_t out[32 * 16];
    1632             :   tran_low_t temp_in[32], temp_out[32];
    1633             :   int i, j;
    1634             : #if CONFIG_EXT_TX
    1635             :   int16_t flipped_input[32 * 16];
    1636           0 :   maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
    1637             : #endif
    1638             : 
    1639             :   // Rows
    1640           0 :   for (i = 0; i < n2; ++i) {
    1641           0 :     for (j = 0; j < n; ++j)
    1642           0 :       temp_in[j] =
    1643           0 :           (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
    1644           0 :     ht.rows(temp_in, temp_out);
    1645           0 :     for (j = 0; j < n; ++j)
    1646           0 :       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
    1647             :   }
    1648             : 
    1649             :   // Columns
    1650           0 :   for (i = 0; i < n; ++i) {
    1651           0 :     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
    1652           0 :     ht.cols(temp_in, temp_out);
    1653           0 :     for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
    1654             :   }
    1655             :   // Note: overall scale factor of transform is 4 times unitary
    1656           0 : }
    1657             : 
    1658           0 : void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
    1659             :                     int tx_type) {
    1660             :   static const transform_2d FHT[] = {
    1661             :     { fdct16, fdct32 },         // DCT_DCT
    1662             :     { fadst16, fdct32 },        // ADST_DCT
    1663             :     { fdct16, fhalfright32 },   // DCT_ADST
    1664             :     { fadst16, fhalfright32 },  // ADST_ADST
    1665             : #if CONFIG_EXT_TX
    1666             :     { fadst16, fdct32 },        // FLIPADST_DCT
    1667             :     { fdct16, fhalfright32 },   // DCT_FLIPADST
    1668             :     { fadst16, fhalfright32 },  // FLIPADST_FLIPADST
    1669             :     { fadst16, fhalfright32 },  // ADST_FLIPADST
    1670             :     { fadst16, fhalfright32 },  // FLIPADST_ADST
    1671             :     { fidtx16, fidtx32 },       // IDTX
    1672             :     { fdct16, fidtx32 },        // V_DCT
    1673             :     { fidtx16, fdct32 },        // H_DCT
    1674             :     { fadst16, fidtx32 },       // V_ADST
    1675             :     { fidtx16, fhalfright32 },  // H_ADST
    1676             :     { fadst16, fidtx32 },       // V_FLIPADST
    1677             :     { fidtx16, fhalfright32 },  // H_FLIPADST
    1678             : #endif
    1679             :   };
    1680           0 :   const transform_2d ht = FHT[tx_type];
    1681           0 :   const int n = 16;
    1682           0 :   const int n2 = 32;
    1683             :   tran_low_t out[32 * 16];
    1684             :   tran_low_t temp_in[32], temp_out[32];
    1685             :   int i, j;
    1686             : #if CONFIG_EXT_TX
    1687             :   int16_t flipped_input[32 * 16];
    1688           0 :   maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
    1689             : #endif
    1690             : 
    1691             :   // Columns
    1692           0 :   for (i = 0; i < n2; ++i) {
    1693           0 :     for (j = 0; j < n; ++j)
    1694           0 :       temp_in[j] =
    1695           0 :           (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
    1696           0 :     ht.cols(temp_in, temp_out);
    1697           0 :     for (j = 0; j < n; ++j)
    1698           0 :       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
    1699             :   }
    1700             : 
    1701             :   // Rows
    1702           0 :   for (i = 0; i < n; ++i) {
    1703           0 :     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
    1704           0 :     ht.rows(temp_in, temp_out);
    1705           0 :     for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
    1706             :   }
    1707             :   // Note: overall scale factor of transform is 4 times unitary
    1708           0 : }
    1709             : 
    1710           0 : void av1_fdct8x8_quant_c(const int16_t *input, int stride,
    1711             :                          tran_low_t *coeff_ptr, intptr_t n_coeffs,
    1712             :                          int skip_block, const int16_t *zbin_ptr,
    1713             :                          const int16_t *round_ptr, const int16_t *quant_ptr,
    1714             :                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    1715             :                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
    1716             :                          uint16_t *eob_ptr, const int16_t *scan,
    1717             :                          const int16_t *iscan
    1718             : #if CONFIG_AOM_QM
    1719             :                          ,
    1720             :                          const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
    1721             : #endif
    1722             :                          ) {
    1723           0 :   int eob = -1;
    1724             : 
    1725             :   int i, j;
    1726             :   tran_low_t intermediate[64];
    1727             : 
    1728             :   // Transform columns
    1729             :   {
    1730           0 :     tran_low_t *output = intermediate;
    1731             :     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
    1732             :     tran_high_t t0, t1, t2, t3;                  // needs32
    1733             :     tran_high_t x0, x1, x2, x3;                  // canbe16
    1734             : 
    1735           0 :     for (i = 0; i < 8; i++) {
    1736             :       // stage 1
    1737           0 :       s0 = (input[0 * stride] + input[7 * stride]) * 4;
    1738           0 :       s1 = (input[1 * stride] + input[6 * stride]) * 4;
    1739           0 :       s2 = (input[2 * stride] + input[5 * stride]) * 4;
    1740           0 :       s3 = (input[3 * stride] + input[4 * stride]) * 4;
    1741           0 :       s4 = (input[3 * stride] - input[4 * stride]) * 4;
    1742           0 :       s5 = (input[2 * stride] - input[5 * stride]) * 4;
    1743           0 :       s6 = (input[1 * stride] - input[6 * stride]) * 4;
    1744           0 :       s7 = (input[0 * stride] - input[7 * stride]) * 4;
    1745             : 
    1746             :       // fdct4(step, step);
    1747           0 :       x0 = s0 + s3;
    1748           0 :       x1 = s1 + s2;
    1749           0 :       x2 = s1 - s2;
    1750           0 :       x3 = s0 - s3;
    1751           0 :       t0 = (x0 + x1) * cospi_16_64;
    1752           0 :       t1 = (x0 - x1) * cospi_16_64;
    1753           0 :       t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
    1754           0 :       t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
    1755           0 :       output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
    1756           0 :       output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
    1757           0 :       output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
    1758           0 :       output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
    1759             : 
    1760             :       // stage 2
    1761           0 :       t0 = (s6 - s5) * cospi_16_64;
    1762           0 :       t1 = (s6 + s5) * cospi_16_64;
    1763           0 :       t2 = fdct_round_shift(t0);
    1764           0 :       t3 = fdct_round_shift(t1);
    1765             : 
    1766             :       // stage 3
    1767           0 :       x0 = s4 + t2;
    1768           0 :       x1 = s4 - t2;
    1769           0 :       x2 = s7 - t3;
    1770           0 :       x3 = s7 + t3;
    1771             : 
    1772             :       // stage 4
    1773           0 :       t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
    1774           0 :       t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
    1775           0 :       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    1776           0 :       t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
    1777           0 :       output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
    1778           0 :       output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
    1779           0 :       output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
    1780           0 :       output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
    1781           0 :       input++;
    1782           0 :       output++;
    1783             :     }
    1784             :   }
    1785             : 
    1786             :   // Rows
    1787           0 :   for (i = 0; i < 8; ++i) {
    1788           0 :     fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
    1789           0 :     for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2;
    1790             :   }
    1791             : 
    1792             :   // TODO(jingning) Decide the need of these arguments after the
    1793             :   // quantization process is completed.
    1794             :   (void)zbin_ptr;
    1795             :   (void)quant_shift_ptr;
    1796             :   (void)iscan;
    1797             : 
    1798           0 :   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
    1799           0 :   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
    1800             : 
    1801           0 :   if (!skip_block) {
    1802             :     // Quantization pass: All coefficients with index >= zero_flag are
    1803             :     // skippable. Note: zero_flag can be zero.
    1804           0 :     for (i = 0; i < n_coeffs; i++) {
    1805           0 :       const int rc = scan[i];
    1806           0 :       const int coeff = coeff_ptr[rc];
    1807             : #if CONFIG_AOM_QM
    1808             :       const qm_val_t wt = qm_ptr[rc];
    1809             :       const qm_val_t iwt = iqm_ptr[rc];
    1810             :       const int dequant =
    1811             :           (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
    1812             :           AOM_QM_BITS;
    1813             : #endif
    1814           0 :       const int coeff_sign = (coeff >> 31);
    1815           0 :       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
    1816             : 
    1817           0 :       int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
    1818             :       int tmp32;
    1819             : #if CONFIG_AOM_QM
    1820             :       tmp32 = (int)((tmp * quant_ptr[rc != 0] * wt) >> (16 + AOM_QM_BITS));
    1821             :       qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
    1822             :       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
    1823             : #else
    1824           0 :       tmp32 = (int)((tmp * quant_ptr[rc != 0]) >> 16);
    1825           0 :       qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
    1826           0 :       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
    1827             : #endif
    1828             : 
    1829           0 :       if (tmp32) eob = i;
    1830             :     }
    1831             :   }
    1832           0 :   *eob_ptr = eob + 1;
    1833           0 : }
    1834             : 
    1835           0 : void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
    1836             :                   int tx_type) {
    1837           0 :   if (tx_type == DCT_DCT) {
    1838           0 :     aom_fdct8x8_c(input, output, stride);
    1839             :   } else {
    1840             :     static const transform_2d FHT[] = {
    1841             :       { fdct8, fdct8 },    // DCT_DCT
    1842             :       { fadst8, fdct8 },   // ADST_DCT
    1843             :       { fdct8, fadst8 },   // DCT_ADST
    1844             :       { fadst8, fadst8 },  // ADST_ADST
    1845             : #if CONFIG_EXT_TX
    1846             :       { fadst8, fdct8 },   // FLIPADST_DCT
    1847             :       { fdct8, fadst8 },   // DCT_FLIPADST
    1848             :       { fadst8, fadst8 },  // FLIPADST_FLIPADST
    1849             :       { fadst8, fadst8 },  // ADST_FLIPADST
    1850             :       { fadst8, fadst8 },  // FLIPADST_ADST
    1851             :       { fidtx8, fidtx8 },  // IDTX
    1852             :       { fdct8, fidtx8 },   // V_DCT
    1853             :       { fidtx8, fdct8 },   // H_DCT
    1854             :       { fadst8, fidtx8 },  // V_ADST
    1855             :       { fidtx8, fadst8 },  // H_ADST
    1856             :       { fadst8, fidtx8 },  // V_FLIPADST
    1857             :       { fidtx8, fadst8 },  // H_FLIPADST
    1858             : #endif                     // CONFIG_EXT_TX
    1859             :     };
    1860           0 :     const transform_2d ht = FHT[tx_type];
    1861             :     tran_low_t out[64];
    1862             :     int i, j;
    1863             :     tran_low_t temp_in[8], temp_out[8];
    1864             : 
    1865             : #if CONFIG_EXT_TX
    1866             :     int16_t flipped_input[8 * 8];
    1867           0 :     maybe_flip_input(&input, &stride, 8, 8, flipped_input, tx_type);
    1868             : #endif
    1869             : 
    1870             :     // Columns
    1871           0 :     for (i = 0; i < 8; ++i) {
    1872           0 :       for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
    1873           0 :       ht.cols(temp_in, temp_out);
    1874           0 :       for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
    1875             :     }
    1876             : 
    1877             :     // Rows
    1878           0 :     for (i = 0; i < 8; ++i) {
    1879           0 :       for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
    1880           0 :       ht.rows(temp_in, temp_out);
    1881           0 :       for (j = 0; j < 8; ++j)
    1882           0 :         output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
    1883             :     }
    1884             :   }
    1885           0 : }
    1886             : 
    1887             : /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
    1888             :    pixel. */
    1889           0 : void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
    1890             :   int i;
    1891             :   tran_high_t a1, b1, c1, d1, e1;
    1892           0 :   const int16_t *ip_pass0 = input;
    1893           0 :   const tran_low_t *ip = NULL;
    1894           0 :   tran_low_t *op = output;
    1895             : 
    1896           0 :   for (i = 0; i < 4; i++) {
    1897           0 :     a1 = ip_pass0[0 * stride];
    1898           0 :     b1 = ip_pass0[1 * stride];
    1899           0 :     c1 = ip_pass0[2 * stride];
    1900           0 :     d1 = ip_pass0[3 * stride];
    1901             : 
    1902           0 :     a1 += b1;
    1903           0 :     d1 = d1 - c1;
    1904           0 :     e1 = (a1 - d1) >> 1;
    1905           0 :     b1 = e1 - b1;
    1906           0 :     c1 = e1 - c1;
    1907           0 :     a1 -= c1;
    1908           0 :     d1 += b1;
    1909           0 :     op[0] = (tran_low_t)a1;
    1910           0 :     op[4] = (tran_low_t)c1;
    1911           0 :     op[8] = (tran_low_t)d1;
    1912           0 :     op[12] = (tran_low_t)b1;
    1913             : 
    1914           0 :     ip_pass0++;
    1915           0 :     op++;
    1916             :   }
    1917           0 :   ip = output;
    1918           0 :   op = output;
    1919             : 
    1920           0 :   for (i = 0; i < 4; i++) {
    1921           0 :     a1 = ip[0];
    1922           0 :     b1 = ip[1];
    1923           0 :     c1 = ip[2];
    1924           0 :     d1 = ip[3];
    1925             : 
    1926           0 :     a1 += b1;
    1927           0 :     d1 -= c1;
    1928           0 :     e1 = (a1 - d1) >> 1;
    1929           0 :     b1 = e1 - b1;
    1930           0 :     c1 = e1 - c1;
    1931           0 :     a1 -= c1;
    1932           0 :     d1 += b1;
    1933           0 :     op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
    1934           0 :     op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
    1935           0 :     op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
    1936           0 :     op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
    1937             : 
    1938           0 :     ip += 4;
    1939           0 :     op += 4;
    1940             :   }
    1941           0 : }
    1942             : 
    1943           0 : void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
    1944             :                     int tx_type) {
    1945             :   static const transform_2d FHT[] = {
    1946             :     { fdct16, fdct16 },    // DCT_DCT
    1947             :     { fadst16, fdct16 },   // ADST_DCT
    1948             :     { fdct16, fadst16 },   // DCT_ADST
    1949             :     { fadst16, fadst16 },  // ADST_ADST
    1950             : #if CONFIG_EXT_TX
    1951             :     { fadst16, fdct16 },   // FLIPADST_DCT
    1952             :     { fdct16, fadst16 },   // DCT_FLIPADST
    1953             :     { fadst16, fadst16 },  // FLIPADST_FLIPADST
    1954             :     { fadst16, fadst16 },  // ADST_FLIPADST
    1955             :     { fadst16, fadst16 },  // FLIPADST_ADST
    1956             :     { fidtx16, fidtx16 },  // IDTX
    1957             :     { fdct16, fidtx16 },   // V_DCT
    1958             :     { fidtx16, fdct16 },   // H_DCT
    1959             :     { fadst16, fidtx16 },  // V_ADST
    1960             :     { fidtx16, fadst16 },  // H_ADST
    1961             :     { fadst16, fidtx16 },  // V_FLIPADST
    1962             :     { fidtx16, fadst16 },  // H_FLIPADST
    1963             : #endif                     // CONFIG_EXT_TX
    1964             :   };
    1965             : 
    1966           0 :   const transform_2d ht = FHT[tx_type];
    1967             :   tran_low_t out[256];
    1968             :   int i, j;
    1969             :   tran_low_t temp_in[16], temp_out[16];
    1970             : 
    1971             : #if CONFIG_EXT_TX
    1972             :   int16_t flipped_input[16 * 16];
    1973           0 :   maybe_flip_input(&input, &stride, 16, 16, flipped_input, tx_type);
    1974             : #endif
    1975             : 
    1976             :   // Columns
    1977           0 :   for (i = 0; i < 16; ++i) {
    1978           0 :     for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
    1979           0 :     ht.cols(temp_in, temp_out);
    1980           0 :     for (j = 0; j < 16; ++j)
    1981           0 :       out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
    1982             :   }
    1983             : 
    1984             :   // Rows
    1985           0 :   for (i = 0; i < 16; ++i) {
    1986           0 :     for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
    1987           0 :     ht.rows(temp_in, temp_out);
    1988           0 :     for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
    1989             :   }
    1990           0 : }
    1991             : 
    1992             : #if CONFIG_HIGHBITDEPTH
    1993           0 : void av1_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
    1994             :                          int tx_type) {
    1995           0 :   av1_fht4x4_c(input, output, stride, tx_type);
    1996           0 : }
    1997             : 
    1998           0 : void av1_highbd_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
    1999             :                          int tx_type) {
    2000           0 :   av1_fht4x8_c(input, output, stride, tx_type);
    2001           0 : }
    2002             : 
    2003           0 : void av1_highbd_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
    2004             :                          int tx_type) {
    2005           0 :   av1_fht8x4_c(input, output, stride, tx_type);
    2006           0 : }
    2007             : 
    2008           0 : void av1_highbd_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
    2009             :                           int tx_type) {
    2010           0 :   av1_fht8x16_c(input, output, stride, tx_type);
    2011           0 : }
    2012             : 
    2013           0 : void av1_highbd_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
    2014             :                           int tx_type) {
    2015           0 :   av1_fht16x8_c(input, output, stride, tx_type);
    2016           0 : }
    2017             : 
    2018           0 : void av1_highbd_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
    2019             :                            int tx_type) {
    2020           0 :   av1_fht16x32_c(input, output, stride, tx_type);
    2021           0 : }
    2022             : 
    2023           0 : void av1_highbd_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
    2024             :                            int tx_type) {
    2025           0 :   av1_fht32x16_c(input, output, stride, tx_type);
    2026           0 : }
    2027             : 
    2028           0 : void av1_highbd_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
    2029             :                           int tx_type) {
    2030           0 :   av1_fht4x16_c(input, output, stride, tx_type);
    2031           0 : }
    2032             : 
    2033           0 : void av1_highbd_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
    2034             :                           int tx_type) {
    2035           0 :   av1_fht16x4_c(input, output, stride, tx_type);
    2036           0 : }
    2037             : 
    2038           0 : void av1_highbd_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
    2039             :                           int tx_type) {
    2040           0 :   av1_fht8x32_c(input, output, stride, tx_type);
    2041           0 : }
    2042             : 
    2043           0 : void av1_highbd_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
    2044             :                           int tx_type) {
    2045           0 :   av1_fht32x8_c(input, output, stride, tx_type);
    2046           0 : }
    2047             : 
    2048           0 : void av1_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
    2049             :                          int tx_type) {
    2050           0 :   av1_fht8x8_c(input, output, stride, tx_type);
    2051           0 : }
    2052             : 
    2053           0 : void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
    2054             :                           int stride) {
    2055           0 :   av1_fwht4x4_c(input, output, stride);
    2056           0 : }
    2057             : 
    2058           0 : void av1_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
    2059             :                            int tx_type) {
    2060           0 :   av1_fht16x16_c(input, output, stride, tx_type);
    2061           0 : }
    2062             : #endif  // CONFIG_HIGHBITDEPTH
    2063             : 
    2064           0 : void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
    2065             :                     int tx_type) {
    2066             :   static const transform_2d FHT[] = {
    2067             :     { fdct32, fdct32 },  // DCT_DCT
    2068             : #if CONFIG_EXT_TX
    2069             :     { fhalfright32, fdct32 },        // ADST_DCT
    2070             :     { fdct32, fhalfright32 },        // DCT_ADST
    2071             :     { fhalfright32, fhalfright32 },  // ADST_ADST
    2072             :     { fhalfright32, fdct32 },        // FLIPADST_DCT
    2073             :     { fdct32, fhalfright32 },        // DCT_FLIPADST
    2074             :     { fhalfright32, fhalfright32 },  // FLIPADST_FLIPADST
    2075             :     { fhalfright32, fhalfright32 },  // ADST_FLIPADST
    2076             :     { fhalfright32, fhalfright32 },  // FLIPADST_ADST
    2077             :     { fidtx32, fidtx32 },            // IDTX
    2078             :     { fdct32, fidtx32 },             // V_DCT
    2079             :     { fidtx32, fdct32 },             // H_DCT
    2080             :     { fhalfright32, fidtx32 },       // V_ADST
    2081             :     { fidtx32, fhalfright32 },       // H_ADST
    2082             :     { fhalfright32, fidtx32 },       // V_FLIPADST
    2083             :     { fidtx32, fhalfright32 },       // H_FLIPADST
    2084             : #endif
    2085             :   };
    2086           0 :   const transform_2d ht = FHT[tx_type];
    2087             :   tran_low_t out[1024];
    2088             :   int i, j;
    2089             :   tran_low_t temp_in[32], temp_out[32];
    2090             : 
    2091             : #if CONFIG_EXT_TX
    2092             :   int16_t flipped_input[32 * 32];
    2093           0 :   maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
    2094             : #endif
    2095             : 
    2096             :   // Columns
    2097           0 :   for (i = 0; i < 32; ++i) {
    2098           0 :     for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
    2099           0 :     ht.cols(temp_in, temp_out);
    2100           0 :     for (j = 0; j < 32; ++j)
    2101           0 :       out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
    2102             :   }
    2103             : 
    2104             :   // Rows
    2105           0 :   for (i = 0; i < 32; ++i) {
    2106           0 :     for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
    2107           0 :     ht.rows(temp_in, temp_out);
    2108           0 :     for (j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j];
    2109             :   }
    2110           0 : }
    2111             : 
    2112             : #if CONFIG_TX64X64
    2113             : #if CONFIG_EXT_TX
    2114             : static void fidtx64(const tran_low_t *input, tran_low_t *output) {
    2115             :   int i;
    2116             :   for (i = 0; i < 64; ++i)
    2117             :     output[i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
    2118             : }
    2119             : 
    2120             : // For use in lieu of ADST
    2121             : static void fhalfright64(const tran_low_t *input, tran_low_t *output) {
    2122             :   int i;
    2123             :   tran_low_t inputhalf[32];
    2124             :   for (i = 0; i < 32; ++i) {
    2125             :     output[32 + i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
    2126             :   }
    2127             :   // Multiply input by sqrt(2)
    2128             :   for (i = 0; i < 32; ++i) {
    2129             :     inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 32] * Sqrt2);
    2130             :   }
    2131             :   fdct32(inputhalf, output);
    2132             :   // Note overall scaling factor is 2 times unitary
    2133             : }
    2134             : #endif  // CONFIG_EXT_TX
    2135             : 
    2136             : static void fdct64_col(const tran_low_t *input, tran_low_t *output) {
    2137             :   int32_t in[64], out[64];
    2138             :   int i;
    2139             :   for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
    2140             :   av1_fdct64_new(in, out, fwd_cos_bit_col_dct_64, fwd_stage_range_col_dct_64);
    2141             :   for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
    2142             : }
    2143             : 
    2144             : static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
    2145             :   int32_t in[64], out[64];
    2146             :   int i;
    2147             :   for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
    2148             :   av1_fdct64_new(in, out, fwd_cos_bit_row_dct_64, fwd_stage_range_row_dct_64);
    2149             :   for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
    2150             : }
    2151             : 
    2152             : void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
    2153             :                     int tx_type) {
    2154             :   static const transform_2d FHT[] = {
    2155             :     { fdct64_col, fdct64_row },  // DCT_DCT
    2156             : #if CONFIG_EXT_TX
    2157             :     { fhalfright64, fdct64_row },    // ADST_DCT
    2158             :     { fdct64_col, fhalfright64 },    // DCT_ADST
    2159             :     { fhalfright64, fhalfright64 },  // ADST_ADST
    2160             :     { fhalfright64, fdct64_row },    // FLIPADST_DCT
    2161             :     { fdct64_col, fhalfright64 },    // DCT_FLIPADST
    2162             :     { fhalfright64, fhalfright64 },  // FLIPADST_FLIPADST
    2163             :     { fhalfright64, fhalfright64 },  // ADST_FLIPADST
    2164             :     { fhalfright64, fhalfright64 },  // FLIPADST_ADST
    2165             :     { fidtx64, fidtx64 },            // IDTX
    2166             :     { fdct64_col, fidtx64 },         // V_DCT
    2167             :     { fidtx64, fdct64_row },         // H_DCT
    2168             :     { fhalfright64, fidtx64 },       // V_ADST
    2169             :     { fidtx64, fhalfright64 },       // H_ADST
    2170             :     { fhalfright64, fidtx64 },       // V_FLIPADST
    2171             :     { fidtx64, fhalfright64 },       // H_FLIPADST
    2172             : #endif
    2173             :   };
    2174             :   const transform_2d ht = FHT[tx_type];
    2175             :   tran_low_t out[4096];
    2176             :   int i, j;
    2177             :   tran_low_t temp_in[64], temp_out[64];
    2178             : #if CONFIG_EXT_TX
    2179             :   int16_t flipped_input[64 * 64];
    2180             :   maybe_flip_input(&input, &stride, 64, 64, flipped_input, tx_type);
    2181             : #endif
    2182             :   // Columns
    2183             :   for (i = 0; i < 64; ++i) {
    2184             :     for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
    2185             :     ht.cols(temp_in, temp_out);
    2186             :     for (j = 0; j < 64; ++j)
    2187             :       out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
    2188             :   }
    2189             : 
    2190             :   // Rows
    2191             :   for (i = 0; i < 64; ++i) {
    2192             :     for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
    2193             :     ht.rows(temp_in, temp_out);
    2194             :     for (j = 0; j < 64; ++j)
    2195             :       output[j + i * 64] =
    2196             :           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
    2197             :   }
    2198             : }
    2199             : #endif  // CONFIG_TX64X64
    2200             : 
    2201             : #if CONFIG_EXT_TX
    2202             : // Forward identity transform.
    2203           0 : void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
    2204             :                     int bs, int tx_type) {
    2205             :   int r, c;
    2206           0 :   const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1);
    2207           0 :   if (tx_type == IDTX) {
    2208           0 :     for (r = 0; r < bs; ++r) {
    2209           0 :       for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] * (1 << shift);
    2210           0 :       src_diff += stride;
    2211           0 :       coeff += bs;
    2212             :     }
    2213             :   }
    2214           0 : }
    2215             : #endif  // CONFIG_EXT_TX
    2216             : 
    2217             : #if CONFIG_HIGHBITDEPTH
    2218           0 : void av1_highbd_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
    2219             :                            int tx_type) {
    2220           0 :   av1_fht32x32_c(input, output, stride, tx_type);
    2221           0 : }
    2222             : 
    2223             : #if CONFIG_TX64X64
    2224             : void av1_highbd_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
    2225             :                            int tx_type) {
    2226             :   av1_fht64x64_c(input, output, stride, tx_type);
    2227             : }
    2228             : #endif  // CONFIG_TX64X64
    2229             : #endif  // CONFIG_HIGHBITDEPTH
    2230             : 
    2231             : #if CONFIG_DPCM_INTRA
    2232             : void av1_dpcm_ft4_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
    2233             :                     tran_low_t *output) {
    2234             :   assert(tx_type < TX_TYPES_1D);
    2235             :   static const transform_1d FHT[] = { fdct4, fadst4, fadst4, fidtx4 };
    2236             :   const transform_1d ft = FHT[tx_type];
    2237             :   tran_low_t temp_in[4];
    2238             :   for (int i = 0; i < 4; ++i)
    2239             :     temp_in[i] = (tran_low_t)fdct_round_shift(input[i * stride] * 4 * Sqrt2);
    2240             :   ft(temp_in, output);
    2241             : }
    2242             : 
    2243             : void av1_dpcm_ft8_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
    2244             :                     tran_low_t *output) {
    2245             :   assert(tx_type < TX_TYPES_1D);
    2246             :   static const transform_1d FHT[] = { fdct8, fadst8, fadst8, fidtx8 };
    2247             :   const transform_1d ft = FHT[tx_type];
    2248             :   tran_low_t temp_in[8];
    2249             :   for (int i = 0; i < 8; ++i) temp_in[i] = input[i * stride] * 4;
    2250             :   ft(temp_in, output);
    2251             : }
    2252             : 
    2253             : void av1_dpcm_ft16_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
    2254             :                      tran_low_t *output) {
    2255             :   assert(tx_type < TX_TYPES_1D);
    2256             :   static const transform_1d FHT[] = { fdct16, fadst16, fadst16, fidtx16 };
    2257             :   const transform_1d ft = FHT[tx_type];
    2258             :   tran_low_t temp_in[16];
    2259             :   for (int i = 0; i < 16; ++i)
    2260             :     temp_in[i] = (tran_low_t)fdct_round_shift(input[i * stride] * 2 * Sqrt2);
    2261             :   ft(temp_in, output);
    2262             : }
    2263             : 
    2264             : void av1_dpcm_ft32_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
    2265             :                      tran_low_t *output) {
    2266             :   assert(tx_type < TX_TYPES_1D);
    2267             :   static const transform_1d FHT[] = { fdct32, fhalfright32, fhalfright32,
    2268             :                                       fidtx32 };
    2269             :   const transform_1d ft = FHT[tx_type];
    2270             :   tran_low_t temp_in[32];
    2271             :   for (int i = 0; i < 32; ++i) temp_in[i] = input[i * stride];
    2272             :   ft(temp_in, output);
    2273             : }
    2274             : #endif  // CONFIG_DPCM_INTRA
    2275             : #endif  // !AV1_DCT_GTEST

Generated by: LCOV version 1.13