LCOV - output.info - third_party/aom/aom_dsp/fwd

LCOV - code coverage report

Current view:	top level - third_party/aom/aom_dsp - fwd_txfm.c (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	0	593	0.0 %
Date:	2017-07-14 16:53:18	Functions:	0	20	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include "aom_dsp/fwd_txfm.h"
      13             : #include <assert.h>
      14             : #include "./aom_dsp_rtcd.h"
      15             : 
      16           0 : void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
      17             :   // The 2D transform is done with two passes which are actually pretty
      18             :   // similar. In the first one, we transform the columns and transpose
      19             :   // the results. In the second one, we transform the rows. To achieve that,
      20             :   // as the first pass results are transposed, we transpose the columns (that
      21             :   // is the transposed rows) and transpose the results (so that it goes back
      22             :   // in normal/row positions).
      23             :   int pass;
      24             :   // We need an intermediate buffer between passes.
      25             :   tran_low_t intermediate[4 * 4];
      26           0 :   const tran_low_t *in_low = NULL;
      27           0 :   tran_low_t *out = intermediate;
      28             :   // Do the two transform/transpose passes
      29           0 :   for (pass = 0; pass < 2; ++pass) {
      30             :     tran_high_t in_high[4];    // canbe16
      31             :     tran_high_t step[4];       // canbe16
      32             :     tran_high_t temp1, temp2;  // needs32
      33             :     int i;
      34           0 :     for (i = 0; i < 4; ++i) {
      35             :       // Load inputs.
      36           0 :       if (pass == 0) {
      37           0 :         in_high[0] = input[0 * stride] * 16;
      38           0 :         in_high[1] = input[1 * stride] * 16;
      39           0 :         in_high[2] = input[2 * stride] * 16;
      40           0 :         in_high[3] = input[3 * stride] * 16;
      41           0 :         if (i == 0 && in_high[0]) {
      42           0 :           ++in_high[0];
      43             :         }
      44             :       } else {
      45           0 :         assert(in_low != NULL);
      46           0 :         in_high[0] = in_low[0 * 4];
      47           0 :         in_high[1] = in_low[1 * 4];
      48           0 :         in_high[2] = in_low[2 * 4];
      49           0 :         in_high[3] = in_low[3 * 4];
      50           0 :         ++in_low;
      51             :       }
      52             :       // Transform.
      53           0 :       step[0] = in_high[0] + in_high[3];
      54           0 :       step[1] = in_high[1] + in_high[2];
      55           0 :       step[2] = in_high[1] - in_high[2];
      56           0 :       step[3] = in_high[0] - in_high[3];
      57           0 :       temp1 = (step[0] + step[1]) * cospi_16_64;
      58           0 :       temp2 = (step[0] - step[1]) * cospi_16_64;
      59           0 :       out[0] = (tran_low_t)fdct_round_shift(temp1);
      60           0 :       out[2] = (tran_low_t)fdct_round_shift(temp2);
      61           0 :       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
      62           0 :       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
      63           0 :       out[1] = (tran_low_t)fdct_round_shift(temp1);
      64           0 :       out[3] = (tran_low_t)fdct_round_shift(temp2);
      65             :       // Do next column (which is a transposed row in second/horizontal pass)
      66           0 :       ++input;
      67           0 :       out += 4;
      68             :     }
      69             :     // Setup in/out for next pass.
      70           0 :     in_low = intermediate;
      71           0 :     out = output;
      72             :   }
      73             : 
      74             :   {
      75             :     int i, j;
      76           0 :     for (i = 0; i < 4; ++i) {
      77           0 :       for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
      78             :     }
      79             :   }
      80           0 : }
      81             : 
      82           0 : void aom_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
      83             :   int r, c;
      84           0 :   tran_low_t sum = 0;
      85           0 :   for (r = 0; r < 4; ++r)
      86           0 :     for (c = 0; c < 4; ++c) sum += input[r * stride + c];
      87             : 
      88           0 :   output[0] = sum << 1;
      89           0 : }
      90             : 
      91           0 : void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
      92             :   int i, j;
      93             :   tran_low_t intermediate[64];
      94             :   int pass;
      95           0 :   tran_low_t *output = intermediate;
      96           0 :   const tran_low_t *in = NULL;
      97             : 
      98             :   // Transform columns
      99           0 :   for (pass = 0; pass < 2; ++pass) {
     100             :     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
     101             :     tran_high_t t0, t1, t2, t3;                  // needs32
     102             :     tran_high_t x0, x1, x2, x3;                  // canbe16
     103             : 
     104           0 :     for (i = 0; i < 8; i++) {
     105             :       // stage 1
     106           0 :       if (pass == 0) {
     107           0 :         s0 = (input[0 * stride] + input[7 * stride]) * 4;
     108           0 :         s1 = (input[1 * stride] + input[6 * stride]) * 4;
     109           0 :         s2 = (input[2 * stride] + input[5 * stride]) * 4;
     110           0 :         s3 = (input[3 * stride] + input[4 * stride]) * 4;
     111           0 :         s4 = (input[3 * stride] - input[4 * stride]) * 4;
     112           0 :         s5 = (input[2 * stride] - input[5 * stride]) * 4;
     113           0 :         s6 = (input[1 * stride] - input[6 * stride]) * 4;
     114           0 :         s7 = (input[0 * stride] - input[7 * stride]) * 4;
     115           0 :         ++input;
     116             :       } else {
     117           0 :         s0 = in[0 * 8] + in[7 * 8];
     118           0 :         s1 = in[1 * 8] + in[6 * 8];
     119           0 :         s2 = in[2 * 8] + in[5 * 8];
     120           0 :         s3 = in[3 * 8] + in[4 * 8];
     121           0 :         s4 = in[3 * 8] - in[4 * 8];
     122           0 :         s5 = in[2 * 8] - in[5 * 8];
     123           0 :         s6 = in[1 * 8] - in[6 * 8];
     124           0 :         s7 = in[0 * 8] - in[7 * 8];
     125           0 :         ++in;
     126             :       }
     127             : 
     128             :       // fdct4(step, step);
     129           0 :       x0 = s0 + s3;
     130           0 :       x1 = s1 + s2;
     131           0 :       x2 = s1 - s2;
     132           0 :       x3 = s0 - s3;
     133           0 :       t0 = (x0 + x1) * cospi_16_64;
     134           0 :       t1 = (x0 - x1) * cospi_16_64;
     135           0 :       t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
     136           0 :       t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
     137           0 :       output[0] = (tran_low_t)fdct_round_shift(t0);
     138           0 :       output[2] = (tran_low_t)fdct_round_shift(t2);
     139           0 :       output[4] = (tran_low_t)fdct_round_shift(t1);
     140           0 :       output[6] = (tran_low_t)fdct_round_shift(t3);
     141             : 
     142             :       // Stage 2
     143           0 :       t0 = (s6 - s5) * cospi_16_64;
     144           0 :       t1 = (s6 + s5) * cospi_16_64;
     145           0 :       t2 = fdct_round_shift(t0);
     146           0 :       t3 = fdct_round_shift(t1);
     147             : 
     148             :       // Stage 3
     149           0 :       x0 = s4 + t2;
     150           0 :       x1 = s4 - t2;
     151           0 :       x2 = s7 - t3;
     152           0 :       x3 = s7 + t3;
     153             : 
     154             :       // Stage 4
     155           0 :       t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
     156           0 :       t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
     157           0 :       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
     158           0 :       t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
     159           0 :       output[1] = (tran_low_t)fdct_round_shift(t0);
     160           0 :       output[3] = (tran_low_t)fdct_round_shift(t2);
     161           0 :       output[5] = (tran_low_t)fdct_round_shift(t1);
     162           0 :       output[7] = (tran_low_t)fdct_round_shift(t3);
     163           0 :       output += 8;
     164             :     }
     165           0 :     in = intermediate;
     166           0 :     output = final_output;
     167             :   }
     168             : 
     169             :   // Rows
     170           0 :   for (i = 0; i < 8; ++i) {
     171           0 :     for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
     172             :   }
     173           0 : }
     174             : 
     175           0 : void aom_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
     176             :   int r, c;
     177           0 :   tran_low_t sum = 0;
     178           0 :   for (r = 0; r < 8; ++r)
     179           0 :     for (c = 0; c < 8; ++c) sum += input[r * stride + c];
     180             : 
     181           0 :   output[0] = sum;
     182           0 : }
     183             : 
     184           0 : void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
     185             :   // The 2D transform is done with two passes which are actually pretty
     186             :   // similar. In the first one, we transform the columns and transpose
     187             :   // the results. In the second one, we transform the rows. To achieve that,
     188             :   // as the first pass results are transposed, we transpose the columns (that
     189             :   // is the transposed rows) and transpose the results (so that it goes back
     190             :   // in normal/row positions).
     191             :   int pass;
     192             :   // We need an intermediate buffer between passes.
     193             :   tran_low_t intermediate[256];
     194           0 :   const tran_low_t *in_low = NULL;
     195           0 :   tran_low_t *out = intermediate;
     196             :   // Do the two transform/transpose passes
     197           0 :   for (pass = 0; pass < 2; ++pass) {
     198             :     tran_high_t step1[8];      // canbe16
     199             :     tran_high_t step2[8];      // canbe16
     200             :     tran_high_t step3[8];      // canbe16
     201             :     tran_high_t in_high[8];    // canbe16
     202             :     tran_high_t temp1, temp2;  // needs32
     203             :     int i;
     204           0 :     for (i = 0; i < 16; i++) {
     205           0 :       if (0 == pass) {
     206             :         // Calculate input for the first 8 results.
     207           0 :         in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
     208           0 :         in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
     209           0 :         in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
     210           0 :         in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
     211           0 :         in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
     212           0 :         in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
     213           0 :         in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
     214           0 :         in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
     215             :         // Calculate input for the next 8 results.
     216           0 :         step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
     217           0 :         step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
     218           0 :         step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
     219           0 :         step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
     220           0 :         step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
     221           0 :         step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
     222           0 :         step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
     223           0 :         step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
     224             :       } else {
     225             :         // Calculate input for the first 8 results.
     226           0 :         assert(in_low != NULL);
     227           0 :         in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
     228           0 :         in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
     229           0 :         in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
     230           0 :         in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
     231           0 :         in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
     232           0 :         in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
     233           0 :         in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
     234           0 :         in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
     235             :         // Calculate input for the next 8 results.
     236           0 :         step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
     237           0 :         step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
     238           0 :         step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
     239           0 :         step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
     240           0 :         step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
     241           0 :         step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
     242           0 :         step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
     243           0 :         step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
     244           0 :         in_low++;
     245             :       }
     246             :       // Work on the first eight values; fdct8(input, even_results);
     247             :       {
     248             :         tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
     249             :         tran_high_t t0, t1, t2, t3;                  // needs32
     250             :         tran_high_t x0, x1, x2, x3;                  // canbe16
     251             : 
     252             :         // stage 1
     253           0 :         s0 = in_high[0] + in_high[7];
     254           0 :         s1 = in_high[1] + in_high[6];
     255           0 :         s2 = in_high[2] + in_high[5];
     256           0 :         s3 = in_high[3] + in_high[4];
     257           0 :         s4 = in_high[3] - in_high[4];
     258           0 :         s5 = in_high[2] - in_high[5];
     259           0 :         s6 = in_high[1] - in_high[6];
     260           0 :         s7 = in_high[0] - in_high[7];
     261             : 
     262             :         // fdct4(step, step);
     263           0 :         x0 = s0 + s3;
     264           0 :         x1 = s1 + s2;
     265           0 :         x2 = s1 - s2;
     266           0 :         x3 = s0 - s3;
     267           0 :         t0 = (x0 + x1) * cospi_16_64;
     268           0 :         t1 = (x0 - x1) * cospi_16_64;
     269           0 :         t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
     270           0 :         t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
     271           0 :         out[0] = (tran_low_t)fdct_round_shift(t0);
     272           0 :         out[4] = (tran_low_t)fdct_round_shift(t2);
     273           0 :         out[8] = (tran_low_t)fdct_round_shift(t1);
     274           0 :         out[12] = (tran_low_t)fdct_round_shift(t3);
     275             : 
     276             :         // Stage 2
     277           0 :         t0 = (s6 - s5) * cospi_16_64;
     278           0 :         t1 = (s6 + s5) * cospi_16_64;
     279           0 :         t2 = fdct_round_shift(t0);
     280           0 :         t3 = fdct_round_shift(t1);
     281             : 
     282             :         // Stage 3
     283           0 :         x0 = s4 + t2;
     284           0 :         x1 = s4 - t2;
     285           0 :         x2 = s7 - t3;
     286           0 :         x3 = s7 + t3;
     287             : 
     288             :         // Stage 4
     289           0 :         t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
     290           0 :         t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
     291           0 :         t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
     292           0 :         t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
     293           0 :         out[2] = (tran_low_t)fdct_round_shift(t0);
     294           0 :         out[6] = (tran_low_t)fdct_round_shift(t2);
     295           0 :         out[10] = (tran_low_t)fdct_round_shift(t1);
     296           0 :         out[14] = (tran_low_t)fdct_round_shift(t3);
     297             :       }
     298             :       // Work on the next eight values; step1 -> odd_results
     299             :       {
     300             :         // step 2
     301           0 :         temp1 = (step1[5] - step1[2]) * cospi_16_64;
     302           0 :         temp2 = (step1[4] - step1[3]) * cospi_16_64;
     303           0 :         step2[2] = fdct_round_shift(temp1);
     304           0 :         step2[3] = fdct_round_shift(temp2);
     305           0 :         temp1 = (step1[4] + step1[3]) * cospi_16_64;
     306           0 :         temp2 = (step1[5] + step1[2]) * cospi_16_64;
     307           0 :         step2[4] = fdct_round_shift(temp1);
     308           0 :         step2[5] = fdct_round_shift(temp2);
     309             :         // step 3
     310           0 :         step3[0] = step1[0] + step2[3];
     311           0 :         step3[1] = step1[1] + step2[2];
     312           0 :         step3[2] = step1[1] - step2[2];
     313           0 :         step3[3] = step1[0] - step2[3];
     314           0 :         step3[4] = step1[7] - step2[4];
     315           0 :         step3[5] = step1[6] - step2[5];
     316           0 :         step3[6] = step1[6] + step2[5];
     317           0 :         step3[7] = step1[7] + step2[4];
     318             :         // step 4
     319           0 :         temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
     320           0 :         temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
     321           0 :         step2[1] = fdct_round_shift(temp1);
     322           0 :         step2[2] = fdct_round_shift(temp2);
     323           0 :         temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
     324           0 :         temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
     325           0 :         step2[5] = fdct_round_shift(temp1);
     326           0 :         step2[6] = fdct_round_shift(temp2);
     327             :         // step 5
     328           0 :         step1[0] = step3[0] + step2[1];
     329           0 :         step1[1] = step3[0] - step2[1];
     330           0 :         step1[2] = step3[3] + step2[2];
     331           0 :         step1[3] = step3[3] - step2[2];
     332           0 :         step1[4] = step3[4] - step2[5];
     333           0 :         step1[5] = step3[4] + step2[5];
     334           0 :         step1[6] = step3[7] - step2[6];
     335           0 :         step1[7] = step3[7] + step2[6];
     336             :         // step 6
     337           0 :         temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
     338           0 :         temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
     339           0 :         out[1] = (tran_low_t)fdct_round_shift(temp1);
     340           0 :         out[9] = (tran_low_t)fdct_round_shift(temp2);
     341           0 :         temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
     342           0 :         temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
     343           0 :         out[5] = (tran_low_t)fdct_round_shift(temp1);
     344           0 :         out[13] = (tran_low_t)fdct_round_shift(temp2);
     345           0 :         temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
     346           0 :         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
     347           0 :         out[3] = (tran_low_t)fdct_round_shift(temp1);
     348           0 :         out[11] = (tran_low_t)fdct_round_shift(temp2);
     349           0 :         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
     350           0 :         temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
     351           0 :         out[7] = (tran_low_t)fdct_round_shift(temp1);
     352           0 :         out[15] = (tran_low_t)fdct_round_shift(temp2);
     353             :       }
     354             :       // Do next column (which is a transposed row in second/horizontal pass)
     355           0 :       input++;
     356           0 :       out += 16;
     357             :     }
     358             :     // Setup in/out for next pass.
     359           0 :     in_low = intermediate;
     360           0 :     out = output;
     361             :   }
     362           0 : }
     363             : 
     364           0 : void aom_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
     365             :   int r, c;
     366           0 :   int sum = 0;
     367           0 :   for (r = 0; r < 16; ++r)
     368           0 :     for (c = 0; c < 16; ++c) sum += input[r * stride + c];
     369             : 
     370           0 :   output[0] = (tran_low_t)(sum >> 1);
     371           0 : }
     372             : 
     373           0 : static INLINE tran_high_t dct_32_round(tran_high_t input) {
     374           0 :   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
     375             :   // TODO(debargha, peter.derivaz): Find new bounds for this assert,
     376             :   // and make the bounds consts.
     377             :   // assert(-131072 <= rv && rv <= 131071);
     378           0 :   return rv;
     379             : }
     380             : 
     381           0 : static INLINE tran_high_t half_round_shift(tran_high_t input) {
     382           0 :   tran_high_t rv = (input + 1 + (input < 0)) >> 2;
     383           0 :   return rv;
     384             : }
     385             : 
     386           0 : void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
     387             :   tran_high_t step[32];
     388             :   // Stage 1
     389           0 :   step[0] = input[0] + input[(32 - 1)];
     390           0 :   step[1] = input[1] + input[(32 - 2)];
     391           0 :   step[2] = input[2] + input[(32 - 3)];
     392           0 :   step[3] = input[3] + input[(32 - 4)];
     393           0 :   step[4] = input[4] + input[(32 - 5)];
     394           0 :   step[5] = input[5] + input[(32 - 6)];
     395           0 :   step[6] = input[6] + input[(32 - 7)];
     396           0 :   step[7] = input[7] + input[(32 - 8)];
     397           0 :   step[8] = input[8] + input[(32 - 9)];
     398           0 :   step[9] = input[9] + input[(32 - 10)];
     399           0 :   step[10] = input[10] + input[(32 - 11)];
     400           0 :   step[11] = input[11] + input[(32 - 12)];
     401           0 :   step[12] = input[12] + input[(32 - 13)];
     402           0 :   step[13] = input[13] + input[(32 - 14)];
     403           0 :   step[14] = input[14] + input[(32 - 15)];
     404           0 :   step[15] = input[15] + input[(32 - 16)];
     405           0 :   step[16] = -input[16] + input[(32 - 17)];
     406           0 :   step[17] = -input[17] + input[(32 - 18)];
     407           0 :   step[18] = -input[18] + input[(32 - 19)];
     408           0 :   step[19] = -input[19] + input[(32 - 20)];
     409           0 :   step[20] = -input[20] + input[(32 - 21)];
     410           0 :   step[21] = -input[21] + input[(32 - 22)];
     411           0 :   step[22] = -input[22] + input[(32 - 23)];
     412           0 :   step[23] = -input[23] + input[(32 - 24)];
     413           0 :   step[24] = -input[24] + input[(32 - 25)];
     414           0 :   step[25] = -input[25] + input[(32 - 26)];
     415           0 :   step[26] = -input[26] + input[(32 - 27)];
     416           0 :   step[27] = -input[27] + input[(32 - 28)];
     417           0 :   step[28] = -input[28] + input[(32 - 29)];
     418           0 :   step[29] = -input[29] + input[(32 - 30)];
     419           0 :   step[30] = -input[30] + input[(32 - 31)];
     420           0 :   step[31] = -input[31] + input[(32 - 32)];
     421             : 
     422             :   // Stage 2
     423           0 :   output[0] = step[0] + step[16 - 1];
     424           0 :   output[1] = step[1] + step[16 - 2];
     425           0 :   output[2] = step[2] + step[16 - 3];
     426           0 :   output[3] = step[3] + step[16 - 4];
     427           0 :   output[4] = step[4] + step[16 - 5];
     428           0 :   output[5] = step[5] + step[16 - 6];
     429           0 :   output[6] = step[6] + step[16 - 7];
     430           0 :   output[7] = step[7] + step[16 - 8];
     431           0 :   output[8] = -step[8] + step[16 - 9];
     432           0 :   output[9] = -step[9] + step[16 - 10];
     433           0 :   output[10] = -step[10] + step[16 - 11];
     434           0 :   output[11] = -step[11] + step[16 - 12];
     435           0 :   output[12] = -step[12] + step[16 - 13];
     436           0 :   output[13] = -step[13] + step[16 - 14];
     437           0 :   output[14] = -step[14] + step[16 - 15];
     438           0 :   output[15] = -step[15] + step[16 - 16];
     439             : 
     440           0 :   output[16] = step[16];
     441           0 :   output[17] = step[17];
     442           0 :   output[18] = step[18];
     443           0 :   output[19] = step[19];
     444             : 
     445           0 :   output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
     446           0 :   output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
     447           0 :   output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
     448           0 :   output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
     449             : 
     450           0 :   output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
     451           0 :   output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
     452           0 :   output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
     453           0 :   output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
     454             : 
     455           0 :   output[28] = step[28];
     456           0 :   output[29] = step[29];
     457           0 :   output[30] = step[30];
     458           0 :   output[31] = step[31];
     459             : 
     460             :   // dump the magnitude by 4, hence the intermediate values are within
     461             :   // the range of 16 bits.
     462           0 :   if (round) {
     463           0 :     output[0] = half_round_shift(output[0]);
     464           0 :     output[1] = half_round_shift(output[1]);
     465           0 :     output[2] = half_round_shift(output[2]);
     466           0 :     output[3] = half_round_shift(output[3]);
     467           0 :     output[4] = half_round_shift(output[4]);
     468           0 :     output[5] = half_round_shift(output[5]);
     469           0 :     output[6] = half_round_shift(output[6]);
     470           0 :     output[7] = half_round_shift(output[7]);
     471           0 :     output[8] = half_round_shift(output[8]);
     472           0 :     output[9] = half_round_shift(output[9]);
     473           0 :     output[10] = half_round_shift(output[10]);
     474           0 :     output[11] = half_round_shift(output[11]);
     475           0 :     output[12] = half_round_shift(output[12]);
     476           0 :     output[13] = half_round_shift(output[13]);
     477           0 :     output[14] = half_round_shift(output[14]);
     478           0 :     output[15] = half_round_shift(output[15]);
     479             : 
     480           0 :     output[16] = half_round_shift(output[16]);
     481           0 :     output[17] = half_round_shift(output[17]);
     482           0 :     output[18] = half_round_shift(output[18]);
     483           0 :     output[19] = half_round_shift(output[19]);
     484           0 :     output[20] = half_round_shift(output[20]);
     485           0 :     output[21] = half_round_shift(output[21]);
     486           0 :     output[22] = half_round_shift(output[22]);
     487           0 :     output[23] = half_round_shift(output[23]);
     488           0 :     output[24] = half_round_shift(output[24]);
     489           0 :     output[25] = half_round_shift(output[25]);
     490           0 :     output[26] = half_round_shift(output[26]);
     491           0 :     output[27] = half_round_shift(output[27]);
     492           0 :     output[28] = half_round_shift(output[28]);
     493           0 :     output[29] = half_round_shift(output[29]);
     494           0 :     output[30] = half_round_shift(output[30]);
     495           0 :     output[31] = half_round_shift(output[31]);
     496             :   }
     497             : 
     498             :   // Stage 3
     499           0 :   step[0] = output[0] + output[(8 - 1)];
     500           0 :   step[1] = output[1] + output[(8 - 2)];
     501           0 :   step[2] = output[2] + output[(8 - 3)];
     502           0 :   step[3] = output[3] + output[(8 - 4)];
     503           0 :   step[4] = -output[4] + output[(8 - 5)];
     504           0 :   step[5] = -output[5] + output[(8 - 6)];
     505           0 :   step[6] = -output[6] + output[(8 - 7)];
     506           0 :   step[7] = -output[7] + output[(8 - 8)];
     507           0 :   step[8] = output[8];
     508           0 :   step[9] = output[9];
     509           0 :   step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
     510           0 :   step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
     511           0 :   step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
     512           0 :   step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
     513           0 :   step[14] = output[14];
     514           0 :   step[15] = output[15];
     515             : 
     516           0 :   step[16] = output[16] + output[23];
     517           0 :   step[17] = output[17] + output[22];
     518           0 :   step[18] = output[18] + output[21];
     519           0 :   step[19] = output[19] + output[20];
     520           0 :   step[20] = -output[20] + output[19];
     521           0 :   step[21] = -output[21] + output[18];
     522           0 :   step[22] = -output[22] + output[17];
     523           0 :   step[23] = -output[23] + output[16];
     524           0 :   step[24] = -output[24] + output[31];
     525           0 :   step[25] = -output[25] + output[30];
     526           0 :   step[26] = -output[26] + output[29];
     527           0 :   step[27] = -output[27] + output[28];
     528           0 :   step[28] = output[28] + output[27];
     529           0 :   step[29] = output[29] + output[26];
     530           0 :   step[30] = output[30] + output[25];
     531           0 :   step[31] = output[31] + output[24];
     532             : 
     533             :   // Stage 4
     534           0 :   output[0] = step[0] + step[3];
     535           0 :   output[1] = step[1] + step[2];
     536           0 :   output[2] = -step[2] + step[1];
     537           0 :   output[3] = -step[3] + step[0];
     538           0 :   output[4] = step[4];
     539           0 :   output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
     540           0 :   output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
     541           0 :   output[7] = step[7];
     542           0 :   output[8] = step[8] + step[11];
     543           0 :   output[9] = step[9] + step[10];
     544           0 :   output[10] = -step[10] + step[9];
     545           0 :   output[11] = -step[11] + step[8];
     546           0 :   output[12] = -step[12] + step[15];
     547           0 :   output[13] = -step[13] + step[14];
     548           0 :   output[14] = step[14] + step[13];
     549           0 :   output[15] = step[15] + step[12];
     550             : 
     551           0 :   output[16] = step[16];
     552           0 :   output[17] = step[17];
     553           0 :   output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
     554           0 :   output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
     555           0 :   output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
     556           0 :   output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
     557           0 :   output[22] = step[22];
     558           0 :   output[23] = step[23];
     559           0 :   output[24] = step[24];
     560           0 :   output[25] = step[25];
     561           0 :   output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
     562           0 :   output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
     563           0 :   output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
     564           0 :   output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
     565           0 :   output[30] = step[30];
     566           0 :   output[31] = step[31];
     567             : 
     568             :   // Stage 5
     569           0 :   step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
     570           0 :   step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
     571           0 :   step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
     572           0 :   step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
     573           0 :   step[4] = output[4] + output[5];
     574           0 :   step[5] = -output[5] + output[4];
     575           0 :   step[6] = -output[6] + output[7];
     576           0 :   step[7] = output[7] + output[6];
     577           0 :   step[8] = output[8];
     578           0 :   step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
     579           0 :   step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
     580           0 :   step[11] = output[11];
     581           0 :   step[12] = output[12];
     582           0 :   step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
     583           0 :   step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
     584           0 :   step[15] = output[15];
     585             : 
     586           0 :   step[16] = output[16] + output[19];
     587           0 :   step[17] = output[17] + output[18];
     588           0 :   step[18] = -output[18] + output[17];
     589           0 :   step[19] = -output[19] + output[16];
     590           0 :   step[20] = -output[20] + output[23];
     591           0 :   step[21] = -output[21] + output[22];
     592           0 :   step[22] = output[22] + output[21];
     593           0 :   step[23] = output[23] + output[20];
     594           0 :   step[24] = output[24] + output[27];
     595           0 :   step[25] = output[25] + output[26];
     596           0 :   step[26] = -output[26] + output[25];
     597           0 :   step[27] = -output[27] + output[24];
     598           0 :   step[28] = -output[28] + output[31];
     599           0 :   step[29] = -output[29] + output[30];
     600           0 :   step[30] = output[30] + output[29];
     601           0 :   step[31] = output[31] + output[28];
     602             : 
     603             :   // Stage 6
     604           0 :   output[0] = step[0];
     605           0 :   output[1] = step[1];
     606           0 :   output[2] = step[2];
     607           0 :   output[3] = step[3];
     608           0 :   output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
     609           0 :   output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
     610           0 :   output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
     611           0 :   output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
     612           0 :   output[8] = step[8] + step[9];
     613           0 :   output[9] = -step[9] + step[8];
     614           0 :   output[10] = -step[10] + step[11];
     615           0 :   output[11] = step[11] + step[10];
     616           0 :   output[12] = step[12] + step[13];
     617           0 :   output[13] = -step[13] + step[12];
     618           0 :   output[14] = -step[14] + step[15];
     619           0 :   output[15] = step[15] + step[14];
     620             : 
     621           0 :   output[16] = step[16];
     622           0 :   output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
     623           0 :   output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
     624           0 :   output[19] = step[19];
     625           0 :   output[20] = step[20];
     626           0 :   output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
     627           0 :   output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
     628           0 :   output[23] = step[23];
     629           0 :   output[24] = step[24];
     630           0 :   output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
     631           0 :   output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
     632           0 :   output[27] = step[27];
     633           0 :   output[28] = step[28];
     634           0 :   output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
     635           0 :   output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
     636           0 :   output[31] = step[31];
     637             : 
     638             :   // Stage 7
     639           0 :   step[0] = output[0];
     640           0 :   step[1] = output[1];
     641           0 :   step[2] = output[2];
     642           0 :   step[3] = output[3];
     643           0 :   step[4] = output[4];
     644           0 :   step[5] = output[5];
     645           0 :   step[6] = output[6];
     646           0 :   step[7] = output[7];
     647           0 :   step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
     648           0 :   step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
     649           0 :   step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
     650           0 :   step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
     651           0 :   step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
     652           0 :   step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
     653           0 :   step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
     654           0 :   step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
     655             : 
     656           0 :   step[16] = output[16] + output[17];
     657           0 :   step[17] = -output[17] + output[16];
     658           0 :   step[18] = -output[18] + output[19];
     659           0 :   step[19] = output[19] + output[18];
     660           0 :   step[20] = output[20] + output[21];
     661           0 :   step[21] = -output[21] + output[20];
     662           0 :   step[22] = -output[22] + output[23];
     663           0 :   step[23] = output[23] + output[22];
     664           0 :   step[24] = output[24] + output[25];
     665           0 :   step[25] = -output[25] + output[24];
     666           0 :   step[26] = -output[26] + output[27];
     667           0 :   step[27] = output[27] + output[26];
     668           0 :   step[28] = output[28] + output[29];
     669           0 :   step[29] = -output[29] + output[28];
     670           0 :   step[30] = -output[30] + output[31];
     671           0 :   step[31] = output[31] + output[30];
     672             : 
     673             :   // Final stage --- outputs indices are bit-reversed.
     674           0 :   output[0] = step[0];
     675           0 :   output[16] = step[1];
     676           0 :   output[8] = step[2];
     677           0 :   output[24] = step[3];
     678           0 :   output[4] = step[4];
     679           0 :   output[20] = step[5];
     680           0 :   output[12] = step[6];
     681           0 :   output[28] = step[7];
     682           0 :   output[2] = step[8];
     683           0 :   output[18] = step[9];
     684           0 :   output[10] = step[10];
     685           0 :   output[26] = step[11];
     686           0 :   output[6] = step[12];
     687           0 :   output[22] = step[13];
     688           0 :   output[14] = step[14];
     689           0 :   output[30] = step[15];
     690             : 
     691           0 :   output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
     692           0 :   output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
     693           0 :   output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
     694           0 :   output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
     695           0 :   output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
     696           0 :   output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
     697           0 :   output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
     698           0 :   output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
     699           0 :   output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
     700           0 :   output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
     701           0 :   output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
     702           0 :   output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
     703           0 :   output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
     704           0 :   output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
     705           0 :   output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
     706           0 :   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
     707           0 : }
     708             : 
     709           0 : void aom_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
     710             :   int i, j;
     711             :   tran_high_t output[32 * 32];
     712             : 
     713             :   // Columns
     714           0 :   for (i = 0; i < 32; ++i) {
     715             :     tran_high_t temp_in[32], temp_out[32];
     716           0 :     for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
     717           0 :     aom_fdct32(temp_in, temp_out, 0);
     718           0 :     for (j = 0; j < 32; ++j)
     719           0 :       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
     720             :   }
     721             : 
     722             :   // Rows
     723           0 :   for (i = 0; i < 32; ++i) {
     724             :     tran_high_t temp_in[32], temp_out[32];
     725           0 :     for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
     726           0 :     aom_fdct32(temp_in, temp_out, 0);
     727           0 :     for (j = 0; j < 32; ++j)
     728           0 :       out[j + i * 32] =
     729           0 :           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
     730             :   }
     731           0 : }
     732             : 
     733             : // Note that although we use dct_32_round in dct32 computation flow,
     734             : // this 2d fdct32x32 for rate-distortion optimization loop is operating
     735             : // within 16 bits precision.
     736           0 : void aom_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
     737             :   int i, j;
     738             :   tran_high_t output[32 * 32];
     739             : 
     740             :   // Columns
     741           0 :   for (i = 0; i < 32; ++i) {
     742             :     tran_high_t temp_in[32], temp_out[32];
     743           0 :     for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
     744           0 :     aom_fdct32(temp_in, temp_out, 0);
     745           0 :     for (j = 0; j < 32; ++j)
     746             :       // TODO(cd): see quality impact of only doing
     747             :       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
     748             :       //           PS: also change code in aom_dsp/x86/aom_dct_sse2.c
     749           0 :       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
     750             :   }
     751             : 
     752             :   // Rows
     753           0 :   for (i = 0; i < 32; ++i) {
     754             :     tran_high_t temp_in[32], temp_out[32];
     755           0 :     for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
     756           0 :     aom_fdct32(temp_in, temp_out, 1);
     757           0 :     for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
     758             :   }
     759           0 : }
     760             : 
     761           0 : void aom_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
     762             :   int r, c;
     763           0 :   int sum = 0;
     764           0 :   for (r = 0; r < 32; ++r)
     765           0 :     for (c = 0; c < 32; ++c) sum += input[r * stride + c];
     766             : 
     767           0 :   output[0] = (tran_low_t)(sum >> 3);
     768           0 : }
     769             : 
     770             : #if CONFIG_HIGHBITDEPTH
     771           0 : void aom_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
     772             :                           int stride) {
     773           0 :   aom_fdct4x4_c(input, output, stride);
     774           0 : }
     775             : 
     776           0 : void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
     777             :                           int stride) {
     778           0 :   aom_fdct8x8_c(input, final_output, stride);
     779           0 : }
     780             : 
     781           0 : void aom_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
     782             :                             int stride) {
     783           0 :   aom_fdct8x8_1_c(input, final_output, stride);
     784           0 : }
     785             : 
     786           0 : void aom_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
     787             :                             int stride) {
     788           0 :   aom_fdct16x16_c(input, output, stride);
     789           0 : }
     790             : 
     791           0 : void aom_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
     792             :                               int stride) {
     793           0 :   aom_fdct16x16_1_c(input, output, stride);
     794           0 : }
     795             : 
     796           0 : void aom_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
     797           0 :   aom_fdct32x32_c(input, out, stride);
     798           0 : }
     799             : 
     800           0 : void aom_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
     801             :                                int stride) {
     802           0 :   aom_fdct32x32_rd_c(input, out, stride);
     803           0 : }
     804             : 
     805           0 : void aom_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
     806             :                               int stride) {
     807           0 :   aom_fdct32x32_1_c(input, out, stride);
     808           0 : }
     809             : #endif  // CONFIG_HIGHBITDEPTH

Generated by: LCOV version 1.13