LCOV - output.info - media/libvpx/libvpx/vpx_dsp/fwd

LCOV - code coverage report

Current view:	top level - media/libvpx/libvpx/vpx_dsp - fwd_txfm.c (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	0	569	0.0 %
Date:	2017-07-14 16:53:18	Functions:	0	12	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*
       2             :  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include <assert.h>
      12             : #include "./vpx_dsp_rtcd.h"
      13             : #include "vpx_dsp/fwd_txfm.h"
      14             : 
      15           0 : void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
      16             :   // The 2D transform is done with two passes which are actually pretty
      17             :   // similar. In the first one, we transform the columns and transpose
      18             :   // the results. In the second one, we transform the rows. To achieve that,
      19             :   // as the first pass results are transposed, we transpose the columns (that
      20             :   // is the transposed rows) and transpose the results (so that it goes back
      21             :   // in normal/row positions).
      22             :   int pass;
      23             :   // We need an intermediate buffer between passes.
      24             :   tran_low_t intermediate[4 * 4];
      25           0 :   const tran_low_t *in_low = NULL;
      26           0 :   tran_low_t *out = intermediate;
      27             :   // Do the two transform/transpose passes
      28           0 :   for (pass = 0; pass < 2; ++pass) {
      29             :     tran_high_t in_high[4];    // canbe16
      30             :     tran_high_t step[4];       // canbe16
      31             :     tran_high_t temp1, temp2;  // needs32
      32             :     int i;
      33           0 :     for (i = 0; i < 4; ++i) {
      34             :       // Load inputs.
      35           0 :       if (pass == 0) {
      36           0 :         in_high[0] = input[0 * stride] * 16;
      37           0 :         in_high[1] = input[1 * stride] * 16;
      38           0 :         in_high[2] = input[2 * stride] * 16;
      39           0 :         in_high[3] = input[3 * stride] * 16;
      40           0 :         if (i == 0 && in_high[0]) {
      41           0 :           ++in_high[0];
      42             :         }
      43             :       } else {
      44           0 :         assert(in_low != NULL);
      45           0 :         in_high[0] = in_low[0 * 4];
      46           0 :         in_high[1] = in_low[1 * 4];
      47           0 :         in_high[2] = in_low[2 * 4];
      48           0 :         in_high[3] = in_low[3 * 4];
      49           0 :         ++in_low;
      50             :       }
      51             :       // Transform.
      52           0 :       step[0] = in_high[0] + in_high[3];
      53           0 :       step[1] = in_high[1] + in_high[2];
      54           0 :       step[2] = in_high[1] - in_high[2];
      55           0 :       step[3] = in_high[0] - in_high[3];
      56           0 :       temp1 = (step[0] + step[1]) * cospi_16_64;
      57           0 :       temp2 = (step[0] - step[1]) * cospi_16_64;
      58           0 :       out[0] = (tran_low_t)fdct_round_shift(temp1);
      59           0 :       out[2] = (tran_low_t)fdct_round_shift(temp2);
      60           0 :       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
      61           0 :       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
      62           0 :       out[1] = (tran_low_t)fdct_round_shift(temp1);
      63           0 :       out[3] = (tran_low_t)fdct_round_shift(temp2);
      64             :       // Do next column (which is a transposed row in second/horizontal pass)
      65           0 :       ++input;
      66           0 :       out += 4;
      67             :     }
      68             :     // Setup in/out for next pass.
      69           0 :     in_low = intermediate;
      70           0 :     out = output;
      71             :   }
      72             : 
      73             :   {
      74             :     int i, j;
      75           0 :     for (i = 0; i < 4; ++i) {
      76           0 :       for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
      77             :     }
      78             :   }
      79           0 : }
      80             : 
      81           0 : void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
      82             :   int r, c;
      83           0 :   tran_low_t sum = 0;
      84           0 :   for (r = 0; r < 4; ++r)
      85           0 :     for (c = 0; c < 4; ++c) sum += input[r * stride + c];
      86             : 
      87           0 :   output[0] = sum << 1;
      88           0 : }
      89             : 
      90           0 : void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
      91             :   int i, j;
      92             :   tran_low_t intermediate[64];
      93             :   int pass;
      94           0 :   tran_low_t *output = intermediate;
      95           0 :   const tran_low_t *in = NULL;
      96             : 
      97             :   // Transform columns
      98           0 :   for (pass = 0; pass < 2; ++pass) {
      99             :     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
     100             :     tran_high_t t0, t1, t2, t3;                  // needs32
     101             :     tran_high_t x0, x1, x2, x3;                  // canbe16
     102             : 
     103           0 :     for (i = 0; i < 8; i++) {
     104             :       // stage 1
     105           0 :       if (pass == 0) {
     106           0 :         s0 = (input[0 * stride] + input[7 * stride]) * 4;
     107           0 :         s1 = (input[1 * stride] + input[6 * stride]) * 4;
     108           0 :         s2 = (input[2 * stride] + input[5 * stride]) * 4;
     109           0 :         s3 = (input[3 * stride] + input[4 * stride]) * 4;
     110           0 :         s4 = (input[3 * stride] - input[4 * stride]) * 4;
     111           0 :         s5 = (input[2 * stride] - input[5 * stride]) * 4;
     112           0 :         s6 = (input[1 * stride] - input[6 * stride]) * 4;
     113           0 :         s7 = (input[0 * stride] - input[7 * stride]) * 4;
     114           0 :         ++input;
     115             :       } else {
     116           0 :         s0 = in[0 * 8] + in[7 * 8];
     117           0 :         s1 = in[1 * 8] + in[6 * 8];
     118           0 :         s2 = in[2 * 8] + in[5 * 8];
     119           0 :         s3 = in[3 * 8] + in[4 * 8];
     120           0 :         s4 = in[3 * 8] - in[4 * 8];
     121           0 :         s5 = in[2 * 8] - in[5 * 8];
     122           0 :         s6 = in[1 * 8] - in[6 * 8];
     123           0 :         s7 = in[0 * 8] - in[7 * 8];
     124           0 :         ++in;
     125             :       }
     126             : 
     127             :       // fdct4(step, step);
     128           0 :       x0 = s0 + s3;
     129           0 :       x1 = s1 + s2;
     130           0 :       x2 = s1 - s2;
     131           0 :       x3 = s0 - s3;
     132           0 :       t0 = (x0 + x1) * cospi_16_64;
     133           0 :       t1 = (x0 - x1) * cospi_16_64;
     134           0 :       t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
     135           0 :       t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
     136           0 :       output[0] = (tran_low_t)fdct_round_shift(t0);
     137           0 :       output[2] = (tran_low_t)fdct_round_shift(t2);
     138           0 :       output[4] = (tran_low_t)fdct_round_shift(t1);
     139           0 :       output[6] = (tran_low_t)fdct_round_shift(t3);
     140             : 
     141             :       // Stage 2
     142           0 :       t0 = (s6 - s5) * cospi_16_64;
     143           0 :       t1 = (s6 + s5) * cospi_16_64;
     144           0 :       t2 = fdct_round_shift(t0);
     145           0 :       t3 = fdct_round_shift(t1);
     146             : 
     147             :       // Stage 3
     148           0 :       x0 = s4 + t2;
     149           0 :       x1 = s4 - t2;
     150           0 :       x2 = s7 - t3;
     151           0 :       x3 = s7 + t3;
     152             : 
     153             :       // Stage 4
     154           0 :       t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
     155           0 :       t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
     156           0 :       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
     157           0 :       t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
     158           0 :       output[1] = (tran_low_t)fdct_round_shift(t0);
     159           0 :       output[3] = (tran_low_t)fdct_round_shift(t2);
     160           0 :       output[5] = (tran_low_t)fdct_round_shift(t1);
     161           0 :       output[7] = (tran_low_t)fdct_round_shift(t3);
     162           0 :       output += 8;
     163             :     }
     164           0 :     in = intermediate;
     165           0 :     output = final_output;
     166             :   }
     167             : 
     168             :   // Rows
     169           0 :   for (i = 0; i < 8; ++i) {
     170           0 :     for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
     171             :   }
     172           0 : }
     173             : 
     174           0 : void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
     175             :   int r, c;
     176           0 :   tran_low_t sum = 0;
     177           0 :   for (r = 0; r < 8; ++r)
     178           0 :     for (c = 0; c < 8; ++c) sum += input[r * stride + c];
     179             : 
     180           0 :   output[0] = sum;
     181           0 : }
     182             : 
     183           0 : void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
     184             :   // The 2D transform is done with two passes which are actually pretty
     185             :   // similar. In the first one, we transform the columns and transpose
     186             :   // the results. In the second one, we transform the rows. To achieve that,
     187             :   // as the first pass results are transposed, we transpose the columns (that
     188             :   // is the transposed rows) and transpose the results (so that it goes back
     189             :   // in normal/row positions).
     190             :   int pass;
     191             :   // We need an intermediate buffer between passes.
     192             :   tran_low_t intermediate[256];
     193           0 :   const tran_low_t *in_low = NULL;
     194           0 :   tran_low_t *out = intermediate;
     195             :   // Do the two transform/transpose passes
     196           0 :   for (pass = 0; pass < 2; ++pass) {
     197             :     tran_high_t step1[8];      // canbe16
     198             :     tran_high_t step2[8];      // canbe16
     199             :     tran_high_t step3[8];      // canbe16
     200             :     tran_high_t in_high[8];    // canbe16
     201             :     tran_high_t temp1, temp2;  // needs32
     202             :     int i;
     203           0 :     for (i = 0; i < 16; i++) {
     204           0 :       if (0 == pass) {
     205             :         // Calculate input for the first 8 results.
     206           0 :         in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
     207           0 :         in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
     208           0 :         in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
     209           0 :         in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
     210           0 :         in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
     211           0 :         in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
     212           0 :         in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
     213           0 :         in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
     214             :         // Calculate input for the next 8 results.
     215           0 :         step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
     216           0 :         step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
     217           0 :         step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
     218           0 :         step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
     219           0 :         step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
     220           0 :         step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
     221           0 :         step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
     222           0 :         step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
     223             :       } else {
     224             :         // Calculate input for the first 8 results.
     225           0 :         assert(in_low != NULL);
     226           0 :         in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
     227           0 :         in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
     228           0 :         in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
     229           0 :         in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
     230           0 :         in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
     231           0 :         in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
     232           0 :         in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
     233           0 :         in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
     234             :         // Calculate input for the next 8 results.
     235           0 :         step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
     236           0 :         step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
     237           0 :         step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
     238           0 :         step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
     239           0 :         step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
     240           0 :         step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
     241           0 :         step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
     242           0 :         step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
     243           0 :         in_low++;
     244             :       }
     245             :       // Work on the first eight values; fdct8(input, even_results);
     246             :       {
     247             :         tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
     248             :         tran_high_t t0, t1, t2, t3;                  // needs32
     249             :         tran_high_t x0, x1, x2, x3;                  // canbe16
     250             : 
     251             :         // stage 1
     252           0 :         s0 = in_high[0] + in_high[7];
     253           0 :         s1 = in_high[1] + in_high[6];
     254           0 :         s2 = in_high[2] + in_high[5];
     255           0 :         s3 = in_high[3] + in_high[4];
     256           0 :         s4 = in_high[3] - in_high[4];
     257           0 :         s5 = in_high[2] - in_high[5];
     258           0 :         s6 = in_high[1] - in_high[6];
     259           0 :         s7 = in_high[0] - in_high[7];
     260             : 
     261             :         // fdct4(step, step);
     262           0 :         x0 = s0 + s3;
     263           0 :         x1 = s1 + s2;
     264           0 :         x2 = s1 - s2;
     265           0 :         x3 = s0 - s3;
     266           0 :         t0 = (x0 + x1) * cospi_16_64;
     267           0 :         t1 = (x0 - x1) * cospi_16_64;
     268           0 :         t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
     269           0 :         t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
     270           0 :         out[0] = (tran_low_t)fdct_round_shift(t0);
     271           0 :         out[4] = (tran_low_t)fdct_round_shift(t2);
     272           0 :         out[8] = (tran_low_t)fdct_round_shift(t1);
     273           0 :         out[12] = (tran_low_t)fdct_round_shift(t3);
     274             : 
     275             :         // Stage 2
     276           0 :         t0 = (s6 - s5) * cospi_16_64;
     277           0 :         t1 = (s6 + s5) * cospi_16_64;
     278           0 :         t2 = fdct_round_shift(t0);
     279           0 :         t3 = fdct_round_shift(t1);
     280             : 
     281             :         // Stage 3
     282           0 :         x0 = s4 + t2;
     283           0 :         x1 = s4 - t2;
     284           0 :         x2 = s7 - t3;
     285           0 :         x3 = s7 + t3;
     286             : 
     287             :         // Stage 4
     288           0 :         t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
     289           0 :         t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
     290           0 :         t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
     291           0 :         t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
     292           0 :         out[2] = (tran_low_t)fdct_round_shift(t0);
     293           0 :         out[6] = (tran_low_t)fdct_round_shift(t2);
     294           0 :         out[10] = (tran_low_t)fdct_round_shift(t1);
     295           0 :         out[14] = (tran_low_t)fdct_round_shift(t3);
     296             :       }
     297             :       // Work on the next eight values; step1 -> odd_results
     298             :       {
     299             :         // step 2
     300           0 :         temp1 = (step1[5] - step1[2]) * cospi_16_64;
     301           0 :         temp2 = (step1[4] - step1[3]) * cospi_16_64;
     302           0 :         step2[2] = fdct_round_shift(temp1);
     303           0 :         step2[3] = fdct_round_shift(temp2);
     304           0 :         temp1 = (step1[4] + step1[3]) * cospi_16_64;
     305           0 :         temp2 = (step1[5] + step1[2]) * cospi_16_64;
     306           0 :         step2[4] = fdct_round_shift(temp1);
     307           0 :         step2[5] = fdct_round_shift(temp2);
     308             :         // step 3
     309           0 :         step3[0] = step1[0] + step2[3];
     310           0 :         step3[1] = step1[1] + step2[2];
     311           0 :         step3[2] = step1[1] - step2[2];
     312           0 :         step3[3] = step1[0] - step2[3];
     313           0 :         step3[4] = step1[7] - step2[4];
     314           0 :         step3[5] = step1[6] - step2[5];
     315           0 :         step3[6] = step1[6] + step2[5];
     316           0 :         step3[7] = step1[7] + step2[4];
     317             :         // step 4
     318           0 :         temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
     319           0 :         temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
     320           0 :         step2[1] = fdct_round_shift(temp1);
     321           0 :         step2[2] = fdct_round_shift(temp2);
     322           0 :         temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
     323           0 :         temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
     324           0 :         step2[5] = fdct_round_shift(temp1);
     325           0 :         step2[6] = fdct_round_shift(temp2);
     326             :         // step 5
     327           0 :         step1[0] = step3[0] + step2[1];
     328           0 :         step1[1] = step3[0] - step2[1];
     329           0 :         step1[2] = step3[3] + step2[2];
     330           0 :         step1[3] = step3[3] - step2[2];
     331           0 :         step1[4] = step3[4] - step2[5];
     332           0 :         step1[5] = step3[4] + step2[5];
     333           0 :         step1[6] = step3[7] - step2[6];
     334           0 :         step1[7] = step3[7] + step2[6];
     335             :         // step 6
     336           0 :         temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
     337           0 :         temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
     338           0 :         out[1] = (tran_low_t)fdct_round_shift(temp1);
     339           0 :         out[9] = (tran_low_t)fdct_round_shift(temp2);
     340           0 :         temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
     341           0 :         temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
     342           0 :         out[5] = (tran_low_t)fdct_round_shift(temp1);
     343           0 :         out[13] = (tran_low_t)fdct_round_shift(temp2);
     344           0 :         temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
     345           0 :         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
     346           0 :         out[3] = (tran_low_t)fdct_round_shift(temp1);
     347           0 :         out[11] = (tran_low_t)fdct_round_shift(temp2);
     348           0 :         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
     349           0 :         temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
     350           0 :         out[7] = (tran_low_t)fdct_round_shift(temp1);
     351           0 :         out[15] = (tran_low_t)fdct_round_shift(temp2);
     352             :       }
     353             :       // Do next column (which is a transposed row in second/horizontal pass)
     354           0 :       input++;
     355           0 :       out += 16;
     356             :     }
     357             :     // Setup in/out for next pass.
     358           0 :     in_low = intermediate;
     359           0 :     out = output;
     360             :   }
     361           0 : }
     362             : 
     363           0 : void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
     364             :   int r, c;
     365           0 :   int sum = 0;
     366           0 :   for (r = 0; r < 16; ++r)
     367           0 :     for (c = 0; c < 16; ++c) sum += input[r * stride + c];
     368             : 
     369           0 :   output[0] = (tran_low_t)(sum >> 1);
     370           0 : }
     371             : 
     372           0 : static INLINE tran_high_t dct_32_round(tran_high_t input) {
     373           0 :   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
     374             :   // TODO(debargha, peter.derivaz): Find new bounds for this assert,
     375             :   // and make the bounds consts.
     376             :   // assert(-131072 <= rv && rv <= 131071);
     377           0 :   return rv;
     378             : }
     379             : 
     380           0 : static INLINE tran_high_t half_round_shift(tran_high_t input) {
     381           0 :   tran_high_t rv = (input + 1 + (input < 0)) >> 2;
     382           0 :   return rv;
     383             : }
     384             : 
     385           0 : void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
     386             :   tran_high_t step[32];
     387             :   // Stage 1
     388           0 :   step[0] = input[0] + input[(32 - 1)];
     389           0 :   step[1] = input[1] + input[(32 - 2)];
     390           0 :   step[2] = input[2] + input[(32 - 3)];
     391           0 :   step[3] = input[3] + input[(32 - 4)];
     392           0 :   step[4] = input[4] + input[(32 - 5)];
     393           0 :   step[5] = input[5] + input[(32 - 6)];
     394           0 :   step[6] = input[6] + input[(32 - 7)];
     395           0 :   step[7] = input[7] + input[(32 - 8)];
     396           0 :   step[8] = input[8] + input[(32 - 9)];
     397           0 :   step[9] = input[9] + input[(32 - 10)];
     398           0 :   step[10] = input[10] + input[(32 - 11)];
     399           0 :   step[11] = input[11] + input[(32 - 12)];
     400           0 :   step[12] = input[12] + input[(32 - 13)];
     401           0 :   step[13] = input[13] + input[(32 - 14)];
     402           0 :   step[14] = input[14] + input[(32 - 15)];
     403           0 :   step[15] = input[15] + input[(32 - 16)];
     404           0 :   step[16] = -input[16] + input[(32 - 17)];
     405           0 :   step[17] = -input[17] + input[(32 - 18)];
     406           0 :   step[18] = -input[18] + input[(32 - 19)];
     407           0 :   step[19] = -input[19] + input[(32 - 20)];
     408           0 :   step[20] = -input[20] + input[(32 - 21)];
     409           0 :   step[21] = -input[21] + input[(32 - 22)];
     410           0 :   step[22] = -input[22] + input[(32 - 23)];
     411           0 :   step[23] = -input[23] + input[(32 - 24)];
     412           0 :   step[24] = -input[24] + input[(32 - 25)];
     413           0 :   step[25] = -input[25] + input[(32 - 26)];
     414           0 :   step[26] = -input[26] + input[(32 - 27)];
     415           0 :   step[27] = -input[27] + input[(32 - 28)];
     416           0 :   step[28] = -input[28] + input[(32 - 29)];
     417           0 :   step[29] = -input[29] + input[(32 - 30)];
     418           0 :   step[30] = -input[30] + input[(32 - 31)];
     419           0 :   step[31] = -input[31] + input[(32 - 32)];
     420             : 
     421             :   // Stage 2
     422           0 :   output[0] = step[0] + step[16 - 1];
     423           0 :   output[1] = step[1] + step[16 - 2];
     424           0 :   output[2] = step[2] + step[16 - 3];
     425           0 :   output[3] = step[3] + step[16 - 4];
     426           0 :   output[4] = step[4] + step[16 - 5];
     427           0 :   output[5] = step[5] + step[16 - 6];
     428           0 :   output[6] = step[6] + step[16 - 7];
     429           0 :   output[7] = step[7] + step[16 - 8];
     430           0 :   output[8] = -step[8] + step[16 - 9];
     431           0 :   output[9] = -step[9] + step[16 - 10];
     432           0 :   output[10] = -step[10] + step[16 - 11];
     433           0 :   output[11] = -step[11] + step[16 - 12];
     434           0 :   output[12] = -step[12] + step[16 - 13];
     435           0 :   output[13] = -step[13] + step[16 - 14];
     436           0 :   output[14] = -step[14] + step[16 - 15];
     437           0 :   output[15] = -step[15] + step[16 - 16];
     438             : 
     439           0 :   output[16] = step[16];
     440           0 :   output[17] = step[17];
     441           0 :   output[18] = step[18];
     442           0 :   output[19] = step[19];
     443             : 
     444           0 :   output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
     445           0 :   output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
     446           0 :   output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
     447           0 :   output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
     448             : 
     449           0 :   output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
     450           0 :   output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
     451           0 :   output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
     452           0 :   output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
     453             : 
     454           0 :   output[28] = step[28];
     455           0 :   output[29] = step[29];
     456           0 :   output[30] = step[30];
     457           0 :   output[31] = step[31];
     458             : 
     459             :   // dump the magnitude by 4, hence the intermediate values are within
     460             :   // the range of 16 bits.
     461           0 :   if (round) {
     462           0 :     output[0] = half_round_shift(output[0]);
     463           0 :     output[1] = half_round_shift(output[1]);
     464           0 :     output[2] = half_round_shift(output[2]);
     465           0 :     output[3] = half_round_shift(output[3]);
     466           0 :     output[4] = half_round_shift(output[4]);
     467           0 :     output[5] = half_round_shift(output[5]);
     468           0 :     output[6] = half_round_shift(output[6]);
     469           0 :     output[7] = half_round_shift(output[7]);
     470           0 :     output[8] = half_round_shift(output[8]);
     471           0 :     output[9] = half_round_shift(output[9]);
     472           0 :     output[10] = half_round_shift(output[10]);
     473           0 :     output[11] = half_round_shift(output[11]);
     474           0 :     output[12] = half_round_shift(output[12]);
     475           0 :     output[13] = half_round_shift(output[13]);
     476           0 :     output[14] = half_round_shift(output[14]);
     477           0 :     output[15] = half_round_shift(output[15]);
     478             : 
     479           0 :     output[16] = half_round_shift(output[16]);
     480           0 :     output[17] = half_round_shift(output[17]);
     481           0 :     output[18] = half_round_shift(output[18]);
     482           0 :     output[19] = half_round_shift(output[19]);
     483           0 :     output[20] = half_round_shift(output[20]);
     484           0 :     output[21] = half_round_shift(output[21]);
     485           0 :     output[22] = half_round_shift(output[22]);
     486           0 :     output[23] = half_round_shift(output[23]);
     487           0 :     output[24] = half_round_shift(output[24]);
     488           0 :     output[25] = half_round_shift(output[25]);
     489           0 :     output[26] = half_round_shift(output[26]);
     490           0 :     output[27] = half_round_shift(output[27]);
     491           0 :     output[28] = half_round_shift(output[28]);
     492           0 :     output[29] = half_round_shift(output[29]);
     493           0 :     output[30] = half_round_shift(output[30]);
     494           0 :     output[31] = half_round_shift(output[31]);
     495             :   }
     496             : 
     497             :   // Stage 3
     498           0 :   step[0] = output[0] + output[(8 - 1)];
     499           0 :   step[1] = output[1] + output[(8 - 2)];
     500           0 :   step[2] = output[2] + output[(8 - 3)];
     501           0 :   step[3] = output[3] + output[(8 - 4)];
     502           0 :   step[4] = -output[4] + output[(8 - 5)];
     503           0 :   step[5] = -output[5] + output[(8 - 6)];
     504           0 :   step[6] = -output[6] + output[(8 - 7)];
     505           0 :   step[7] = -output[7] + output[(8 - 8)];
     506           0 :   step[8] = output[8];
     507           0 :   step[9] = output[9];
     508           0 :   step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
     509           0 :   step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
     510           0 :   step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
     511           0 :   step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
     512           0 :   step[14] = output[14];
     513           0 :   step[15] = output[15];
     514             : 
     515           0 :   step[16] = output[16] + output[23];
     516           0 :   step[17] = output[17] + output[22];
     517           0 :   step[18] = output[18] + output[21];
     518           0 :   step[19] = output[19] + output[20];
     519           0 :   step[20] = -output[20] + output[19];
     520           0 :   step[21] = -output[21] + output[18];
     521           0 :   step[22] = -output[22] + output[17];
     522           0 :   step[23] = -output[23] + output[16];
     523           0 :   step[24] = -output[24] + output[31];
     524           0 :   step[25] = -output[25] + output[30];
     525           0 :   step[26] = -output[26] + output[29];
     526           0 :   step[27] = -output[27] + output[28];
     527           0 :   step[28] = output[28] + output[27];
     528           0 :   step[29] = output[29] + output[26];
     529           0 :   step[30] = output[30] + output[25];
     530           0 :   step[31] = output[31] + output[24];
     531             : 
     532             :   // Stage 4
     533           0 :   output[0] = step[0] + step[3];
     534           0 :   output[1] = step[1] + step[2];
     535           0 :   output[2] = -step[2] + step[1];
     536           0 :   output[3] = -step[3] + step[0];
     537           0 :   output[4] = step[4];
     538           0 :   output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
     539           0 :   output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
     540           0 :   output[7] = step[7];
     541           0 :   output[8] = step[8] + step[11];
     542           0 :   output[9] = step[9] + step[10];
     543           0 :   output[10] = -step[10] + step[9];
     544           0 :   output[11] = -step[11] + step[8];
     545           0 :   output[12] = -step[12] + step[15];
     546           0 :   output[13] = -step[13] + step[14];
     547           0 :   output[14] = step[14] + step[13];
     548           0 :   output[15] = step[15] + step[12];
     549             : 
     550           0 :   output[16] = step[16];
     551           0 :   output[17] = step[17];
     552           0 :   output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
     553           0 :   output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
     554           0 :   output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
     555           0 :   output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
     556           0 :   output[22] = step[22];
     557           0 :   output[23] = step[23];
     558           0 :   output[24] = step[24];
     559           0 :   output[25] = step[25];
     560           0 :   output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
     561           0 :   output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
     562           0 :   output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
     563           0 :   output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
     564           0 :   output[30] = step[30];
     565           0 :   output[31] = step[31];
     566             : 
     567             :   // Stage 5
     568           0 :   step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
     569           0 :   step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
     570           0 :   step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
     571           0 :   step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
     572           0 :   step[4] = output[4] + output[5];
     573           0 :   step[5] = -output[5] + output[4];
     574           0 :   step[6] = -output[6] + output[7];
     575           0 :   step[7] = output[7] + output[6];
     576           0 :   step[8] = output[8];
     577           0 :   step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
     578           0 :   step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
     579           0 :   step[11] = output[11];
     580           0 :   step[12] = output[12];
     581           0 :   step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
     582           0 :   step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
     583           0 :   step[15] = output[15];
     584             : 
     585           0 :   step[16] = output[16] + output[19];
     586           0 :   step[17] = output[17] + output[18];
     587           0 :   step[18] = -output[18] + output[17];
     588           0 :   step[19] = -output[19] + output[16];
     589           0 :   step[20] = -output[20] + output[23];
     590           0 :   step[21] = -output[21] + output[22];
     591           0 :   step[22] = output[22] + output[21];
     592           0 :   step[23] = output[23] + output[20];
     593           0 :   step[24] = output[24] + output[27];
     594           0 :   step[25] = output[25] + output[26];
     595           0 :   step[26] = -output[26] + output[25];
     596           0 :   step[27] = -output[27] + output[24];
     597           0 :   step[28] = -output[28] + output[31];
     598           0 :   step[29] = -output[29] + output[30];
     599           0 :   step[30] = output[30] + output[29];
     600           0 :   step[31] = output[31] + output[28];
     601             : 
     602             :   // Stage 6
     603           0 :   output[0] = step[0];
     604           0 :   output[1] = step[1];
     605           0 :   output[2] = step[2];
     606           0 :   output[3] = step[3];
     607           0 :   output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
     608           0 :   output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
     609           0 :   output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
     610           0 :   output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
     611           0 :   output[8] = step[8] + step[9];
     612           0 :   output[9] = -step[9] + step[8];
     613           0 :   output[10] = -step[10] + step[11];
     614           0 :   output[11] = step[11] + step[10];
     615           0 :   output[12] = step[12] + step[13];
     616           0 :   output[13] = -step[13] + step[12];
     617           0 :   output[14] = -step[14] + step[15];
     618           0 :   output[15] = step[15] + step[14];
     619             : 
     620           0 :   output[16] = step[16];
     621           0 :   output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
     622           0 :   output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
     623           0 :   output[19] = step[19];
     624           0 :   output[20] = step[20];
     625           0 :   output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
     626           0 :   output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
     627           0 :   output[23] = step[23];
     628           0 :   output[24] = step[24];
     629           0 :   output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
     630           0 :   output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
     631           0 :   output[27] = step[27];
     632           0 :   output[28] = step[28];
     633           0 :   output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
     634           0 :   output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
     635           0 :   output[31] = step[31];
     636             : 
     637             :   // Stage 7
     638           0 :   step[0] = output[0];
     639           0 :   step[1] = output[1];
     640           0 :   step[2] = output[2];
     641           0 :   step[3] = output[3];
     642           0 :   step[4] = output[4];
     643           0 :   step[5] = output[5];
     644           0 :   step[6] = output[6];
     645           0 :   step[7] = output[7];
     646           0 :   step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
     647           0 :   step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
     648           0 :   step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
     649           0 :   step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
     650           0 :   step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
     651           0 :   step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
     652           0 :   step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
     653           0 :   step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
     654             : 
     655           0 :   step[16] = output[16] + output[17];
     656           0 :   step[17] = -output[17] + output[16];
     657           0 :   step[18] = -output[18] + output[19];
     658           0 :   step[19] = output[19] + output[18];
     659           0 :   step[20] = output[20] + output[21];
     660           0 :   step[21] = -output[21] + output[20];
     661           0 :   step[22] = -output[22] + output[23];
     662           0 :   step[23] = output[23] + output[22];
     663           0 :   step[24] = output[24] + output[25];
     664           0 :   step[25] = -output[25] + output[24];
     665           0 :   step[26] = -output[26] + output[27];
     666           0 :   step[27] = output[27] + output[26];
     667           0 :   step[28] = output[28] + output[29];
     668           0 :   step[29] = -output[29] + output[28];
     669           0 :   step[30] = -output[30] + output[31];
     670           0 :   step[31] = output[31] + output[30];
     671             : 
     672             :   // Final stage --- outputs indices are bit-reversed.
     673           0 :   output[0] = step[0];
     674           0 :   output[16] = step[1];
     675           0 :   output[8] = step[2];
     676           0 :   output[24] = step[3];
     677           0 :   output[4] = step[4];
     678           0 :   output[20] = step[5];
     679           0 :   output[12] = step[6];
     680           0 :   output[28] = step[7];
     681           0 :   output[2] = step[8];
     682           0 :   output[18] = step[9];
     683           0 :   output[10] = step[10];
     684           0 :   output[26] = step[11];
     685           0 :   output[6] = step[12];
     686           0 :   output[22] = step[13];
     687           0 :   output[14] = step[14];
     688           0 :   output[30] = step[15];
     689             : 
     690           0 :   output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
     691           0 :   output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
     692           0 :   output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
     693           0 :   output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
     694           0 :   output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
     695           0 :   output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
     696           0 :   output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
     697           0 :   output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
     698           0 :   output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
     699           0 :   output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
     700           0 :   output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
     701           0 :   output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
     702           0 :   output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
     703           0 :   output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
     704           0 :   output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
     705           0 :   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
     706           0 : }
     707             : 
     708           0 : void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
     709             :   int i, j;
     710             :   tran_high_t output[32 * 32];
     711             : 
     712             :   // Columns
     713           0 :   for (i = 0; i < 32; ++i) {
     714             :     tran_high_t temp_in[32], temp_out[32];
     715           0 :     for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
     716           0 :     vpx_fdct32(temp_in, temp_out, 0);
     717           0 :     for (j = 0; j < 32; ++j)
     718           0 :       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
     719             :   }
     720             : 
     721             :   // Rows
     722           0 :   for (i = 0; i < 32; ++i) {
     723             :     tran_high_t temp_in[32], temp_out[32];
     724           0 :     for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
     725           0 :     vpx_fdct32(temp_in, temp_out, 0);
     726           0 :     for (j = 0; j < 32; ++j)
     727           0 :       out[j + i * 32] =
     728           0 :           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
     729             :   }
     730           0 : }
     731             : 
     732             : // Note that although we use dct_32_round in dct32 computation flow,
     733             : // this 2d fdct32x32 for rate-distortion optimization loop is operating
     734             : // within 16 bits precision.
     735           0 : void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
     736             :   int i, j;
     737             :   tran_high_t output[32 * 32];
     738             : 
     739             :   // Columns
     740           0 :   for (i = 0; i < 32; ++i) {
     741             :     tran_high_t temp_in[32], temp_out[32];
     742           0 :     for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
     743           0 :     vpx_fdct32(temp_in, temp_out, 0);
     744           0 :     for (j = 0; j < 32; ++j)
     745             :       // TODO(cd): see quality impact of only doing
     746             :       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
     747             :       //           PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c
     748           0 :       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
     749             :   }
     750             : 
     751             :   // Rows
     752           0 :   for (i = 0; i < 32; ++i) {
     753             :     tran_high_t temp_in[32], temp_out[32];
     754           0 :     for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
     755           0 :     vpx_fdct32(temp_in, temp_out, 1);
     756           0 :     for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
     757             :   }
     758           0 : }
     759             : 
     760           0 : void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
     761             :   int r, c;
     762           0 :   int sum = 0;
     763           0 :   for (r = 0; r < 32; ++r)
     764           0 :     for (c = 0; c < 32; ++c) sum += input[r * stride + c];
     765             : 
     766           0 :   output[0] = (tran_low_t)(sum >> 3);
     767           0 : }
     768             : 
     769             : #if CONFIG_VP9_HIGHBITDEPTH
     770             : void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
     771             :                           int stride) {
     772             :   vpx_fdct4x4_c(input, output, stride);
     773             : }
     774             : 
     775             : void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
     776             :                           int stride) {
     777             :   vpx_fdct8x8_c(input, final_output, stride);
     778             : }
     779             : 
     780             : void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
     781             :                             int stride) {
     782             :   vpx_fdct8x8_1_c(input, final_output, stride);
     783             : }
     784             : 
     785             : void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
     786             :                             int stride) {
     787             :   vpx_fdct16x16_c(input, output, stride);
     788             : }
     789             : 
     790             : void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
     791             :                               int stride) {
     792             :   vpx_fdct16x16_1_c(input, output, stride);
     793             : }
     794             : 
     795             : void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
     796             :   vpx_fdct32x32_c(input, out, stride);
     797             : }
     798             : 
     799             : void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
     800             :                                int stride) {
     801             :   vpx_fdct32x32_rd_c(input, out, stride);
     802             : }
     803             : 
     804             : void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
     805             :                               int stride) {
     806             :   vpx_fdct32x32_1_c(input, out, stride);
     807             : }
     808             : #endif  // CONFIG_VP9_HIGHBITDEPTH

Generated by: LCOV version 1.13