LCOV - code coverage report
Current view: top level - third_party/aom/av1/common/x86 - av1_fwd_txfm1d_sse4.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 662 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 3 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : #include "av1/common/x86/av1_txfm1d_sse4.h"
       2             : 
       3           0 : void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
       4             :                            const int8_t *cos_bit, const int8_t *stage_range) {
       5           0 :   const int txfm_size = 32;
       6           0 :   const int num_per_128 = 4;
       7             :   const int32_t *cospi;
       8             :   __m128i buf0[32];
       9             :   __m128i buf1[32];
      10           0 :   int col_num = txfm_size / num_per_128;
      11             :   int bit;
      12             :   int col;
      13             :   (void)stage_range;
      14           0 :   for (col = 0; col < col_num; col++) {
      15             :     // stage 0;
      16           0 :     int32_t stage_idx = 0;
      17             :     int j;
      18           0 :     for (j = 0; j < 32; ++j) {
      19           0 :       buf0[j] = input[j * col_num + col];
      20             :     }
      21             : 
      22             :     // stage 1
      23           0 :     stage_idx++;
      24           0 :     buf1[0] = _mm_add_epi32(buf0[0], buf0[31]);
      25           0 :     buf1[31] = _mm_sub_epi32(buf0[0], buf0[31]);
      26           0 :     buf1[1] = _mm_add_epi32(buf0[1], buf0[30]);
      27           0 :     buf1[30] = _mm_sub_epi32(buf0[1], buf0[30]);
      28           0 :     buf1[2] = _mm_add_epi32(buf0[2], buf0[29]);
      29           0 :     buf1[29] = _mm_sub_epi32(buf0[2], buf0[29]);
      30           0 :     buf1[3] = _mm_add_epi32(buf0[3], buf0[28]);
      31           0 :     buf1[28] = _mm_sub_epi32(buf0[3], buf0[28]);
      32           0 :     buf1[4] = _mm_add_epi32(buf0[4], buf0[27]);
      33           0 :     buf1[27] = _mm_sub_epi32(buf0[4], buf0[27]);
      34           0 :     buf1[5] = _mm_add_epi32(buf0[5], buf0[26]);
      35           0 :     buf1[26] = _mm_sub_epi32(buf0[5], buf0[26]);
      36           0 :     buf1[6] = _mm_add_epi32(buf0[6], buf0[25]);
      37           0 :     buf1[25] = _mm_sub_epi32(buf0[6], buf0[25]);
      38           0 :     buf1[7] = _mm_add_epi32(buf0[7], buf0[24]);
      39           0 :     buf1[24] = _mm_sub_epi32(buf0[7], buf0[24]);
      40           0 :     buf1[8] = _mm_add_epi32(buf0[8], buf0[23]);
      41           0 :     buf1[23] = _mm_sub_epi32(buf0[8], buf0[23]);
      42           0 :     buf1[9] = _mm_add_epi32(buf0[9], buf0[22]);
      43           0 :     buf1[22] = _mm_sub_epi32(buf0[9], buf0[22]);
      44           0 :     buf1[10] = _mm_add_epi32(buf0[10], buf0[21]);
      45           0 :     buf1[21] = _mm_sub_epi32(buf0[10], buf0[21]);
      46           0 :     buf1[11] = _mm_add_epi32(buf0[11], buf0[20]);
      47           0 :     buf1[20] = _mm_sub_epi32(buf0[11], buf0[20]);
      48           0 :     buf1[12] = _mm_add_epi32(buf0[12], buf0[19]);
      49           0 :     buf1[19] = _mm_sub_epi32(buf0[12], buf0[19]);
      50           0 :     buf1[13] = _mm_add_epi32(buf0[13], buf0[18]);
      51           0 :     buf1[18] = _mm_sub_epi32(buf0[13], buf0[18]);
      52           0 :     buf1[14] = _mm_add_epi32(buf0[14], buf0[17]);
      53           0 :     buf1[17] = _mm_sub_epi32(buf0[14], buf0[17]);
      54           0 :     buf1[15] = _mm_add_epi32(buf0[15], buf0[16]);
      55           0 :     buf1[16] = _mm_sub_epi32(buf0[15], buf0[16]);
      56             : 
      57             :     // stage 2
      58           0 :     stage_idx++;
      59           0 :     bit = cos_bit[stage_idx];
      60           0 :     cospi = cospi_arr(bit);
      61           0 :     buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
      62           0 :     buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
      63           0 :     buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
      64           0 :     buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
      65           0 :     buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
      66           0 :     buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
      67           0 :     buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
      68           0 :     buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
      69           0 :     buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
      70           0 :     buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
      71           0 :     buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
      72           0 :     buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
      73           0 :     buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
      74           0 :     buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
      75           0 :     buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
      76           0 :     buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
      77           0 :     buf0[16] = buf1[16];
      78           0 :     buf0[17] = buf1[17];
      79           0 :     buf0[18] = buf1[18];
      80           0 :     buf0[19] = buf1[19];
      81           0 :     btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
      82             :                         buf0[27], bit);
      83           0 :     btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
      84             :                         buf0[26], bit);
      85           0 :     btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
      86             :                         buf0[25], bit);
      87           0 :     btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
      88             :                         buf0[24], bit);
      89           0 :     buf0[28] = buf1[28];
      90           0 :     buf0[29] = buf1[29];
      91           0 :     buf0[30] = buf1[30];
      92           0 :     buf0[31] = buf1[31];
      93             : 
      94             :     // stage 3
      95           0 :     stage_idx++;
      96           0 :     bit = cos_bit[stage_idx];
      97           0 :     cospi = cospi_arr(bit);
      98           0 :     buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
      99           0 :     buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
     100           0 :     buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
     101           0 :     buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
     102           0 :     buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
     103           0 :     buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
     104           0 :     buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
     105           0 :     buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
     106           0 :     buf1[8] = buf0[8];
     107           0 :     buf1[9] = buf0[9];
     108           0 :     btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
     109             :                         buf1[13], bit);
     110           0 :     btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
     111             :                         buf1[12], bit);
     112           0 :     buf1[14] = buf0[14];
     113           0 :     buf1[15] = buf0[15];
     114           0 :     buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
     115           0 :     buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
     116           0 :     buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
     117           0 :     buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
     118           0 :     buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
     119           0 :     buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
     120           0 :     buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
     121           0 :     buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
     122           0 :     buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
     123           0 :     buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
     124           0 :     buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
     125           0 :     buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
     126           0 :     buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
     127           0 :     buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
     128           0 :     buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
     129           0 :     buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
     130             : 
     131             :     // stage 4
     132           0 :     stage_idx++;
     133           0 :     bit = cos_bit[stage_idx];
     134           0 :     cospi = cospi_arr(bit);
     135           0 :     buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
     136           0 :     buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
     137           0 :     buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
     138           0 :     buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
     139           0 :     buf0[4] = buf1[4];
     140           0 :     btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
     141             :                         buf0[6], bit);
     142           0 :     buf0[7] = buf1[7];
     143           0 :     buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
     144           0 :     buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
     145           0 :     buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
     146           0 :     buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
     147           0 :     buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
     148           0 :     buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
     149           0 :     buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
     150           0 :     buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
     151           0 :     buf0[16] = buf1[16];
     152           0 :     buf0[17] = buf1[17];
     153           0 :     btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
     154             :                         buf0[29], bit);
     155           0 :     btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
     156             :                         buf0[28], bit);
     157           0 :     btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
     158             :                         buf0[27], bit);
     159           0 :     btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
     160             :                         buf0[26], bit);
     161           0 :     buf0[22] = buf1[22];
     162           0 :     buf0[23] = buf1[23];
     163           0 :     buf0[24] = buf1[24];
     164           0 :     buf0[25] = buf1[25];
     165           0 :     buf0[30] = buf1[30];
     166           0 :     buf0[31] = buf1[31];
     167             : 
     168             :     // stage 5
     169           0 :     stage_idx++;
     170           0 :     bit = cos_bit[stage_idx];
     171           0 :     cospi = cospi_arr(bit);
     172           0 :     btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
     173             :                         buf1[1], bit);
     174           0 :     btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
     175             :                         buf1[3], bit);
     176           0 :     buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
     177           0 :     buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
     178           0 :     buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
     179           0 :     buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
     180           0 :     buf1[8] = buf0[8];
     181           0 :     btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
     182             :                         buf1[14], bit);
     183           0 :     btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
     184             :                         buf1[13], bit);
     185           0 :     buf1[11] = buf0[11];
     186           0 :     buf1[12] = buf0[12];
     187           0 :     buf1[15] = buf0[15];
     188           0 :     buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
     189           0 :     buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
     190           0 :     buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
     191           0 :     buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
     192           0 :     buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
     193           0 :     buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
     194           0 :     buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
     195           0 :     buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
     196           0 :     buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
     197           0 :     buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
     198           0 :     buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
     199           0 :     buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
     200           0 :     buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
     201           0 :     buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
     202           0 :     buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
     203           0 :     buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
     204             : 
     205             :     // stage 6
     206           0 :     stage_idx++;
     207           0 :     bit = cos_bit[stage_idx];
     208           0 :     cospi = cospi_arr(bit);
     209           0 :     buf0[0] = buf1[0];
     210           0 :     buf0[1] = buf1[1];
     211           0 :     buf0[2] = buf1[2];
     212           0 :     buf0[3] = buf1[3];
     213           0 :     btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
     214             :                         bit);
     215           0 :     btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
     216             :                         buf0[6], bit);
     217           0 :     buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
     218           0 :     buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
     219           0 :     buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
     220           0 :     buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
     221           0 :     buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
     222           0 :     buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
     223           0 :     buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
     224           0 :     buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
     225           0 :     buf0[16] = buf1[16];
     226           0 :     btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
     227             :                         buf0[30], bit);
     228           0 :     btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
     229             :                         buf0[29], bit);
     230           0 :     buf0[19] = buf1[19];
     231           0 :     buf0[20] = buf1[20];
     232           0 :     btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
     233             :                         buf0[26], bit);
     234           0 :     btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
     235             :                         buf0[25], bit);
     236           0 :     buf0[23] = buf1[23];
     237           0 :     buf0[24] = buf1[24];
     238           0 :     buf0[27] = buf1[27];
     239           0 :     buf0[28] = buf1[28];
     240           0 :     buf0[31] = buf1[31];
     241             : 
     242             :     // stage 7
     243           0 :     stage_idx++;
     244           0 :     bit = cos_bit[stage_idx];
     245           0 :     cospi = cospi_arr(bit);
     246           0 :     buf1[0] = buf0[0];
     247           0 :     buf1[1] = buf0[1];
     248           0 :     buf1[2] = buf0[2];
     249           0 :     buf1[3] = buf0[3];
     250           0 :     buf1[4] = buf0[4];
     251           0 :     buf1[5] = buf0[5];
     252           0 :     buf1[6] = buf0[6];
     253           0 :     buf1[7] = buf0[7];
     254           0 :     btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8],
     255             :                         buf1[15], bit);
     256           0 :     btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
     257             :                         buf1[14], bit);
     258           0 :     btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
     259             :                         buf1[13], bit);
     260           0 :     btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
     261             :                         buf1[12], bit);
     262           0 :     buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
     263           0 :     buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
     264           0 :     buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
     265           0 :     buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
     266           0 :     buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
     267           0 :     buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
     268           0 :     buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
     269           0 :     buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
     270           0 :     buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
     271           0 :     buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
     272           0 :     buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
     273           0 :     buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
     274           0 :     buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
     275           0 :     buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
     276           0 :     buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
     277           0 :     buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
     278             : 
     279             :     // stage 8
     280           0 :     stage_idx++;
     281           0 :     bit = cos_bit[stage_idx];
     282           0 :     cospi = cospi_arr(bit);
     283           0 :     buf0[0] = buf1[0];
     284           0 :     buf0[1] = buf1[1];
     285           0 :     buf0[2] = buf1[2];
     286           0 :     buf0[3] = buf1[3];
     287           0 :     buf0[4] = buf1[4];
     288           0 :     buf0[5] = buf1[5];
     289           0 :     buf0[6] = buf1[6];
     290           0 :     buf0[7] = buf1[7];
     291           0 :     buf0[8] = buf1[8];
     292           0 :     buf0[9] = buf1[9];
     293           0 :     buf0[10] = buf1[10];
     294           0 :     buf0[11] = buf1[11];
     295           0 :     buf0[12] = buf1[12];
     296           0 :     buf0[13] = buf1[13];
     297           0 :     buf0[14] = buf1[14];
     298           0 :     buf0[15] = buf1[15];
     299           0 :     btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
     300             :                         buf0[31], bit);
     301           0 :     btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
     302             :                         buf0[30], bit);
     303           0 :     btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
     304             :                         buf0[29], bit);
     305           0 :     btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
     306             :                         buf0[28], bit);
     307           0 :     btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
     308             :                         buf0[27], bit);
     309           0 :     btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
     310             :                         buf0[26], bit);
     311           0 :     btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
     312             :                         buf0[25], bit);
     313           0 :     btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
     314             :                         buf0[24], bit);
     315             : 
     316             :     // stage 9
     317           0 :     stage_idx++;
     318           0 :     buf1[0] = buf0[0];
     319           0 :     buf1[1] = buf0[16];
     320           0 :     buf1[2] = buf0[8];
     321           0 :     buf1[3] = buf0[24];
     322           0 :     buf1[4] = buf0[4];
     323           0 :     buf1[5] = buf0[20];
     324           0 :     buf1[6] = buf0[12];
     325           0 :     buf1[7] = buf0[28];
     326           0 :     buf1[8] = buf0[2];
     327           0 :     buf1[9] = buf0[18];
     328           0 :     buf1[10] = buf0[10];
     329           0 :     buf1[11] = buf0[26];
     330           0 :     buf1[12] = buf0[6];
     331           0 :     buf1[13] = buf0[22];
     332           0 :     buf1[14] = buf0[14];
     333           0 :     buf1[15] = buf0[30];
     334           0 :     buf1[16] = buf0[1];
     335           0 :     buf1[17] = buf0[17];
     336           0 :     buf1[18] = buf0[9];
     337           0 :     buf1[19] = buf0[25];
     338           0 :     buf1[20] = buf0[5];
     339           0 :     buf1[21] = buf0[21];
     340           0 :     buf1[22] = buf0[13];
     341           0 :     buf1[23] = buf0[29];
     342           0 :     buf1[24] = buf0[3];
     343           0 :     buf1[25] = buf0[19];
     344           0 :     buf1[26] = buf0[11];
     345           0 :     buf1[27] = buf0[27];
     346           0 :     buf1[28] = buf0[7];
     347           0 :     buf1[29] = buf0[23];
     348           0 :     buf1[30] = buf0[15];
     349           0 :     buf1[31] = buf0[31];
     350             : 
     351           0 :     for (j = 0; j < 32; ++j) {
     352           0 :       output[j * col_num + col] = buf1[j];
     353             :     }
     354             :   }
     355           0 : }
     356             : 
     357           0 : void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
     358             :                            const int8_t *cos_bit, const int8_t *stage_range) {
     359           0 :   const int txfm_size = 4;
     360           0 :   const int num_per_128 = 4;
     361             :   const int32_t *cospi;
     362             :   __m128i buf0[4];
     363             :   __m128i buf1[4];
     364           0 :   int col_num = txfm_size / num_per_128;
     365             :   int bit;
     366             :   int col;
     367             :   (void)stage_range;
     368           0 :   for (col = 0; col < col_num; col++) {
     369             :     // stage 0;
     370           0 :     int32_t stage_idx = 0;
     371             :     int j;
     372           0 :     for (j = 0; j < 4; ++j) {
     373           0 :       buf0[j] = input[j * col_num + col];
     374             :     }
     375             : 
     376             :     // stage 1
     377           0 :     stage_idx++;
     378           0 :     buf1[0] = buf0[3];
     379           0 :     buf1[1] = buf0[0];
     380           0 :     buf1[2] = buf0[1];
     381           0 :     buf1[3] = buf0[2];
     382             : 
     383             :     // stage 2
     384           0 :     stage_idx++;
     385           0 :     bit = cos_bit[stage_idx];
     386           0 :     cospi = cospi_arr(bit);
     387           0 :     btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
     388             :                         bit);
     389           0 :     btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
     390             :                         buf0[3], bit);
     391             : 
     392             :     // stage 3
     393           0 :     stage_idx++;
     394           0 :     buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
     395           0 :     buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
     396           0 :     buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
     397           0 :     buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
     398             : 
     399             :     // stage 4
     400           0 :     stage_idx++;
     401           0 :     bit = cos_bit[stage_idx];
     402           0 :     cospi = cospi_arr(bit);
     403           0 :     buf0[0] = buf1[0];
     404           0 :     buf0[1] = buf1[1];
     405           0 :     btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
     406             :                         buf0[3], bit);
     407             : 
     408             :     // stage 5
     409           0 :     stage_idx++;
     410           0 :     buf1[0] = buf0[0];
     411           0 :     buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
     412           0 :     buf1[2] = buf0[3];
     413           0 :     buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
     414             : 
     415           0 :     for (j = 0; j < 4; ++j) {
     416           0 :       output[j * col_num + col] = buf1[j];
     417             :     }
     418             :   }
     419           0 : }
     420             : 
     421           0 : void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
     422             :                             const int8_t *cos_bit, const int8_t *stage_range) {
     423           0 :   const int txfm_size = 32;
     424           0 :   const int num_per_128 = 4;
     425             :   const int32_t *cospi;
     426             :   __m128i buf0[32];
     427             :   __m128i buf1[32];
     428           0 :   int col_num = txfm_size / num_per_128;
     429             :   int bit;
     430             :   int col;
     431             :   (void)stage_range;
     432           0 :   for (col = 0; col < col_num; col++) {
     433             :     // stage 0;
     434           0 :     int32_t stage_idx = 0;
     435             :     int j;
     436           0 :     for (j = 0; j < 32; ++j) {
     437           0 :       buf0[j] = input[j * col_num + col];
     438             :     }
     439             : 
     440             :     // stage 1
     441           0 :     stage_idx++;
     442           0 :     buf1[0] = buf0[31];
     443           0 :     buf1[1] = buf0[0];
     444           0 :     buf1[2] = buf0[29];
     445           0 :     buf1[3] = buf0[2];
     446           0 :     buf1[4] = buf0[27];
     447           0 :     buf1[5] = buf0[4];
     448           0 :     buf1[6] = buf0[25];
     449           0 :     buf1[7] = buf0[6];
     450           0 :     buf1[8] = buf0[23];
     451           0 :     buf1[9] = buf0[8];
     452           0 :     buf1[10] = buf0[21];
     453           0 :     buf1[11] = buf0[10];
     454           0 :     buf1[12] = buf0[19];
     455           0 :     buf1[13] = buf0[12];
     456           0 :     buf1[14] = buf0[17];
     457           0 :     buf1[15] = buf0[14];
     458           0 :     buf1[16] = buf0[15];
     459           0 :     buf1[17] = buf0[16];
     460           0 :     buf1[18] = buf0[13];
     461           0 :     buf1[19] = buf0[18];
     462           0 :     buf1[20] = buf0[11];
     463           0 :     buf1[21] = buf0[20];
     464           0 :     buf1[22] = buf0[9];
     465           0 :     buf1[23] = buf0[22];
     466           0 :     buf1[24] = buf0[7];
     467           0 :     buf1[25] = buf0[24];
     468           0 :     buf1[26] = buf0[5];
     469           0 :     buf1[27] = buf0[26];
     470           0 :     buf1[28] = buf0[3];
     471           0 :     buf1[29] = buf0[28];
     472           0 :     buf1[30] = buf0[1];
     473           0 :     buf1[31] = buf0[30];
     474             : 
     475             :     // stage 2
     476           0 :     stage_idx++;
     477           0 :     bit = cos_bit[stage_idx];
     478           0 :     cospi = cospi_arr(bit);
     479           0 :     btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
     480             :                         bit);
     481           0 :     btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
     482             :                         bit);
     483           0 :     btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
     484             :                         bit);
     485           0 :     btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
     486             :                         buf0[7], bit);
     487           0 :     btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
     488             :                         buf0[9], bit);
     489           0 :     btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
     490             :                         buf0[11], bit);
     491           0 :     btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
     492             :                         buf0[13], bit);
     493           0 :     btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
     494             :                         buf0[15], bit);
     495           0 :     btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
     496             :                         buf0[17], bit);
     497           0 :     btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
     498             :                         buf0[19], bit);
     499           0 :     btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
     500             :                         buf0[21], bit);
     501           0 :     btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
     502             :                         buf0[23], bit);
     503           0 :     btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
     504             :                         buf0[25], bit);
     505           0 :     btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
     506             :                         buf0[27], bit);
     507           0 :     btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
     508             :                         buf0[29], bit);
     509           0 :     btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
     510             :                         buf0[31], bit);
     511             : 
     512             :     // stage 3
     513           0 :     stage_idx++;
     514           0 :     buf1[0] = _mm_add_epi32(buf0[0], buf0[16]);
     515           0 :     buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]);
     516           0 :     buf1[1] = _mm_add_epi32(buf0[1], buf0[17]);
     517           0 :     buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]);
     518           0 :     buf1[2] = _mm_add_epi32(buf0[2], buf0[18]);
     519           0 :     buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]);
     520           0 :     buf1[3] = _mm_add_epi32(buf0[3], buf0[19]);
     521           0 :     buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]);
     522           0 :     buf1[4] = _mm_add_epi32(buf0[4], buf0[20]);
     523           0 :     buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]);
     524           0 :     buf1[5] = _mm_add_epi32(buf0[5], buf0[21]);
     525           0 :     buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]);
     526           0 :     buf1[6] = _mm_add_epi32(buf0[6], buf0[22]);
     527           0 :     buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]);
     528           0 :     buf1[7] = _mm_add_epi32(buf0[7], buf0[23]);
     529           0 :     buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]);
     530           0 :     buf1[8] = _mm_add_epi32(buf0[8], buf0[24]);
     531           0 :     buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]);
     532           0 :     buf1[9] = _mm_add_epi32(buf0[9], buf0[25]);
     533           0 :     buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]);
     534           0 :     buf1[10] = _mm_add_epi32(buf0[10], buf0[26]);
     535           0 :     buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]);
     536           0 :     buf1[11] = _mm_add_epi32(buf0[11], buf0[27]);
     537           0 :     buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]);
     538           0 :     buf1[12] = _mm_add_epi32(buf0[12], buf0[28]);
     539           0 :     buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]);
     540           0 :     buf1[13] = _mm_add_epi32(buf0[13], buf0[29]);
     541           0 :     buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]);
     542           0 :     buf1[14] = _mm_add_epi32(buf0[14], buf0[30]);
     543           0 :     buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]);
     544           0 :     buf1[15] = _mm_add_epi32(buf0[15], buf0[31]);
     545           0 :     buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]);
     546             : 
     547             :     // stage 4
     548           0 :     stage_idx++;
     549           0 :     bit = cos_bit[stage_idx];
     550           0 :     cospi = cospi_arr(bit);
     551           0 :     buf0[0] = buf1[0];
     552           0 :     buf0[1] = buf1[1];
     553           0 :     buf0[2] = buf1[2];
     554           0 :     buf0[3] = buf1[3];
     555           0 :     buf0[4] = buf1[4];
     556           0 :     buf0[5] = buf1[5];
     557           0 :     buf0[6] = buf1[6];
     558           0 :     buf0[7] = buf1[7];
     559           0 :     buf0[8] = buf1[8];
     560           0 :     buf0[9] = buf1[9];
     561           0 :     buf0[10] = buf1[10];
     562           0 :     buf0[11] = buf1[11];
     563           0 :     buf0[12] = buf1[12];
     564           0 :     buf0[13] = buf1[13];
     565           0 :     buf0[14] = buf1[14];
     566           0 :     buf0[15] = buf1[15];
     567           0 :     btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
     568             :                         buf0[17], bit);
     569           0 :     btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
     570             :                         buf0[19], bit);
     571           0 :     btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
     572             :                         buf0[21], bit);
     573           0 :     btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
     574             :                         buf0[23], bit);
     575           0 :     btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
     576             :                         buf0[25], bit);
     577           0 :     btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
     578             :                         buf0[27], bit);
     579           0 :     btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
     580             :                         buf0[29], bit);
     581           0 :     btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
     582             :                         buf0[31], bit);
     583             : 
     584             :     // stage 5
     585           0 :     stage_idx++;
     586           0 :     buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
     587           0 :     buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
     588           0 :     buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
     589           0 :     buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
     590           0 :     buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
     591           0 :     buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
     592           0 :     buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
     593           0 :     buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
     594           0 :     buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
     595           0 :     buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
     596           0 :     buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
     597           0 :     buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
     598           0 :     buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
     599           0 :     buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
     600           0 :     buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
     601           0 :     buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
     602           0 :     buf1[16] = _mm_add_epi32(buf0[16], buf0[24]);
     603           0 :     buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]);
     604           0 :     buf1[17] = _mm_add_epi32(buf0[17], buf0[25]);
     605           0 :     buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]);
     606           0 :     buf1[18] = _mm_add_epi32(buf0[18], buf0[26]);
     607           0 :     buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]);
     608           0 :     buf1[19] = _mm_add_epi32(buf0[19], buf0[27]);
     609           0 :     buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]);
     610           0 :     buf1[20] = _mm_add_epi32(buf0[20], buf0[28]);
     611           0 :     buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]);
     612           0 :     buf1[21] = _mm_add_epi32(buf0[21], buf0[29]);
     613           0 :     buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]);
     614           0 :     buf1[22] = _mm_add_epi32(buf0[22], buf0[30]);
     615           0 :     buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]);
     616           0 :     buf1[23] = _mm_add_epi32(buf0[23], buf0[31]);
     617           0 :     buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]);
     618             : 
     619             :     // stage 6
     620           0 :     stage_idx++;
     621           0 :     bit = cos_bit[stage_idx];
     622           0 :     cospi = cospi_arr(bit);
     623           0 :     buf0[0] = buf1[0];
     624           0 :     buf0[1] = buf1[1];
     625           0 :     buf0[2] = buf1[2];
     626           0 :     buf0[3] = buf1[3];
     627           0 :     buf0[4] = buf1[4];
     628           0 :     buf0[5] = buf1[5];
     629           0 :     buf0[6] = buf1[6];
     630           0 :     buf0[7] = buf1[7];
     631           0 :     btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
     632             :                         bit);
     633           0 :     btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
     634             :                         buf0[11], bit);
     635           0 :     btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
     636             :                         buf0[13], bit);
     637           0 :     btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
     638             :                         buf0[15], bit);
     639           0 :     buf0[16] = buf1[16];
     640           0 :     buf0[17] = buf1[17];
     641           0 :     buf0[18] = buf1[18];
     642           0 :     buf0[19] = buf1[19];
     643           0 :     buf0[20] = buf1[20];
     644           0 :     buf0[21] = buf1[21];
     645           0 :     buf0[22] = buf1[22];
     646           0 :     buf0[23] = buf1[23];
     647           0 :     btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
     648             :                         buf0[25], bit);
     649           0 :     btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
     650             :                         buf0[27], bit);
     651           0 :     btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
     652             :                         buf0[29], bit);
     653           0 :     btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
     654             :                         buf0[31], bit);
     655             : 
     656             :     // stage 7
     657           0 :     stage_idx++;
     658           0 :     buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
     659           0 :     buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
     660           0 :     buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
     661           0 :     buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
     662           0 :     buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
     663           0 :     buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
     664           0 :     buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
     665           0 :     buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
     666           0 :     buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
     667           0 :     buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
     668           0 :     buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
     669           0 :     buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
     670           0 :     buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
     671           0 :     buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
     672           0 :     buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
     673           0 :     buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
     674           0 :     buf1[16] = _mm_add_epi32(buf0[16], buf0[20]);
     675           0 :     buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]);
     676           0 :     buf1[17] = _mm_add_epi32(buf0[17], buf0[21]);
     677           0 :     buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]);
     678           0 :     buf1[18] = _mm_add_epi32(buf0[18], buf0[22]);
     679           0 :     buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]);
     680           0 :     buf1[19] = _mm_add_epi32(buf0[19], buf0[23]);
     681           0 :     buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]);
     682           0 :     buf1[24] = _mm_add_epi32(buf0[24], buf0[28]);
     683           0 :     buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]);
     684           0 :     buf1[25] = _mm_add_epi32(buf0[25], buf0[29]);
     685           0 :     buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]);
     686           0 :     buf1[26] = _mm_add_epi32(buf0[26], buf0[30]);
     687           0 :     buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]);
     688           0 :     buf1[27] = _mm_add_epi32(buf0[27], buf0[31]);
     689           0 :     buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]);
     690             : 
     691             :     // stage 8
     692           0 :     stage_idx++;
     693           0 :     bit = cos_bit[stage_idx];
     694           0 :     cospi = cospi_arr(bit);
     695           0 :     buf0[0] = buf1[0];
     696           0 :     buf0[1] = buf1[1];
     697           0 :     buf0[2] = buf1[2];
     698           0 :     buf0[3] = buf1[3];
     699           0 :     btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
     700             :                         buf0[5], bit);
     701           0 :     btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
     702             :                         buf0[7], bit);
     703           0 :     buf0[8] = buf1[8];
     704           0 :     buf0[9] = buf1[9];
     705           0 :     buf0[10] = buf1[10];
     706           0 :     buf0[11] = buf1[11];
     707           0 :     btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
     708             :                         buf0[13], bit);
     709           0 :     btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
     710             :                         buf0[15], bit);
     711           0 :     buf0[16] = buf1[16];
     712           0 :     buf0[17] = buf1[17];
     713           0 :     buf0[18] = buf1[18];
     714           0 :     buf0[19] = buf1[19];
     715           0 :     btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
     716             :                         buf0[21], bit);
     717           0 :     btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
     718             :                         buf0[23], bit);
     719           0 :     buf0[24] = buf1[24];
     720           0 :     buf0[25] = buf1[25];
     721           0 :     buf0[26] = buf1[26];
     722           0 :     buf0[27] = buf1[27];
     723           0 :     btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
     724             :                         buf0[29], bit);
     725           0 :     btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
     726             :                         buf0[31], bit);
     727             : 
     728             :     // stage 9
     729           0 :     stage_idx++;
     730           0 :     buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
     731           0 :     buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
     732           0 :     buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
     733           0 :     buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
     734           0 :     buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
     735           0 :     buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
     736           0 :     buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
     737           0 :     buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
     738           0 :     buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
     739           0 :     buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
     740           0 :     buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
     741           0 :     buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
     742           0 :     buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
     743           0 :     buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
     744           0 :     buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
     745           0 :     buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
     746           0 :     buf1[16] = _mm_add_epi32(buf0[16], buf0[18]);
     747           0 :     buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]);
     748           0 :     buf1[17] = _mm_add_epi32(buf0[17], buf0[19]);
     749           0 :     buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]);
     750           0 :     buf1[20] = _mm_add_epi32(buf0[20], buf0[22]);
     751           0 :     buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]);
     752           0 :     buf1[21] = _mm_add_epi32(buf0[21], buf0[23]);
     753           0 :     buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]);
     754           0 :     buf1[24] = _mm_add_epi32(buf0[24], buf0[26]);
     755           0 :     buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]);
     756           0 :     buf1[25] = _mm_add_epi32(buf0[25], buf0[27]);
     757           0 :     buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]);
     758           0 :     buf1[28] = _mm_add_epi32(buf0[28], buf0[30]);
     759           0 :     buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]);
     760           0 :     buf1[29] = _mm_add_epi32(buf0[29], buf0[31]);
     761           0 :     buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]);
     762             : 
     763             :     // stage 10
     764           0 :     stage_idx++;
     765           0 :     bit = cos_bit[stage_idx];
     766           0 :     cospi = cospi_arr(bit);
     767           0 :     buf0[0] = buf1[0];
     768           0 :     buf0[1] = buf1[1];
     769           0 :     btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
     770             :                         buf0[3], bit);
     771           0 :     buf0[4] = buf1[4];
     772           0 :     buf0[5] = buf1[5];
     773           0 :     btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
     774             :                         buf0[7], bit);
     775           0 :     buf0[8] = buf1[8];
     776           0 :     buf0[9] = buf1[9];
     777           0 :     btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
     778             :                         buf0[11], bit);
     779           0 :     buf0[12] = buf1[12];
     780           0 :     buf0[13] = buf1[13];
     781           0 :     btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
     782             :                         buf0[15], bit);
     783           0 :     buf0[16] = buf1[16];
     784           0 :     buf0[17] = buf1[17];
     785           0 :     btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
     786             :                         buf0[19], bit);
     787           0 :     buf0[20] = buf1[20];
     788           0 :     buf0[21] = buf1[21];
     789           0 :     btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
     790             :                         buf0[23], bit);
     791           0 :     buf0[24] = buf1[24];
     792           0 :     buf0[25] = buf1[25];
     793           0 :     btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
     794             :                         buf0[27], bit);
     795           0 :     buf0[28] = buf1[28];
     796           0 :     buf0[29] = buf1[29];
     797           0 :     btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
     798             :                         buf0[31], bit);
     799             : 
     800             :     // stage 11
     801           0 :     stage_idx++;
     802           0 :     buf1[0] = buf0[0];
     803           0 :     buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]);
     804           0 :     buf1[2] = buf0[24];
     805           0 :     buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
     806           0 :     buf1[4] = buf0[12];
     807           0 :     buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]);
     808           0 :     buf1[6] = buf0[20];
     809           0 :     buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
     810           0 :     buf1[8] = buf0[6];
     811           0 :     buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]);
     812           0 :     buf1[10] = buf0[30];
     813           0 :     buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
     814           0 :     buf1[12] = buf0[10];
     815           0 :     buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]);
     816           0 :     buf1[14] = buf0[18];
     817           0 :     buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
     818           0 :     buf1[16] = buf0[3];
     819           0 :     buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]);
     820           0 :     buf1[18] = buf0[27];
     821           0 :     buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
     822           0 :     buf1[20] = buf0[15];
     823           0 :     buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]);
     824           0 :     buf1[22] = buf0[23];
     825           0 :     buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
     826           0 :     buf1[24] = buf0[5];
     827           0 :     buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]);
     828           0 :     buf1[26] = buf0[29];
     829           0 :     buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
     830           0 :     buf1[28] = buf0[9];
     831           0 :     buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]);
     832           0 :     buf1[30] = buf0[17];
     833           0 :     buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
     834             : 
     835           0 :     for (j = 0; j < 32; ++j) {
     836           0 :       output[j * col_num + col] = buf1[j];
     837             :     }
     838             :   }
     839           0 : }

Generated by: LCOV version 1.13