LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - fwd_dct32_8cols_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 749 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 1 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <emmintrin.h>  // SSE2
      13             : 
      14             : #include "aom_dsp/fwd_txfm.h"
      15             : #include "aom_dsp/txfm_common.h"
      16             : #include "aom_dsp/x86/txfm_common_sse2.h"
      17             : 
      18             : // Apply a 32-element IDCT to 8 columns. This does not do any transposition
      19             : // of its output - the caller is expected to do that.
      20             : // The input buffers are the top and bottom halves of an 8x32 block.
      21           0 : void fdct32_8col(__m128i *in0, __m128i *in1) {
      22             :   // Constants
      23             :   //    When we use them, in one case, they are all the same. In all others
      24             :   //    it's a pair of them that we need to repeat four times. This is done
      25             :   //    by constructing the 32 bit constant corresponding to that pair.
      26           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
      27           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
      28           0 :   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
      29           0 :   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
      30           0 :   const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
      31           0 :   const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
      32           0 :   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
      33           0 :   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
      34           0 :   const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
      35           0 :   const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
      36           0 :   const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
      37           0 :   const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
      38           0 :   const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
      39           0 :   const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
      40           0 :   const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
      41           0 :   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
      42           0 :   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
      43           0 :   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
      44           0 :   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
      45           0 :   const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
      46           0 :   const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
      47           0 :   const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
      48           0 :   const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
      49           0 :   const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
      50           0 :   const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
      51           0 :   const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
      52           0 :   const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
      53           0 :   const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
      54           0 :   const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
      55           0 :   const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
      56           0 :   const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
      57           0 :   const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
      58           0 :   const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
      59           0 :   const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
      60           0 :   const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
      61           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
      62             : 
      63             :   __m128i step1[32];
      64             :   __m128i step2[32];
      65             :   __m128i step3[32];
      66             :   __m128i out[32];
      67             :   // Stage 1
      68             :   {
      69           0 :     const __m128i *ina = in0;
      70           0 :     const __m128i *inb = in1 + 15;
      71           0 :     __m128i *step1a = &step1[0];
      72           0 :     __m128i *step1b = &step1[31];
      73           0 :     const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
      74           0 :     const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
      75           0 :     const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
      76           0 :     const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
      77           0 :     const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
      78           0 :     const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
      79           0 :     const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
      80           0 :     const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
      81           0 :     step1a[0] = _mm_add_epi16(ina0, inb0);
      82           0 :     step1a[1] = _mm_add_epi16(ina1, inb1);
      83           0 :     step1a[2] = _mm_add_epi16(ina2, inb2);
      84           0 :     step1a[3] = _mm_add_epi16(ina3, inb3);
      85           0 :     step1b[-3] = _mm_sub_epi16(ina3, inb3);
      86           0 :     step1b[-2] = _mm_sub_epi16(ina2, inb2);
      87           0 :     step1b[-1] = _mm_sub_epi16(ina1, inb1);
      88           0 :     step1b[-0] = _mm_sub_epi16(ina0, inb0);
      89             :   }
      90             :   {
      91           0 :     const __m128i *ina = in0 + 4;
      92           0 :     const __m128i *inb = in1 + 11;
      93           0 :     __m128i *step1a = &step1[4];
      94           0 :     __m128i *step1b = &step1[27];
      95           0 :     const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
      96           0 :     const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
      97           0 :     const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
      98           0 :     const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
      99           0 :     const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
     100           0 :     const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
     101           0 :     const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
     102           0 :     const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
     103           0 :     step1a[0] = _mm_add_epi16(ina0, inb0);
     104           0 :     step1a[1] = _mm_add_epi16(ina1, inb1);
     105           0 :     step1a[2] = _mm_add_epi16(ina2, inb2);
     106           0 :     step1a[3] = _mm_add_epi16(ina3, inb3);
     107           0 :     step1b[-3] = _mm_sub_epi16(ina3, inb3);
     108           0 :     step1b[-2] = _mm_sub_epi16(ina2, inb2);
     109           0 :     step1b[-1] = _mm_sub_epi16(ina1, inb1);
     110           0 :     step1b[-0] = _mm_sub_epi16(ina0, inb0);
     111             :   }
     112             :   {
     113           0 :     const __m128i *ina = in0 + 8;
     114           0 :     const __m128i *inb = in1 + 7;
     115           0 :     __m128i *step1a = &step1[8];
     116           0 :     __m128i *step1b = &step1[23];
     117           0 :     const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
     118           0 :     const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
     119           0 :     const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
     120           0 :     const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
     121           0 :     const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
     122           0 :     const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
     123           0 :     const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
     124           0 :     const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
     125           0 :     step1a[0] = _mm_add_epi16(ina0, inb0);
     126           0 :     step1a[1] = _mm_add_epi16(ina1, inb1);
     127           0 :     step1a[2] = _mm_add_epi16(ina2, inb2);
     128           0 :     step1a[3] = _mm_add_epi16(ina3, inb3);
     129           0 :     step1b[-3] = _mm_sub_epi16(ina3, inb3);
     130           0 :     step1b[-2] = _mm_sub_epi16(ina2, inb2);
     131           0 :     step1b[-1] = _mm_sub_epi16(ina1, inb1);
     132           0 :     step1b[-0] = _mm_sub_epi16(ina0, inb0);
     133             :   }
     134             :   {
     135           0 :     const __m128i *ina = in0 + 12;
     136           0 :     const __m128i *inb = in1 + 3;
     137           0 :     __m128i *step1a = &step1[12];
     138           0 :     __m128i *step1b = &step1[19];
     139           0 :     const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
     140           0 :     const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
     141           0 :     const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
     142           0 :     const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
     143           0 :     const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
     144           0 :     const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
     145           0 :     const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
     146           0 :     const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
     147           0 :     step1a[0] = _mm_add_epi16(ina0, inb0);
     148           0 :     step1a[1] = _mm_add_epi16(ina1, inb1);
     149           0 :     step1a[2] = _mm_add_epi16(ina2, inb2);
     150           0 :     step1a[3] = _mm_add_epi16(ina3, inb3);
     151           0 :     step1b[-3] = _mm_sub_epi16(ina3, inb3);
     152           0 :     step1b[-2] = _mm_sub_epi16(ina2, inb2);
     153           0 :     step1b[-1] = _mm_sub_epi16(ina1, inb1);
     154           0 :     step1b[-0] = _mm_sub_epi16(ina0, inb0);
     155             :   }
     156             :   // Stage 2
     157             :   {
     158           0 :     step2[0] = _mm_add_epi16(step1[0], step1[15]);
     159           0 :     step2[1] = _mm_add_epi16(step1[1], step1[14]);
     160           0 :     step2[2] = _mm_add_epi16(step1[2], step1[13]);
     161           0 :     step2[3] = _mm_add_epi16(step1[3], step1[12]);
     162           0 :     step2[4] = _mm_add_epi16(step1[4], step1[11]);
     163           0 :     step2[5] = _mm_add_epi16(step1[5], step1[10]);
     164           0 :     step2[6] = _mm_add_epi16(step1[6], step1[9]);
     165           0 :     step2[7] = _mm_add_epi16(step1[7], step1[8]);
     166           0 :     step2[8] = _mm_sub_epi16(step1[7], step1[8]);
     167           0 :     step2[9] = _mm_sub_epi16(step1[6], step1[9]);
     168           0 :     step2[10] = _mm_sub_epi16(step1[5], step1[10]);
     169           0 :     step2[11] = _mm_sub_epi16(step1[4], step1[11]);
     170           0 :     step2[12] = _mm_sub_epi16(step1[3], step1[12]);
     171           0 :     step2[13] = _mm_sub_epi16(step1[2], step1[13]);
     172           0 :     step2[14] = _mm_sub_epi16(step1[1], step1[14]);
     173           0 :     step2[15] = _mm_sub_epi16(step1[0], step1[15]);
     174             :   }
     175             :   {
     176           0 :     const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
     177           0 :     const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
     178           0 :     const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
     179           0 :     const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
     180           0 :     const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
     181           0 :     const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
     182           0 :     const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
     183           0 :     const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
     184           0 :     const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
     185           0 :     const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
     186           0 :     const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
     187           0 :     const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
     188           0 :     const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
     189           0 :     const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
     190           0 :     const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
     191           0 :     const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
     192           0 :     const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
     193           0 :     const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
     194           0 :     const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
     195           0 :     const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
     196           0 :     const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
     197           0 :     const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
     198           0 :     const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
     199           0 :     const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
     200             :     // dct_const_round_shift
     201           0 :     const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
     202           0 :     const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
     203           0 :     const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
     204           0 :     const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
     205           0 :     const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
     206           0 :     const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
     207           0 :     const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
     208           0 :     const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
     209           0 :     const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
     210           0 :     const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
     211           0 :     const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
     212           0 :     const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
     213           0 :     const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
     214           0 :     const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
     215           0 :     const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
     216           0 :     const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
     217           0 :     const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
     218           0 :     const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
     219           0 :     const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
     220           0 :     const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
     221           0 :     const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
     222           0 :     const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
     223           0 :     const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
     224           0 :     const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
     225           0 :     const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
     226           0 :     const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
     227           0 :     const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
     228           0 :     const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
     229           0 :     const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
     230           0 :     const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
     231           0 :     const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
     232           0 :     const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
     233             :     // Combine
     234           0 :     step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
     235           0 :     step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
     236           0 :     step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
     237           0 :     step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
     238           0 :     step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
     239           0 :     step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
     240           0 :     step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
     241           0 :     step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
     242             :   }
     243             :   // Stage 3
     244             :   {
     245           0 :     step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
     246           0 :     step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
     247           0 :     step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
     248           0 :     step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
     249           0 :     step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
     250           0 :     step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
     251           0 :     step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
     252           0 :     step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
     253             :   }
     254             :   {
     255           0 :     const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
     256           0 :     const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
     257           0 :     const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
     258           0 :     const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
     259           0 :     const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
     260           0 :     const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
     261           0 :     const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
     262           0 :     const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
     263           0 :     const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
     264           0 :     const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
     265           0 :     const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
     266           0 :     const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
     267             :     // dct_const_round_shift
     268           0 :     const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
     269           0 :     const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
     270           0 :     const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
     271           0 :     const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
     272           0 :     const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
     273           0 :     const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
     274           0 :     const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
     275           0 :     const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
     276           0 :     const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
     277           0 :     const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
     278           0 :     const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
     279           0 :     const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
     280           0 :     const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
     281           0 :     const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
     282           0 :     const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
     283           0 :     const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
     284             :     // Combine
     285           0 :     step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
     286           0 :     step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
     287           0 :     step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
     288           0 :     step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
     289             :   }
     290             :   {
     291           0 :     step3[16] = _mm_add_epi16(step2[23], step1[16]);
     292           0 :     step3[17] = _mm_add_epi16(step2[22], step1[17]);
     293           0 :     step3[18] = _mm_add_epi16(step2[21], step1[18]);
     294           0 :     step3[19] = _mm_add_epi16(step2[20], step1[19]);
     295           0 :     step3[20] = _mm_sub_epi16(step1[19], step2[20]);
     296           0 :     step3[21] = _mm_sub_epi16(step1[18], step2[21]);
     297           0 :     step3[22] = _mm_sub_epi16(step1[17], step2[22]);
     298           0 :     step3[23] = _mm_sub_epi16(step1[16], step2[23]);
     299           0 :     step3[24] = _mm_sub_epi16(step1[31], step2[24]);
     300           0 :     step3[25] = _mm_sub_epi16(step1[30], step2[25]);
     301           0 :     step3[26] = _mm_sub_epi16(step1[29], step2[26]);
     302           0 :     step3[27] = _mm_sub_epi16(step1[28], step2[27]);
     303           0 :     step3[28] = _mm_add_epi16(step2[27], step1[28]);
     304           0 :     step3[29] = _mm_add_epi16(step2[26], step1[29]);
     305           0 :     step3[30] = _mm_add_epi16(step2[25], step1[30]);
     306           0 :     step3[31] = _mm_add_epi16(step2[24], step1[31]);
     307             :   }
     308             : 
     309             :   // Stage 4
     310             :   {
     311           0 :     step1[0] = _mm_add_epi16(step3[3], step3[0]);
     312           0 :     step1[1] = _mm_add_epi16(step3[2], step3[1]);
     313           0 :     step1[2] = _mm_sub_epi16(step3[1], step3[2]);
     314           0 :     step1[3] = _mm_sub_epi16(step3[0], step3[3]);
     315           0 :     step1[8] = _mm_add_epi16(step3[11], step2[8]);
     316           0 :     step1[9] = _mm_add_epi16(step3[10], step2[9]);
     317           0 :     step1[10] = _mm_sub_epi16(step2[9], step3[10]);
     318           0 :     step1[11] = _mm_sub_epi16(step2[8], step3[11]);
     319           0 :     step1[12] = _mm_sub_epi16(step2[15], step3[12]);
     320           0 :     step1[13] = _mm_sub_epi16(step2[14], step3[13]);
     321           0 :     step1[14] = _mm_add_epi16(step3[13], step2[14]);
     322           0 :     step1[15] = _mm_add_epi16(step3[12], step2[15]);
     323             :   }
     324             :   {
     325           0 :     const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
     326           0 :     const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
     327           0 :     const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
     328           0 :     const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
     329           0 :     const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
     330           0 :     const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
     331             :     // dct_const_round_shift
     332           0 :     const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
     333           0 :     const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
     334           0 :     const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
     335           0 :     const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
     336           0 :     const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
     337           0 :     const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
     338           0 :     const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
     339           0 :     const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
     340             :     // Combine
     341           0 :     step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
     342           0 :     step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
     343             :   }
     344             :   {
     345           0 :     const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
     346           0 :     const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
     347           0 :     const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
     348           0 :     const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
     349           0 :     const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
     350           0 :     const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
     351           0 :     const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
     352           0 :     const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
     353           0 :     const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
     354           0 :     const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
     355           0 :     const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
     356           0 :     const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
     357           0 :     const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
     358           0 :     const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
     359           0 :     const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
     360           0 :     const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
     361           0 :     const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
     362           0 :     const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
     363           0 :     const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
     364           0 :     const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
     365           0 :     const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
     366           0 :     const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
     367           0 :     const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
     368           0 :     const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
     369             :     // dct_const_round_shift
     370           0 :     const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
     371           0 :     const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
     372           0 :     const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
     373           0 :     const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
     374           0 :     const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
     375           0 :     const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
     376           0 :     const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
     377           0 :     const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
     378           0 :     const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
     379           0 :     const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
     380           0 :     const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
     381           0 :     const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
     382           0 :     const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
     383           0 :     const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
     384           0 :     const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
     385           0 :     const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
     386           0 :     const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
     387           0 :     const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
     388           0 :     const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
     389           0 :     const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
     390           0 :     const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
     391           0 :     const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
     392           0 :     const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
     393           0 :     const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
     394           0 :     const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
     395           0 :     const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
     396           0 :     const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
     397           0 :     const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
     398           0 :     const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
     399           0 :     const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
     400           0 :     const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
     401           0 :     const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
     402             :     // Combine
     403           0 :     step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
     404           0 :     step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
     405           0 :     step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
     406           0 :     step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
     407           0 :     step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
     408           0 :     step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
     409           0 :     step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
     410           0 :     step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
     411             :   }
     412             :   // Stage 5
     413             :   {
     414           0 :     step2[4] = _mm_add_epi16(step1[5], step3[4]);
     415           0 :     step2[5] = _mm_sub_epi16(step3[4], step1[5]);
     416           0 :     step2[6] = _mm_sub_epi16(step3[7], step1[6]);
     417           0 :     step2[7] = _mm_add_epi16(step1[6], step3[7]);
     418             :   }
     419             :   {
     420           0 :     const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
     421           0 :     const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
     422           0 :     const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
     423           0 :     const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
     424           0 :     const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
     425           0 :     const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
     426           0 :     const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
     427           0 :     const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
     428           0 :     const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
     429           0 :     const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
     430           0 :     const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
     431           0 :     const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
     432             :     // dct_const_round_shift
     433           0 :     const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
     434           0 :     const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
     435           0 :     const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
     436           0 :     const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
     437           0 :     const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
     438           0 :     const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
     439           0 :     const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
     440           0 :     const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
     441           0 :     const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
     442           0 :     const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
     443           0 :     const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
     444           0 :     const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
     445           0 :     const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
     446           0 :     const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
     447           0 :     const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
     448           0 :     const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
     449             :     // Combine
     450           0 :     out[0] = _mm_packs_epi32(out_00_6, out_00_7);
     451           0 :     out[16] = _mm_packs_epi32(out_16_6, out_16_7);
     452           0 :     out[8] = _mm_packs_epi32(out_08_6, out_08_7);
     453           0 :     out[24] = _mm_packs_epi32(out_24_6, out_24_7);
     454             :   }
     455             :   {
     456           0 :     const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
     457           0 :     const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
     458           0 :     const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
     459           0 :     const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
     460           0 :     const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
     461           0 :     const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
     462           0 :     const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
     463           0 :     const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
     464           0 :     const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
     465           0 :     const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
     466           0 :     const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
     467           0 :     const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
     468             :     // dct_const_round_shift
     469           0 :     const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
     470           0 :     const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
     471           0 :     const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
     472           0 :     const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
     473           0 :     const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
     474           0 :     const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
     475           0 :     const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
     476           0 :     const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
     477           0 :     const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
     478           0 :     const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
     479           0 :     const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
     480           0 :     const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
     481           0 :     const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
     482           0 :     const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
     483           0 :     const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
     484           0 :     const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
     485             :     // Combine
     486           0 :     step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
     487           0 :     step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
     488           0 :     step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
     489           0 :     step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
     490             :   }
     491             :   {
     492           0 :     step2[16] = _mm_add_epi16(step1[19], step3[16]);
     493           0 :     step2[17] = _mm_add_epi16(step1[18], step3[17]);
     494           0 :     step2[18] = _mm_sub_epi16(step3[17], step1[18]);
     495           0 :     step2[19] = _mm_sub_epi16(step3[16], step1[19]);
     496           0 :     step2[20] = _mm_sub_epi16(step3[23], step1[20]);
     497           0 :     step2[21] = _mm_sub_epi16(step3[22], step1[21]);
     498           0 :     step2[22] = _mm_add_epi16(step1[21], step3[22]);
     499           0 :     step2[23] = _mm_add_epi16(step1[20], step3[23]);
     500           0 :     step2[24] = _mm_add_epi16(step1[27], step3[24]);
     501           0 :     step2[25] = _mm_add_epi16(step1[26], step3[25]);
     502           0 :     step2[26] = _mm_sub_epi16(step3[25], step1[26]);
     503           0 :     step2[27] = _mm_sub_epi16(step3[24], step1[27]);
     504           0 :     step2[28] = _mm_sub_epi16(step3[31], step1[28]);
     505           0 :     step2[29] = _mm_sub_epi16(step3[30], step1[29]);
     506           0 :     step2[30] = _mm_add_epi16(step1[29], step3[30]);
     507           0 :     step2[31] = _mm_add_epi16(step1[28], step3[31]);
     508             :   }
     509             :   // Stage 6
     510             :   {
     511           0 :     const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
     512           0 :     const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
     513           0 :     const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
     514           0 :     const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
     515           0 :     const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
     516           0 :     const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
     517           0 :     const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
     518           0 :     const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
     519           0 :     const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
     520           0 :     const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
     521           0 :     const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
     522           0 :     const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
     523           0 :     const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
     524           0 :     const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
     525           0 :     const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
     526           0 :     const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
     527             :     // dct_const_round_shift
     528           0 :     const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
     529           0 :     const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
     530           0 :     const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
     531           0 :     const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
     532           0 :     const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
     533           0 :     const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
     534           0 :     const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
     535           0 :     const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
     536           0 :     const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
     537           0 :     const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
     538           0 :     const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
     539           0 :     const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
     540           0 :     const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
     541           0 :     const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
     542           0 :     const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
     543           0 :     const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
     544             :     // Combine
     545           0 :     out[4] = _mm_packs_epi32(out_04_6, out_04_7);
     546           0 :     out[20] = _mm_packs_epi32(out_20_6, out_20_7);
     547           0 :     out[12] = _mm_packs_epi32(out_12_6, out_12_7);
     548           0 :     out[28] = _mm_packs_epi32(out_28_6, out_28_7);
     549             :   }
     550             :   {
     551           0 :     step3[8] = _mm_add_epi16(step2[9], step1[8]);
     552           0 :     step3[9] = _mm_sub_epi16(step1[8], step2[9]);
     553           0 :     step3[10] = _mm_sub_epi16(step1[11], step2[10]);
     554           0 :     step3[11] = _mm_add_epi16(step2[10], step1[11]);
     555           0 :     step3[12] = _mm_add_epi16(step2[13], step1[12]);
     556           0 :     step3[13] = _mm_sub_epi16(step1[12], step2[13]);
     557           0 :     step3[14] = _mm_sub_epi16(step1[15], step2[14]);
     558           0 :     step3[15] = _mm_add_epi16(step2[14], step1[15]);
     559             :   }
     560             :   {
     561           0 :     const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
     562           0 :     const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
     563           0 :     const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
     564           0 :     const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
     565           0 :     const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
     566           0 :     const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
     567           0 :     const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
     568           0 :     const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
     569           0 :     const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
     570           0 :     const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
     571           0 :     const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
     572           0 :     const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
     573           0 :     const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
     574           0 :     const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
     575           0 :     const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
     576           0 :     const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
     577           0 :     const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
     578           0 :     const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
     579           0 :     const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
     580           0 :     const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
     581           0 :     const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
     582           0 :     const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
     583           0 :     const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
     584           0 :     const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
     585             :     // dct_const_round_shift
     586           0 :     const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
     587           0 :     const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
     588           0 :     const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
     589           0 :     const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
     590           0 :     const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
     591           0 :     const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
     592           0 :     const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
     593           0 :     const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
     594           0 :     const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
     595           0 :     const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
     596           0 :     const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
     597           0 :     const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
     598           0 :     const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
     599           0 :     const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
     600           0 :     const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
     601           0 :     const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
     602           0 :     const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
     603           0 :     const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
     604           0 :     const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
     605           0 :     const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
     606           0 :     const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
     607           0 :     const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
     608           0 :     const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
     609           0 :     const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
     610           0 :     const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
     611           0 :     const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
     612           0 :     const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
     613           0 :     const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
     614           0 :     const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
     615           0 :     const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
     616           0 :     const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
     617           0 :     const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
     618             :     // Combine
     619           0 :     step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
     620           0 :     step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
     621           0 :     step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
     622           0 :     step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
     623             :     // Combine
     624           0 :     step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
     625           0 :     step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
     626           0 :     step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
     627           0 :     step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
     628             :   }
     629             :   // Stage 7
     630             :   {
     631           0 :     const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
     632           0 :     const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
     633           0 :     const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
     634           0 :     const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
     635           0 :     const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
     636           0 :     const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
     637           0 :     const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
     638           0 :     const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
     639           0 :     const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
     640           0 :     const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
     641           0 :     const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
     642           0 :     const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
     643           0 :     const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
     644           0 :     const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
     645           0 :     const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
     646           0 :     const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
     647           0 :     const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
     648           0 :     const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
     649           0 :     const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
     650           0 :     const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
     651           0 :     const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
     652           0 :     const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
     653           0 :     const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
     654           0 :     const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
     655             :     // dct_const_round_shift
     656           0 :     const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
     657           0 :     const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
     658           0 :     const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
     659           0 :     const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
     660           0 :     const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
     661           0 :     const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
     662           0 :     const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
     663           0 :     const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
     664           0 :     const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
     665           0 :     const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
     666           0 :     const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
     667           0 :     const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
     668           0 :     const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
     669           0 :     const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
     670           0 :     const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
     671           0 :     const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
     672           0 :     const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
     673           0 :     const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
     674           0 :     const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
     675           0 :     const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
     676           0 :     const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
     677           0 :     const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
     678           0 :     const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
     679           0 :     const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
     680           0 :     const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
     681           0 :     const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
     682           0 :     const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
     683           0 :     const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
     684           0 :     const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
     685           0 :     const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
     686           0 :     const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
     687           0 :     const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
     688             :     // Combine
     689           0 :     out[2] = _mm_packs_epi32(out_02_6, out_02_7);
     690           0 :     out[18] = _mm_packs_epi32(out_18_6, out_18_7);
     691           0 :     out[10] = _mm_packs_epi32(out_10_6, out_10_7);
     692           0 :     out[26] = _mm_packs_epi32(out_26_6, out_26_7);
     693           0 :     out[6] = _mm_packs_epi32(out_06_6, out_06_7);
     694           0 :     out[22] = _mm_packs_epi32(out_22_6, out_22_7);
     695           0 :     out[14] = _mm_packs_epi32(out_14_6, out_14_7);
     696           0 :     out[30] = _mm_packs_epi32(out_30_6, out_30_7);
     697             :   }
     698             :   {
     699           0 :     step1[16] = _mm_add_epi16(step3[17], step2[16]);
     700           0 :     step1[17] = _mm_sub_epi16(step2[16], step3[17]);
     701           0 :     step1[18] = _mm_sub_epi16(step2[19], step3[18]);
     702           0 :     step1[19] = _mm_add_epi16(step3[18], step2[19]);
     703           0 :     step1[20] = _mm_add_epi16(step3[21], step2[20]);
     704           0 :     step1[21] = _mm_sub_epi16(step2[20], step3[21]);
     705           0 :     step1[22] = _mm_sub_epi16(step2[23], step3[22]);
     706           0 :     step1[23] = _mm_add_epi16(step3[22], step2[23]);
     707           0 :     step1[24] = _mm_add_epi16(step3[25], step2[24]);
     708           0 :     step1[25] = _mm_sub_epi16(step2[24], step3[25]);
     709           0 :     step1[26] = _mm_sub_epi16(step2[27], step3[26]);
     710           0 :     step1[27] = _mm_add_epi16(step3[26], step2[27]);
     711           0 :     step1[28] = _mm_add_epi16(step3[29], step2[28]);
     712           0 :     step1[29] = _mm_sub_epi16(step2[28], step3[29]);
     713           0 :     step1[30] = _mm_sub_epi16(step2[31], step3[30]);
     714           0 :     step1[31] = _mm_add_epi16(step3[30], step2[31]);
     715             :   }
     716             :   // Final stage --- outputs indices are bit-reversed.
     717             :   {
     718           0 :     const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
     719           0 :     const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
     720           0 :     const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
     721           0 :     const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
     722           0 :     const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
     723           0 :     const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
     724           0 :     const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
     725           0 :     const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
     726           0 :     const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
     727           0 :     const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
     728           0 :     const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
     729           0 :     const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
     730           0 :     const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
     731           0 :     const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
     732           0 :     const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
     733           0 :     const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
     734           0 :     const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
     735           0 :     const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
     736           0 :     const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
     737           0 :     const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
     738           0 :     const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
     739           0 :     const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
     740           0 :     const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
     741           0 :     const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
     742             :     // dct_const_round_shift
     743           0 :     const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
     744           0 :     const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
     745           0 :     const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
     746           0 :     const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
     747           0 :     const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
     748           0 :     const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
     749           0 :     const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
     750           0 :     const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
     751           0 :     const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
     752           0 :     const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
     753           0 :     const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
     754           0 :     const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
     755           0 :     const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
     756           0 :     const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
     757           0 :     const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
     758           0 :     const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
     759           0 :     const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
     760           0 :     const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
     761           0 :     const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
     762           0 :     const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
     763           0 :     const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
     764           0 :     const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
     765           0 :     const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
     766           0 :     const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
     767           0 :     const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
     768           0 :     const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
     769           0 :     const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
     770           0 :     const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
     771           0 :     const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
     772           0 :     const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
     773           0 :     const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
     774           0 :     const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
     775             :     // Combine
     776           0 :     out[1] = _mm_packs_epi32(out_01_6, out_01_7);
     777           0 :     out[17] = _mm_packs_epi32(out_17_6, out_17_7);
     778           0 :     out[9] = _mm_packs_epi32(out_09_6, out_09_7);
     779           0 :     out[25] = _mm_packs_epi32(out_25_6, out_25_7);
     780           0 :     out[7] = _mm_packs_epi32(out_07_6, out_07_7);
     781           0 :     out[23] = _mm_packs_epi32(out_23_6, out_23_7);
     782           0 :     out[15] = _mm_packs_epi32(out_15_6, out_15_7);
     783           0 :     out[31] = _mm_packs_epi32(out_31_6, out_31_7);
     784             :   }
     785             :   {
     786           0 :     const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
     787           0 :     const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
     788           0 :     const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
     789           0 :     const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
     790           0 :     const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
     791           0 :     const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
     792           0 :     const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
     793           0 :     const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
     794           0 :     const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
     795           0 :     const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
     796           0 :     const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
     797           0 :     const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
     798           0 :     const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
     799           0 :     const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
     800           0 :     const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
     801           0 :     const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
     802           0 :     const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
     803           0 :     const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
     804           0 :     const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
     805           0 :     const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
     806           0 :     const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
     807           0 :     const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
     808           0 :     const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
     809           0 :     const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
     810             :     // dct_const_round_shift
     811           0 :     const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
     812           0 :     const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
     813           0 :     const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
     814           0 :     const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
     815           0 :     const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
     816           0 :     const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
     817           0 :     const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
     818           0 :     const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
     819           0 :     const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
     820           0 :     const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
     821           0 :     const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
     822           0 :     const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
     823           0 :     const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
     824           0 :     const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
     825           0 :     const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
     826           0 :     const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
     827           0 :     const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
     828           0 :     const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
     829           0 :     const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
     830           0 :     const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
     831           0 :     const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
     832           0 :     const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
     833           0 :     const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
     834           0 :     const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
     835           0 :     const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
     836           0 :     const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
     837           0 :     const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
     838           0 :     const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
     839           0 :     const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
     840           0 :     const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
     841           0 :     const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
     842           0 :     const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
     843             :     // Combine
     844           0 :     out[5] = _mm_packs_epi32(out_05_6, out_05_7);
     845           0 :     out[21] = _mm_packs_epi32(out_21_6, out_21_7);
     846           0 :     out[13] = _mm_packs_epi32(out_13_6, out_13_7);
     847           0 :     out[29] = _mm_packs_epi32(out_29_6, out_29_7);
     848           0 :     out[3] = _mm_packs_epi32(out_03_6, out_03_7);
     849           0 :     out[19] = _mm_packs_epi32(out_19_6, out_19_7);
     850           0 :     out[11] = _mm_packs_epi32(out_11_6, out_11_7);
     851           0 :     out[27] = _mm_packs_epi32(out_27_6, out_27_7);
     852             :   }
     853             : 
     854             :   // Output results
     855             :   {
     856             :     int j;
     857           0 :     for (j = 0; j < 16; ++j) {
     858           0 :       _mm_storeu_si128((__m128i *)(in0 + j), out[j]);
     859           0 :       _mm_storeu_si128((__m128i *)(in1 + j), out[j + 16]);
     860             :     }
     861             :   }
     862           0 : }  // NOLINT

Generated by: LCOV version 1.13