LCOV - code coverage report
Current view: top level - media/libvpx/libvpx/vpx_dsp/x86 - inv_txfm_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 1913 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 20 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include "./vpx_dsp_rtcd.h"
      12             : #include "vpx_dsp/x86/inv_txfm_sse2.h"
      13             : #include "vpx_dsp/x86/txfm_common_sse2.h"
      14             : 
      15             : #define RECON_AND_STORE4X4(dest, in_x)                    \
      16             :   {                                                       \
      17             :     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
      18             :     d0 = _mm_unpacklo_epi8(d0, zero);                     \
      19             :     d0 = _mm_add_epi16(in_x, d0);                         \
      20             :     d0 = _mm_packus_epi16(d0, d0);                        \
      21             :     *(int *)(dest) = _mm_cvtsi128_si32(d0);               \
      22             :   }
      23             : 
      24           0 : void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
      25             :                              int stride) {
      26           0 :   const __m128i zero = _mm_setzero_si128();
      27           0 :   const __m128i eight = _mm_set1_epi16(8);
      28           0 :   const __m128i cst = _mm_setr_epi16(
      29           0 :       (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
      30           0 :       (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
      31           0 :       (int16_t)cospi_8_64, (int16_t)cospi_24_64);
      32           0 :   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
      33             :   __m128i input0, input1, input2, input3;
      34             : 
      35             :   // Rows
      36           0 :   input0 = load_input_data(input);
      37           0 :   input2 = load_input_data(input + 8);
      38             : 
      39             :   // Construct i3, i1, i3, i1, i2, i0, i2, i0
      40           0 :   input0 = _mm_shufflelo_epi16(input0, 0xd8);
      41           0 :   input0 = _mm_shufflehi_epi16(input0, 0xd8);
      42           0 :   input2 = _mm_shufflelo_epi16(input2, 0xd8);
      43           0 :   input2 = _mm_shufflehi_epi16(input2, 0xd8);
      44             : 
      45           0 :   input1 = _mm_unpackhi_epi32(input0, input0);
      46           0 :   input0 = _mm_unpacklo_epi32(input0, input0);
      47           0 :   input3 = _mm_unpackhi_epi32(input2, input2);
      48           0 :   input2 = _mm_unpacklo_epi32(input2, input2);
      49             : 
      50             :   // Stage 1
      51           0 :   input0 = _mm_madd_epi16(input0, cst);
      52           0 :   input1 = _mm_madd_epi16(input1, cst);
      53           0 :   input2 = _mm_madd_epi16(input2, cst);
      54           0 :   input3 = _mm_madd_epi16(input3, cst);
      55             : 
      56           0 :   input0 = _mm_add_epi32(input0, rounding);
      57           0 :   input1 = _mm_add_epi32(input1, rounding);
      58           0 :   input2 = _mm_add_epi32(input2, rounding);
      59           0 :   input3 = _mm_add_epi32(input3, rounding);
      60             : 
      61           0 :   input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
      62           0 :   input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
      63           0 :   input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
      64           0 :   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
      65             : 
      66             :   // Stage 2
      67           0 :   input0 = _mm_packs_epi32(input0, input1);
      68           0 :   input1 = _mm_packs_epi32(input2, input3);
      69             : 
      70             :   // Transpose
      71           0 :   input2 = _mm_unpacklo_epi16(input0, input1);
      72           0 :   input3 = _mm_unpackhi_epi16(input0, input1);
      73           0 :   input0 = _mm_unpacklo_epi32(input2, input3);
      74           0 :   input1 = _mm_unpackhi_epi32(input2, input3);
      75             : 
      76             :   // Switch column2, column 3, and then, we got:
      77             :   // input2: column1, column 0;  input3: column2, column 3.
      78           0 :   input1 = _mm_shuffle_epi32(input1, 0x4e);
      79           0 :   input2 = _mm_add_epi16(input0, input1);
      80           0 :   input3 = _mm_sub_epi16(input0, input1);
      81             : 
      82             :   // Columns
      83             :   // Construct i3, i1, i3, i1, i2, i0, i2, i0
      84           0 :   input0 = _mm_unpacklo_epi32(input2, input2);
      85           0 :   input1 = _mm_unpackhi_epi32(input2, input2);
      86           0 :   input2 = _mm_unpackhi_epi32(input3, input3);
      87           0 :   input3 = _mm_unpacklo_epi32(input3, input3);
      88             : 
      89             :   // Stage 1
      90           0 :   input0 = _mm_madd_epi16(input0, cst);
      91           0 :   input1 = _mm_madd_epi16(input1, cst);
      92           0 :   input2 = _mm_madd_epi16(input2, cst);
      93           0 :   input3 = _mm_madd_epi16(input3, cst);
      94             : 
      95           0 :   input0 = _mm_add_epi32(input0, rounding);
      96           0 :   input1 = _mm_add_epi32(input1, rounding);
      97           0 :   input2 = _mm_add_epi32(input2, rounding);
      98           0 :   input3 = _mm_add_epi32(input3, rounding);
      99             : 
     100           0 :   input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
     101           0 :   input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
     102           0 :   input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
     103           0 :   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
     104             : 
     105             :   // Stage 2
     106           0 :   input0 = _mm_packs_epi32(input0, input2);
     107           0 :   input1 = _mm_packs_epi32(input1, input3);
     108             : 
     109             :   // Transpose
     110           0 :   input2 = _mm_unpacklo_epi16(input0, input1);
     111           0 :   input3 = _mm_unpackhi_epi16(input0, input1);
     112           0 :   input0 = _mm_unpacklo_epi32(input2, input3);
     113           0 :   input1 = _mm_unpackhi_epi32(input2, input3);
     114             : 
     115             :   // Switch column2, column 3, and then, we got:
     116             :   // input2: column1, column 0;  input3: column2, column 3.
     117           0 :   input1 = _mm_shuffle_epi32(input1, 0x4e);
     118           0 :   input2 = _mm_add_epi16(input0, input1);
     119           0 :   input3 = _mm_sub_epi16(input0, input1);
     120             : 
     121             :   // Final round and shift
     122           0 :   input2 = _mm_add_epi16(input2, eight);
     123           0 :   input3 = _mm_add_epi16(input3, eight);
     124             : 
     125           0 :   input2 = _mm_srai_epi16(input2, 4);
     126           0 :   input3 = _mm_srai_epi16(input3, 4);
     127             : 
     128             :   // Reconstruction and Store
     129             :   {
     130           0 :     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
     131           0 :     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
     132           0 :     d0 = _mm_unpacklo_epi32(d0,
     133           0 :                             _mm_cvtsi32_si128(*(const int *)(dest + stride)));
     134           0 :     d2 = _mm_unpacklo_epi32(
     135           0 :         _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
     136           0 :     d0 = _mm_unpacklo_epi8(d0, zero);
     137           0 :     d2 = _mm_unpacklo_epi8(d2, zero);
     138           0 :     d0 = _mm_add_epi16(d0, input2);
     139           0 :     d2 = _mm_add_epi16(d2, input3);
     140           0 :     d0 = _mm_packus_epi16(d0, d2);
     141             :     // store input0
     142           0 :     *(int *)dest = _mm_cvtsi128_si32(d0);
     143             :     // store input1
     144           0 :     d0 = _mm_srli_si128(d0, 4);
     145           0 :     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
     146             :     // store input2
     147           0 :     d0 = _mm_srli_si128(d0, 4);
     148           0 :     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
     149             :     // store input3
     150           0 :     d0 = _mm_srli_si128(d0, 4);
     151           0 :     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
     152             :   }
     153           0 : }
     154             : 
     155           0 : void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
     156             :                             int stride) {
     157             :   __m128i dc_value;
     158           0 :   const __m128i zero = _mm_setzero_si128();
     159             :   int a;
     160             : 
     161           0 :   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
     162           0 :   a = (int)dct_const_round_shift(a * cospi_16_64);
     163           0 :   a = ROUND_POWER_OF_TWO(a, 4);
     164             : 
     165           0 :   dc_value = _mm_set1_epi16(a);
     166             : 
     167           0 :   RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
     168           0 :   RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
     169           0 :   RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
     170           0 :   RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
     171           0 : }
     172             : 
     173           0 : static INLINE void transpose_4x4(__m128i *res) {
     174           0 :   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
     175           0 :   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
     176             : 
     177           0 :   res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
     178           0 :   res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
     179           0 : }
     180             : 
     181           0 : void idct4_sse2(__m128i *in) {
     182           0 :   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
     183           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     184           0 :   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
     185           0 :   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
     186           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     187             :   __m128i u[8], v[8];
     188             : 
     189           0 :   transpose_4x4(in);
     190             :   // stage 1
     191           0 :   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
     192           0 :   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
     193           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
     194           0 :   v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
     195           0 :   v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
     196           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
     197             : 
     198           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
     199           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
     200           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
     201           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
     202             : 
     203           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
     204           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
     205           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
     206           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
     207             : 
     208           0 :   u[0] = _mm_packs_epi32(v[0], v[1]);
     209           0 :   u[1] = _mm_packs_epi32(v[3], v[2]);
     210             : 
     211             :   // stage 2
     212           0 :   in[0] = _mm_add_epi16(u[0], u[1]);
     213           0 :   in[1] = _mm_sub_epi16(u[0], u[1]);
     214           0 :   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
     215           0 : }
     216             : 
     217           0 : void iadst4_sse2(__m128i *in) {
     218           0 :   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
     219           0 :   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
     220           0 :   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
     221           0 :   const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
     222           0 :   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
     223           0 :   const __m128i kZero = _mm_set1_epi16(0);
     224           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     225             :   __m128i u[8], v[8], in7;
     226             : 
     227           0 :   transpose_4x4(in);
     228           0 :   in7 = _mm_srli_si128(in[1], 8);
     229           0 :   in7 = _mm_add_epi16(in7, in[0]);
     230           0 :   in7 = _mm_sub_epi16(in7, in[1]);
     231             : 
     232           0 :   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
     233           0 :   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
     234           0 :   u[2] = _mm_unpacklo_epi16(in7, kZero);
     235           0 :   u[3] = _mm_unpackhi_epi16(in[0], kZero);
     236             : 
     237           0 :   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
     238           0 :   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
     239           0 :   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
     240           0 :   v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
     241           0 :   v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
     242           0 :   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
     243             : 
     244           0 :   u[0] = _mm_add_epi32(v[0], v[1]);
     245           0 :   u[1] = _mm_add_epi32(v[3], v[4]);
     246           0 :   u[2] = v[2];
     247           0 :   u[3] = _mm_add_epi32(u[0], u[1]);
     248           0 :   u[4] = _mm_slli_epi32(v[5], 2);
     249           0 :   u[5] = _mm_add_epi32(u[3], v[5]);
     250           0 :   u[6] = _mm_sub_epi32(u[5], u[4]);
     251             : 
     252           0 :   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
     253           0 :   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
     254           0 :   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
     255           0 :   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
     256             : 
     257           0 :   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
     258           0 :   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
     259           0 :   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
     260           0 :   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
     261             : 
     262           0 :   in[0] = _mm_packs_epi32(u[0], u[1]);
     263           0 :   in[1] = _mm_packs_epi32(u[2], u[3]);
     264           0 : }
     265             : 
     266             : #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
     267             :                       out2, out3, out4, out5, out6, out7)                 \
     268             :   {                                                                       \
     269             :     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);                   \
     270             :     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);                   \
     271             :     const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1);                   \
     272             :     const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3);                   \
     273             :     const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5);                   \
     274             :     const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7);                   \
     275             :     const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5);                   \
     276             :     const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7);                   \
     277             :                                                                           \
     278             :     const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);               \
     279             :     const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);               \
     280             :     const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);               \
     281             :     const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);               \
     282             :     const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);               \
     283             :     const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);               \
     284             :     const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);               \
     285             :     const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);               \
     286             :                                                                           \
     287             :     out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                              \
     288             :     out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                              \
     289             :     out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                              \
     290             :     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                              \
     291             :     out4 = _mm_unpacklo_epi64(tr1_1, tr1_5);                              \
     292             :     out5 = _mm_unpackhi_epi64(tr1_1, tr1_5);                              \
     293             :     out6 = _mm_unpacklo_epi64(tr1_3, tr1_7);                              \
     294             :     out7 = _mm_unpackhi_epi64(tr1_3, tr1_7);                              \
     295             :   }
     296             : 
     297             : #define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
     298             :   {                                                                      \
     299             :     const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1);                \
     300             :     const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0);                \
     301             :     const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3);                \
     302             :     const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2);                \
     303             :                                                                          \
     304             :     const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);              \
     305             :     const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);              \
     306             :     const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);              \
     307             :     const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);              \
     308             :                                                                          \
     309             :     out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                             \
     310             :     out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                             \
     311             :     out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                             \
     312             :     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                             \
     313             :   }
     314             : 
     315             : #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
     316             :   {                                                      \
     317             :     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);  \
     318             :     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);  \
     319             :     out0 = _mm_unpacklo_epi32(tr0_0, tr0_1);             \
     320             :     out1 = _mm_unpackhi_epi32(tr0_0, tr0_1);             \
     321             :   }
     322             : 
     323             : // Define Macro for multiplying elements by constants and adding them together.
     324             : #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
     325             :                                res0, res1, res2, res3)                         \
     326             :   {                                                                            \
     327             :     tmp0 = _mm_madd_epi16(lo_0, cst0);                                         \
     328             :     tmp1 = _mm_madd_epi16(hi_0, cst0);                                         \
     329             :     tmp2 = _mm_madd_epi16(lo_0, cst1);                                         \
     330             :     tmp3 = _mm_madd_epi16(hi_0, cst1);                                         \
     331             :     tmp4 = _mm_madd_epi16(lo_1, cst2);                                         \
     332             :     tmp5 = _mm_madd_epi16(hi_1, cst2);                                         \
     333             :     tmp6 = _mm_madd_epi16(lo_1, cst3);                                         \
     334             :     tmp7 = _mm_madd_epi16(hi_1, cst3);                                         \
     335             :                                                                                \
     336             :     tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
     337             :     tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
     338             :     tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
     339             :     tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
     340             :     tmp4 = _mm_add_epi32(tmp4, rounding);                                      \
     341             :     tmp5 = _mm_add_epi32(tmp5, rounding);                                      \
     342             :     tmp6 = _mm_add_epi32(tmp6, rounding);                                      \
     343             :     tmp7 = _mm_add_epi32(tmp7, rounding);                                      \
     344             :                                                                                \
     345             :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
     346             :     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
     347             :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
     348             :     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
     349             :     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);                               \
     350             :     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);                               \
     351             :     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);                               \
     352             :     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);                               \
     353             :                                                                                \
     354             :     res0 = _mm_packs_epi32(tmp0, tmp1);                                        \
     355             :     res1 = _mm_packs_epi32(tmp2, tmp3);                                        \
     356             :     res2 = _mm_packs_epi32(tmp4, tmp5);                                        \
     357             :     res3 = _mm_packs_epi32(tmp6, tmp7);                                        \
     358             :   }
     359             : 
     360             : #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
     361             :   {                                                                  \
     362             :     tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
     363             :     tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
     364             :     tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
     365             :     tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
     366             :                                                                      \
     367             :     tmp0 = _mm_add_epi32(tmp0, rounding);                            \
     368             :     tmp1 = _mm_add_epi32(tmp1, rounding);                            \
     369             :     tmp2 = _mm_add_epi32(tmp2, rounding);                            \
     370             :     tmp3 = _mm_add_epi32(tmp3, rounding);                            \
     371             :                                                                      \
     372             :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
     373             :     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
     374             :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
     375             :     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
     376             :                                                                      \
     377             :     res0 = _mm_packs_epi32(tmp0, tmp1);                              \
     378             :     res1 = _mm_packs_epi32(tmp2, tmp3);                              \
     379             :   }
     380             : 
     381             : #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
     382             :               out4, out5, out6, out7)                                         \
     383             :   {                                                                           \
     384             :     /* Stage1 */                                                              \
     385             :     {                                                                         \
     386             :       const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
     387             :       const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
     388             :       const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
     389             :       const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
     390             :                                                                               \
     391             :       MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
     392             :                              stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
     393             :     }                                                                         \
     394             :                                                                               \
     395             :     /* Stage2 */                                                              \
     396             :     {                                                                         \
     397             :       const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
     398             :       const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
     399             :       const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
     400             :       const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
     401             :                                                                               \
     402             :       MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
     403             :                              stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
     404             :                                                                               \
     405             :       stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                 \
     406             :       stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                 \
     407             :       stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                 \
     408             :       stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                 \
     409             :     }                                                                         \
     410             :                                                                               \
     411             :     /* Stage3 */                                                              \
     412             :     {                                                                         \
     413             :       const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
     414             :       const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
     415             :                                                                               \
     416             :       stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                 \
     417             :       stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                 \
     418             :       stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                 \
     419             :       stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                 \
     420             :                                                                               \
     421             :       tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
     422             :       tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
     423             :       tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
     424             :       tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
     425             :                                                                               \
     426             :       tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
     427             :       tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
     428             :       tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
     429             :       tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
     430             :                                                                               \
     431             :       tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
     432             :       tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
     433             :       tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
     434             :       tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
     435             :                                                                               \
     436             :       stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
     437             :       stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
     438             :     }                                                                         \
     439             :                                                                               \
     440             :     /* Stage4  */                                                             \
     441             :     out0 = _mm_add_epi16(stp1_0, stp2_7);                                     \
     442             :     out1 = _mm_add_epi16(stp1_1, stp1_6);                                     \
     443             :     out2 = _mm_add_epi16(stp1_2, stp1_5);                                     \
     444             :     out3 = _mm_add_epi16(stp1_3, stp2_4);                                     \
     445             :     out4 = _mm_sub_epi16(stp1_3, stp2_4);                                     \
     446             :     out5 = _mm_sub_epi16(stp1_2, stp1_5);                                     \
     447             :     out6 = _mm_sub_epi16(stp1_1, stp1_6);                                     \
     448             :     out7 = _mm_sub_epi16(stp1_0, stp2_7);                                     \
     449             :   }
     450             : 
     451           0 : void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
     452             :                              int stride) {
     453           0 :   const __m128i zero = _mm_setzero_si128();
     454           0 :   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
     455           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
     456           0 :   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
     457           0 :   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
     458           0 :   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
     459           0 :   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
     460           0 :   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
     461           0 :   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     462           0 :   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
     463           0 :   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
     464             : 
     465             :   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
     466             :   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
     467             :   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
     468             :   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     469             :   int i;
     470             : 
     471             :   // Load input data.
     472           0 :   in0 = load_input_data(input);
     473           0 :   in1 = load_input_data(input + 8 * 1);
     474           0 :   in2 = load_input_data(input + 8 * 2);
     475           0 :   in3 = load_input_data(input + 8 * 3);
     476           0 :   in4 = load_input_data(input + 8 * 4);
     477           0 :   in5 = load_input_data(input + 8 * 5);
     478           0 :   in6 = load_input_data(input + 8 * 6);
     479           0 :   in7 = load_input_data(input + 8 * 7);
     480             : 
     481             :   // 2-D
     482           0 :   for (i = 0; i < 2; i++) {
     483             :     // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
     484           0 :     TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
     485             :                   in4, in5, in6, in7);
     486             : 
     487             :     // 4-stage 1D idct8x8
     488           0 :     IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
     489             :           in6, in7);
     490             :   }
     491             : 
     492             :   // Final rounding and shift
     493           0 :   in0 = _mm_adds_epi16(in0, final_rounding);
     494           0 :   in1 = _mm_adds_epi16(in1, final_rounding);
     495           0 :   in2 = _mm_adds_epi16(in2, final_rounding);
     496           0 :   in3 = _mm_adds_epi16(in3, final_rounding);
     497           0 :   in4 = _mm_adds_epi16(in4, final_rounding);
     498           0 :   in5 = _mm_adds_epi16(in5, final_rounding);
     499           0 :   in6 = _mm_adds_epi16(in6, final_rounding);
     500           0 :   in7 = _mm_adds_epi16(in7, final_rounding);
     501             : 
     502           0 :   in0 = _mm_srai_epi16(in0, 5);
     503           0 :   in1 = _mm_srai_epi16(in1, 5);
     504           0 :   in2 = _mm_srai_epi16(in2, 5);
     505           0 :   in3 = _mm_srai_epi16(in3, 5);
     506           0 :   in4 = _mm_srai_epi16(in4, 5);
     507           0 :   in5 = _mm_srai_epi16(in5, 5);
     508           0 :   in6 = _mm_srai_epi16(in6, 5);
     509           0 :   in7 = _mm_srai_epi16(in7, 5);
     510             : 
     511           0 :   RECON_AND_STORE(dest + 0 * stride, in0);
     512           0 :   RECON_AND_STORE(dest + 1 * stride, in1);
     513           0 :   RECON_AND_STORE(dest + 2 * stride, in2);
     514           0 :   RECON_AND_STORE(dest + 3 * stride, in3);
     515           0 :   RECON_AND_STORE(dest + 4 * stride, in4);
     516           0 :   RECON_AND_STORE(dest + 5 * stride, in5);
     517           0 :   RECON_AND_STORE(dest + 6 * stride, in6);
     518           0 :   RECON_AND_STORE(dest + 7 * stride, in7);
     519           0 : }
     520             : 
     521           0 : void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
     522             :                             int stride) {
     523             :   __m128i dc_value;
     524           0 :   const __m128i zero = _mm_setzero_si128();
     525             :   int a;
     526             : 
     527           0 :   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
     528           0 :   a = (int)dct_const_round_shift(a * cospi_16_64);
     529           0 :   a = ROUND_POWER_OF_TWO(a, 5);
     530             : 
     531           0 :   dc_value = _mm_set1_epi16(a);
     532             : 
     533           0 :   RECON_AND_STORE(dest + 0 * stride, dc_value);
     534           0 :   RECON_AND_STORE(dest + 1 * stride, dc_value);
     535           0 :   RECON_AND_STORE(dest + 2 * stride, dc_value);
     536           0 :   RECON_AND_STORE(dest + 3 * stride, dc_value);
     537           0 :   RECON_AND_STORE(dest + 4 * stride, dc_value);
     538           0 :   RECON_AND_STORE(dest + 5 * stride, dc_value);
     539           0 :   RECON_AND_STORE(dest + 6 * stride, dc_value);
     540           0 :   RECON_AND_STORE(dest + 7 * stride, dc_value);
     541           0 : }
     542             : 
     543           0 : void idct8_sse2(__m128i *in) {
     544           0 :   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
     545           0 :   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
     546           0 :   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
     547           0 :   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
     548           0 :   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
     549           0 :   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
     550           0 :   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     551           0 :   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
     552           0 :   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
     553             : 
     554             :   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
     555             :   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
     556             :   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
     557             :   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     558             : 
     559             :   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
     560           0 :   TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
     561             :                 in1, in2, in3, in4, in5, in6, in7);
     562             : 
     563             :   // 4-stage 1D idct8x8
     564           0 :   IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
     565             :         in[4], in[5], in[6], in[7]);
     566           0 : }
     567             : 
     568           0 : void iadst8_sse2(__m128i *in) {
     569           0 :   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
     570           0 :   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
     571           0 :   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
     572           0 :   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
     573           0 :   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
     574           0 :   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
     575           0 :   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
     576           0 :   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
     577           0 :   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
     578           0 :   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
     579           0 :   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
     580           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     581           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
     582           0 :   const __m128i k__const_0 = _mm_set1_epi16(0);
     583           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     584             : 
     585             :   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
     586             :   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
     587             :   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
     588             :   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
     589             :   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
     590             : 
     591             :   // transpose
     592           0 :   array_transpose_8x8(in, in);
     593             : 
     594             :   // properly aligned for butterfly input
     595           0 :   in0 = in[7];
     596           0 :   in1 = in[0];
     597           0 :   in2 = in[5];
     598           0 :   in3 = in[2];
     599           0 :   in4 = in[3];
     600           0 :   in5 = in[4];
     601           0 :   in6 = in[1];
     602           0 :   in7 = in[6];
     603             : 
     604             :   // column transformation
     605             :   // stage 1
     606             :   // interleave and multiply/add into 32-bit integer
     607           0 :   s0 = _mm_unpacklo_epi16(in0, in1);
     608           0 :   s1 = _mm_unpackhi_epi16(in0, in1);
     609           0 :   s2 = _mm_unpacklo_epi16(in2, in3);
     610           0 :   s3 = _mm_unpackhi_epi16(in2, in3);
     611           0 :   s4 = _mm_unpacklo_epi16(in4, in5);
     612           0 :   s5 = _mm_unpackhi_epi16(in4, in5);
     613           0 :   s6 = _mm_unpacklo_epi16(in6, in7);
     614           0 :   s7 = _mm_unpackhi_epi16(in6, in7);
     615             : 
     616           0 :   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
     617           0 :   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
     618           0 :   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
     619           0 :   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
     620           0 :   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
     621           0 :   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
     622           0 :   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
     623           0 :   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
     624           0 :   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
     625           0 :   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
     626           0 :   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
     627           0 :   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
     628           0 :   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
     629           0 :   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
     630           0 :   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
     631           0 :   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
     632             : 
     633             :   // addition
     634           0 :   w0 = _mm_add_epi32(u0, u8);
     635           0 :   w1 = _mm_add_epi32(u1, u9);
     636           0 :   w2 = _mm_add_epi32(u2, u10);
     637           0 :   w3 = _mm_add_epi32(u3, u11);
     638           0 :   w4 = _mm_add_epi32(u4, u12);
     639           0 :   w5 = _mm_add_epi32(u5, u13);
     640           0 :   w6 = _mm_add_epi32(u6, u14);
     641           0 :   w7 = _mm_add_epi32(u7, u15);
     642           0 :   w8 = _mm_sub_epi32(u0, u8);
     643           0 :   w9 = _mm_sub_epi32(u1, u9);
     644           0 :   w10 = _mm_sub_epi32(u2, u10);
     645           0 :   w11 = _mm_sub_epi32(u3, u11);
     646           0 :   w12 = _mm_sub_epi32(u4, u12);
     647           0 :   w13 = _mm_sub_epi32(u5, u13);
     648           0 :   w14 = _mm_sub_epi32(u6, u14);
     649           0 :   w15 = _mm_sub_epi32(u7, u15);
     650             : 
     651             :   // shift and rounding
     652           0 :   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
     653           0 :   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
     654           0 :   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
     655           0 :   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
     656           0 :   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
     657           0 :   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
     658           0 :   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
     659           0 :   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
     660           0 :   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
     661           0 :   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
     662           0 :   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
     663           0 :   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
     664           0 :   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
     665           0 :   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
     666           0 :   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
     667           0 :   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
     668             : 
     669           0 :   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
     670           0 :   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
     671           0 :   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
     672           0 :   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
     673           0 :   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
     674           0 :   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
     675           0 :   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
     676           0 :   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
     677           0 :   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
     678           0 :   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
     679           0 :   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
     680           0 :   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
     681           0 :   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
     682           0 :   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
     683           0 :   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
     684           0 :   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
     685             : 
     686             :   // back to 16-bit and pack 8 integers into __m128i
     687           0 :   in[0] = _mm_packs_epi32(u0, u1);
     688           0 :   in[1] = _mm_packs_epi32(u2, u3);
     689           0 :   in[2] = _mm_packs_epi32(u4, u5);
     690           0 :   in[3] = _mm_packs_epi32(u6, u7);
     691           0 :   in[4] = _mm_packs_epi32(u8, u9);
     692           0 :   in[5] = _mm_packs_epi32(u10, u11);
     693           0 :   in[6] = _mm_packs_epi32(u12, u13);
     694           0 :   in[7] = _mm_packs_epi32(u14, u15);
     695             : 
     696             :   // stage 2
     697           0 :   s0 = _mm_add_epi16(in[0], in[2]);
     698           0 :   s1 = _mm_add_epi16(in[1], in[3]);
     699           0 :   s2 = _mm_sub_epi16(in[0], in[2]);
     700           0 :   s3 = _mm_sub_epi16(in[1], in[3]);
     701           0 :   u0 = _mm_unpacklo_epi16(in[4], in[5]);
     702           0 :   u1 = _mm_unpackhi_epi16(in[4], in[5]);
     703           0 :   u2 = _mm_unpacklo_epi16(in[6], in[7]);
     704           0 :   u3 = _mm_unpackhi_epi16(in[6], in[7]);
     705             : 
     706           0 :   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
     707           0 :   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
     708           0 :   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
     709           0 :   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
     710           0 :   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
     711           0 :   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
     712           0 :   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
     713           0 :   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
     714             : 
     715           0 :   w0 = _mm_add_epi32(v0, v4);
     716           0 :   w1 = _mm_add_epi32(v1, v5);
     717           0 :   w2 = _mm_add_epi32(v2, v6);
     718           0 :   w3 = _mm_add_epi32(v3, v7);
     719           0 :   w4 = _mm_sub_epi32(v0, v4);
     720           0 :   w5 = _mm_sub_epi32(v1, v5);
     721           0 :   w6 = _mm_sub_epi32(v2, v6);
     722           0 :   w7 = _mm_sub_epi32(v3, v7);
     723             : 
     724           0 :   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
     725           0 :   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
     726           0 :   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
     727           0 :   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
     728           0 :   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
     729           0 :   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
     730           0 :   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
     731           0 :   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
     732             : 
     733           0 :   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
     734           0 :   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
     735           0 :   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
     736           0 :   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
     737           0 :   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
     738           0 :   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
     739           0 :   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
     740           0 :   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
     741             : 
     742             :   // back to 16-bit intergers
     743           0 :   s4 = _mm_packs_epi32(u0, u1);
     744           0 :   s5 = _mm_packs_epi32(u2, u3);
     745           0 :   s6 = _mm_packs_epi32(u4, u5);
     746           0 :   s7 = _mm_packs_epi32(u6, u7);
     747             : 
     748             :   // stage 3
     749           0 :   u0 = _mm_unpacklo_epi16(s2, s3);
     750           0 :   u1 = _mm_unpackhi_epi16(s2, s3);
     751           0 :   u2 = _mm_unpacklo_epi16(s6, s7);
     752           0 :   u3 = _mm_unpackhi_epi16(s6, s7);
     753             : 
     754           0 :   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
     755           0 :   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
     756           0 :   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
     757           0 :   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
     758           0 :   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
     759           0 :   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
     760           0 :   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
     761           0 :   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
     762             : 
     763           0 :   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
     764           0 :   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
     765           0 :   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
     766           0 :   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
     767           0 :   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
     768           0 :   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
     769           0 :   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
     770           0 :   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
     771             : 
     772           0 :   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
     773           0 :   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
     774           0 :   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
     775           0 :   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
     776           0 :   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
     777           0 :   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
     778           0 :   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
     779           0 :   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
     780             : 
     781           0 :   s2 = _mm_packs_epi32(v0, v1);
     782           0 :   s3 = _mm_packs_epi32(v2, v3);
     783           0 :   s6 = _mm_packs_epi32(v4, v5);
     784           0 :   s7 = _mm_packs_epi32(v6, v7);
     785             : 
     786           0 :   in[0] = s0;
     787           0 :   in[1] = _mm_sub_epi16(k__const_0, s4);
     788           0 :   in[2] = s6;
     789           0 :   in[3] = _mm_sub_epi16(k__const_0, s2);
     790           0 :   in[4] = s3;
     791           0 :   in[5] = _mm_sub_epi16(k__const_0, s7);
     792           0 :   in[6] = s5;
     793           0 :   in[7] = _mm_sub_epi16(k__const_0, s1);
     794           0 : }
     795             : 
     796           0 : void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
     797             :                              int stride) {
     798           0 :   const __m128i zero = _mm_setzero_si128();
     799           0 :   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
     800           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
     801           0 :   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
     802           0 :   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
     803           0 :   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
     804           0 :   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
     805           0 :   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
     806           0 :   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     807           0 :   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
     808           0 :   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
     809           0 :   const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
     810             : 
     811             :   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
     812             :   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
     813             :   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
     814             :   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     815             : 
     816             :   // Rows. Load 4-row input data.
     817           0 :   in0 = load_input_data(input);
     818           0 :   in1 = load_input_data(input + 8 * 1);
     819           0 :   in2 = load_input_data(input + 8 * 2);
     820           0 :   in3 = load_input_data(input + 8 * 3);
     821             : 
     822             :   // 8x4 Transpose
     823           0 :   TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
     824             :   // Stage1
     825             :   {
     826           0 :     const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
     827           0 :     const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
     828             : 
     829           0 :     tmp0 = _mm_madd_epi16(lo_17, stg1_0);
     830           0 :     tmp2 = _mm_madd_epi16(lo_17, stg1_1);
     831           0 :     tmp4 = _mm_madd_epi16(lo_35, stg1_2);
     832           0 :     tmp6 = _mm_madd_epi16(lo_35, stg1_3);
     833             : 
     834           0 :     tmp0 = _mm_add_epi32(tmp0, rounding);
     835           0 :     tmp2 = _mm_add_epi32(tmp2, rounding);
     836           0 :     tmp4 = _mm_add_epi32(tmp4, rounding);
     837           0 :     tmp6 = _mm_add_epi32(tmp6, rounding);
     838           0 :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
     839           0 :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
     840           0 :     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
     841           0 :     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
     842             : 
     843           0 :     stp1_4 = _mm_packs_epi32(tmp0, tmp2);
     844           0 :     stp1_5 = _mm_packs_epi32(tmp4, tmp6);
     845             :   }
     846             : 
     847             :   // Stage2
     848             :   {
     849           0 :     const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
     850           0 :     const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
     851             : 
     852           0 :     tmp0 = _mm_madd_epi16(lo_04, stg2_0);
     853           0 :     tmp2 = _mm_madd_epi16(lo_04, stg2_1);
     854           0 :     tmp4 = _mm_madd_epi16(lo_26, stg2_2);
     855           0 :     tmp6 = _mm_madd_epi16(lo_26, stg2_3);
     856             : 
     857           0 :     tmp0 = _mm_add_epi32(tmp0, rounding);
     858           0 :     tmp2 = _mm_add_epi32(tmp2, rounding);
     859           0 :     tmp4 = _mm_add_epi32(tmp4, rounding);
     860           0 :     tmp6 = _mm_add_epi32(tmp6, rounding);
     861           0 :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
     862           0 :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
     863           0 :     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
     864           0 :     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
     865             : 
     866           0 :     stp2_0 = _mm_packs_epi32(tmp0, tmp2);
     867           0 :     stp2_2 = _mm_packs_epi32(tmp6, tmp4);
     868             : 
     869           0 :     tmp0 = _mm_add_epi16(stp1_4, stp1_5);
     870           0 :     tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
     871             : 
     872           0 :     stp2_4 = tmp0;
     873           0 :     stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
     874           0 :     stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
     875             :   }
     876             : 
     877             :   // Stage3
     878             :   {
     879           0 :     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
     880             : 
     881           0 :     tmp4 = _mm_add_epi16(stp2_0, stp2_2);
     882           0 :     tmp6 = _mm_sub_epi16(stp2_0, stp2_2);
     883             : 
     884           0 :     stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
     885           0 :     stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
     886             : 
     887           0 :     tmp0 = _mm_madd_epi16(lo_56, stg3_0);
     888           0 :     tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
     889             : 
     890           0 :     tmp0 = _mm_add_epi32(tmp0, rounding);
     891           0 :     tmp2 = _mm_add_epi32(tmp2, rounding);
     892           0 :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
     893           0 :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
     894             : 
     895           0 :     stp1_5 = _mm_packs_epi32(tmp0, tmp2);
     896             :   }
     897             : 
     898             :   // Stage4
     899           0 :   tmp0 = _mm_add_epi16(stp1_3, stp2_4);
     900           0 :   tmp1 = _mm_add_epi16(stp1_2, stp1_5);
     901           0 :   tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
     902           0 :   tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
     903             : 
     904           0 :   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
     905             : 
     906           0 :   IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
     907             :         in5, in6, in7);
     908             :   // Final rounding and shift
     909           0 :   in0 = _mm_adds_epi16(in0, final_rounding);
     910           0 :   in1 = _mm_adds_epi16(in1, final_rounding);
     911           0 :   in2 = _mm_adds_epi16(in2, final_rounding);
     912           0 :   in3 = _mm_adds_epi16(in3, final_rounding);
     913           0 :   in4 = _mm_adds_epi16(in4, final_rounding);
     914           0 :   in5 = _mm_adds_epi16(in5, final_rounding);
     915           0 :   in6 = _mm_adds_epi16(in6, final_rounding);
     916           0 :   in7 = _mm_adds_epi16(in7, final_rounding);
     917             : 
     918           0 :   in0 = _mm_srai_epi16(in0, 5);
     919           0 :   in1 = _mm_srai_epi16(in1, 5);
     920           0 :   in2 = _mm_srai_epi16(in2, 5);
     921           0 :   in3 = _mm_srai_epi16(in3, 5);
     922           0 :   in4 = _mm_srai_epi16(in4, 5);
     923           0 :   in5 = _mm_srai_epi16(in5, 5);
     924           0 :   in6 = _mm_srai_epi16(in6, 5);
     925           0 :   in7 = _mm_srai_epi16(in7, 5);
     926             : 
     927           0 :   RECON_AND_STORE(dest + 0 * stride, in0);
     928           0 :   RECON_AND_STORE(dest + 1 * stride, in1);
     929           0 :   RECON_AND_STORE(dest + 2 * stride, in2);
     930           0 :   RECON_AND_STORE(dest + 3 * stride, in3);
     931           0 :   RECON_AND_STORE(dest + 4 * stride, in4);
     932           0 :   RECON_AND_STORE(dest + 5 * stride, in5);
     933           0 :   RECON_AND_STORE(dest + 6 * stride, in6);
     934           0 :   RECON_AND_STORE(dest + 7 * stride, in7);
     935           0 : }
     936             : 
     937             : #define IDCT16                                                                 \
     938             :   /* Stage2 */                                                                 \
     939             :   {                                                                            \
     940             :     const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
     941             :     const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
     942             :     const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
     943             :     const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
     944             :     const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
     945             :     const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
     946             :     const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
     947             :     const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
     948             :                                                                                \
     949             :     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
     950             :                            stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
     951             :                                                                                \
     952             :     MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
     953             :                            stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
     954             :   }                                                                            \
     955             :                                                                                \
     956             :   /* Stage3 */                                                                 \
     957             :   {                                                                            \
     958             :     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
     959             :     const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
     960             :     const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
     961             :     const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
     962             :                                                                                \
     963             :     MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
     964             :                            stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
     965             :                                                                                \
     966             :     stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
     967             :     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
     968             :     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
     969             :     stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
     970             :                                                                                \
     971             :     stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
     972             :     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
     973             :     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
     974             :     stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
     975             :   }                                                                            \
     976             :                                                                                \
     977             :   /* Stage4 */                                                                 \
     978             :   {                                                                            \
     979             :     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
     980             :     const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
     981             :     const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
     982             :     const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
     983             :                                                                                \
     984             :     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
     985             :     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
     986             :     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
     987             :     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
     988             :                                                                                \
     989             :     MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
     990             :                            stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
     991             :                                                                                \
     992             :     stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
     993             :     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
     994             :     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
     995             :     stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
     996             :                                                                                \
     997             :     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
     998             :                            stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
     999             :                            stp2_13)                                            \
    1000             :   }                                                                            \
    1001             :                                                                                \
    1002             :   /* Stage5 */                                                                 \
    1003             :   {                                                                            \
    1004             :     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
    1005             :     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
    1006             :                                                                                \
    1007             :     stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
    1008             :     stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
    1009             :     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
    1010             :     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
    1011             :                                                                                \
    1012             :     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
    1013             :     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
    1014             :     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
    1015             :     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
    1016             :                                                                                \
    1017             :     tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
    1018             :     tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
    1019             :     tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
    1020             :     tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
    1021             :                                                                                \
    1022             :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
    1023             :     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
    1024             :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
    1025             :     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
    1026             :                                                                                \
    1027             :     stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
    1028             :     stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
    1029             :                                                                                \
    1030             :     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
    1031             :     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
    1032             :     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
    1033             :     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
    1034             :                                                                                \
    1035             :     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
    1036             :     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
    1037             :     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
    1038             :     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
    1039             :   }                                                                            \
    1040             :                                                                                \
    1041             :   /* Stage6 */                                                                 \
    1042             :   {                                                                            \
    1043             :     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
    1044             :     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
    1045             :     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
    1046             :     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
    1047             :                                                                                \
    1048             :     stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
    1049             :     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
    1050             :     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
    1051             :     stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
    1052             :     stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
    1053             :     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
    1054             :     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
    1055             :     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
    1056             :                                                                                \
    1057             :     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
    1058             :                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
    1059             :                            stp2_12)                                            \
    1060             :   }
    1061             : 
    1062             : #define IDCT16_10                                                              \
    1063             :   /* Stage2 */                                                                 \
    1064             :   {                                                                            \
    1065             :     const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
    1066             :     const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
    1067             :     const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
    1068             :     const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
    1069             :                                                                                \
    1070             :     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
    1071             :                            stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
    1072             :                            stp1_12_0)                                          \
    1073             :   }                                                                            \
    1074             :                                                                                \
    1075             :   /* Stage3 */                                                                 \
    1076             :   {                                                                            \
    1077             :     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
    1078             :     const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
    1079             :                                                                                \
    1080             :     MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
    1081             :                                                                                \
    1082             :     stp1_9 = stp1_8_0;                                                         \
    1083             :     stp1_10 = stp1_11;                                                         \
    1084             :                                                                                \
    1085             :     stp1_13 = stp1_12_0;                                                       \
    1086             :     stp1_14 = stp1_15;                                                         \
    1087             :   }                                                                            \
    1088             :                                                                                \
    1089             :   /* Stage4 */                                                                 \
    1090             :   {                                                                            \
    1091             :     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
    1092             :     const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
    1093             :                                                                                \
    1094             :     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
    1095             :     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
    1096             :     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
    1097             :     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
    1098             :                                                                                \
    1099             :     MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
    1100             :     stp2_5 = stp2_4;                                                           \
    1101             :     stp2_6 = stp2_7;                                                           \
    1102             :                                                                                \
    1103             :     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
    1104             :                            stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
    1105             :                            stp2_13)                                            \
    1106             :   }                                                                            \
    1107             :                                                                                \
    1108             :   /* Stage5 */                                                                 \
    1109             :   {                                                                            \
    1110             :     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
    1111             :     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
    1112             :                                                                                \
    1113             :     stp1_2 = stp1_1;                                                           \
    1114             :     stp1_3 = stp1_0;                                                           \
    1115             :                                                                                \
    1116             :     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
    1117             :     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
    1118             :     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
    1119             :     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
    1120             :                                                                                \
    1121             :     tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
    1122             :     tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
    1123             :     tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
    1124             :     tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
    1125             :                                                                                \
    1126             :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
    1127             :     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
    1128             :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
    1129             :     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
    1130             :                                                                                \
    1131             :     stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
    1132             :     stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
    1133             :                                                                                \
    1134             :     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
    1135             :     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
    1136             :     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
    1137             :     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
    1138             :                                                                                \
    1139             :     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
    1140             :     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
    1141             :     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
    1142             :     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
    1143             :   }                                                                            \
    1144             :                                                                                \
    1145             :   /* Stage6 */                                                                 \
    1146             :   {                                                                            \
    1147             :     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
    1148             :     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
    1149             :     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
    1150             :     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
    1151             :                                                                                \
    1152             :     stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
    1153             :     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
    1154             :     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
    1155             :     stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
    1156             :     stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
    1157             :     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
    1158             :     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
    1159             :     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
    1160             :                                                                                \
    1161             :     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
    1162             :                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
    1163             :                            stp2_12)                                            \
    1164             :   }
    1165             : 
    1166           0 : void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
    1167             :                                 int stride) {
    1168           0 :   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    1169           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
    1170           0 :   const __m128i zero = _mm_setzero_si128();
    1171             : 
    1172           0 :   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
    1173           0 :   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
    1174           0 :   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
    1175           0 :   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
    1176           0 :   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
    1177           0 :   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
    1178           0 :   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
    1179           0 :   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
    1180             : 
    1181           0 :   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    1182           0 :   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
    1183           0 :   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
    1184           0 :   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
    1185             : 
    1186           0 :   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
    1187           0 :   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    1188           0 :   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    1189           0 :   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
    1190           0 :   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    1191           0 :   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
    1192           0 :   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
    1193           0 :   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    1194             : 
    1195           0 :   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    1196             : 
    1197             :   __m128i in[16], l[16], r[16], *curr1;
    1198             :   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
    1199             :       stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
    1200             :       stp1_8_0, stp1_12_0;
    1201             :   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
    1202             :       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
    1203             :   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    1204             :   int i;
    1205             : 
    1206           0 :   curr1 = l;
    1207           0 :   for (i = 0; i < 2; i++) {
    1208             :     // 1-D idct
    1209             : 
    1210             :     // Load input data.
    1211           0 :     in[0] = load_input_data(input);
    1212           0 :     in[8] = load_input_data(input + 8 * 1);
    1213           0 :     in[1] = load_input_data(input + 8 * 2);
    1214           0 :     in[9] = load_input_data(input + 8 * 3);
    1215           0 :     in[2] = load_input_data(input + 8 * 4);
    1216           0 :     in[10] = load_input_data(input + 8 * 5);
    1217           0 :     in[3] = load_input_data(input + 8 * 6);
    1218           0 :     in[11] = load_input_data(input + 8 * 7);
    1219           0 :     in[4] = load_input_data(input + 8 * 8);
    1220           0 :     in[12] = load_input_data(input + 8 * 9);
    1221           0 :     in[5] = load_input_data(input + 8 * 10);
    1222           0 :     in[13] = load_input_data(input + 8 * 11);
    1223           0 :     in[6] = load_input_data(input + 8 * 12);
    1224           0 :     in[14] = load_input_data(input + 8 * 13);
    1225           0 :     in[7] = load_input_data(input + 8 * 14);
    1226           0 :     in[15] = load_input_data(input + 8 * 15);
    1227             : 
    1228           0 :     array_transpose_8x8(in, in);
    1229           0 :     array_transpose_8x8(in + 8, in + 8);
    1230             : 
    1231           0 :     IDCT16
    1232             : 
    1233             :     // Stage7
    1234           0 :     curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
    1235           0 :     curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
    1236           0 :     curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
    1237           0 :     curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
    1238           0 :     curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
    1239           0 :     curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
    1240           0 :     curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
    1241           0 :     curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
    1242           0 :     curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
    1243           0 :     curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
    1244           0 :     curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
    1245           0 :     curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
    1246           0 :     curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
    1247           0 :     curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
    1248           0 :     curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
    1249           0 :     curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
    1250             : 
    1251           0 :     curr1 = r;
    1252           0 :     input += 128;
    1253             :   }
    1254           0 :   for (i = 0; i < 2; i++) {
    1255             :     int j;
    1256             :     // 1-D idct
    1257           0 :     array_transpose_8x8(l + i * 8, in);
    1258           0 :     array_transpose_8x8(r + i * 8, in + 8);
    1259             : 
    1260           0 :     IDCT16
    1261             : 
    1262             :     // 2-D
    1263           0 :     in[0] = _mm_add_epi16(stp2_0, stp1_15);
    1264           0 :     in[1] = _mm_add_epi16(stp2_1, stp1_14);
    1265           0 :     in[2] = _mm_add_epi16(stp2_2, stp2_13);
    1266           0 :     in[3] = _mm_add_epi16(stp2_3, stp2_12);
    1267           0 :     in[4] = _mm_add_epi16(stp2_4, stp2_11);
    1268           0 :     in[5] = _mm_add_epi16(stp2_5, stp2_10);
    1269           0 :     in[6] = _mm_add_epi16(stp2_6, stp1_9);
    1270           0 :     in[7] = _mm_add_epi16(stp2_7, stp1_8);
    1271           0 :     in[8] = _mm_sub_epi16(stp2_7, stp1_8);
    1272           0 :     in[9] = _mm_sub_epi16(stp2_6, stp1_9);
    1273           0 :     in[10] = _mm_sub_epi16(stp2_5, stp2_10);
    1274           0 :     in[11] = _mm_sub_epi16(stp2_4, stp2_11);
    1275           0 :     in[12] = _mm_sub_epi16(stp2_3, stp2_12);
    1276           0 :     in[13] = _mm_sub_epi16(stp2_2, stp2_13);
    1277           0 :     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
    1278           0 :     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
    1279             : 
    1280           0 :     for (j = 0; j < 16; ++j) {
    1281             :       // Final rounding and shift
    1282           0 :       in[j] = _mm_adds_epi16(in[j], final_rounding);
    1283           0 :       in[j] = _mm_srai_epi16(in[j], 6);
    1284           0 :       RECON_AND_STORE(dest + j * stride, in[j]);
    1285             :     }
    1286             : 
    1287           0 :     dest += 8;
    1288             :   }
    1289           0 : }
    1290             : 
    1291           0 : void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
    1292             :                               int stride) {
    1293             :   __m128i dc_value;
    1294           0 :   const __m128i zero = _mm_setzero_si128();
    1295             :   int a, i;
    1296             : 
    1297           0 :   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
    1298           0 :   a = (int)dct_const_round_shift(a * cospi_16_64);
    1299           0 :   a = ROUND_POWER_OF_TWO(a, 6);
    1300             : 
    1301           0 :   dc_value = _mm_set1_epi16(a);
    1302             : 
    1303           0 :   for (i = 0; i < 16; ++i) {
    1304           0 :     RECON_AND_STORE(dest + 0, dc_value);
    1305           0 :     RECON_AND_STORE(dest + 8, dc_value);
    1306           0 :     dest += stride;
    1307             :   }
    1308           0 : }
    1309             : 
    1310           0 : static void iadst16_8col(__m128i *in) {
    1311             :   // perform 16x16 1-D ADST for 8 columns
    1312             :   __m128i s[16], x[16], u[32], v[32];
    1313           0 :   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
    1314           0 :   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
    1315           0 :   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
    1316           0 :   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
    1317           0 :   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
    1318           0 :   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
    1319           0 :   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
    1320           0 :   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
    1321           0 :   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
    1322           0 :   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
    1323           0 :   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
    1324           0 :   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
    1325           0 :   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
    1326           0 :   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
    1327           0 :   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
    1328           0 :   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
    1329           0 :   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
    1330           0 :   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    1331           0 :   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
    1332           0 :   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
    1333           0 :   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
    1334           0 :   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
    1335           0 :   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
    1336           0 :   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    1337           0 :   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
    1338           0 :   const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
    1339           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
    1340           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    1341           0 :   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    1342           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    1343           0 :   const __m128i kZero = _mm_set1_epi16(0);
    1344             : 
    1345           0 :   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
    1346           0 :   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
    1347           0 :   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
    1348           0 :   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
    1349           0 :   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
    1350           0 :   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
    1351           0 :   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
    1352           0 :   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
    1353           0 :   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
    1354           0 :   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
    1355           0 :   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
    1356           0 :   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
    1357           0 :   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
    1358           0 :   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
    1359           0 :   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
    1360           0 :   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
    1361             : 
    1362           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
    1363           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
    1364           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
    1365           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
    1366           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
    1367           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
    1368           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
    1369           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
    1370           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
    1371           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
    1372           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
    1373           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
    1374           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
    1375           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
    1376           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
    1377           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
    1378           0 :   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
    1379           0 :   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
    1380           0 :   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
    1381           0 :   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
    1382           0 :   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
    1383           0 :   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
    1384           0 :   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
    1385           0 :   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
    1386           0 :   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
    1387           0 :   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
    1388           0 :   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
    1389           0 :   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
    1390           0 :   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
    1391           0 :   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
    1392           0 :   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
    1393           0 :   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
    1394             : 
    1395           0 :   u[0] = _mm_add_epi32(v[0], v[16]);
    1396           0 :   u[1] = _mm_add_epi32(v[1], v[17]);
    1397           0 :   u[2] = _mm_add_epi32(v[2], v[18]);
    1398           0 :   u[3] = _mm_add_epi32(v[3], v[19]);
    1399           0 :   u[4] = _mm_add_epi32(v[4], v[20]);
    1400           0 :   u[5] = _mm_add_epi32(v[5], v[21]);
    1401           0 :   u[6] = _mm_add_epi32(v[6], v[22]);
    1402           0 :   u[7] = _mm_add_epi32(v[7], v[23]);
    1403           0 :   u[8] = _mm_add_epi32(v[8], v[24]);
    1404           0 :   u[9] = _mm_add_epi32(v[9], v[25]);
    1405           0 :   u[10] = _mm_add_epi32(v[10], v[26]);
    1406           0 :   u[11] = _mm_add_epi32(v[11], v[27]);
    1407           0 :   u[12] = _mm_add_epi32(v[12], v[28]);
    1408           0 :   u[13] = _mm_add_epi32(v[13], v[29]);
    1409           0 :   u[14] = _mm_add_epi32(v[14], v[30]);
    1410           0 :   u[15] = _mm_add_epi32(v[15], v[31]);
    1411           0 :   u[16] = _mm_sub_epi32(v[0], v[16]);
    1412           0 :   u[17] = _mm_sub_epi32(v[1], v[17]);
    1413           0 :   u[18] = _mm_sub_epi32(v[2], v[18]);
    1414           0 :   u[19] = _mm_sub_epi32(v[3], v[19]);
    1415           0 :   u[20] = _mm_sub_epi32(v[4], v[20]);
    1416           0 :   u[21] = _mm_sub_epi32(v[5], v[21]);
    1417           0 :   u[22] = _mm_sub_epi32(v[6], v[22]);
    1418           0 :   u[23] = _mm_sub_epi32(v[7], v[23]);
    1419           0 :   u[24] = _mm_sub_epi32(v[8], v[24]);
    1420           0 :   u[25] = _mm_sub_epi32(v[9], v[25]);
    1421           0 :   u[26] = _mm_sub_epi32(v[10], v[26]);
    1422           0 :   u[27] = _mm_sub_epi32(v[11], v[27]);
    1423           0 :   u[28] = _mm_sub_epi32(v[12], v[28]);
    1424           0 :   u[29] = _mm_sub_epi32(v[13], v[29]);
    1425           0 :   u[30] = _mm_sub_epi32(v[14], v[30]);
    1426           0 :   u[31] = _mm_sub_epi32(v[15], v[31]);
    1427             : 
    1428           0 :   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    1429           0 :   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    1430           0 :   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    1431           0 :   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
    1432           0 :   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
    1433           0 :   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
    1434           0 :   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
    1435           0 :   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
    1436           0 :   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
    1437           0 :   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
    1438           0 :   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
    1439           0 :   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
    1440           0 :   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
    1441           0 :   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
    1442           0 :   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
    1443           0 :   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
    1444           0 :   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
    1445           0 :   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
    1446           0 :   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
    1447           0 :   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
    1448           0 :   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
    1449           0 :   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
    1450           0 :   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
    1451           0 :   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
    1452           0 :   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
    1453           0 :   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
    1454           0 :   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
    1455           0 :   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
    1456           0 :   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
    1457           0 :   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
    1458           0 :   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
    1459           0 :   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
    1460             : 
    1461           0 :   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
    1462           0 :   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
    1463           0 :   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
    1464           0 :   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
    1465           0 :   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
    1466           0 :   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
    1467           0 :   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
    1468           0 :   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
    1469           0 :   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
    1470           0 :   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
    1471           0 :   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
    1472           0 :   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
    1473           0 :   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
    1474           0 :   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
    1475           0 :   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
    1476           0 :   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
    1477           0 :   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
    1478           0 :   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
    1479           0 :   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
    1480           0 :   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
    1481           0 :   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
    1482           0 :   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
    1483           0 :   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
    1484           0 :   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
    1485           0 :   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
    1486           0 :   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
    1487           0 :   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
    1488           0 :   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
    1489           0 :   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
    1490           0 :   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
    1491           0 :   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
    1492           0 :   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
    1493             : 
    1494           0 :   s[0] = _mm_packs_epi32(u[0], u[1]);
    1495           0 :   s[1] = _mm_packs_epi32(u[2], u[3]);
    1496           0 :   s[2] = _mm_packs_epi32(u[4], u[5]);
    1497           0 :   s[3] = _mm_packs_epi32(u[6], u[7]);
    1498           0 :   s[4] = _mm_packs_epi32(u[8], u[9]);
    1499           0 :   s[5] = _mm_packs_epi32(u[10], u[11]);
    1500           0 :   s[6] = _mm_packs_epi32(u[12], u[13]);
    1501           0 :   s[7] = _mm_packs_epi32(u[14], u[15]);
    1502           0 :   s[8] = _mm_packs_epi32(u[16], u[17]);
    1503           0 :   s[9] = _mm_packs_epi32(u[18], u[19]);
    1504           0 :   s[10] = _mm_packs_epi32(u[20], u[21]);
    1505           0 :   s[11] = _mm_packs_epi32(u[22], u[23]);
    1506           0 :   s[12] = _mm_packs_epi32(u[24], u[25]);
    1507           0 :   s[13] = _mm_packs_epi32(u[26], u[27]);
    1508           0 :   s[14] = _mm_packs_epi32(u[28], u[29]);
    1509           0 :   s[15] = _mm_packs_epi32(u[30], u[31]);
    1510             : 
    1511             :   // stage 2
    1512           0 :   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
    1513           0 :   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
    1514           0 :   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
    1515           0 :   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
    1516           0 :   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
    1517           0 :   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
    1518           0 :   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
    1519           0 :   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
    1520             : 
    1521           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
    1522           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
    1523           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
    1524           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
    1525           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
    1526           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
    1527           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
    1528           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
    1529           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
    1530           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
    1531           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
    1532           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
    1533           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
    1534           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
    1535           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
    1536           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
    1537             : 
    1538           0 :   u[0] = _mm_add_epi32(v[0], v[8]);
    1539           0 :   u[1] = _mm_add_epi32(v[1], v[9]);
    1540           0 :   u[2] = _mm_add_epi32(v[2], v[10]);
    1541           0 :   u[3] = _mm_add_epi32(v[3], v[11]);
    1542           0 :   u[4] = _mm_add_epi32(v[4], v[12]);
    1543           0 :   u[5] = _mm_add_epi32(v[5], v[13]);
    1544           0 :   u[6] = _mm_add_epi32(v[6], v[14]);
    1545           0 :   u[7] = _mm_add_epi32(v[7], v[15]);
    1546           0 :   u[8] = _mm_sub_epi32(v[0], v[8]);
    1547           0 :   u[9] = _mm_sub_epi32(v[1], v[9]);
    1548           0 :   u[10] = _mm_sub_epi32(v[2], v[10]);
    1549           0 :   u[11] = _mm_sub_epi32(v[3], v[11]);
    1550           0 :   u[12] = _mm_sub_epi32(v[4], v[12]);
    1551           0 :   u[13] = _mm_sub_epi32(v[5], v[13]);
    1552           0 :   u[14] = _mm_sub_epi32(v[6], v[14]);
    1553           0 :   u[15] = _mm_sub_epi32(v[7], v[15]);
    1554             : 
    1555           0 :   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    1556           0 :   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    1557           0 :   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    1558           0 :   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
    1559           0 :   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
    1560           0 :   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
    1561           0 :   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
    1562           0 :   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
    1563           0 :   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
    1564           0 :   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
    1565           0 :   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
    1566           0 :   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
    1567           0 :   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
    1568           0 :   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
    1569           0 :   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
    1570           0 :   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
    1571             : 
    1572           0 :   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
    1573           0 :   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
    1574           0 :   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
    1575           0 :   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
    1576           0 :   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
    1577           0 :   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
    1578           0 :   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
    1579           0 :   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
    1580           0 :   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
    1581           0 :   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
    1582           0 :   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
    1583           0 :   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
    1584           0 :   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
    1585           0 :   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
    1586           0 :   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
    1587           0 :   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
    1588             : 
    1589           0 :   x[0] = _mm_add_epi16(s[0], s[4]);
    1590           0 :   x[1] = _mm_add_epi16(s[1], s[5]);
    1591           0 :   x[2] = _mm_add_epi16(s[2], s[6]);
    1592           0 :   x[3] = _mm_add_epi16(s[3], s[7]);
    1593           0 :   x[4] = _mm_sub_epi16(s[0], s[4]);
    1594           0 :   x[5] = _mm_sub_epi16(s[1], s[5]);
    1595           0 :   x[6] = _mm_sub_epi16(s[2], s[6]);
    1596           0 :   x[7] = _mm_sub_epi16(s[3], s[7]);
    1597           0 :   x[8] = _mm_packs_epi32(u[0], u[1]);
    1598           0 :   x[9] = _mm_packs_epi32(u[2], u[3]);
    1599           0 :   x[10] = _mm_packs_epi32(u[4], u[5]);
    1600           0 :   x[11] = _mm_packs_epi32(u[6], u[7]);
    1601           0 :   x[12] = _mm_packs_epi32(u[8], u[9]);
    1602           0 :   x[13] = _mm_packs_epi32(u[10], u[11]);
    1603           0 :   x[14] = _mm_packs_epi32(u[12], u[13]);
    1604           0 :   x[15] = _mm_packs_epi32(u[14], u[15]);
    1605             : 
    1606             :   // stage 3
    1607           0 :   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
    1608           0 :   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
    1609           0 :   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
    1610           0 :   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
    1611           0 :   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
    1612           0 :   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
    1613           0 :   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
    1614           0 :   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
    1615             : 
    1616           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
    1617           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
    1618           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
    1619           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
    1620           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
    1621           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
    1622           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
    1623           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
    1624           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
    1625           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
    1626           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
    1627           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
    1628           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
    1629           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
    1630           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
    1631           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
    1632             : 
    1633           0 :   u[0] = _mm_add_epi32(v[0], v[4]);
    1634           0 :   u[1] = _mm_add_epi32(v[1], v[5]);
    1635           0 :   u[2] = _mm_add_epi32(v[2], v[6]);
    1636           0 :   u[3] = _mm_add_epi32(v[3], v[7]);
    1637           0 :   u[4] = _mm_sub_epi32(v[0], v[4]);
    1638           0 :   u[5] = _mm_sub_epi32(v[1], v[5]);
    1639           0 :   u[6] = _mm_sub_epi32(v[2], v[6]);
    1640           0 :   u[7] = _mm_sub_epi32(v[3], v[7]);
    1641           0 :   u[8] = _mm_add_epi32(v[8], v[12]);
    1642           0 :   u[9] = _mm_add_epi32(v[9], v[13]);
    1643           0 :   u[10] = _mm_add_epi32(v[10], v[14]);
    1644           0 :   u[11] = _mm_add_epi32(v[11], v[15]);
    1645           0 :   u[12] = _mm_sub_epi32(v[8], v[12]);
    1646           0 :   u[13] = _mm_sub_epi32(v[9], v[13]);
    1647           0 :   u[14] = _mm_sub_epi32(v[10], v[14]);
    1648           0 :   u[15] = _mm_sub_epi32(v[11], v[15]);
    1649             : 
    1650           0 :   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    1651           0 :   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    1652           0 :   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    1653           0 :   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
    1654           0 :   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
    1655           0 :   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
    1656           0 :   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
    1657           0 :   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
    1658           0 :   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
    1659           0 :   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
    1660           0 :   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
    1661           0 :   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
    1662           0 :   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
    1663           0 :   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
    1664           0 :   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
    1665           0 :   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
    1666             : 
    1667           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1668           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1669           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1670           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1671           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1672           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1673           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1674           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1675           0 :   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
    1676           0 :   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
    1677           0 :   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
    1678           0 :   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
    1679           0 :   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
    1680           0 :   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
    1681           0 :   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
    1682           0 :   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
    1683             : 
    1684           0 :   s[0] = _mm_add_epi16(x[0], x[2]);
    1685           0 :   s[1] = _mm_add_epi16(x[1], x[3]);
    1686           0 :   s[2] = _mm_sub_epi16(x[0], x[2]);
    1687           0 :   s[3] = _mm_sub_epi16(x[1], x[3]);
    1688           0 :   s[4] = _mm_packs_epi32(v[0], v[1]);
    1689           0 :   s[5] = _mm_packs_epi32(v[2], v[3]);
    1690           0 :   s[6] = _mm_packs_epi32(v[4], v[5]);
    1691           0 :   s[7] = _mm_packs_epi32(v[6], v[7]);
    1692           0 :   s[8] = _mm_add_epi16(x[8], x[10]);
    1693           0 :   s[9] = _mm_add_epi16(x[9], x[11]);
    1694           0 :   s[10] = _mm_sub_epi16(x[8], x[10]);
    1695           0 :   s[11] = _mm_sub_epi16(x[9], x[11]);
    1696           0 :   s[12] = _mm_packs_epi32(v[8], v[9]);
    1697           0 :   s[13] = _mm_packs_epi32(v[10], v[11]);
    1698           0 :   s[14] = _mm_packs_epi32(v[12], v[13]);
    1699           0 :   s[15] = _mm_packs_epi32(v[14], v[15]);
    1700             : 
    1701             :   // stage 4
    1702           0 :   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
    1703           0 :   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
    1704           0 :   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
    1705           0 :   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
    1706           0 :   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
    1707           0 :   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
    1708           0 :   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
    1709           0 :   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
    1710             : 
    1711           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
    1712           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
    1713           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
    1714           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
    1715           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
    1716           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
    1717           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
    1718           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
    1719           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
    1720           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
    1721           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
    1722           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
    1723           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
    1724           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
    1725           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
    1726           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
    1727             : 
    1728           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1729           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1730           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1731           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1732           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    1733           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    1734           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    1735           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    1736           0 :   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
    1737           0 :   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
    1738           0 :   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
    1739           0 :   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
    1740           0 :   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
    1741           0 :   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
    1742           0 :   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
    1743           0 :   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
    1744             : 
    1745           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1746           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1747           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1748           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1749           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1750           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1751           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1752           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1753           0 :   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
    1754           0 :   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
    1755           0 :   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
    1756           0 :   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
    1757           0 :   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
    1758           0 :   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
    1759           0 :   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
    1760           0 :   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
    1761             : 
    1762           0 :   in[0] = s[0];
    1763           0 :   in[1] = _mm_sub_epi16(kZero, s[8]);
    1764           0 :   in[2] = s[12];
    1765           0 :   in[3] = _mm_sub_epi16(kZero, s[4]);
    1766           0 :   in[4] = _mm_packs_epi32(v[4], v[5]);
    1767           0 :   in[5] = _mm_packs_epi32(v[12], v[13]);
    1768           0 :   in[6] = _mm_packs_epi32(v[8], v[9]);
    1769           0 :   in[7] = _mm_packs_epi32(v[0], v[1]);
    1770           0 :   in[8] = _mm_packs_epi32(v[2], v[3]);
    1771           0 :   in[9] = _mm_packs_epi32(v[10], v[11]);
    1772           0 :   in[10] = _mm_packs_epi32(v[14], v[15]);
    1773           0 :   in[11] = _mm_packs_epi32(v[6], v[7]);
    1774           0 :   in[12] = s[5];
    1775           0 :   in[13] = _mm_sub_epi16(kZero, s[13]);
    1776           0 :   in[14] = s[9];
    1777           0 :   in[15] = _mm_sub_epi16(kZero, s[1]);
    1778           0 : }
    1779             : 
    1780           0 : static void idct16_8col(__m128i *in) {
    1781           0 :   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
    1782           0 :   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
    1783           0 :   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
    1784           0 :   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
    1785           0 :   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
    1786           0 :   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
    1787           0 :   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
    1788           0 :   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
    1789           0 :   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    1790           0 :   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
    1791           0 :   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
    1792           0 :   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
    1793           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
    1794           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    1795           0 :   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    1796           0 :   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
    1797           0 :   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    1798           0 :   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
    1799           0 :   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
    1800           0 :   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    1801           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    1802             :   __m128i v[16], u[16], s[16], t[16];
    1803             : 
    1804             :   // stage 1
    1805           0 :   s[0] = in[0];
    1806           0 :   s[1] = in[8];
    1807           0 :   s[2] = in[4];
    1808           0 :   s[3] = in[12];
    1809           0 :   s[4] = in[2];
    1810           0 :   s[5] = in[10];
    1811           0 :   s[6] = in[6];
    1812           0 :   s[7] = in[14];
    1813           0 :   s[8] = in[1];
    1814           0 :   s[9] = in[9];
    1815           0 :   s[10] = in[5];
    1816           0 :   s[11] = in[13];
    1817           0 :   s[12] = in[3];
    1818           0 :   s[13] = in[11];
    1819           0 :   s[14] = in[7];
    1820           0 :   s[15] = in[15];
    1821             : 
    1822             :   // stage 2
    1823           0 :   u[0] = _mm_unpacklo_epi16(s[8], s[15]);
    1824           0 :   u[1] = _mm_unpackhi_epi16(s[8], s[15]);
    1825           0 :   u[2] = _mm_unpacklo_epi16(s[9], s[14]);
    1826           0 :   u[3] = _mm_unpackhi_epi16(s[9], s[14]);
    1827           0 :   u[4] = _mm_unpacklo_epi16(s[10], s[13]);
    1828           0 :   u[5] = _mm_unpackhi_epi16(s[10], s[13]);
    1829           0 :   u[6] = _mm_unpacklo_epi16(s[11], s[12]);
    1830           0 :   u[7] = _mm_unpackhi_epi16(s[11], s[12]);
    1831             : 
    1832           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
    1833           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
    1834           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
    1835           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
    1836           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
    1837           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
    1838           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
    1839           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
    1840           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
    1841           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
    1842           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
    1843           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
    1844           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
    1845           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
    1846           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
    1847           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
    1848             : 
    1849           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1850           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1851           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1852           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1853           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    1854           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    1855           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    1856           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    1857           0 :   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
    1858           0 :   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
    1859           0 :   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
    1860           0 :   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
    1861           0 :   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
    1862           0 :   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
    1863           0 :   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
    1864           0 :   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
    1865             : 
    1866           0 :   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1867           0 :   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1868           0 :   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1869           0 :   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1870           0 :   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1871           0 :   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1872           0 :   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1873           0 :   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1874           0 :   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
    1875           0 :   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
    1876           0 :   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
    1877           0 :   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
    1878           0 :   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
    1879           0 :   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
    1880           0 :   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
    1881           0 :   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
    1882             : 
    1883           0 :   s[8] = _mm_packs_epi32(u[0], u[1]);
    1884           0 :   s[15] = _mm_packs_epi32(u[2], u[3]);
    1885           0 :   s[9] = _mm_packs_epi32(u[4], u[5]);
    1886           0 :   s[14] = _mm_packs_epi32(u[6], u[7]);
    1887           0 :   s[10] = _mm_packs_epi32(u[8], u[9]);
    1888           0 :   s[13] = _mm_packs_epi32(u[10], u[11]);
    1889           0 :   s[11] = _mm_packs_epi32(u[12], u[13]);
    1890           0 :   s[12] = _mm_packs_epi32(u[14], u[15]);
    1891             : 
    1892             :   // stage 3
    1893           0 :   t[0] = s[0];
    1894           0 :   t[1] = s[1];
    1895           0 :   t[2] = s[2];
    1896           0 :   t[3] = s[3];
    1897           0 :   u[0] = _mm_unpacklo_epi16(s[4], s[7]);
    1898           0 :   u[1] = _mm_unpackhi_epi16(s[4], s[7]);
    1899           0 :   u[2] = _mm_unpacklo_epi16(s[5], s[6]);
    1900           0 :   u[3] = _mm_unpackhi_epi16(s[5], s[6]);
    1901             : 
    1902           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
    1903           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
    1904           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
    1905           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
    1906           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
    1907           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
    1908           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
    1909           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
    1910             : 
    1911           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1912           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1913           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1914           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1915           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    1916           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    1917           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    1918           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    1919             : 
    1920           0 :   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1921           0 :   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1922           0 :   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1923           0 :   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1924           0 :   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1925           0 :   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1926           0 :   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1927           0 :   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1928             : 
    1929           0 :   t[4] = _mm_packs_epi32(u[0], u[1]);
    1930           0 :   t[7] = _mm_packs_epi32(u[2], u[3]);
    1931           0 :   t[5] = _mm_packs_epi32(u[4], u[5]);
    1932           0 :   t[6] = _mm_packs_epi32(u[6], u[7]);
    1933           0 :   t[8] = _mm_add_epi16(s[8], s[9]);
    1934           0 :   t[9] = _mm_sub_epi16(s[8], s[9]);
    1935           0 :   t[10] = _mm_sub_epi16(s[11], s[10]);
    1936           0 :   t[11] = _mm_add_epi16(s[10], s[11]);
    1937           0 :   t[12] = _mm_add_epi16(s[12], s[13]);
    1938           0 :   t[13] = _mm_sub_epi16(s[12], s[13]);
    1939           0 :   t[14] = _mm_sub_epi16(s[15], s[14]);
    1940           0 :   t[15] = _mm_add_epi16(s[14], s[15]);
    1941             : 
    1942             :   // stage 4
    1943           0 :   u[0] = _mm_unpacklo_epi16(t[0], t[1]);
    1944           0 :   u[1] = _mm_unpackhi_epi16(t[0], t[1]);
    1945           0 :   u[2] = _mm_unpacklo_epi16(t[2], t[3]);
    1946           0 :   u[3] = _mm_unpackhi_epi16(t[2], t[3]);
    1947           0 :   u[4] = _mm_unpacklo_epi16(t[9], t[14]);
    1948           0 :   u[5] = _mm_unpackhi_epi16(t[9], t[14]);
    1949           0 :   u[6] = _mm_unpacklo_epi16(t[10], t[13]);
    1950           0 :   u[7] = _mm_unpackhi_epi16(t[10], t[13]);
    1951             : 
    1952           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
    1953           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
    1954           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
    1955           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
    1956           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
    1957           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
    1958           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
    1959           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
    1960           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
    1961           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
    1962           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
    1963           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
    1964           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
    1965           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
    1966           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
    1967           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
    1968             : 
    1969           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1970           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1971           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1972           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1973           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    1974           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    1975           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    1976           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    1977           0 :   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
    1978           0 :   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
    1979           0 :   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
    1980           0 :   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
    1981           0 :   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
    1982           0 :   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
    1983           0 :   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
    1984           0 :   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
    1985             : 
    1986           0 :   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1987           0 :   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1988           0 :   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1989           0 :   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1990           0 :   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1991           0 :   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1992           0 :   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1993           0 :   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1994           0 :   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
    1995           0 :   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
    1996           0 :   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
    1997           0 :   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
    1998           0 :   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
    1999           0 :   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
    2000           0 :   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
    2001           0 :   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
    2002             : 
    2003           0 :   s[0] = _mm_packs_epi32(u[0], u[1]);
    2004           0 :   s[1] = _mm_packs_epi32(u[2], u[3]);
    2005           0 :   s[2] = _mm_packs_epi32(u[4], u[5]);
    2006           0 :   s[3] = _mm_packs_epi32(u[6], u[7]);
    2007           0 :   s[4] = _mm_add_epi16(t[4], t[5]);
    2008           0 :   s[5] = _mm_sub_epi16(t[4], t[5]);
    2009           0 :   s[6] = _mm_sub_epi16(t[7], t[6]);
    2010           0 :   s[7] = _mm_add_epi16(t[6], t[7]);
    2011           0 :   s[8] = t[8];
    2012           0 :   s[15] = t[15];
    2013           0 :   s[9] = _mm_packs_epi32(u[8], u[9]);
    2014           0 :   s[14] = _mm_packs_epi32(u[10], u[11]);
    2015           0 :   s[10] = _mm_packs_epi32(u[12], u[13]);
    2016           0 :   s[13] = _mm_packs_epi32(u[14], u[15]);
    2017           0 :   s[11] = t[11];
    2018           0 :   s[12] = t[12];
    2019             : 
    2020             :   // stage 5
    2021           0 :   t[0] = _mm_add_epi16(s[0], s[3]);
    2022           0 :   t[1] = _mm_add_epi16(s[1], s[2]);
    2023           0 :   t[2] = _mm_sub_epi16(s[1], s[2]);
    2024           0 :   t[3] = _mm_sub_epi16(s[0], s[3]);
    2025           0 :   t[4] = s[4];
    2026           0 :   t[7] = s[7];
    2027             : 
    2028           0 :   u[0] = _mm_unpacklo_epi16(s[5], s[6]);
    2029           0 :   u[1] = _mm_unpackhi_epi16(s[5], s[6]);
    2030           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
    2031           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
    2032           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
    2033           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
    2034           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    2035           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    2036           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    2037           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    2038           0 :   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    2039           0 :   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    2040           0 :   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    2041           0 :   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    2042           0 :   t[5] = _mm_packs_epi32(u[0], u[1]);
    2043           0 :   t[6] = _mm_packs_epi32(u[2], u[3]);
    2044             : 
    2045           0 :   t[8] = _mm_add_epi16(s[8], s[11]);
    2046           0 :   t[9] = _mm_add_epi16(s[9], s[10]);
    2047           0 :   t[10] = _mm_sub_epi16(s[9], s[10]);
    2048           0 :   t[11] = _mm_sub_epi16(s[8], s[11]);
    2049           0 :   t[12] = _mm_sub_epi16(s[15], s[12]);
    2050           0 :   t[13] = _mm_sub_epi16(s[14], s[13]);
    2051           0 :   t[14] = _mm_add_epi16(s[13], s[14]);
    2052           0 :   t[15] = _mm_add_epi16(s[12], s[15]);
    2053             : 
    2054             :   // stage 6
    2055           0 :   s[0] = _mm_add_epi16(t[0], t[7]);
    2056           0 :   s[1] = _mm_add_epi16(t[1], t[6]);
    2057           0 :   s[2] = _mm_add_epi16(t[2], t[5]);
    2058           0 :   s[3] = _mm_add_epi16(t[3], t[4]);
    2059           0 :   s[4] = _mm_sub_epi16(t[3], t[4]);
    2060           0 :   s[5] = _mm_sub_epi16(t[2], t[5]);
    2061           0 :   s[6] = _mm_sub_epi16(t[1], t[6]);
    2062           0 :   s[7] = _mm_sub_epi16(t[0], t[7]);
    2063           0 :   s[8] = t[8];
    2064           0 :   s[9] = t[9];
    2065             : 
    2066           0 :   u[0] = _mm_unpacklo_epi16(t[10], t[13]);
    2067           0 :   u[1] = _mm_unpackhi_epi16(t[10], t[13]);
    2068           0 :   u[2] = _mm_unpacklo_epi16(t[11], t[12]);
    2069           0 :   u[3] = _mm_unpackhi_epi16(t[11], t[12]);
    2070             : 
    2071           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
    2072           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
    2073           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
    2074           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
    2075           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
    2076           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
    2077           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
    2078           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
    2079             : 
    2080           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    2081           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    2082           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    2083           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    2084           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    2085           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    2086           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    2087           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    2088             : 
    2089           0 :   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    2090           0 :   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    2091           0 :   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    2092           0 :   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    2093           0 :   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    2094           0 :   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    2095           0 :   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    2096           0 :   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    2097             : 
    2098           0 :   s[10] = _mm_packs_epi32(u[0], u[1]);
    2099           0 :   s[13] = _mm_packs_epi32(u[2], u[3]);
    2100           0 :   s[11] = _mm_packs_epi32(u[4], u[5]);
    2101           0 :   s[12] = _mm_packs_epi32(u[6], u[7]);
    2102           0 :   s[14] = t[14];
    2103           0 :   s[15] = t[15];
    2104             : 
    2105             :   // stage 7
    2106           0 :   in[0] = _mm_add_epi16(s[0], s[15]);
    2107           0 :   in[1] = _mm_add_epi16(s[1], s[14]);
    2108           0 :   in[2] = _mm_add_epi16(s[2], s[13]);
    2109           0 :   in[3] = _mm_add_epi16(s[3], s[12]);
    2110           0 :   in[4] = _mm_add_epi16(s[4], s[11]);
    2111           0 :   in[5] = _mm_add_epi16(s[5], s[10]);
    2112           0 :   in[6] = _mm_add_epi16(s[6], s[9]);
    2113           0 :   in[7] = _mm_add_epi16(s[7], s[8]);
    2114           0 :   in[8] = _mm_sub_epi16(s[7], s[8]);
    2115           0 :   in[9] = _mm_sub_epi16(s[6], s[9]);
    2116           0 :   in[10] = _mm_sub_epi16(s[5], s[10]);
    2117           0 :   in[11] = _mm_sub_epi16(s[4], s[11]);
    2118           0 :   in[12] = _mm_sub_epi16(s[3], s[12]);
    2119           0 :   in[13] = _mm_sub_epi16(s[2], s[13]);
    2120           0 :   in[14] = _mm_sub_epi16(s[1], s[14]);
    2121           0 :   in[15] = _mm_sub_epi16(s[0], s[15]);
    2122           0 : }
    2123             : 
    2124           0 : void idct16_sse2(__m128i *in0, __m128i *in1) {
    2125           0 :   array_transpose_16x16(in0, in1);
    2126           0 :   idct16_8col(in0);
    2127           0 :   idct16_8col(in1);
    2128           0 : }
    2129             : 
    2130           0 : void iadst16_sse2(__m128i *in0, __m128i *in1) {
    2131           0 :   array_transpose_16x16(in0, in1);
    2132           0 :   iadst16_8col(in0);
    2133           0 :   iadst16_8col(in1);
    2134           0 : }
    2135             : 
    2136           0 : void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
    2137             :                                int stride) {
    2138           0 :   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    2139           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
    2140           0 :   const __m128i zero = _mm_setzero_si128();
    2141             : 
    2142           0 :   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
    2143           0 :   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
    2144           0 :   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
    2145           0 :   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
    2146             : 
    2147           0 :   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    2148           0 :   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
    2149             : 
    2150           0 :   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
    2151           0 :   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    2152           0 :   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    2153           0 :   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
    2154           0 :   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
    2155           0 :   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    2156             : 
    2157           0 :   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    2158             :   __m128i in[16], l[16];
    2159             :   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
    2160             :       stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
    2161             :       stp1_12_0;
    2162             :   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
    2163             :       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
    2164             :   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    2165             :   int i;
    2166             :   // First 1-D inverse DCT
    2167             :   // Load input data.
    2168           0 :   in[0] = load_input_data(input);
    2169           0 :   in[1] = load_input_data(input + 8 * 2);
    2170           0 :   in[2] = load_input_data(input + 8 * 4);
    2171           0 :   in[3] = load_input_data(input + 8 * 6);
    2172             : 
    2173           0 :   TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
    2174             : 
    2175             :   // Stage2
    2176             :   {
    2177           0 :     const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
    2178           0 :     const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
    2179             : 
    2180           0 :     tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
    2181           0 :     tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
    2182           0 :     tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
    2183           0 :     tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
    2184             : 
    2185           0 :     tmp0 = _mm_add_epi32(tmp0, rounding);
    2186           0 :     tmp2 = _mm_add_epi32(tmp2, rounding);
    2187           0 :     tmp5 = _mm_add_epi32(tmp5, rounding);
    2188           0 :     tmp7 = _mm_add_epi32(tmp7, rounding);
    2189             : 
    2190           0 :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
    2191           0 :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
    2192           0 :     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
    2193           0 :     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
    2194             : 
    2195           0 :     stp2_8 = _mm_packs_epi32(tmp0, tmp2);
    2196           0 :     stp2_11 = _mm_packs_epi32(tmp5, tmp7);
    2197             :   }
    2198             : 
    2199             :   // Stage3
    2200             :   {
    2201           0 :     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
    2202             : 
    2203           0 :     tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
    2204           0 :     tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
    2205             : 
    2206           0 :     tmp0 = _mm_add_epi32(tmp0, rounding);
    2207           0 :     tmp2 = _mm_add_epi32(tmp2, rounding);
    2208           0 :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
    2209           0 :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
    2210             : 
    2211           0 :     stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
    2212           0 :     stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
    2213             : 
    2214           0 :     stp1_4 = _mm_packs_epi32(tmp0, tmp2);
    2215             :   }
    2216             : 
    2217             :   // Stage4
    2218             :   {
    2219           0 :     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
    2220           0 :     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
    2221           0 :     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
    2222             : 
    2223           0 :     tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
    2224           0 :     tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
    2225           0 :     tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
    2226           0 :     tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
    2227           0 :     tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
    2228           0 :     tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
    2229             : 
    2230           0 :     tmp0 = _mm_add_epi32(tmp0, rounding);
    2231           0 :     tmp2 = _mm_add_epi32(tmp2, rounding);
    2232           0 :     tmp1 = _mm_add_epi32(tmp1, rounding);
    2233           0 :     tmp3 = _mm_add_epi32(tmp3, rounding);
    2234           0 :     tmp5 = _mm_add_epi32(tmp5, rounding);
    2235           0 :     tmp7 = _mm_add_epi32(tmp7, rounding);
    2236             : 
    2237           0 :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
    2238           0 :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
    2239           0 :     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
    2240           0 :     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
    2241           0 :     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
    2242           0 :     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
    2243             : 
    2244           0 :     stp1_0 = _mm_packs_epi32(tmp0, tmp0);
    2245           0 :     stp1_1 = _mm_packs_epi32(tmp2, tmp2);
    2246           0 :     stp2_9 = _mm_packs_epi32(tmp1, tmp3);
    2247           0 :     stp2_10 = _mm_packs_epi32(tmp5, tmp7);
    2248             : 
    2249           0 :     stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
    2250             :   }
    2251             : 
    2252             :   // Stage5 and Stage6
    2253             :   {
    2254           0 :     tmp0 = _mm_add_epi16(stp2_8, stp2_11);
    2255           0 :     tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
    2256           0 :     tmp2 = _mm_add_epi16(stp2_9, stp2_10);
    2257           0 :     tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
    2258             : 
    2259           0 :     stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
    2260           0 :     stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
    2261           0 :     stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
    2262           0 :     stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
    2263             : 
    2264           0 :     stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
    2265           0 :     stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
    2266           0 :     stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
    2267           0 :     stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
    2268             :   }
    2269             : 
    2270             :   // Stage6
    2271             :   {
    2272           0 :     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
    2273           0 :     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
    2274           0 :     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
    2275             : 
    2276           0 :     tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
    2277           0 :     tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
    2278           0 :     tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
    2279           0 :     tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
    2280           0 :     tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
    2281           0 :     tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
    2282             : 
    2283           0 :     tmp1 = _mm_add_epi32(tmp1, rounding);
    2284           0 :     tmp3 = _mm_add_epi32(tmp3, rounding);
    2285           0 :     tmp0 = _mm_add_epi32(tmp0, rounding);
    2286           0 :     tmp2 = _mm_add_epi32(tmp2, rounding);
    2287           0 :     tmp4 = _mm_add_epi32(tmp4, rounding);
    2288           0 :     tmp6 = _mm_add_epi32(tmp6, rounding);
    2289             : 
    2290           0 :     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
    2291           0 :     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
    2292           0 :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
    2293           0 :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
    2294           0 :     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
    2295           0 :     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
    2296             : 
    2297           0 :     stp1_6 = _mm_packs_epi32(tmp3, tmp1);
    2298             : 
    2299           0 :     stp2_10 = _mm_packs_epi32(tmp0, zero);
    2300           0 :     stp2_13 = _mm_packs_epi32(tmp2, zero);
    2301           0 :     stp2_11 = _mm_packs_epi32(tmp4, zero);
    2302           0 :     stp2_12 = _mm_packs_epi32(tmp6, zero);
    2303             : 
    2304           0 :     tmp0 = _mm_add_epi16(stp1_0, stp1_4);
    2305           0 :     tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
    2306           0 :     tmp2 = _mm_add_epi16(stp1_1, stp1_6);
    2307           0 :     tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
    2308             : 
    2309           0 :     stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
    2310           0 :     stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
    2311           0 :     stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
    2312           0 :     stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
    2313           0 :     stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
    2314           0 :     stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
    2315           0 :     stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
    2316           0 :     stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
    2317             :   }
    2318             : 
    2319             :   // Stage7. Left 8x16 only.
    2320           0 :   l[0] = _mm_add_epi16(stp2_0, stp1_15);
    2321           0 :   l[1] = _mm_add_epi16(stp2_1, stp1_14);
    2322           0 :   l[2] = _mm_add_epi16(stp2_2, stp2_13);
    2323           0 :   l[3] = _mm_add_epi16(stp2_3, stp2_12);
    2324           0 :   l[4] = _mm_add_epi16(stp2_4, stp2_11);
    2325           0 :   l[5] = _mm_add_epi16(stp2_5, stp2_10);
    2326           0 :   l[6] = _mm_add_epi16(stp2_6, stp1_9);
    2327           0 :   l[7] = _mm_add_epi16(stp2_7, stp1_8);
    2328           0 :   l[8] = _mm_sub_epi16(stp2_7, stp1_8);
    2329           0 :   l[9] = _mm_sub_epi16(stp2_6, stp1_9);
    2330           0 :   l[10] = _mm_sub_epi16(stp2_5, stp2_10);
    2331           0 :   l[11] = _mm_sub_epi16(stp2_4, stp2_11);
    2332           0 :   l[12] = _mm_sub_epi16(stp2_3, stp2_12);
    2333           0 :   l[13] = _mm_sub_epi16(stp2_2, stp2_13);
    2334           0 :   l[14] = _mm_sub_epi16(stp2_1, stp1_14);
    2335           0 :   l[15] = _mm_sub_epi16(stp2_0, stp1_15);
    2336             : 
    2337             :   // Second 1-D inverse transform, performed per 8x16 block
    2338           0 :   for (i = 0; i < 2; i++) {
    2339             :     int j;
    2340           0 :     array_transpose_4X8(l + 8 * i, in);
    2341             : 
    2342           0 :     IDCT16_10
    2343             : 
    2344             :     // Stage7
    2345           0 :     in[0] = _mm_add_epi16(stp2_0, stp1_15);
    2346           0 :     in[1] = _mm_add_epi16(stp2_1, stp1_14);
    2347           0 :     in[2] = _mm_add_epi16(stp2_2, stp2_13);
    2348           0 :     in[3] = _mm_add_epi16(stp2_3, stp2_12);
    2349           0 :     in[4] = _mm_add_epi16(stp2_4, stp2_11);
    2350           0 :     in[5] = _mm_add_epi16(stp2_5, stp2_10);
    2351           0 :     in[6] = _mm_add_epi16(stp2_6, stp1_9);
    2352           0 :     in[7] = _mm_add_epi16(stp2_7, stp1_8);
    2353           0 :     in[8] = _mm_sub_epi16(stp2_7, stp1_8);
    2354           0 :     in[9] = _mm_sub_epi16(stp2_6, stp1_9);
    2355           0 :     in[10] = _mm_sub_epi16(stp2_5, stp2_10);
    2356           0 :     in[11] = _mm_sub_epi16(stp2_4, stp2_11);
    2357           0 :     in[12] = _mm_sub_epi16(stp2_3, stp2_12);
    2358           0 :     in[13] = _mm_sub_epi16(stp2_2, stp2_13);
    2359           0 :     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
    2360           0 :     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
    2361             : 
    2362           0 :     for (j = 0; j < 16; ++j) {
    2363             :       // Final rounding and shift
    2364           0 :       in[j] = _mm_adds_epi16(in[j], final_rounding);
    2365           0 :       in[j] = _mm_srai_epi16(in[j], 6);
    2366           0 :       RECON_AND_STORE(dest + j * stride, in[j]);
    2367             :     }
    2368             : 
    2369           0 :     dest += 8;
    2370             :   }
    2371           0 : }
    2372             : 
    2373             : #define LOAD_DQCOEFF(reg, input)  \
    2374             :   {                               \
    2375             :     reg = load_input_data(input); \
    2376             :     input += 8;                   \
    2377             :   }
    2378             : 
    2379             : #define IDCT32_34                                                              \
    2380             :   /* Stage1 */                                                                 \
    2381             :   {                                                                            \
    2382             :     const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
    2383             :     const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
    2384             :                                                                                \
    2385             :     const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
    2386             :     const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
    2387             :                                                                                \
    2388             :     const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
    2389             :     const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
    2390             :                                                                                \
    2391             :     const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
    2392             :     const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
    2393             :                                                                                \
    2394             :     MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
    2395             :                              stp1_31);                                         \
    2396             :     MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
    2397             :                              stp1_28);                                         \
    2398             :     MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
    2399             :                              stp1_27);                                         \
    2400             :     MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
    2401             :                              stp1_24);                                         \
    2402             :   }                                                                            \
    2403             :                                                                                \
    2404             :   /* Stage2 */                                                                 \
    2405             :   {                                                                            \
    2406             :     const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
    2407             :     const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
    2408             :                                                                                \
    2409             :     const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
    2410             :     const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
    2411             :                                                                                \
    2412             :     MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
    2413             :                              stp2_15);                                         \
    2414             :     MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
    2415             :                              stp2_12);                                         \
    2416             :                                                                                \
    2417             :     stp2_16 = stp1_16;                                                         \
    2418             :     stp2_19 = stp1_19;                                                         \
    2419             :                                                                                \
    2420             :     stp2_20 = stp1_20;                                                         \
    2421             :     stp2_23 = stp1_23;                                                         \
    2422             :                                                                                \
    2423             :     stp2_24 = stp1_24;                                                         \
    2424             :     stp2_27 = stp1_27;                                                         \
    2425             :                                                                                \
    2426             :     stp2_28 = stp1_28;                                                         \
    2427             :     stp2_31 = stp1_31;                                                         \
    2428             :   }                                                                            \
    2429             :                                                                                \
    2430             :   /* Stage3 */                                                                 \
    2431             :   {                                                                            \
    2432             :     const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
    2433             :     const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
    2434             :                                                                                \
    2435             :     const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
    2436             :     const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
    2437             :     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
    2438             :     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
    2439             :                                                                                \
    2440             :     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
    2441             :     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
    2442             :     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
    2443             :     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
    2444             :                                                                                \
    2445             :     MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
    2446             :                              stp1_7);                                          \
    2447             :                                                                                \
    2448             :     stp1_8 = stp2_8;                                                           \
    2449             :     stp1_11 = stp2_11;                                                         \
    2450             :     stp1_12 = stp2_12;                                                         \
    2451             :     stp1_15 = stp2_15;                                                         \
    2452             :                                                                                \
    2453             :     MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
    2454             :                            stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
    2455             :                            stp1_29)                                            \
    2456             :     MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
    2457             :                            stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
    2458             :                            stp1_25)                                            \
    2459             :                                                                                \
    2460             :     stp1_16 = stp2_16;                                                         \
    2461             :     stp1_31 = stp2_31;                                                         \
    2462             :     stp1_19 = stp2_19;                                                         \
    2463             :     stp1_20 = stp2_20;                                                         \
    2464             :     stp1_23 = stp2_23;                                                         \
    2465             :     stp1_24 = stp2_24;                                                         \
    2466             :     stp1_27 = stp2_27;                                                         \
    2467             :     stp1_28 = stp2_28;                                                         \
    2468             :   }                                                                            \
    2469             :                                                                                \
    2470             :   /* Stage4 */                                                                 \
    2471             :   {                                                                            \
    2472             :     const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
    2473             :     const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
    2474             :                                                                                \
    2475             :     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
    2476             :     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
    2477             :     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
    2478             :     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
    2479             :                                                                                \
    2480             :     MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
    2481             :                              stp2_1);                                          \
    2482             :                                                                                \
    2483             :     stp2_4 = stp1_4;                                                           \
    2484             :     stp2_5 = stp1_4;                                                           \
    2485             :     stp2_6 = stp1_7;                                                           \
    2486             :     stp2_7 = stp1_7;                                                           \
    2487             :                                                                                \
    2488             :     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
    2489             :                            stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
    2490             :                            stp2_13)                                            \
    2491             :                                                                                \
    2492             :     stp2_8 = stp1_8;                                                           \
    2493             :     stp2_15 = stp1_15;                                                         \
    2494             :     stp2_11 = stp1_11;                                                         \
    2495             :     stp2_12 = stp1_12;                                                         \
    2496             :                                                                                \
    2497             :     stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
    2498             :     stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
    2499             :     stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
    2500             :     stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
    2501             :     stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
    2502             :     stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
    2503             :     stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
    2504             :     stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
    2505             :                                                                                \
    2506             :     stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
    2507             :     stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
    2508             :     stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
    2509             :     stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
    2510             :     stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
    2511             :     stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
    2512             :     stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
    2513             :     stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
    2514             :   }                                                                            \
    2515             :                                                                                \
    2516             :   /* Stage5 */                                                                 \
    2517             :   {                                                                            \
    2518             :     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
    2519             :     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
    2520             :     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
    2521             :     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
    2522             :                                                                                \
    2523             :     const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
    2524             :     const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
    2525             :     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
    2526             :     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
    2527             :                                                                                \
    2528             :     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
    2529             :     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
    2530             :                                                                                \
    2531             :     stp1_0 = stp2_0;                                                           \
    2532             :     stp1_1 = stp2_1;                                                           \
    2533             :     stp1_2 = stp2_1;                                                           \
    2534             :     stp1_3 = stp2_0;                                                           \
    2535             :                                                                                \
    2536             :     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
    2537             :     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
    2538             :     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
    2539             :     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
    2540             :                                                                                \
    2541             :     tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
    2542             :     tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
    2543             :     tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
    2544             :     tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
    2545             :                                                                                \
    2546             :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
    2547             :     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
    2548             :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
    2549             :     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
    2550             :                                                                                \
    2551             :     stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
    2552             :     stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
    2553             :                                                                                \
    2554             :     stp1_4 = stp2_4;                                                           \
    2555             :     stp1_7 = stp2_7;                                                           \
    2556             :                                                                                \
    2557             :     stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
    2558             :     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
    2559             :     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
    2560             :     stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
    2561             :     stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
    2562             :     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
    2563             :     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
    2564             :     stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
    2565             :                                                                                \
    2566             :     stp1_16 = stp2_16;                                                         \
    2567             :     stp1_17 = stp2_17;                                                         \
    2568             :                                                                                \
    2569             :     MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
    2570             :                            stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
    2571             :                            stp1_28)                                            \
    2572             :     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
    2573             :                            stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
    2574             :                            stp1_26)                                            \
    2575             :                                                                                \
    2576             :     stp1_22 = stp2_22;                                                         \
    2577             :     stp1_23 = stp2_23;                                                         \
    2578             :     stp1_24 = stp2_24;                                                         \
    2579             :     stp1_25 = stp2_25;                                                         \
    2580             :     stp1_30 = stp2_30;                                                         \
    2581             :     stp1_31 = stp2_31;                                                         \
    2582             :   }                                                                            \
    2583             :                                                                                \
    2584             :   /* Stage6 */                                                                 \
    2585             :   {                                                                            \
    2586             :     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
    2587             :     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
    2588             :     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
    2589             :     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
    2590             :                                                                                \
    2591             :     stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
    2592             :     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
    2593             :     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
    2594             :     stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
    2595             :     stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
    2596             :     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
    2597             :     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
    2598             :     stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
    2599             :                                                                                \
    2600             :     stp2_8 = stp1_8;                                                           \
    2601             :     stp2_9 = stp1_9;                                                           \
    2602             :     stp2_14 = stp1_14;                                                         \
    2603             :     stp2_15 = stp1_15;                                                         \
    2604             :                                                                                \
    2605             :     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
    2606             :                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
    2607             :                            stp2_12)                                            \
    2608             :                                                                                \
    2609             :     stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
    2610             :     stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
    2611             :     stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
    2612             :     stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
    2613             :     stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
    2614             :     stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
    2615             :     stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
    2616             :     stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
    2617             :                                                                                \
    2618             :     stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
    2619             :     stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
    2620             :     stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
    2621             :     stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
    2622             :     stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
    2623             :     stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
    2624             :     stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
    2625             :     stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
    2626             :   }                                                                            \
    2627             :                                                                                \
    2628             :   /* Stage7 */                                                                 \
    2629             :   {                                                                            \
    2630             :     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
    2631             :     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
    2632             :     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
    2633             :     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
    2634             :                                                                                \
    2635             :     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
    2636             :     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
    2637             :     const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
    2638             :     const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
    2639             :                                                                                \
    2640             :     stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
    2641             :     stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
    2642             :     stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
    2643             :     stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
    2644             :     stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
    2645             :     stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
    2646             :     stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
    2647             :     stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
    2648             :     stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
    2649             :     stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
    2650             :     stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
    2651             :     stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
    2652             :     stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
    2653             :     stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
    2654             :     stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
    2655             :     stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
    2656             :                                                                                \
    2657             :     stp1_16 = stp2_16;                                                         \
    2658             :     stp1_17 = stp2_17;                                                         \
    2659             :     stp1_18 = stp2_18;                                                         \
    2660             :     stp1_19 = stp2_19;                                                         \
    2661             :                                                                                \
    2662             :     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
    2663             :                            stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
    2664             :                            stp1_26)                                            \
    2665             :     MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
    2666             :                            stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
    2667             :                            stp1_24)                                            \
    2668             :                                                                                \
    2669             :     stp1_28 = stp2_28;                                                         \
    2670             :     stp1_29 = stp2_29;                                                         \
    2671             :     stp1_30 = stp2_30;                                                         \
    2672             :     stp1_31 = stp2_31;                                                         \
    2673             :   }
    2674             : 
    2675             : #define IDCT32                                                                 \
    2676             :   /* Stage1 */                                                                 \
    2677             :   {                                                                            \
    2678             :     const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);                 \
    2679             :     const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);                 \
    2680             :     const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);               \
    2681             :     const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);               \
    2682             :                                                                                \
    2683             :     const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);                 \
    2684             :     const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);                 \
    2685             :     const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);                 \
    2686             :     const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);                 \
    2687             :                                                                                \
    2688             :     const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);                 \
    2689             :     const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);                 \
    2690             :     const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);               \
    2691             :     const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);               \
    2692             :                                                                                \
    2693             :     const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);               \
    2694             :     const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);               \
    2695             :     const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);                 \
    2696             :     const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);                 \
    2697             :                                                                                \
    2698             :     MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
    2699             :                            stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
    2700             :                            stp1_30)                                            \
    2701             :     MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
    2702             :                            stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
    2703             :     MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
    2704             :                            stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
    2705             :                            stp1_21, stp1_26)                                   \
    2706             :     MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
    2707             :                            stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
    2708             :                            stp1_23, stp1_24)                                   \
    2709             :   }                                                                            \
    2710             :                                                                                \
    2711             :   /* Stage2 */                                                                 \
    2712             :   {                                                                            \
    2713             :     const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);                 \
    2714             :     const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);                 \
    2715             :     const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);               \
    2716             :     const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);               \
    2717             :                                                                                \
    2718             :     const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);               \
    2719             :     const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);               \
    2720             :     const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);                 \
    2721             :     const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);                 \
    2722             :                                                                                \
    2723             :     MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
    2724             :                            stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
    2725             :                            stp2_14)                                            \
    2726             :     MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
    2727             :                            stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
    2728             :                            stp2_12)                                            \
    2729             :                                                                                \
    2730             :     stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
    2731             :     stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
    2732             :     stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
    2733             :     stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
    2734             :                                                                                \
    2735             :     stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
    2736             :     stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
    2737             :     stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
    2738             :     stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
    2739             :                                                                                \
    2740             :     stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
    2741             :     stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
    2742             :     stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
    2743             :     stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
    2744             :                                                                                \
    2745             :     stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
    2746             :     stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
    2747             :     stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
    2748             :     stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
    2749             :   }                                                                            \
    2750             :                                                                                \
    2751             :   /* Stage3 */                                                                 \
    2752             :   {                                                                            \
    2753             :     const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);                 \
    2754             :     const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);                 \
    2755             :     const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);               \
    2756             :     const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);               \
    2757             :                                                                                \
    2758             :     const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
    2759             :     const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
    2760             :     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
    2761             :     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
    2762             :                                                                                \
    2763             :     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
    2764             :     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
    2765             :     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
    2766             :     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
    2767             :                                                                                \
    2768             :     MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
    2769             :                            stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
    2770             :                            stp1_6)                                             \
    2771             :                                                                                \
    2772             :     stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
    2773             :     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
    2774             :     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
    2775             :     stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
    2776             :     stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
    2777             :     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
    2778             :     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
    2779             :     stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
    2780             :                                                                                \
    2781             :     MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
    2782             :                            stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
    2783             :                            stp1_29)                                            \
    2784             :     MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
    2785             :                            stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
    2786             :                            stp1_25)                                            \
    2787             :                                                                                \
    2788             :     stp1_16 = stp2_16;                                                         \
    2789             :     stp1_31 = stp2_31;                                                         \
    2790             :     stp1_19 = stp2_19;                                                         \
    2791             :     stp1_20 = stp2_20;                                                         \
    2792             :     stp1_23 = stp2_23;                                                         \
    2793             :     stp1_24 = stp2_24;                                                         \
    2794             :     stp1_27 = stp2_27;                                                         \
    2795             :     stp1_28 = stp2_28;                                                         \
    2796             :   }                                                                            \
    2797             :                                                                                \
    2798             :   /* Stage4 */                                                                 \
    2799             :   {                                                                            \
    2800             :     const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);                 \
    2801             :     const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);                 \
    2802             :     const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);                 \
    2803             :     const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);                 \
    2804             :                                                                                \
    2805             :     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
    2806             :     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
    2807             :     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
    2808             :     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
    2809             :                                                                                \
    2810             :     MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
    2811             :                            stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
    2812             :                                                                                \
    2813             :     stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
    2814             :     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
    2815             :     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
    2816             :     stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
    2817             :                                                                                \
    2818             :     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
    2819             :                            stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
    2820             :                            stp2_13)                                            \
    2821             :                                                                                \
    2822             :     stp2_8 = stp1_8;                                                           \
    2823             :     stp2_15 = stp1_15;                                                         \
    2824             :     stp2_11 = stp1_11;                                                         \
    2825             :     stp2_12 = stp1_12;                                                         \
    2826             :                                                                                \
    2827             :     stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
    2828             :     stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
    2829             :     stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
    2830             :     stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
    2831             :     stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
    2832             :     stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
    2833             :     stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
    2834             :     stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
    2835             :                                                                                \
    2836             :     stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
    2837             :     stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
    2838             :     stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
    2839             :     stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
    2840             :     stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
    2841             :     stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
    2842             :     stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
    2843             :     stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
    2844             :   }                                                                            \
    2845             :                                                                                \
    2846             :   /* Stage5 */                                                                 \
    2847             :   {                                                                            \
    2848             :     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
    2849             :     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
    2850             :     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
    2851             :     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
    2852             :                                                                                \
    2853             :     const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
    2854             :     const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
    2855             :     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
    2856             :     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
    2857             :                                                                                \
    2858             :     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
    2859             :     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
    2860             :                                                                                \
    2861             :     stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
    2862             :     stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
    2863             :     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
    2864             :     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
    2865             :                                                                                \
    2866             :     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
    2867             :     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
    2868             :     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
    2869             :     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
    2870             :                                                                                \
    2871             :     tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
    2872             :     tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
    2873             :     tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
    2874             :     tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
    2875             :                                                                                \
    2876             :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
    2877             :     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
    2878             :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
    2879             :     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
    2880             :                                                                                \
    2881             :     stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
    2882             :     stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
    2883             :                                                                                \
    2884             :     stp1_4 = stp2_4;                                                           \
    2885             :     stp1_7 = stp2_7;                                                           \
    2886             :                                                                                \
    2887             :     stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
    2888             :     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
    2889             :     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
    2890             :     stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
    2891             :     stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
    2892             :     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
    2893             :     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
    2894             :     stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
    2895             :                                                                                \
    2896             :     stp1_16 = stp2_16;                                                         \
    2897             :     stp1_17 = stp2_17;                                                         \
    2898             :                                                                                \
    2899             :     MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
    2900             :                            stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
    2901             :                            stp1_28)                                            \
    2902             :     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
    2903             :                            stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
    2904             :                            stp1_26)                                            \
    2905             :                                                                                \
    2906             :     stp1_22 = stp2_22;                                                         \
    2907             :     stp1_23 = stp2_23;                                                         \
    2908             :     stp1_24 = stp2_24;                                                         \
    2909             :     stp1_25 = stp2_25;                                                         \
    2910             :     stp1_30 = stp2_30;                                                         \
    2911             :     stp1_31 = stp2_31;                                                         \
    2912             :   }                                                                            \
    2913             :                                                                                \
    2914             :   /* Stage6 */                                                                 \
    2915             :   {                                                                            \
    2916             :     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
    2917             :     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
    2918             :     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
    2919             :     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
    2920             :                                                                                \
    2921             :     stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
    2922             :     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
    2923             :     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
    2924             :     stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
    2925             :     stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
    2926             :     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
    2927             :     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
    2928             :     stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
    2929             :                                                                                \
    2930             :     stp2_8 = stp1_8;                                                           \
    2931             :     stp2_9 = stp1_9;                                                           \
    2932             :     stp2_14 = stp1_14;                                                         \
    2933             :     stp2_15 = stp1_15;                                                         \
    2934             :                                                                                \
    2935             :     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
    2936             :                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
    2937             :                            stp2_12)                                            \
    2938             :                                                                                \
    2939             :     stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
    2940             :     stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
    2941             :     stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
    2942             :     stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
    2943             :     stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
    2944             :     stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
    2945             :     stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
    2946             :     stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
    2947             :                                                                                \
    2948             :     stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
    2949             :     stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
    2950             :     stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
    2951             :     stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
    2952             :     stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
    2953             :     stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
    2954             :     stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
    2955             :     stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
    2956             :   }                                                                            \
    2957             :                                                                                \
    2958             :   /* Stage7 */                                                                 \
    2959             :   {                                                                            \
    2960             :     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
    2961             :     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
    2962             :     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
    2963             :     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
    2964             :                                                                                \
    2965             :     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
    2966             :     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
    2967             :     const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
    2968             :     const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
    2969             :                                                                                \
    2970             :     stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
    2971             :     stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
    2972             :     stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
    2973             :     stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
    2974             :     stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
    2975             :     stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
    2976             :     stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
    2977             :     stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
    2978             :     stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
    2979             :     stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
    2980             :     stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
    2981             :     stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
    2982             :     stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
    2983             :     stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
    2984             :     stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
    2985             :     stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
    2986             :                                                                                \
    2987             :     stp1_16 = stp2_16;                                                         \
    2988             :     stp1_17 = stp2_17;                                                         \
    2989             :     stp1_18 = stp2_18;                                                         \
    2990             :     stp1_19 = stp2_19;                                                         \
    2991             :                                                                                \
    2992             :     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
    2993             :                            stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
    2994             :                            stp1_26)                                            \
    2995             :     MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
    2996             :                            stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
    2997             :                            stp1_24)                                            \
    2998             :                                                                                \
    2999             :     stp1_28 = stp2_28;                                                         \
    3000             :     stp1_29 = stp2_29;                                                         \
    3001             :     stp1_30 = stp2_30;                                                         \
    3002             :     stp1_31 = stp2_31;                                                         \
    3003             :   }
    3004             : 
    3005             : // Only upper-left 8x8 has non-zero coeff
    3006           0 : void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
    3007             :                                int stride) {
    3008           0 :   const __m128i zero = _mm_setzero_si128();
    3009           0 :   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    3010           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
    3011             : 
    3012             :   // idct constants for each stage
    3013           0 :   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
    3014           0 :   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
    3015           0 :   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
    3016           0 :   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
    3017           0 :   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
    3018           0 :   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
    3019           0 :   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
    3020           0 :   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
    3021             : 
    3022           0 :   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
    3023           0 :   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
    3024           0 :   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
    3025           0 :   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
    3026             : 
    3027           0 :   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    3028           0 :   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
    3029           0 :   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
    3030           0 :   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
    3031           0 :   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
    3032           0 :   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    3033           0 :   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
    3034           0 :   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
    3035             : 
    3036           0 :   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
    3037           0 :   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    3038           0 :   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    3039           0 :   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
    3040           0 :   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
    3041             : 
    3042           0 :   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    3043             : 
    3044             :   __m128i in[32], col[32];
    3045             :   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
    3046             :       stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
    3047             :       stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
    3048             :       stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
    3049             :   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
    3050             :       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
    3051             :       stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
    3052             :       stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
    3053             :   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    3054             :   int i;
    3055             : 
    3056             :   // Load input data. Only need to load the top left 8x8 block.
    3057           0 :   in[0] = load_input_data(input);
    3058           0 :   in[1] = load_input_data(input + 32);
    3059           0 :   in[2] = load_input_data(input + 64);
    3060           0 :   in[3] = load_input_data(input + 96);
    3061           0 :   in[4] = load_input_data(input + 128);
    3062           0 :   in[5] = load_input_data(input + 160);
    3063           0 :   in[6] = load_input_data(input + 192);
    3064           0 :   in[7] = load_input_data(input + 224);
    3065             : 
    3066           0 :   array_transpose_8x8(in, in);
    3067           0 :   IDCT32_34
    3068             : 
    3069             :   // 1_D: Store 32 intermediate results for each 8x32 block.
    3070           0 :   col[0] = _mm_add_epi16(stp1_0, stp1_31);
    3071           0 :   col[1] = _mm_add_epi16(stp1_1, stp1_30);
    3072           0 :   col[2] = _mm_add_epi16(stp1_2, stp1_29);
    3073           0 :   col[3] = _mm_add_epi16(stp1_3, stp1_28);
    3074           0 :   col[4] = _mm_add_epi16(stp1_4, stp1_27);
    3075           0 :   col[5] = _mm_add_epi16(stp1_5, stp1_26);
    3076           0 :   col[6] = _mm_add_epi16(stp1_6, stp1_25);
    3077           0 :   col[7] = _mm_add_epi16(stp1_7, stp1_24);
    3078           0 :   col[8] = _mm_add_epi16(stp1_8, stp1_23);
    3079           0 :   col[9] = _mm_add_epi16(stp1_9, stp1_22);
    3080           0 :   col[10] = _mm_add_epi16(stp1_10, stp1_21);
    3081           0 :   col[11] = _mm_add_epi16(stp1_11, stp1_20);
    3082           0 :   col[12] = _mm_add_epi16(stp1_12, stp1_19);
    3083           0 :   col[13] = _mm_add_epi16(stp1_13, stp1_18);
    3084           0 :   col[14] = _mm_add_epi16(stp1_14, stp1_17);
    3085           0 :   col[15] = _mm_add_epi16(stp1_15, stp1_16);
    3086           0 :   col[16] = _mm_sub_epi16(stp1_15, stp1_16);
    3087           0 :   col[17] = _mm_sub_epi16(stp1_14, stp1_17);
    3088           0 :   col[18] = _mm_sub_epi16(stp1_13, stp1_18);
    3089           0 :   col[19] = _mm_sub_epi16(stp1_12, stp1_19);
    3090           0 :   col[20] = _mm_sub_epi16(stp1_11, stp1_20);
    3091           0 :   col[21] = _mm_sub_epi16(stp1_10, stp1_21);
    3092           0 :   col[22] = _mm_sub_epi16(stp1_9, stp1_22);
    3093           0 :   col[23] = _mm_sub_epi16(stp1_8, stp1_23);
    3094           0 :   col[24] = _mm_sub_epi16(stp1_7, stp1_24);
    3095           0 :   col[25] = _mm_sub_epi16(stp1_6, stp1_25);
    3096           0 :   col[26] = _mm_sub_epi16(stp1_5, stp1_26);
    3097           0 :   col[27] = _mm_sub_epi16(stp1_4, stp1_27);
    3098           0 :   col[28] = _mm_sub_epi16(stp1_3, stp1_28);
    3099           0 :   col[29] = _mm_sub_epi16(stp1_2, stp1_29);
    3100           0 :   col[30] = _mm_sub_epi16(stp1_1, stp1_30);
    3101           0 :   col[31] = _mm_sub_epi16(stp1_0, stp1_31);
    3102           0 :   for (i = 0; i < 4; i++) {
    3103             :     int j;
    3104             :     // Transpose 32x8 block to 8x32 block
    3105           0 :     array_transpose_8x8(col + i * 8, in);
    3106           0 :     IDCT32_34
    3107             : 
    3108             :     // 2_D: Calculate the results and store them to destination.
    3109           0 :     in[0] = _mm_add_epi16(stp1_0, stp1_31);
    3110           0 :     in[1] = _mm_add_epi16(stp1_1, stp1_30);
    3111           0 :     in[2] = _mm_add_epi16(stp1_2, stp1_29);
    3112           0 :     in[3] = _mm_add_epi16(stp1_3, stp1_28);
    3113           0 :     in[4] = _mm_add_epi16(stp1_4, stp1_27);
    3114           0 :     in[5] = _mm_add_epi16(stp1_5, stp1_26);
    3115           0 :     in[6] = _mm_add_epi16(stp1_6, stp1_25);
    3116           0 :     in[7] = _mm_add_epi16(stp1_7, stp1_24);
    3117           0 :     in[8] = _mm_add_epi16(stp1_8, stp1_23);
    3118           0 :     in[9] = _mm_add_epi16(stp1_9, stp1_22);
    3119           0 :     in[10] = _mm_add_epi16(stp1_10, stp1_21);
    3120           0 :     in[11] = _mm_add_epi16(stp1_11, stp1_20);
    3121           0 :     in[12] = _mm_add_epi16(stp1_12, stp1_19);
    3122           0 :     in[13] = _mm_add_epi16(stp1_13, stp1_18);
    3123           0 :     in[14] = _mm_add_epi16(stp1_14, stp1_17);
    3124           0 :     in[15] = _mm_add_epi16(stp1_15, stp1_16);
    3125           0 :     in[16] = _mm_sub_epi16(stp1_15, stp1_16);
    3126           0 :     in[17] = _mm_sub_epi16(stp1_14, stp1_17);
    3127           0 :     in[18] = _mm_sub_epi16(stp1_13, stp1_18);
    3128           0 :     in[19] = _mm_sub_epi16(stp1_12, stp1_19);
    3129           0 :     in[20] = _mm_sub_epi16(stp1_11, stp1_20);
    3130           0 :     in[21] = _mm_sub_epi16(stp1_10, stp1_21);
    3131           0 :     in[22] = _mm_sub_epi16(stp1_9, stp1_22);
    3132           0 :     in[23] = _mm_sub_epi16(stp1_8, stp1_23);
    3133           0 :     in[24] = _mm_sub_epi16(stp1_7, stp1_24);
    3134           0 :     in[25] = _mm_sub_epi16(stp1_6, stp1_25);
    3135           0 :     in[26] = _mm_sub_epi16(stp1_5, stp1_26);
    3136           0 :     in[27] = _mm_sub_epi16(stp1_4, stp1_27);
    3137           0 :     in[28] = _mm_sub_epi16(stp1_3, stp1_28);
    3138           0 :     in[29] = _mm_sub_epi16(stp1_2, stp1_29);
    3139           0 :     in[30] = _mm_sub_epi16(stp1_1, stp1_30);
    3140           0 :     in[31] = _mm_sub_epi16(stp1_0, stp1_31);
    3141             : 
    3142           0 :     for (j = 0; j < 32; ++j) {
    3143             :       // Final rounding and shift
    3144           0 :       in[j] = _mm_adds_epi16(in[j], final_rounding);
    3145           0 :       in[j] = _mm_srai_epi16(in[j], 6);
    3146           0 :       RECON_AND_STORE(dest + j * stride, in[j]);
    3147             :     }
    3148             : 
    3149           0 :     dest += 8;
    3150             :   }
    3151           0 : }
    3152             : 
    3153           0 : void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
    3154             :                                  int stride) {
    3155           0 :   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    3156           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
    3157           0 :   const __m128i zero = _mm_setzero_si128();
    3158             : 
    3159             :   // idct constants for each stage
    3160           0 :   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
    3161           0 :   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
    3162           0 :   const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
    3163           0 :   const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
    3164           0 :   const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
    3165           0 :   const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
    3166           0 :   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
    3167           0 :   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
    3168           0 :   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
    3169           0 :   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
    3170           0 :   const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
    3171           0 :   const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
    3172           0 :   const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
    3173           0 :   const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
    3174           0 :   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
    3175           0 :   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
    3176             : 
    3177           0 :   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
    3178           0 :   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
    3179           0 :   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
    3180           0 :   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
    3181           0 :   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
    3182           0 :   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
    3183           0 :   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
    3184           0 :   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
    3185             : 
    3186           0 :   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    3187           0 :   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
    3188           0 :   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
    3189           0 :   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
    3190           0 :   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
    3191           0 :   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
    3192           0 :   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
    3193           0 :   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    3194           0 :   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
    3195           0 :   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
    3196             : 
    3197           0 :   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
    3198           0 :   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    3199           0 :   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    3200           0 :   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
    3201           0 :   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    3202           0 :   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
    3203           0 :   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
    3204             : 
    3205           0 :   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    3206             : 
    3207             :   __m128i in[32], col[128], zero_idx[16];
    3208             :   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
    3209             :       stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
    3210             :       stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
    3211             :       stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
    3212             :   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
    3213             :       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
    3214             :       stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
    3215             :       stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
    3216             :   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    3217             :   int i, j, i32;
    3218             : 
    3219           0 :   for (i = 0; i < 4; i++) {
    3220           0 :     i32 = (i << 5);
    3221             :     // First 1-D idct
    3222             :     // Load input data.
    3223           0 :     LOAD_DQCOEFF(in[0], input);
    3224           0 :     LOAD_DQCOEFF(in[8], input);
    3225           0 :     LOAD_DQCOEFF(in[16], input);
    3226           0 :     LOAD_DQCOEFF(in[24], input);
    3227           0 :     LOAD_DQCOEFF(in[1], input);
    3228           0 :     LOAD_DQCOEFF(in[9], input);
    3229           0 :     LOAD_DQCOEFF(in[17], input);
    3230           0 :     LOAD_DQCOEFF(in[25], input);
    3231           0 :     LOAD_DQCOEFF(in[2], input);
    3232           0 :     LOAD_DQCOEFF(in[10], input);
    3233           0 :     LOAD_DQCOEFF(in[18], input);
    3234           0 :     LOAD_DQCOEFF(in[26], input);
    3235           0 :     LOAD_DQCOEFF(in[3], input);
    3236           0 :     LOAD_DQCOEFF(in[11], input);
    3237           0 :     LOAD_DQCOEFF(in[19], input);
    3238           0 :     LOAD_DQCOEFF(in[27], input);
    3239             : 
    3240           0 :     LOAD_DQCOEFF(in[4], input);
    3241           0 :     LOAD_DQCOEFF(in[12], input);
    3242           0 :     LOAD_DQCOEFF(in[20], input);
    3243           0 :     LOAD_DQCOEFF(in[28], input);
    3244           0 :     LOAD_DQCOEFF(in[5], input);
    3245           0 :     LOAD_DQCOEFF(in[13], input);
    3246           0 :     LOAD_DQCOEFF(in[21], input);
    3247           0 :     LOAD_DQCOEFF(in[29], input);
    3248           0 :     LOAD_DQCOEFF(in[6], input);
    3249           0 :     LOAD_DQCOEFF(in[14], input);
    3250           0 :     LOAD_DQCOEFF(in[22], input);
    3251           0 :     LOAD_DQCOEFF(in[30], input);
    3252           0 :     LOAD_DQCOEFF(in[7], input);
    3253           0 :     LOAD_DQCOEFF(in[15], input);
    3254           0 :     LOAD_DQCOEFF(in[23], input);
    3255           0 :     LOAD_DQCOEFF(in[31], input);
    3256             : 
    3257             :     // checking if all entries are zero
    3258           0 :     zero_idx[0] = _mm_or_si128(in[0], in[1]);
    3259           0 :     zero_idx[1] = _mm_or_si128(in[2], in[3]);
    3260           0 :     zero_idx[2] = _mm_or_si128(in[4], in[5]);
    3261           0 :     zero_idx[3] = _mm_or_si128(in[6], in[7]);
    3262           0 :     zero_idx[4] = _mm_or_si128(in[8], in[9]);
    3263           0 :     zero_idx[5] = _mm_or_si128(in[10], in[11]);
    3264           0 :     zero_idx[6] = _mm_or_si128(in[12], in[13]);
    3265           0 :     zero_idx[7] = _mm_or_si128(in[14], in[15]);
    3266           0 :     zero_idx[8] = _mm_or_si128(in[16], in[17]);
    3267           0 :     zero_idx[9] = _mm_or_si128(in[18], in[19]);
    3268           0 :     zero_idx[10] = _mm_or_si128(in[20], in[21]);
    3269           0 :     zero_idx[11] = _mm_or_si128(in[22], in[23]);
    3270           0 :     zero_idx[12] = _mm_or_si128(in[24], in[25]);
    3271           0 :     zero_idx[13] = _mm_or_si128(in[26], in[27]);
    3272           0 :     zero_idx[14] = _mm_or_si128(in[28], in[29]);
    3273           0 :     zero_idx[15] = _mm_or_si128(in[30], in[31]);
    3274             : 
    3275           0 :     zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
    3276           0 :     zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
    3277           0 :     zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
    3278           0 :     zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
    3279           0 :     zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
    3280           0 :     zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
    3281           0 :     zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
    3282           0 :     zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
    3283             : 
    3284           0 :     zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
    3285           0 :     zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
    3286           0 :     zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
    3287           0 :     zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
    3288           0 :     zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
    3289           0 :     zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
    3290           0 :     zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
    3291             : 
    3292           0 :     if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
    3293           0 :       col[i32 + 0] = _mm_setzero_si128();
    3294           0 :       col[i32 + 1] = _mm_setzero_si128();
    3295           0 :       col[i32 + 2] = _mm_setzero_si128();
    3296           0 :       col[i32 + 3] = _mm_setzero_si128();
    3297           0 :       col[i32 + 4] = _mm_setzero_si128();
    3298           0 :       col[i32 + 5] = _mm_setzero_si128();
    3299           0 :       col[i32 + 6] = _mm_setzero_si128();
    3300           0 :       col[i32 + 7] = _mm_setzero_si128();
    3301           0 :       col[i32 + 8] = _mm_setzero_si128();
    3302           0 :       col[i32 + 9] = _mm_setzero_si128();
    3303           0 :       col[i32 + 10] = _mm_setzero_si128();
    3304           0 :       col[i32 + 11] = _mm_setzero_si128();
    3305           0 :       col[i32 + 12] = _mm_setzero_si128();
    3306           0 :       col[i32 + 13] = _mm_setzero_si128();
    3307           0 :       col[i32 + 14] = _mm_setzero_si128();
    3308           0 :       col[i32 + 15] = _mm_setzero_si128();
    3309           0 :       col[i32 + 16] = _mm_setzero_si128();
    3310           0 :       col[i32 + 17] = _mm_setzero_si128();
    3311           0 :       col[i32 + 18] = _mm_setzero_si128();
    3312           0 :       col[i32 + 19] = _mm_setzero_si128();
    3313           0 :       col[i32 + 20] = _mm_setzero_si128();
    3314           0 :       col[i32 + 21] = _mm_setzero_si128();
    3315           0 :       col[i32 + 22] = _mm_setzero_si128();
    3316           0 :       col[i32 + 23] = _mm_setzero_si128();
    3317           0 :       col[i32 + 24] = _mm_setzero_si128();
    3318           0 :       col[i32 + 25] = _mm_setzero_si128();
    3319           0 :       col[i32 + 26] = _mm_setzero_si128();
    3320           0 :       col[i32 + 27] = _mm_setzero_si128();
    3321           0 :       col[i32 + 28] = _mm_setzero_si128();
    3322           0 :       col[i32 + 29] = _mm_setzero_si128();
    3323           0 :       col[i32 + 30] = _mm_setzero_si128();
    3324           0 :       col[i32 + 31] = _mm_setzero_si128();
    3325           0 :       continue;
    3326             :     }
    3327             : 
    3328             :     // Transpose 32x8 block to 8x32 block
    3329           0 :     array_transpose_8x8(in, in);
    3330           0 :     array_transpose_8x8(in + 8, in + 8);
    3331           0 :     array_transpose_8x8(in + 16, in + 16);
    3332           0 :     array_transpose_8x8(in + 24, in + 24);
    3333             : 
    3334           0 :     IDCT32
    3335             : 
    3336             :     // 1_D: Store 32 intermediate results for each 8x32 block.
    3337           0 :     col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
    3338           0 :     col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
    3339           0 :     col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
    3340           0 :     col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
    3341           0 :     col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
    3342           0 :     col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
    3343           0 :     col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
    3344           0 :     col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
    3345           0 :     col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
    3346           0 :     col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
    3347           0 :     col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
    3348           0 :     col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
    3349           0 :     col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
    3350           0 :     col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
    3351           0 :     col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
    3352           0 :     col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
    3353           0 :     col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
    3354           0 :     col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
    3355           0 :     col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
    3356           0 :     col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
    3357           0 :     col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
    3358           0 :     col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
    3359           0 :     col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
    3360           0 :     col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
    3361           0 :     col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
    3362           0 :     col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
    3363           0 :     col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
    3364           0 :     col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
    3365           0 :     col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
    3366           0 :     col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
    3367           0 :     col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
    3368           0 :     col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
    3369             :   }
    3370           0 :   for (i = 0; i < 4; i++) {
    3371             :     // Second 1-D idct
    3372           0 :     j = i << 3;
    3373             : 
    3374             :     // Transpose 32x8 block to 8x32 block
    3375           0 :     array_transpose_8x8(col + j, in);
    3376           0 :     array_transpose_8x8(col + j + 32, in + 8);
    3377           0 :     array_transpose_8x8(col + j + 64, in + 16);
    3378           0 :     array_transpose_8x8(col + j + 96, in + 24);
    3379             : 
    3380           0 :     IDCT32
    3381             : 
    3382             :     // 2_D: Calculate the results and store them to destination.
    3383           0 :     in[0] = _mm_add_epi16(stp1_0, stp1_31);
    3384           0 :     in[1] = _mm_add_epi16(stp1_1, stp1_30);
    3385           0 :     in[2] = _mm_add_epi16(stp1_2, stp1_29);
    3386           0 :     in[3] = _mm_add_epi16(stp1_3, stp1_28);
    3387           0 :     in[4] = _mm_add_epi16(stp1_4, stp1_27);
    3388           0 :     in[5] = _mm_add_epi16(stp1_5, stp1_26);
    3389           0 :     in[6] = _mm_add_epi16(stp1_6, stp1_25);
    3390           0 :     in[7] = _mm_add_epi16(stp1_7, stp1_24);
    3391           0 :     in[8] = _mm_add_epi16(stp1_8, stp1_23);
    3392           0 :     in[9] = _mm_add_epi16(stp1_9, stp1_22);
    3393           0 :     in[10] = _mm_add_epi16(stp1_10, stp1_21);
    3394           0 :     in[11] = _mm_add_epi16(stp1_11, stp1_20);
    3395           0 :     in[12] = _mm_add_epi16(stp1_12, stp1_19);
    3396           0 :     in[13] = _mm_add_epi16(stp1_13, stp1_18);
    3397           0 :     in[14] = _mm_add_epi16(stp1_14, stp1_17);
    3398           0 :     in[15] = _mm_add_epi16(stp1_15, stp1_16);
    3399           0 :     in[16] = _mm_sub_epi16(stp1_15, stp1_16);
    3400           0 :     in[17] = _mm_sub_epi16(stp1_14, stp1_17);
    3401           0 :     in[18] = _mm_sub_epi16(stp1_13, stp1_18);
    3402           0 :     in[19] = _mm_sub_epi16(stp1_12, stp1_19);
    3403           0 :     in[20] = _mm_sub_epi16(stp1_11, stp1_20);
    3404           0 :     in[21] = _mm_sub_epi16(stp1_10, stp1_21);
    3405           0 :     in[22] = _mm_sub_epi16(stp1_9, stp1_22);
    3406           0 :     in[23] = _mm_sub_epi16(stp1_8, stp1_23);
    3407           0 :     in[24] = _mm_sub_epi16(stp1_7, stp1_24);
    3408           0 :     in[25] = _mm_sub_epi16(stp1_6, stp1_25);
    3409           0 :     in[26] = _mm_sub_epi16(stp1_5, stp1_26);
    3410           0 :     in[27] = _mm_sub_epi16(stp1_4, stp1_27);
    3411           0 :     in[28] = _mm_sub_epi16(stp1_3, stp1_28);
    3412           0 :     in[29] = _mm_sub_epi16(stp1_2, stp1_29);
    3413           0 :     in[30] = _mm_sub_epi16(stp1_1, stp1_30);
    3414           0 :     in[31] = _mm_sub_epi16(stp1_0, stp1_31);
    3415             : 
    3416           0 :     for (j = 0; j < 32; ++j) {
    3417             :       // Final rounding and shift
    3418           0 :       in[j] = _mm_adds_epi16(in[j], final_rounding);
    3419           0 :       in[j] = _mm_srai_epi16(in[j], 6);
    3420           0 :       RECON_AND_STORE(dest + j * stride, in[j]);
    3421             :     }
    3422             : 
    3423           0 :     dest += 8;
    3424             :   }
    3425           0 : }
    3426             : 
    3427           0 : void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
    3428             :                               int stride) {
    3429             :   __m128i dc_value;
    3430           0 :   const __m128i zero = _mm_setzero_si128();
    3431             :   int a, j;
    3432             : 
    3433           0 :   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
    3434           0 :   a = (int)dct_const_round_shift(a * cospi_16_64);
    3435           0 :   a = ROUND_POWER_OF_TWO(a, 6);
    3436             : 
    3437           0 :   dc_value = _mm_set1_epi16(a);
    3438             : 
    3439           0 :   for (j = 0; j < 32; ++j) {
    3440           0 :     RECON_AND_STORE(dest + 0 + j * stride, dc_value);
    3441           0 :     RECON_AND_STORE(dest + 8 + j * stride, dc_value);
    3442           0 :     RECON_AND_STORE(dest + 16 + j * stride, dc_value);
    3443           0 :     RECON_AND_STORE(dest + 24 + j * stride, dc_value);
    3444             :   }
    3445           0 : }
    3446             : 
    3447             : #if CONFIG_VP9_HIGHBITDEPTH
    3448             : static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
    3449             :   __m128i ubounded, retval;
    3450             :   const __m128i zero = _mm_set1_epi16(0);
    3451             :   const __m128i one = _mm_set1_epi16(1);
    3452             :   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
    3453             :   ubounded = _mm_cmpgt_epi16(value, max);
    3454             :   retval = _mm_andnot_si128(ubounded, value);
    3455             :   ubounded = _mm_and_si128(ubounded, max);
    3456             :   retval = _mm_or_si128(retval, ubounded);
    3457             :   retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
    3458             :   return retval;
    3459             : }
    3460             : 
    3461             : void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
    3462             :                                     int stride, int bd) {
    3463             :   tran_low_t out[4 * 4];
    3464             :   tran_low_t *outptr = out;
    3465             :   int i, j;
    3466             :   __m128i inptr[4];
    3467             :   __m128i sign_bits[2];
    3468             :   __m128i temp_mm, min_input, max_input;
    3469             :   int test;
    3470             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    3471             :   int optimised_cols = 0;
    3472             :   const __m128i zero = _mm_set1_epi16(0);
    3473             :   const __m128i eight = _mm_set1_epi16(8);
    3474             :   const __m128i max = _mm_set1_epi16(12043);
    3475             :   const __m128i min = _mm_set1_epi16(-12043);
    3476             :   // Load input into __m128i
    3477             :   inptr[0] = _mm_loadu_si128((const __m128i *)input);
    3478             :   inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
    3479             :   inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
    3480             :   inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
    3481             : 
    3482             :   // Pack to 16 bits
    3483             :   inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
    3484             :   inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
    3485             : 
    3486             :   max_input = _mm_max_epi16(inptr[0], inptr[1]);
    3487             :   min_input = _mm_min_epi16(inptr[0], inptr[1]);
    3488             :   max_input = _mm_cmpgt_epi16(max_input, max);
    3489             :   min_input = _mm_cmplt_epi16(min_input, min);
    3490             :   temp_mm = _mm_or_si128(max_input, min_input);
    3491             :   test = _mm_movemask_epi8(temp_mm);
    3492             : 
    3493             :   if (!test) {
    3494             :     // Do the row transform
    3495             :     idct4_sse2(inptr);
    3496             : 
    3497             :     // Check the min & max values
    3498             :     max_input = _mm_max_epi16(inptr[0], inptr[1]);
    3499             :     min_input = _mm_min_epi16(inptr[0], inptr[1]);
    3500             :     max_input = _mm_cmpgt_epi16(max_input, max);
    3501             :     min_input = _mm_cmplt_epi16(min_input, min);
    3502             :     temp_mm = _mm_or_si128(max_input, min_input);
    3503             :     test = _mm_movemask_epi8(temp_mm);
    3504             : 
    3505             :     if (test) {
    3506             :       transpose_4x4(inptr);
    3507             :       sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
    3508             :       sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
    3509             :       inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
    3510             :       inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
    3511             :       inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
    3512             :       inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
    3513             :       _mm_storeu_si128((__m128i *)outptr, inptr[0]);
    3514             :       _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
    3515             :       _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
    3516             :       _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
    3517             :     } else {
    3518             :       // Set to use the optimised transform for the column
    3519             :       optimised_cols = 1;
    3520             :     }
    3521             :   } else {
    3522             :     // Run the un-optimised row transform
    3523             :     for (i = 0; i < 4; ++i) {
    3524             :       vpx_highbd_idct4_c(input, outptr, bd);
    3525             :       input += 4;
    3526             :       outptr += 4;
    3527             :     }
    3528             :   }
    3529             : 
    3530             :   if (optimised_cols) {
    3531             :     idct4_sse2(inptr);
    3532             : 
    3533             :     // Final round and shift
    3534             :     inptr[0] = _mm_add_epi16(inptr[0], eight);
    3535             :     inptr[1] = _mm_add_epi16(inptr[1], eight);
    3536             : 
    3537             :     inptr[0] = _mm_srai_epi16(inptr[0], 4);
    3538             :     inptr[1] = _mm_srai_epi16(inptr[1], 4);
    3539             : 
    3540             :     // Reconstruction and Store
    3541             :     {
    3542             :       __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
    3543             :       __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
    3544             :       d0 = _mm_unpacklo_epi64(
    3545             :           d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
    3546             :       d2 = _mm_unpacklo_epi64(
    3547             :           d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
    3548             :       d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
    3549             :       d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
    3550             :       // store input0
    3551             :       _mm_storel_epi64((__m128i *)dest, d0);
    3552             :       // store input1
    3553             :       d0 = _mm_srli_si128(d0, 8);
    3554             :       _mm_storel_epi64((__m128i *)(dest + stride), d0);
    3555             :       // store input2
    3556             :       _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
    3557             :       // store input3
    3558             :       d2 = _mm_srli_si128(d2, 8);
    3559             :       _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
    3560             :     }
    3561             :   } else {
    3562             :     // Run the un-optimised column transform
    3563             :     tran_low_t temp_in[4], temp_out[4];
    3564             :     // Columns
    3565             :     for (i = 0; i < 4; ++i) {
    3566             :       for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
    3567             :       vpx_highbd_idct4_c(temp_in, temp_out, bd);
    3568             :       for (j = 0; j < 4; ++j) {
    3569             :         dest[j * stride + i] = highbd_clip_pixel_add(
    3570             :             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
    3571             :       }
    3572             :     }
    3573             :   }
    3574             : }
    3575             : 
    3576             : void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
    3577             :                                     int stride, int bd) {
    3578             :   tran_low_t out[8 * 8];
    3579             :   tran_low_t *outptr = out;
    3580             :   int i, j, test;
    3581             :   __m128i inptr[8];
    3582             :   __m128i min_input, max_input, temp1, temp2, sign_bits;
    3583             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    3584             :   const __m128i zero = _mm_set1_epi16(0);
    3585             :   const __m128i sixteen = _mm_set1_epi16(16);
    3586             :   const __m128i max = _mm_set1_epi16(6201);
    3587             :   const __m128i min = _mm_set1_epi16(-6201);
    3588             :   int optimised_cols = 0;
    3589             : 
    3590             :   // Load input into __m128i & pack to 16 bits
    3591             :   for (i = 0; i < 8; i++) {
    3592             :     temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
    3593             :     temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
    3594             :     inptr[i] = _mm_packs_epi32(temp1, temp2);
    3595             :   }
    3596             : 
    3597             :   // Find the min & max for the row transform
    3598             :   max_input = _mm_max_epi16(inptr[0], inptr[1]);
    3599             :   min_input = _mm_min_epi16(inptr[0], inptr[1]);
    3600             :   for (i = 2; i < 8; i++) {
    3601             :     max_input = _mm_max_epi16(max_input, inptr[i]);
    3602             :     min_input = _mm_min_epi16(min_input, inptr[i]);
    3603             :   }
    3604             :   max_input = _mm_cmpgt_epi16(max_input, max);
    3605             :   min_input = _mm_cmplt_epi16(min_input, min);
    3606             :   temp1 = _mm_or_si128(max_input, min_input);
    3607             :   test = _mm_movemask_epi8(temp1);
    3608             : 
    3609             :   if (!test) {
    3610             :     // Do the row transform
    3611             :     idct8_sse2(inptr);
    3612             : 
    3613             :     // Find the min & max for the column transform
    3614             :     max_input = _mm_max_epi16(inptr[0], inptr[1]);
    3615             :     min_input = _mm_min_epi16(inptr[0], inptr[1]);
    3616             :     for (i = 2; i < 8; i++) {
    3617             :       max_input = _mm_max_epi16(max_input, inptr[i]);
    3618             :       min_input = _mm_min_epi16(min_input, inptr[i]);
    3619             :     }
    3620             :     max_input = _mm_cmpgt_epi16(max_input, max);
    3621             :     min_input = _mm_cmplt_epi16(min_input, min);
    3622             :     temp1 = _mm_or_si128(max_input, min_input);
    3623             :     test = _mm_movemask_epi8(temp1);
    3624             : 
    3625             :     if (test) {
    3626             :       array_transpose_8x8(inptr, inptr);
    3627             :       for (i = 0; i < 8; i++) {
    3628             :         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
    3629             :         temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
    3630             :         temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
    3631             :         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
    3632             :         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
    3633             :       }
    3634             :     } else {
    3635             :       // Set to use the optimised transform for the column
    3636             :       optimised_cols = 1;
    3637             :     }
    3638             :   } else {
    3639             :     // Run the un-optimised row transform
    3640             :     for (i = 0; i < 8; ++i) {
    3641             :       vpx_highbd_idct8_c(input, outptr, bd);
    3642             :       input += 8;
    3643             :       outptr += 8;
    3644             :     }
    3645             :   }
    3646             : 
    3647             :   if (optimised_cols) {
    3648             :     idct8_sse2(inptr);
    3649             : 
    3650             :     // Final round & shift and Reconstruction and Store
    3651             :     {
    3652             :       __m128i d[8];
    3653             :       for (i = 0; i < 8; i++) {
    3654             :         inptr[i] = _mm_add_epi16(inptr[i], sixteen);
    3655             :         d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
    3656             :         inptr[i] = _mm_srai_epi16(inptr[i], 5);
    3657             :         d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
    3658             :         // Store
    3659             :         _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
    3660             :       }
    3661             :     }
    3662             :   } else {
    3663             :     // Run the un-optimised column transform
    3664             :     tran_low_t temp_in[8], temp_out[8];
    3665             :     for (i = 0; i < 8; ++i) {
    3666             :       for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
    3667             :       vpx_highbd_idct8_c(temp_in, temp_out, bd);
    3668             :       for (j = 0; j < 8; ++j) {
    3669             :         dest[j * stride + i] = highbd_clip_pixel_add(
    3670             :             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
    3671             :       }
    3672             :     }
    3673             :   }
    3674             : }
    3675             : 
    3676             : void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8,
    3677             :                                     int stride, int bd) {
    3678             :   tran_low_t out[8 * 8] = { 0 };
    3679             :   tran_low_t *outptr = out;
    3680             :   int i, j, test;
    3681             :   __m128i inptr[8];
    3682             :   __m128i min_input, max_input, temp1, temp2, sign_bits;
    3683             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    3684             :   const __m128i zero = _mm_set1_epi16(0);
    3685             :   const __m128i sixteen = _mm_set1_epi16(16);
    3686             :   const __m128i max = _mm_set1_epi16(6201);
    3687             :   const __m128i min = _mm_set1_epi16(-6201);
    3688             :   int optimised_cols = 0;
    3689             : 
    3690             :   // Load input into __m128i & pack to 16 bits
    3691             :   for (i = 0; i < 8; i++) {
    3692             :     temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
    3693             :     temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
    3694             :     inptr[i] = _mm_packs_epi32(temp1, temp2);
    3695             :   }
    3696             : 
    3697             :   // Find the min & max for the row transform
    3698             :   // only first 4 row has non-zero coefs
    3699             :   max_input = _mm_max_epi16(inptr[0], inptr[1]);
    3700             :   min_input = _mm_min_epi16(inptr[0], inptr[1]);
    3701             :   for (i = 2; i < 4; i++) {
    3702             :     max_input = _mm_max_epi16(max_input, inptr[i]);
    3703             :     min_input = _mm_min_epi16(min_input, inptr[i]);
    3704             :   }
    3705             :   max_input = _mm_cmpgt_epi16(max_input, max);
    3706             :   min_input = _mm_cmplt_epi16(min_input, min);
    3707             :   temp1 = _mm_or_si128(max_input, min_input);
    3708             :   test = _mm_movemask_epi8(temp1);
    3709             : 
    3710             :   if (!test) {
    3711             :     // Do the row transform
    3712             :     idct8_sse2(inptr);
    3713             : 
    3714             :     // Find the min & max for the column transform
    3715             :     // N.B. Only first 4 cols contain non-zero coeffs
    3716             :     max_input = _mm_max_epi16(inptr[0], inptr[1]);
    3717             :     min_input = _mm_min_epi16(inptr[0], inptr[1]);
    3718             :     for (i = 2; i < 8; i++) {
    3719             :       max_input = _mm_max_epi16(max_input, inptr[i]);
    3720             :       min_input = _mm_min_epi16(min_input, inptr[i]);
    3721             :     }
    3722             :     max_input = _mm_cmpgt_epi16(max_input, max);
    3723             :     min_input = _mm_cmplt_epi16(min_input, min);
    3724             :     temp1 = _mm_or_si128(max_input, min_input);
    3725             :     test = _mm_movemask_epi8(temp1);
    3726             : 
    3727             :     if (test) {
    3728             :       // Use fact only first 4 rows contain non-zero coeffs
    3729             :       array_transpose_4X8(inptr, inptr);
    3730             :       for (i = 0; i < 4; i++) {
    3731             :         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
    3732             :         temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
    3733             :         temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
    3734             :         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
    3735             :         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
    3736             :       }
    3737             :     } else {
    3738             :       // Set to use the optimised transform for the column
    3739             :       optimised_cols = 1;
    3740             :     }
    3741             :   } else {
    3742             :     // Run the un-optimised row transform
    3743             :     for (i = 0; i < 4; ++i) {
    3744             :       vpx_highbd_idct8_c(input, outptr, bd);
    3745             :       input += 8;
    3746             :       outptr += 8;
    3747             :     }
    3748             :   }
    3749             : 
    3750             :   if (optimised_cols) {
    3751             :     idct8_sse2(inptr);
    3752             : 
    3753             :     // Final round & shift and Reconstruction and Store
    3754             :     {
    3755             :       __m128i d[8];
    3756             :       for (i = 0; i < 8; i++) {
    3757             :         inptr[i] = _mm_add_epi16(inptr[i], sixteen);
    3758             :         d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
    3759             :         inptr[i] = _mm_srai_epi16(inptr[i], 5);
    3760             :         d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
    3761             :         // Store
    3762             :         _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
    3763             :       }
    3764             :     }
    3765             :   } else {
    3766             :     // Run the un-optimised column transform
    3767             :     tran_low_t temp_in[8], temp_out[8];
    3768             :     for (i = 0; i < 8; ++i) {
    3769             :       for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
    3770             :       vpx_highbd_idct8_c(temp_in, temp_out, bd);
    3771             :       for (j = 0; j < 8; ++j) {
    3772             :         dest[j * stride + i] = highbd_clip_pixel_add(
    3773             :             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
    3774             :       }
    3775             :     }
    3776             :   }
    3777             : }
    3778             : 
    3779             : void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
    3780             :                                        int stride, int bd) {
    3781             :   tran_low_t out[16 * 16];
    3782             :   tran_low_t *outptr = out;
    3783             :   int i, j, test;
    3784             :   __m128i inptr[32];
    3785             :   __m128i min_input, max_input, temp1, temp2, sign_bits;
    3786             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    3787             :   const __m128i zero = _mm_set1_epi16(0);
    3788             :   const __m128i rounding = _mm_set1_epi16(32);
    3789             :   const __m128i max = _mm_set1_epi16(3155);
    3790             :   const __m128i min = _mm_set1_epi16(-3155);
    3791             :   int optimised_cols = 0;
    3792             : 
    3793             :   // Load input into __m128i & pack to 16 bits
    3794             :   for (i = 0; i < 16; i++) {
    3795             :     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
    3796             :     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
    3797             :     inptr[i] = _mm_packs_epi32(temp1, temp2);
    3798             :     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
    3799             :     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
    3800             :     inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
    3801             :   }
    3802             : 
    3803             :   // Find the min & max for the row transform
    3804             :   max_input = _mm_max_epi16(inptr[0], inptr[1]);
    3805             :   min_input = _mm_min_epi16(inptr[0], inptr[1]);
    3806             :   for (i = 2; i < 32; i++) {
    3807             :     max_input = _mm_max_epi16(max_input, inptr[i]);
    3808             :     min_input = _mm_min_epi16(min_input, inptr[i]);
    3809             :   }
    3810             :   max_input = _mm_cmpgt_epi16(max_input, max);
    3811             :   min_input = _mm_cmplt_epi16(min_input, min);
    3812             :   temp1 = _mm_or_si128(max_input, min_input);
    3813             :   test = _mm_movemask_epi8(temp1);
    3814             : 
    3815             :   if (!test) {
    3816             :     // Do the row transform
    3817             :     idct16_sse2(inptr, inptr + 16);
    3818             : 
    3819             :     // Find the min & max for the column transform
    3820             :     max_input = _mm_max_epi16(inptr[0], inptr[1]);
    3821             :     min_input = _mm_min_epi16(inptr[0], inptr[1]);
    3822             :     for (i = 2; i < 32; i++) {
    3823             :       max_input = _mm_max_epi16(max_input, inptr[i]);
    3824             :       min_input = _mm_min_epi16(min_input, inptr[i]);
    3825             :     }
    3826             :     max_input = _mm_cmpgt_epi16(max_input, max);
    3827             :     min_input = _mm_cmplt_epi16(min_input, min);
    3828             :     temp1 = _mm_or_si128(max_input, min_input);
    3829             :     test = _mm_movemask_epi8(temp1);
    3830             : 
    3831             :     if (test) {
    3832             :       array_transpose_16x16(inptr, inptr + 16);
    3833             :       for (i = 0; i < 16; i++) {
    3834             :         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
    3835             :         temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
    3836             :         temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
    3837             :         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
    3838             :         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
    3839             :         sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
    3840             :         temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
    3841             :         temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
    3842             :         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
    3843             :         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
    3844             :       }
    3845             :     } else {
    3846             :       // Set to use the optimised transform for the column
    3847             :       optimised_cols = 1;
    3848             :     }
    3849             :   } else {
    3850             :     // Run the un-optimised row transform
    3851             :     for (i = 0; i < 16; ++i) {
    3852             :       vpx_highbd_idct16_c(input, outptr, bd);
    3853             :       input += 16;
    3854             :       outptr += 16;
    3855             :     }
    3856             :   }
    3857             : 
    3858             :   if (optimised_cols) {
    3859             :     idct16_sse2(inptr, inptr + 16);
    3860             : 
    3861             :     // Final round & shift and Reconstruction and Store
    3862             :     {
    3863             :       __m128i d[2];
    3864             :       for (i = 0; i < 16; i++) {
    3865             :         inptr[i] = _mm_add_epi16(inptr[i], rounding);
    3866             :         inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
    3867             :         d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
    3868             :         d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
    3869             :         inptr[i] = _mm_srai_epi16(inptr[i], 6);
    3870             :         inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
    3871             :         d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
    3872             :         d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
    3873             :         // Store
    3874             :         _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
    3875             :         _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
    3876             :       }
    3877             :     }
    3878             :   } else {
    3879             :     // Run the un-optimised column transform
    3880             :     tran_low_t temp_in[16], temp_out[16];
    3881             :     for (i = 0; i < 16; ++i) {
    3882             :       for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
    3883             :       vpx_highbd_idct16_c(temp_in, temp_out, bd);
    3884             :       for (j = 0; j < 16; ++j) {
    3885             :         dest[j * stride + i] = highbd_clip_pixel_add(
    3886             :             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
    3887             :       }
    3888             :     }
    3889             :   }
    3890             : }
    3891             : 
    3892             : void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
    3893             :                                       int stride, int bd) {
    3894             :   tran_low_t out[16 * 16] = { 0 };
    3895             :   tran_low_t *outptr = out;
    3896             :   int i, j, test;
    3897             :   __m128i inptr[32];
    3898             :   __m128i min_input, max_input, temp1, temp2, sign_bits;
    3899             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    3900             :   const __m128i zero = _mm_set1_epi16(0);
    3901             :   const __m128i rounding = _mm_set1_epi16(32);
    3902             :   const __m128i max = _mm_set1_epi16(3155);
    3903             :   const __m128i min = _mm_set1_epi16(-3155);
    3904             :   int optimised_cols = 0;
    3905             : 
    3906             :   // Load input into __m128i & pack to 16 bits
    3907             :   for (i = 0; i < 16; i++) {
    3908             :     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
    3909             :     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
    3910             :     inptr[i] = _mm_packs_epi32(temp1, temp2);
    3911             :     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
    3912             :     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
    3913             :     inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
    3914             :   }
    3915             : 
    3916             :   // Find the min & max for the row transform
    3917             :   // Since all non-zero dct coefficients are in upper-left 4x4 area,
    3918             :   // we only need to consider first 4 rows here.
    3919             :   max_input = _mm_max_epi16(inptr[0], inptr[1]);
    3920             :   min_input = _mm_min_epi16(inptr[0], inptr[1]);
    3921             :   for (i = 2; i < 4; i++) {
    3922             :     max_input = _mm_max_epi16(max_input, inptr[i]);
    3923             :     min_input = _mm_min_epi16(min_input, inptr[i]);
    3924             :   }
    3925             :   max_input = _mm_cmpgt_epi16(max_input, max);
    3926             :   min_input = _mm_cmplt_epi16(min_input, min);
    3927             :   temp1 = _mm_or_si128(max_input, min_input);
    3928             :   test = _mm_movemask_epi8(temp1);
    3929             : 
    3930             :   if (!test) {
    3931             :     // Do the row transform (N.B. This transposes inptr)
    3932             :     idct16_sse2(inptr, inptr + 16);
    3933             : 
    3934             :     // Find the min & max for the column transform
    3935             :     // N.B. Only first 4 cols contain non-zero coeffs
    3936             :     max_input = _mm_max_epi16(inptr[0], inptr[1]);
    3937             :     min_input = _mm_min_epi16(inptr[0], inptr[1]);
    3938             :     for (i = 2; i < 16; i++) {
    3939             :       max_input = _mm_max_epi16(max_input, inptr[i]);
    3940             :       min_input = _mm_min_epi16(min_input, inptr[i]);
    3941             :     }
    3942             :     max_input = _mm_cmpgt_epi16(max_input, max);
    3943             :     min_input = _mm_cmplt_epi16(min_input, min);
    3944             :     temp1 = _mm_or_si128(max_input, min_input);
    3945             :     test = _mm_movemask_epi8(temp1);
    3946             : 
    3947             :     if (test) {
    3948             :       // Use fact only first 4 rows contain non-zero coeffs
    3949             :       array_transpose_8x8(inptr, inptr);
    3950             :       array_transpose_8x8(inptr + 8, inptr + 16);
    3951             :       for (i = 0; i < 4; i++) {
    3952             :         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
    3953             :         temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
    3954             :         temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
    3955             :         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
    3956             :         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
    3957             :         sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
    3958             :         temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
    3959             :         temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
    3960             :         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
    3961             :         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
    3962             :       }
    3963             :     } else {
    3964             :       // Set to use the optimised transform for the column
    3965             :       optimised_cols = 1;
    3966             :     }
    3967             :   } else {
    3968             :     // Run the un-optimised row transform
    3969             :     for (i = 0; i < 4; ++i) {
    3970             :       vpx_highbd_idct16_c(input, outptr, bd);
    3971             :       input += 16;
    3972             :       outptr += 16;
    3973             :     }
    3974             :   }
    3975             : 
    3976             :   if (optimised_cols) {
    3977             :     idct16_sse2(inptr, inptr + 16);
    3978             : 
    3979             :     // Final round & shift and Reconstruction and Store
    3980             :     {
    3981             :       __m128i d[2];
    3982             :       for (i = 0; i < 16; i++) {
    3983             :         inptr[i] = _mm_add_epi16(inptr[i], rounding);
    3984             :         inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
    3985             :         d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
    3986             :         d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
    3987             :         inptr[i] = _mm_srai_epi16(inptr[i], 6);
    3988             :         inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
    3989             :         d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
    3990             :         d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
    3991             :         // Store
    3992             :         _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
    3993             :         _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
    3994             :       }
    3995             :     }
    3996             :   } else {
    3997             :     // Run the un-optimised column transform
    3998             :     tran_low_t temp_in[16], temp_out[16];
    3999             :     for (i = 0; i < 16; ++i) {
    4000             :       for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
    4001             :       vpx_highbd_idct16_c(temp_in, temp_out, bd);
    4002             :       for (j = 0; j < 16; ++j) {
    4003             :         dest[j * stride + i] = highbd_clip_pixel_add(
    4004             :             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
    4005             :       }
    4006             :     }
    4007             :   }
    4008             : }
    4009             : 
    4010             : void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,
    4011             :                                      int stride, int bd) {
    4012             :   __m128i dc_value, d;
    4013             :   const __m128i zero = _mm_setzero_si128();
    4014             :   const __m128i one = _mm_set1_epi16(1);
    4015             :   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
    4016             :   int a, i, j;
    4017             :   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    4018             :   tran_low_t out;
    4019             : 
    4020             :   out = dct_const_round_shift(input[0] * cospi_16_64);
    4021             :   out = dct_const_round_shift(out * cospi_16_64);
    4022             :   a = ROUND_POWER_OF_TWO(out, 6);
    4023             : 
    4024             :   d = _mm_set1_epi32(a);
    4025             :   dc_value = _mm_packs_epi32(d, d);
    4026             :   for (i = 0; i < 32; ++i) {
    4027             :     for (j = 0; j < 4; ++j) {
    4028             :       d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
    4029             :       d = _mm_adds_epi16(d, dc_value);
    4030             :       d = _mm_max_epi16(d, zero);
    4031             :       d = _mm_min_epi16(d, max);
    4032             :       _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
    4033             :     }
    4034             :     dest += stride;
    4035             :   }
    4036             : }
    4037             : #endif  // CONFIG_VP9_HIGHBITDEPTH

Generated by: LCOV version 1.13