LCOV - code coverage report
Current view: top level - media/libvpx/libvpx/vp9/encoder/x86 - vp9_dct_intrin_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 1605 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 23 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include <assert.h>
      12             : #include <emmintrin.h>  // SSE2
      13             : 
      14             : #include "./vp9_rtcd.h"
      15             : #include "./vpx_dsp_rtcd.h"
      16             : #include "vpx_dsp/txfm_common.h"
      17             : #include "vpx_dsp/x86/fwd_txfm_sse2.h"
      18             : #include "vpx_dsp/x86/txfm_common_sse2.h"
      19             : #include "vpx_ports/mem.h"
      20             : 
      21           0 : static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
      22             :                                    int stride) {
      23           0 :   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
      24           0 :   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
      25             :   __m128i mask;
      26             : 
      27           0 :   in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
      28           0 :   in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
      29           0 :   in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
      30           0 :   in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
      31             : 
      32           0 :   in[0] = _mm_slli_epi16(in[0], 4);
      33           0 :   in[1] = _mm_slli_epi16(in[1], 4);
      34           0 :   in[2] = _mm_slli_epi16(in[2], 4);
      35           0 :   in[3] = _mm_slli_epi16(in[3], 4);
      36             : 
      37           0 :   mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
      38           0 :   in[0] = _mm_add_epi16(in[0], mask);
      39           0 :   in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
      40           0 : }
      41             : 
      42           0 : static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
      43           0 :   const __m128i kOne = _mm_set1_epi16(1);
      44           0 :   __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
      45           0 :   __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
      46           0 :   __m128i out01 = _mm_add_epi16(in01, kOne);
      47           0 :   __m128i out23 = _mm_add_epi16(in23, kOne);
      48           0 :   out01 = _mm_srai_epi16(out01, 2);
      49           0 :   out23 = _mm_srai_epi16(out23, 2);
      50           0 :   store_output(&out01, (output + 0 * 8));
      51           0 :   store_output(&out23, (output + 1 * 8));
      52           0 : }
      53             : 
      54           0 : static INLINE void transpose_4x4(__m128i *res) {
      55             :   // Combine and transpose
      56             :   // 00 01 02 03 20 21 22 23
      57             :   // 10 11 12 13 30 31 32 33
      58           0 :   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
      59           0 :   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
      60             : 
      61             :   // 00 10 01 11 02 12 03 13
      62             :   // 20 30 21 31 22 32 23 33
      63           0 :   res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
      64           0 :   res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
      65             : 
      66             :   // 00 10 20 30 01 11 21 31
      67             :   // 02 12 22 32 03 13 23 33
      68             :   // only use the first 4 16-bit integers
      69           0 :   res[1] = _mm_unpackhi_epi64(res[0], res[0]);
      70           0 :   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
      71           0 : }
      72             : 
      73           0 : static void fdct4_sse2(__m128i *in) {
      74           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
      75           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
      76           0 :   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
      77           0 :   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
      78           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
      79             : 
      80             :   __m128i u[4], v[4];
      81           0 :   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
      82           0 :   u[1] = _mm_unpacklo_epi16(in[3], in[2]);
      83             : 
      84           0 :   v[0] = _mm_add_epi16(u[0], u[1]);
      85           0 :   v[1] = _mm_sub_epi16(u[0], u[1]);
      86             : 
      87           0 :   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
      88           0 :   u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
      89           0 :   u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
      90           0 :   u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
      91             : 
      92           0 :   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
      93           0 :   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
      94           0 :   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
      95           0 :   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
      96           0 :   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
      97           0 :   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
      98           0 :   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
      99           0 :   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
     100             : 
     101           0 :   in[0] = _mm_packs_epi32(u[0], u[1]);
     102           0 :   in[1] = _mm_packs_epi32(u[2], u[3]);
     103           0 :   transpose_4x4(in);
     104           0 : }
     105             : 
     106           0 : static void fadst4_sse2(__m128i *in) {
     107           0 :   const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
     108           0 :   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
     109           0 :   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
     110           0 :   const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
     111           0 :   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
     112           0 :   const __m128i kZero = _mm_set1_epi16(0);
     113           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     114             :   __m128i u[8], v[8];
     115           0 :   __m128i in7 = _mm_add_epi16(in[0], in[1]);
     116             : 
     117           0 :   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
     118           0 :   u[1] = _mm_unpacklo_epi16(in[2], in[3]);
     119           0 :   u[2] = _mm_unpacklo_epi16(in7, kZero);
     120           0 :   u[3] = _mm_unpacklo_epi16(in[2], kZero);
     121           0 :   u[4] = _mm_unpacklo_epi16(in[3], kZero);
     122             : 
     123           0 :   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
     124           0 :   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
     125           0 :   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
     126           0 :   v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
     127           0 :   v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
     128           0 :   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
     129           0 :   v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
     130             : 
     131           0 :   u[0] = _mm_add_epi32(v[0], v[1]);
     132           0 :   u[1] = _mm_sub_epi32(v[2], v[6]);
     133           0 :   u[2] = _mm_add_epi32(v[3], v[4]);
     134           0 :   u[3] = _mm_sub_epi32(u[2], u[0]);
     135           0 :   u[4] = _mm_slli_epi32(v[5], 2);
     136           0 :   u[5] = _mm_sub_epi32(u[4], v[5]);
     137           0 :   u[6] = _mm_add_epi32(u[3], u[5]);
     138             : 
     139           0 :   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
     140           0 :   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
     141           0 :   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
     142           0 :   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
     143             : 
     144           0 :   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
     145           0 :   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
     146           0 :   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
     147           0 :   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
     148             : 
     149           0 :   in[0] = _mm_packs_epi32(u[0], u[2]);
     150           0 :   in[1] = _mm_packs_epi32(u[1], u[3]);
     151           0 :   transpose_4x4(in);
     152           0 : }
     153             : 
     154           0 : void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
     155             :                      int tx_type) {
     156             :   __m128i in[4];
     157             : 
     158           0 :   switch (tx_type) {
     159           0 :     case DCT_DCT: vpx_fdct4x4_sse2(input, output, stride); break;
     160             :     case ADST_DCT:
     161           0 :       load_buffer_4x4(input, in, stride);
     162           0 :       fadst4_sse2(in);
     163           0 :       fdct4_sse2(in);
     164           0 :       write_buffer_4x4(output, in);
     165           0 :       break;
     166             :     case DCT_ADST:
     167           0 :       load_buffer_4x4(input, in, stride);
     168           0 :       fdct4_sse2(in);
     169           0 :       fadst4_sse2(in);
     170           0 :       write_buffer_4x4(output, in);
     171           0 :       break;
     172             :     case ADST_ADST:
     173           0 :       load_buffer_4x4(input, in, stride);
     174           0 :       fadst4_sse2(in);
     175           0 :       fadst4_sse2(in);
     176           0 :       write_buffer_4x4(output, in);
     177           0 :       break;
     178           0 :     default: assert(0); break;
     179             :   }
     180           0 : }
     181             : 
     182           0 : void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
     183             :                             int16_t *coeff_ptr, intptr_t n_coeffs,
     184             :                             int skip_block, const int16_t *zbin_ptr,
     185             :                             const int16_t *round_ptr, const int16_t *quant_ptr,
     186             :                             const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
     187             :                             int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
     188             :                             uint16_t *eob_ptr, const int16_t *scan_ptr,
     189             :                             const int16_t *iscan_ptr) {
     190             :   __m128i zero;
     191             :   int pass;
     192             :   // Constants
     193             :   //    When we use them, in one case, they are all the same. In all others
     194             :   //    it's a pair of them that we need to repeat four times. This is done
     195             :   //    by constructing the 32 bit constant corresponding to that pair.
     196           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
     197           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     198           0 :   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
     199           0 :   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
     200           0 :   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
     201           0 :   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
     202           0 :   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
     203           0 :   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
     204           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     205             :   // Load input
     206           0 :   __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
     207           0 :   __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
     208           0 :   __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
     209           0 :   __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
     210           0 :   __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
     211           0 :   __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
     212           0 :   __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
     213           0 :   __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
     214             :   __m128i *in[8];
     215           0 :   int index = 0;
     216             : 
     217             :   (void)scan_ptr;
     218             :   (void)zbin_ptr;
     219             :   (void)quant_shift_ptr;
     220             :   (void)coeff_ptr;
     221             : 
     222             :   // Pre-condition input (shift by two)
     223           0 :   in0 = _mm_slli_epi16(in0, 2);
     224           0 :   in1 = _mm_slli_epi16(in1, 2);
     225           0 :   in2 = _mm_slli_epi16(in2, 2);
     226           0 :   in3 = _mm_slli_epi16(in3, 2);
     227           0 :   in4 = _mm_slli_epi16(in4, 2);
     228           0 :   in5 = _mm_slli_epi16(in5, 2);
     229           0 :   in6 = _mm_slli_epi16(in6, 2);
     230           0 :   in7 = _mm_slli_epi16(in7, 2);
     231             : 
     232           0 :   in[0] = &in0;
     233           0 :   in[1] = &in1;
     234           0 :   in[2] = &in2;
     235           0 :   in[3] = &in3;
     236           0 :   in[4] = &in4;
     237           0 :   in[5] = &in5;
     238           0 :   in[6] = &in6;
     239           0 :   in[7] = &in7;
     240             : 
     241             :   // We do two passes, first the columns, then the rows. The results of the
     242             :   // first pass are transposed so that the same column code can be reused. The
     243             :   // results of the second pass are also transposed so that the rows (processed
     244             :   // as columns) are put back in row positions.
     245           0 :   for (pass = 0; pass < 2; pass++) {
     246             :     // To store results of each pass before the transpose.
     247             :     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
     248             :     // Add/subtract
     249           0 :     const __m128i q0 = _mm_add_epi16(in0, in7);
     250           0 :     const __m128i q1 = _mm_add_epi16(in1, in6);
     251           0 :     const __m128i q2 = _mm_add_epi16(in2, in5);
     252           0 :     const __m128i q3 = _mm_add_epi16(in3, in4);
     253           0 :     const __m128i q4 = _mm_sub_epi16(in3, in4);
     254           0 :     const __m128i q5 = _mm_sub_epi16(in2, in5);
     255           0 :     const __m128i q6 = _mm_sub_epi16(in1, in6);
     256           0 :     const __m128i q7 = _mm_sub_epi16(in0, in7);
     257             :     // Work on first four results
     258             :     {
     259             :       // Add/subtract
     260           0 :       const __m128i r0 = _mm_add_epi16(q0, q3);
     261           0 :       const __m128i r1 = _mm_add_epi16(q1, q2);
     262           0 :       const __m128i r2 = _mm_sub_epi16(q1, q2);
     263           0 :       const __m128i r3 = _mm_sub_epi16(q0, q3);
     264             :       // Interleave to do the multiply by constants which gets us into 32bits
     265           0 :       const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
     266           0 :       const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
     267           0 :       const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
     268           0 :       const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
     269           0 :       const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
     270           0 :       const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
     271           0 :       const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
     272           0 :       const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
     273           0 :       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
     274           0 :       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
     275           0 :       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
     276           0 :       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
     277             :       // dct_const_round_shift
     278           0 :       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
     279           0 :       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
     280           0 :       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
     281           0 :       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
     282           0 :       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
     283           0 :       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
     284           0 :       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
     285           0 :       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
     286           0 :       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
     287           0 :       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
     288           0 :       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
     289           0 :       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
     290           0 :       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
     291           0 :       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
     292           0 :       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
     293           0 :       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
     294             :       // Combine
     295           0 :       res0 = _mm_packs_epi32(w0, w1);
     296           0 :       res4 = _mm_packs_epi32(w2, w3);
     297           0 :       res2 = _mm_packs_epi32(w4, w5);
     298           0 :       res6 = _mm_packs_epi32(w6, w7);
     299             :     }
     300             :     // Work on next four results
     301             :     {
     302             :       // Interleave to do the multiply by constants which gets us into 32bits
     303           0 :       const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
     304           0 :       const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
     305           0 :       const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
     306           0 :       const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
     307           0 :       const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
     308           0 :       const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
     309             :       // dct_const_round_shift
     310           0 :       const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
     311           0 :       const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
     312           0 :       const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
     313           0 :       const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
     314           0 :       const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
     315           0 :       const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
     316           0 :       const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
     317           0 :       const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
     318             :       // Combine
     319           0 :       const __m128i r0 = _mm_packs_epi32(s0, s1);
     320           0 :       const __m128i r1 = _mm_packs_epi32(s2, s3);
     321             :       // Add/subtract
     322           0 :       const __m128i x0 = _mm_add_epi16(q4, r0);
     323           0 :       const __m128i x1 = _mm_sub_epi16(q4, r0);
     324           0 :       const __m128i x2 = _mm_sub_epi16(q7, r1);
     325           0 :       const __m128i x3 = _mm_add_epi16(q7, r1);
     326             :       // Interleave to do the multiply by constants which gets us into 32bits
     327           0 :       const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
     328           0 :       const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
     329           0 :       const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
     330           0 :       const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
     331           0 :       const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
     332           0 :       const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
     333           0 :       const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
     334           0 :       const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
     335           0 :       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
     336           0 :       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
     337           0 :       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
     338           0 :       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
     339             :       // dct_const_round_shift
     340           0 :       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
     341           0 :       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
     342           0 :       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
     343           0 :       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
     344           0 :       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
     345           0 :       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
     346           0 :       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
     347           0 :       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
     348           0 :       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
     349           0 :       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
     350           0 :       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
     351           0 :       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
     352           0 :       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
     353           0 :       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
     354           0 :       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
     355           0 :       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
     356             :       // Combine
     357           0 :       res1 = _mm_packs_epi32(w0, w1);
     358           0 :       res7 = _mm_packs_epi32(w2, w3);
     359           0 :       res5 = _mm_packs_epi32(w4, w5);
     360           0 :       res3 = _mm_packs_epi32(w6, w7);
     361             :     }
     362             :     // Transpose the 8x8.
     363             :     {
     364             :       // 00 01 02 03 04 05 06 07
     365             :       // 10 11 12 13 14 15 16 17
     366             :       // 20 21 22 23 24 25 26 27
     367             :       // 30 31 32 33 34 35 36 37
     368             :       // 40 41 42 43 44 45 46 47
     369             :       // 50 51 52 53 54 55 56 57
     370             :       // 60 61 62 63 64 65 66 67
     371             :       // 70 71 72 73 74 75 76 77
     372           0 :       const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
     373           0 :       const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
     374           0 :       const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
     375           0 :       const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
     376           0 :       const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
     377           0 :       const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
     378           0 :       const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
     379           0 :       const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
     380             :       // 00 10 01 11 02 12 03 13
     381             :       // 20 30 21 31 22 32 23 33
     382             :       // 04 14 05 15 06 16 07 17
     383             :       // 24 34 25 35 26 36 27 37
     384             :       // 40 50 41 51 42 52 43 53
     385             :       // 60 70 61 71 62 72 63 73
     386             :       // 54 54 55 55 56 56 57 57
     387             :       // 64 74 65 75 66 76 67 77
     388           0 :       const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
     389           0 :       const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
     390           0 :       const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
     391           0 :       const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
     392           0 :       const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
     393           0 :       const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
     394           0 :       const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
     395           0 :       const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
     396             :       // 00 10 20 30 01 11 21 31
     397             :       // 40 50 60 70 41 51 61 71
     398             :       // 02 12 22 32 03 13 23 33
     399             :       // 42 52 62 72 43 53 63 73
     400             :       // 04 14 24 34 05 15 21 36
     401             :       // 44 54 64 74 45 55 61 76
     402             :       // 06 16 26 36 07 17 27 37
     403             :       // 46 56 66 76 47 57 67 77
     404           0 :       in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
     405           0 :       in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
     406           0 :       in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
     407           0 :       in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
     408           0 :       in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
     409           0 :       in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
     410           0 :       in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
     411           0 :       in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
     412             :       // 00 10 20 30 40 50 60 70
     413             :       // 01 11 21 31 41 51 61 71
     414             :       // 02 12 22 32 42 52 62 72
     415             :       // 03 13 23 33 43 53 63 73
     416             :       // 04 14 24 34 44 54 64 74
     417             :       // 05 15 25 35 45 55 65 75
     418             :       // 06 16 26 36 46 56 66 76
     419             :       // 07 17 27 37 47 57 67 77
     420             :     }
     421             :   }
     422             :   // Post-condition output and store it
     423             :   {
     424             :     // Post-condition (division by two)
     425             :     //    division of two 16 bits signed numbers using shifts
     426             :     //    n / 2 = (n - (n >> 15)) >> 1
     427           0 :     const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
     428           0 :     const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
     429           0 :     const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
     430           0 :     const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
     431           0 :     const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
     432           0 :     const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
     433           0 :     const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
     434           0 :     const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
     435           0 :     in0 = _mm_sub_epi16(in0, sign_in0);
     436           0 :     in1 = _mm_sub_epi16(in1, sign_in1);
     437           0 :     in2 = _mm_sub_epi16(in2, sign_in2);
     438           0 :     in3 = _mm_sub_epi16(in3, sign_in3);
     439           0 :     in4 = _mm_sub_epi16(in4, sign_in4);
     440           0 :     in5 = _mm_sub_epi16(in5, sign_in5);
     441           0 :     in6 = _mm_sub_epi16(in6, sign_in6);
     442           0 :     in7 = _mm_sub_epi16(in7, sign_in7);
     443           0 :     in0 = _mm_srai_epi16(in0, 1);
     444           0 :     in1 = _mm_srai_epi16(in1, 1);
     445           0 :     in2 = _mm_srai_epi16(in2, 1);
     446           0 :     in3 = _mm_srai_epi16(in3, 1);
     447           0 :     in4 = _mm_srai_epi16(in4, 1);
     448           0 :     in5 = _mm_srai_epi16(in5, 1);
     449           0 :     in6 = _mm_srai_epi16(in6, 1);
     450           0 :     in7 = _mm_srai_epi16(in7, 1);
     451             :   }
     452             : 
     453           0 :   iscan_ptr += n_coeffs;
     454           0 :   qcoeff_ptr += n_coeffs;
     455           0 :   dqcoeff_ptr += n_coeffs;
     456           0 :   n_coeffs = -n_coeffs;
     457           0 :   zero = _mm_setzero_si128();
     458             : 
     459           0 :   if (!skip_block) {
     460             :     __m128i eob;
     461             :     __m128i round, quant, dequant;
     462             :     {
     463             :       __m128i coeff0, coeff1;
     464             : 
     465             :       // Setup global values
     466             :       {
     467           0 :         round = _mm_load_si128((const __m128i *)round_ptr);
     468           0 :         quant = _mm_load_si128((const __m128i *)quant_ptr);
     469           0 :         dequant = _mm_load_si128((const __m128i *)dequant_ptr);
     470             :       }
     471             : 
     472             :       {
     473             :         __m128i coeff0_sign, coeff1_sign;
     474             :         __m128i qcoeff0, qcoeff1;
     475             :         __m128i qtmp0, qtmp1;
     476             :         // Do DC and first 15 AC
     477           0 :         coeff0 = *in[0];
     478           0 :         coeff1 = *in[1];
     479             : 
     480             :         // Poor man's sign extract
     481           0 :         coeff0_sign = _mm_srai_epi16(coeff0, 15);
     482           0 :         coeff1_sign = _mm_srai_epi16(coeff1, 15);
     483           0 :         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
     484           0 :         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
     485           0 :         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     486           0 :         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     487             : 
     488           0 :         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
     489           0 :         round = _mm_unpackhi_epi64(round, round);
     490           0 :         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
     491           0 :         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
     492           0 :         quant = _mm_unpackhi_epi64(quant, quant);
     493           0 :         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
     494             : 
     495             :         // Reinsert signs
     496           0 :         qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
     497           0 :         qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
     498           0 :         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     499           0 :         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     500             : 
     501           0 :         _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
     502           0 :         _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
     503             : 
     504           0 :         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
     505           0 :         dequant = _mm_unpackhi_epi64(dequant, dequant);
     506           0 :         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
     507             : 
     508           0 :         _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
     509           0 :         _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
     510             :       }
     511             : 
     512             :       {
     513             :         // Scan for eob
     514             :         __m128i zero_coeff0, zero_coeff1;
     515             :         __m128i nzero_coeff0, nzero_coeff1;
     516             :         __m128i iscan0, iscan1;
     517             :         __m128i eob1;
     518           0 :         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
     519           0 :         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
     520           0 :         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
     521           0 :         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
     522           0 :         iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
     523           0 :         iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
     524             :         // Add one to convert from indices to counts
     525           0 :         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
     526           0 :         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
     527           0 :         eob = _mm_and_si128(iscan0, nzero_coeff0);
     528           0 :         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
     529           0 :         eob = _mm_max_epi16(eob, eob1);
     530             :       }
     531           0 :       n_coeffs += 8 * 2;
     532             :     }
     533             : 
     534             :     // AC only loop
     535           0 :     index = 2;
     536           0 :     while (n_coeffs < 0) {
     537             :       __m128i coeff0, coeff1;
     538             :       {
     539             :         __m128i coeff0_sign, coeff1_sign;
     540             :         __m128i qcoeff0, qcoeff1;
     541             :         __m128i qtmp0, qtmp1;
     542             : 
     543           0 :         assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
     544           0 :         coeff0 = *in[index];
     545           0 :         coeff1 = *in[index + 1];
     546             : 
     547             :         // Poor man's sign extract
     548           0 :         coeff0_sign = _mm_srai_epi16(coeff0, 15);
     549           0 :         coeff1_sign = _mm_srai_epi16(coeff1, 15);
     550           0 :         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
     551           0 :         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
     552           0 :         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     553           0 :         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     554             : 
     555           0 :         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
     556           0 :         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
     557           0 :         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
     558           0 :         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
     559             : 
     560             :         // Reinsert signs
     561           0 :         qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
     562           0 :         qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
     563           0 :         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     564           0 :         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     565             : 
     566           0 :         _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
     567           0 :         _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
     568             : 
     569           0 :         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
     570           0 :         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
     571             : 
     572           0 :         _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
     573           0 :         _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
     574             :       }
     575             : 
     576             :       {
     577             :         // Scan for eob
     578             :         __m128i zero_coeff0, zero_coeff1;
     579             :         __m128i nzero_coeff0, nzero_coeff1;
     580             :         __m128i iscan0, iscan1;
     581             :         __m128i eob0, eob1;
     582           0 :         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
     583           0 :         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
     584           0 :         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
     585           0 :         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
     586           0 :         iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
     587           0 :         iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
     588             :         // Add one to convert from indices to counts
     589           0 :         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
     590           0 :         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
     591           0 :         eob0 = _mm_and_si128(iscan0, nzero_coeff0);
     592           0 :         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
     593           0 :         eob0 = _mm_max_epi16(eob0, eob1);
     594           0 :         eob = _mm_max_epi16(eob, eob0);
     595             :       }
     596           0 :       n_coeffs += 8 * 2;
     597           0 :       index += 2;
     598             :     }
     599             : 
     600             :     // Accumulate EOB
     601             :     {
     602             :       __m128i eob_shuffled;
     603           0 :       eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
     604           0 :       eob = _mm_max_epi16(eob, eob_shuffled);
     605           0 :       eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
     606           0 :       eob = _mm_max_epi16(eob, eob_shuffled);
     607           0 :       eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
     608           0 :       eob = _mm_max_epi16(eob, eob_shuffled);
     609           0 :       *eob_ptr = _mm_extract_epi16(eob, 1);
     610             :     }
     611             :   } else {
     612             :     do {
     613           0 :       _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
     614           0 :       _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
     615           0 :       _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
     616           0 :       _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
     617           0 :       n_coeffs += 8 * 2;
     618           0 :     } while (n_coeffs < 0);
     619           0 :     *eob_ptr = 0;
     620             :   }
     621           0 : }
     622             : 
     623             : // load 8x8 array
     624           0 : static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
     625             :                                    int stride) {
     626           0 :   in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
     627           0 :   in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
     628           0 :   in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
     629           0 :   in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
     630           0 :   in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
     631           0 :   in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
     632           0 :   in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
     633           0 :   in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
     634             : 
     635           0 :   in[0] = _mm_slli_epi16(in[0], 2);
     636           0 :   in[1] = _mm_slli_epi16(in[1], 2);
     637           0 :   in[2] = _mm_slli_epi16(in[2], 2);
     638           0 :   in[3] = _mm_slli_epi16(in[3], 2);
     639           0 :   in[4] = _mm_slli_epi16(in[4], 2);
     640           0 :   in[5] = _mm_slli_epi16(in[5], 2);
     641           0 :   in[6] = _mm_slli_epi16(in[6], 2);
     642           0 :   in[7] = _mm_slli_epi16(in[7], 2);
     643           0 : }
     644             : 
     645             : // right shift and rounding
     646           0 : static INLINE void right_shift_8x8(__m128i *res, const int bit) {
     647           0 :   __m128i sign0 = _mm_srai_epi16(res[0], 15);
     648           0 :   __m128i sign1 = _mm_srai_epi16(res[1], 15);
     649           0 :   __m128i sign2 = _mm_srai_epi16(res[2], 15);
     650           0 :   __m128i sign3 = _mm_srai_epi16(res[3], 15);
     651           0 :   __m128i sign4 = _mm_srai_epi16(res[4], 15);
     652           0 :   __m128i sign5 = _mm_srai_epi16(res[5], 15);
     653           0 :   __m128i sign6 = _mm_srai_epi16(res[6], 15);
     654           0 :   __m128i sign7 = _mm_srai_epi16(res[7], 15);
     655             : 
     656           0 :   if (bit == 2) {
     657           0 :     const __m128i const_rounding = _mm_set1_epi16(1);
     658           0 :     res[0] = _mm_add_epi16(res[0], const_rounding);
     659           0 :     res[1] = _mm_add_epi16(res[1], const_rounding);
     660           0 :     res[2] = _mm_add_epi16(res[2], const_rounding);
     661           0 :     res[3] = _mm_add_epi16(res[3], const_rounding);
     662           0 :     res[4] = _mm_add_epi16(res[4], const_rounding);
     663           0 :     res[5] = _mm_add_epi16(res[5], const_rounding);
     664           0 :     res[6] = _mm_add_epi16(res[6], const_rounding);
     665           0 :     res[7] = _mm_add_epi16(res[7], const_rounding);
     666             :   }
     667             : 
     668           0 :   res[0] = _mm_sub_epi16(res[0], sign0);
     669           0 :   res[1] = _mm_sub_epi16(res[1], sign1);
     670           0 :   res[2] = _mm_sub_epi16(res[2], sign2);
     671           0 :   res[3] = _mm_sub_epi16(res[3], sign3);
     672           0 :   res[4] = _mm_sub_epi16(res[4], sign4);
     673           0 :   res[5] = _mm_sub_epi16(res[5], sign5);
     674           0 :   res[6] = _mm_sub_epi16(res[6], sign6);
     675           0 :   res[7] = _mm_sub_epi16(res[7], sign7);
     676             : 
     677           0 :   if (bit == 1) {
     678           0 :     res[0] = _mm_srai_epi16(res[0], 1);
     679           0 :     res[1] = _mm_srai_epi16(res[1], 1);
     680           0 :     res[2] = _mm_srai_epi16(res[2], 1);
     681           0 :     res[3] = _mm_srai_epi16(res[3], 1);
     682           0 :     res[4] = _mm_srai_epi16(res[4], 1);
     683           0 :     res[5] = _mm_srai_epi16(res[5], 1);
     684           0 :     res[6] = _mm_srai_epi16(res[6], 1);
     685           0 :     res[7] = _mm_srai_epi16(res[7], 1);
     686             :   } else {
     687           0 :     res[0] = _mm_srai_epi16(res[0], 2);
     688           0 :     res[1] = _mm_srai_epi16(res[1], 2);
     689           0 :     res[2] = _mm_srai_epi16(res[2], 2);
     690           0 :     res[3] = _mm_srai_epi16(res[3], 2);
     691           0 :     res[4] = _mm_srai_epi16(res[4], 2);
     692           0 :     res[5] = _mm_srai_epi16(res[5], 2);
     693           0 :     res[6] = _mm_srai_epi16(res[6], 2);
     694           0 :     res[7] = _mm_srai_epi16(res[7], 2);
     695             :   }
     696           0 : }
     697             : 
     698             : // write 8x8 array
     699           0 : static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
     700             :                                     int stride) {
     701           0 :   store_output(&res[0], (output + 0 * stride));
     702           0 :   store_output(&res[1], (output + 1 * stride));
     703           0 :   store_output(&res[2], (output + 2 * stride));
     704           0 :   store_output(&res[3], (output + 3 * stride));
     705           0 :   store_output(&res[4], (output + 4 * stride));
     706           0 :   store_output(&res[5], (output + 5 * stride));
     707           0 :   store_output(&res[6], (output + 6 * stride));
     708           0 :   store_output(&res[7], (output + 7 * stride));
     709           0 : }
     710             : 
     711             : // perform in-place transpose
     712           0 : static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
     713           0 :   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
     714           0 :   const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
     715           0 :   const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
     716           0 :   const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
     717           0 :   const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
     718           0 :   const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
     719           0 :   const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
     720           0 :   const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
     721             :   // 00 10 01 11 02 12 03 13
     722             :   // 20 30 21 31 22 32 23 33
     723             :   // 04 14 05 15 06 16 07 17
     724             :   // 24 34 25 35 26 36 27 37
     725             :   // 40 50 41 51 42 52 43 53
     726             :   // 60 70 61 71 62 72 63 73
     727             :   // 44 54 45 55 46 56 47 57
     728             :   // 64 74 65 75 66 76 67 77
     729           0 :   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
     730           0 :   const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
     731           0 :   const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
     732           0 :   const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
     733           0 :   const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
     734           0 :   const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
     735           0 :   const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
     736           0 :   const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
     737             :   // 00 10 20 30 01 11 21 31
     738             :   // 40 50 60 70 41 51 61 71
     739             :   // 02 12 22 32 03 13 23 33
     740             :   // 42 52 62 72 43 53 63 73
     741             :   // 04 14 24 34 05 15 25 35
     742             :   // 44 54 64 74 45 55 65 75
     743             :   // 06 16 26 36 07 17 27 37
     744             :   // 46 56 66 76 47 57 67 77
     745           0 :   res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
     746           0 :   res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
     747           0 :   res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
     748           0 :   res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
     749           0 :   res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
     750           0 :   res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
     751           0 :   res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
     752           0 :   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
     753             :   // 00 10 20 30 40 50 60 70
     754             :   // 01 11 21 31 41 51 61 71
     755             :   // 02 12 22 32 42 52 62 72
     756             :   // 03 13 23 33 43 53 63 73
     757             :   // 04 14 24 34 44 54 64 74
     758             :   // 05 15 25 35 45 55 65 75
     759             :   // 06 16 26 36 46 56 66 76
     760             :   // 07 17 27 37 47 57 67 77
     761           0 : }
     762             : 
     763           0 : static void fdct8_sse2(__m128i *in) {
     764             :   // constants
     765           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
     766           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     767           0 :   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
     768           0 :   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
     769           0 :   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
     770           0 :   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
     771           0 :   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
     772           0 :   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
     773           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     774             :   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
     775             :   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
     776             :   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
     777             : 
     778             :   // stage 1
     779           0 :   s0 = _mm_add_epi16(in[0], in[7]);
     780           0 :   s1 = _mm_add_epi16(in[1], in[6]);
     781           0 :   s2 = _mm_add_epi16(in[2], in[5]);
     782           0 :   s3 = _mm_add_epi16(in[3], in[4]);
     783           0 :   s4 = _mm_sub_epi16(in[3], in[4]);
     784           0 :   s5 = _mm_sub_epi16(in[2], in[5]);
     785           0 :   s6 = _mm_sub_epi16(in[1], in[6]);
     786           0 :   s7 = _mm_sub_epi16(in[0], in[7]);
     787             : 
     788           0 :   u0 = _mm_add_epi16(s0, s3);
     789           0 :   u1 = _mm_add_epi16(s1, s2);
     790           0 :   u2 = _mm_sub_epi16(s1, s2);
     791           0 :   u3 = _mm_sub_epi16(s0, s3);
     792             :   // interleave and perform butterfly multiplication/addition
     793           0 :   v0 = _mm_unpacklo_epi16(u0, u1);
     794           0 :   v1 = _mm_unpackhi_epi16(u0, u1);
     795           0 :   v2 = _mm_unpacklo_epi16(u2, u3);
     796           0 :   v3 = _mm_unpackhi_epi16(u2, u3);
     797             : 
     798           0 :   u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
     799           0 :   u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
     800           0 :   u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
     801           0 :   u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
     802           0 :   u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
     803           0 :   u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
     804           0 :   u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
     805           0 :   u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
     806             : 
     807             :   // shift and rounding
     808           0 :   v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
     809           0 :   v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
     810           0 :   v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
     811           0 :   v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
     812           0 :   v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
     813           0 :   v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
     814           0 :   v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
     815           0 :   v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
     816             : 
     817           0 :   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
     818           0 :   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
     819           0 :   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
     820           0 :   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
     821           0 :   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
     822           0 :   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
     823           0 :   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
     824           0 :   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
     825             : 
     826           0 :   in[0] = _mm_packs_epi32(u0, u1);
     827           0 :   in[2] = _mm_packs_epi32(u4, u5);
     828           0 :   in[4] = _mm_packs_epi32(u2, u3);
     829           0 :   in[6] = _mm_packs_epi32(u6, u7);
     830             : 
     831             :   // stage 2
     832             :   // interleave and perform butterfly multiplication/addition
     833           0 :   u0 = _mm_unpacklo_epi16(s6, s5);
     834           0 :   u1 = _mm_unpackhi_epi16(s6, s5);
     835           0 :   v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
     836           0 :   v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
     837           0 :   v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
     838           0 :   v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
     839             : 
     840             :   // shift and rounding
     841           0 :   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
     842           0 :   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
     843           0 :   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
     844           0 :   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
     845             : 
     846           0 :   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
     847           0 :   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
     848           0 :   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
     849           0 :   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
     850             : 
     851           0 :   u0 = _mm_packs_epi32(v0, v1);
     852           0 :   u1 = _mm_packs_epi32(v2, v3);
     853             : 
     854             :   // stage 3
     855           0 :   s0 = _mm_add_epi16(s4, u0);
     856           0 :   s1 = _mm_sub_epi16(s4, u0);
     857           0 :   s2 = _mm_sub_epi16(s7, u1);
     858           0 :   s3 = _mm_add_epi16(s7, u1);
     859             : 
     860             :   // stage 4
     861           0 :   u0 = _mm_unpacklo_epi16(s0, s3);
     862           0 :   u1 = _mm_unpackhi_epi16(s0, s3);
     863           0 :   u2 = _mm_unpacklo_epi16(s1, s2);
     864           0 :   u3 = _mm_unpackhi_epi16(s1, s2);
     865             : 
     866           0 :   v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
     867           0 :   v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
     868           0 :   v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
     869           0 :   v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
     870           0 :   v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
     871           0 :   v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
     872           0 :   v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
     873           0 :   v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
     874             : 
     875             :   // shift and rounding
     876           0 :   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
     877           0 :   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
     878           0 :   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
     879           0 :   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
     880           0 :   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
     881           0 :   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
     882           0 :   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
     883           0 :   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
     884             : 
     885           0 :   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
     886           0 :   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
     887           0 :   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
     888           0 :   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
     889           0 :   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
     890           0 :   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
     891           0 :   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
     892           0 :   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
     893             : 
     894           0 :   in[1] = _mm_packs_epi32(v0, v1);
     895           0 :   in[3] = _mm_packs_epi32(v4, v5);
     896           0 :   in[5] = _mm_packs_epi32(v2, v3);
     897           0 :   in[7] = _mm_packs_epi32(v6, v7);
     898             : 
     899             :   // transpose
     900           0 :   array_transpose_8x8(in, in);
     901           0 : }
     902             : 
     903           0 : static void fadst8_sse2(__m128i *in) {
     904             :   // Constants
     905           0 :   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
     906           0 :   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
     907           0 :   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
     908           0 :   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
     909           0 :   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
     910           0 :   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
     911           0 :   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
     912           0 :   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
     913           0 :   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
     914           0 :   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
     915           0 :   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
     916           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     917           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
     918           0 :   const __m128i k__const_0 = _mm_set1_epi16(0);
     919           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     920             : 
     921             :   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
     922             :   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
     923             :   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
     924             :   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
     925             :   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
     926             : 
     927             :   // properly aligned for butterfly input
     928           0 :   in0 = in[7];
     929           0 :   in1 = in[0];
     930           0 :   in2 = in[5];
     931           0 :   in3 = in[2];
     932           0 :   in4 = in[3];
     933           0 :   in5 = in[4];
     934           0 :   in6 = in[1];
     935           0 :   in7 = in[6];
     936             : 
     937             :   // column transformation
     938             :   // stage 1
     939             :   // interleave and multiply/add into 32-bit integer
     940           0 :   s0 = _mm_unpacklo_epi16(in0, in1);
     941           0 :   s1 = _mm_unpackhi_epi16(in0, in1);
     942           0 :   s2 = _mm_unpacklo_epi16(in2, in3);
     943           0 :   s3 = _mm_unpackhi_epi16(in2, in3);
     944           0 :   s4 = _mm_unpacklo_epi16(in4, in5);
     945           0 :   s5 = _mm_unpackhi_epi16(in4, in5);
     946           0 :   s6 = _mm_unpacklo_epi16(in6, in7);
     947           0 :   s7 = _mm_unpackhi_epi16(in6, in7);
     948             : 
     949           0 :   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
     950           0 :   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
     951           0 :   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
     952           0 :   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
     953           0 :   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
     954           0 :   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
     955           0 :   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
     956           0 :   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
     957           0 :   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
     958           0 :   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
     959           0 :   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
     960           0 :   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
     961           0 :   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
     962           0 :   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
     963           0 :   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
     964           0 :   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
     965             : 
     966             :   // addition
     967           0 :   w0 = _mm_add_epi32(u0, u8);
     968           0 :   w1 = _mm_add_epi32(u1, u9);
     969           0 :   w2 = _mm_add_epi32(u2, u10);
     970           0 :   w3 = _mm_add_epi32(u3, u11);
     971           0 :   w4 = _mm_add_epi32(u4, u12);
     972           0 :   w5 = _mm_add_epi32(u5, u13);
     973           0 :   w6 = _mm_add_epi32(u6, u14);
     974           0 :   w7 = _mm_add_epi32(u7, u15);
     975           0 :   w8 = _mm_sub_epi32(u0, u8);
     976           0 :   w9 = _mm_sub_epi32(u1, u9);
     977           0 :   w10 = _mm_sub_epi32(u2, u10);
     978           0 :   w11 = _mm_sub_epi32(u3, u11);
     979           0 :   w12 = _mm_sub_epi32(u4, u12);
     980           0 :   w13 = _mm_sub_epi32(u5, u13);
     981           0 :   w14 = _mm_sub_epi32(u6, u14);
     982           0 :   w15 = _mm_sub_epi32(u7, u15);
     983             : 
     984             :   // shift and rounding
     985           0 :   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
     986           0 :   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
     987           0 :   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
     988           0 :   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
     989           0 :   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
     990           0 :   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
     991           0 :   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
     992           0 :   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
     993           0 :   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
     994           0 :   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
     995           0 :   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
     996           0 :   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
     997           0 :   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
     998           0 :   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
     999           0 :   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
    1000           0 :   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
    1001             : 
    1002           0 :   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    1003           0 :   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    1004           0 :   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    1005           0 :   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    1006           0 :   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    1007           0 :   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    1008           0 :   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    1009           0 :   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    1010           0 :   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
    1011           0 :   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
    1012           0 :   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
    1013           0 :   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
    1014           0 :   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
    1015           0 :   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
    1016           0 :   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
    1017           0 :   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
    1018             : 
    1019             :   // back to 16-bit and pack 8 integers into __m128i
    1020           0 :   in[0] = _mm_packs_epi32(u0, u1);
    1021           0 :   in[1] = _mm_packs_epi32(u2, u3);
    1022           0 :   in[2] = _mm_packs_epi32(u4, u5);
    1023           0 :   in[3] = _mm_packs_epi32(u6, u7);
    1024           0 :   in[4] = _mm_packs_epi32(u8, u9);
    1025           0 :   in[5] = _mm_packs_epi32(u10, u11);
    1026           0 :   in[6] = _mm_packs_epi32(u12, u13);
    1027           0 :   in[7] = _mm_packs_epi32(u14, u15);
    1028             : 
    1029             :   // stage 2
    1030           0 :   s0 = _mm_add_epi16(in[0], in[2]);
    1031           0 :   s1 = _mm_add_epi16(in[1], in[3]);
    1032           0 :   s2 = _mm_sub_epi16(in[0], in[2]);
    1033           0 :   s3 = _mm_sub_epi16(in[1], in[3]);
    1034           0 :   u0 = _mm_unpacklo_epi16(in[4], in[5]);
    1035           0 :   u1 = _mm_unpackhi_epi16(in[4], in[5]);
    1036           0 :   u2 = _mm_unpacklo_epi16(in[6], in[7]);
    1037           0 :   u3 = _mm_unpackhi_epi16(in[6], in[7]);
    1038             : 
    1039           0 :   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
    1040           0 :   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
    1041           0 :   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
    1042           0 :   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
    1043           0 :   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
    1044           0 :   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
    1045           0 :   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
    1046           0 :   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
    1047             : 
    1048           0 :   w0 = _mm_add_epi32(v0, v4);
    1049           0 :   w1 = _mm_add_epi32(v1, v5);
    1050           0 :   w2 = _mm_add_epi32(v2, v6);
    1051           0 :   w3 = _mm_add_epi32(v3, v7);
    1052           0 :   w4 = _mm_sub_epi32(v0, v4);
    1053           0 :   w5 = _mm_sub_epi32(v1, v5);
    1054           0 :   w6 = _mm_sub_epi32(v2, v6);
    1055           0 :   w7 = _mm_sub_epi32(v3, v7);
    1056             : 
    1057           0 :   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
    1058           0 :   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
    1059           0 :   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
    1060           0 :   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
    1061           0 :   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
    1062           0 :   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
    1063           0 :   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
    1064           0 :   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
    1065             : 
    1066           0 :   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    1067           0 :   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    1068           0 :   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    1069           0 :   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    1070           0 :   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    1071           0 :   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    1072           0 :   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    1073           0 :   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    1074             : 
    1075             :   // back to 16-bit intergers
    1076           0 :   s4 = _mm_packs_epi32(u0, u1);
    1077           0 :   s5 = _mm_packs_epi32(u2, u3);
    1078           0 :   s6 = _mm_packs_epi32(u4, u5);
    1079           0 :   s7 = _mm_packs_epi32(u6, u7);
    1080             : 
    1081             :   // stage 3
    1082           0 :   u0 = _mm_unpacklo_epi16(s2, s3);
    1083           0 :   u1 = _mm_unpackhi_epi16(s2, s3);
    1084           0 :   u2 = _mm_unpacklo_epi16(s6, s7);
    1085           0 :   u3 = _mm_unpackhi_epi16(s6, s7);
    1086             : 
    1087           0 :   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
    1088           0 :   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
    1089           0 :   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
    1090           0 :   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
    1091           0 :   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
    1092           0 :   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
    1093           0 :   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
    1094           0 :   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
    1095             : 
    1096           0 :   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
    1097           0 :   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
    1098           0 :   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
    1099           0 :   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
    1100           0 :   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
    1101           0 :   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
    1102           0 :   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
    1103           0 :   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
    1104             : 
    1105           0 :   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
    1106           0 :   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
    1107           0 :   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
    1108           0 :   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
    1109           0 :   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
    1110           0 :   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
    1111           0 :   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
    1112           0 :   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
    1113             : 
    1114           0 :   s2 = _mm_packs_epi32(v0, v1);
    1115           0 :   s3 = _mm_packs_epi32(v2, v3);
    1116           0 :   s6 = _mm_packs_epi32(v4, v5);
    1117           0 :   s7 = _mm_packs_epi32(v6, v7);
    1118             : 
    1119             :   // FIXME(jingning): do subtract using bit inversion?
    1120           0 :   in[0] = s0;
    1121           0 :   in[1] = _mm_sub_epi16(k__const_0, s4);
    1122           0 :   in[2] = s6;
    1123           0 :   in[3] = _mm_sub_epi16(k__const_0, s2);
    1124           0 :   in[4] = s3;
    1125           0 :   in[5] = _mm_sub_epi16(k__const_0, s7);
    1126           0 :   in[6] = s5;
    1127           0 :   in[7] = _mm_sub_epi16(k__const_0, s1);
    1128             : 
    1129             :   // transpose
    1130           0 :   array_transpose_8x8(in, in);
    1131           0 : }
    1132             : 
    1133           0 : void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
    1134             :                      int tx_type) {
    1135             :   __m128i in[8];
    1136             : 
    1137           0 :   switch (tx_type) {
    1138           0 :     case DCT_DCT: vpx_fdct8x8_sse2(input, output, stride); break;
    1139             :     case ADST_DCT:
    1140           0 :       load_buffer_8x8(input, in, stride);
    1141           0 :       fadst8_sse2(in);
    1142           0 :       fdct8_sse2(in);
    1143           0 :       right_shift_8x8(in, 1);
    1144           0 :       write_buffer_8x8(output, in, 8);
    1145           0 :       break;
    1146             :     case DCT_ADST:
    1147           0 :       load_buffer_8x8(input, in, stride);
    1148           0 :       fdct8_sse2(in);
    1149           0 :       fadst8_sse2(in);
    1150           0 :       right_shift_8x8(in, 1);
    1151           0 :       write_buffer_8x8(output, in, 8);
    1152           0 :       break;
    1153             :     case ADST_ADST:
    1154           0 :       load_buffer_8x8(input, in, stride);
    1155           0 :       fadst8_sse2(in);
    1156           0 :       fadst8_sse2(in);
    1157           0 :       right_shift_8x8(in, 1);
    1158           0 :       write_buffer_8x8(output, in, 8);
    1159           0 :       break;
    1160           0 :     default: assert(0); break;
    1161             :   }
    1162           0 : }
    1163             : 
    1164           0 : static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
    1165             :                                      __m128i *in1, int stride) {
    1166             :   // load first 8 columns
    1167           0 :   load_buffer_8x8(input, in0, stride);
    1168           0 :   load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
    1169             : 
    1170           0 :   input += 8;
    1171             :   // load second 8 columns
    1172           0 :   load_buffer_8x8(input, in1, stride);
    1173           0 :   load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
    1174           0 : }
    1175             : 
    1176           0 : static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
    1177             :                                       __m128i *in1, int stride) {
    1178             :   // write first 8 columns
    1179           0 :   write_buffer_8x8(output, in0, stride);
    1180           0 :   write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
    1181             :   // write second 8 columns
    1182           0 :   output += 8;
    1183           0 :   write_buffer_8x8(output, in1, stride);
    1184           0 :   write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
    1185           0 : }
    1186             : 
    1187           0 : static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
    1188             :   __m128i tbuf[8];
    1189           0 :   array_transpose_8x8(res0, res0);
    1190           0 :   array_transpose_8x8(res1, tbuf);
    1191           0 :   array_transpose_8x8(res0 + 8, res1);
    1192           0 :   array_transpose_8x8(res1 + 8, res1 + 8);
    1193             : 
    1194           0 :   res0[8] = tbuf[0];
    1195           0 :   res0[9] = tbuf[1];
    1196           0 :   res0[10] = tbuf[2];
    1197           0 :   res0[11] = tbuf[3];
    1198           0 :   res0[12] = tbuf[4];
    1199           0 :   res0[13] = tbuf[5];
    1200           0 :   res0[14] = tbuf[6];
    1201           0 :   res0[15] = tbuf[7];
    1202           0 : }
    1203             : 
    1204           0 : static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
    1205             :   // perform rounding operations
    1206           0 :   right_shift_8x8(res0, 2);
    1207           0 :   right_shift_8x8(res0 + 8, 2);
    1208           0 :   right_shift_8x8(res1, 2);
    1209           0 :   right_shift_8x8(res1 + 8, 2);
    1210           0 : }
    1211             : 
    1212           0 : static void fdct16_8col(__m128i *in) {
    1213             :   // perform 16x16 1-D DCT for 8 columns
    1214             :   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
    1215           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
    1216           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    1217           0 :   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    1218           0 :   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
    1219           0 :   const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
    1220           0 :   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    1221           0 :   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
    1222           0 :   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
    1223           0 :   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
    1224           0 :   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    1225           0 :   const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
    1226           0 :   const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
    1227           0 :   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
    1228           0 :   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
    1229           0 :   const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
    1230           0 :   const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
    1231           0 :   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
    1232           0 :   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
    1233           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    1234             : 
    1235             :   // stage 1
    1236           0 :   i[0] = _mm_add_epi16(in[0], in[15]);
    1237           0 :   i[1] = _mm_add_epi16(in[1], in[14]);
    1238           0 :   i[2] = _mm_add_epi16(in[2], in[13]);
    1239           0 :   i[3] = _mm_add_epi16(in[3], in[12]);
    1240           0 :   i[4] = _mm_add_epi16(in[4], in[11]);
    1241           0 :   i[5] = _mm_add_epi16(in[5], in[10]);
    1242           0 :   i[6] = _mm_add_epi16(in[6], in[9]);
    1243           0 :   i[7] = _mm_add_epi16(in[7], in[8]);
    1244             : 
    1245           0 :   s[0] = _mm_sub_epi16(in[7], in[8]);
    1246           0 :   s[1] = _mm_sub_epi16(in[6], in[9]);
    1247           0 :   s[2] = _mm_sub_epi16(in[5], in[10]);
    1248           0 :   s[3] = _mm_sub_epi16(in[4], in[11]);
    1249           0 :   s[4] = _mm_sub_epi16(in[3], in[12]);
    1250           0 :   s[5] = _mm_sub_epi16(in[2], in[13]);
    1251           0 :   s[6] = _mm_sub_epi16(in[1], in[14]);
    1252           0 :   s[7] = _mm_sub_epi16(in[0], in[15]);
    1253             : 
    1254           0 :   p[0] = _mm_add_epi16(i[0], i[7]);
    1255           0 :   p[1] = _mm_add_epi16(i[1], i[6]);
    1256           0 :   p[2] = _mm_add_epi16(i[2], i[5]);
    1257           0 :   p[3] = _mm_add_epi16(i[3], i[4]);
    1258           0 :   p[4] = _mm_sub_epi16(i[3], i[4]);
    1259           0 :   p[5] = _mm_sub_epi16(i[2], i[5]);
    1260           0 :   p[6] = _mm_sub_epi16(i[1], i[6]);
    1261           0 :   p[7] = _mm_sub_epi16(i[0], i[7]);
    1262             : 
    1263           0 :   u[0] = _mm_add_epi16(p[0], p[3]);
    1264           0 :   u[1] = _mm_add_epi16(p[1], p[2]);
    1265           0 :   u[2] = _mm_sub_epi16(p[1], p[2]);
    1266           0 :   u[3] = _mm_sub_epi16(p[0], p[3]);
    1267             : 
    1268           0 :   v[0] = _mm_unpacklo_epi16(u[0], u[1]);
    1269           0 :   v[1] = _mm_unpackhi_epi16(u[0], u[1]);
    1270           0 :   v[2] = _mm_unpacklo_epi16(u[2], u[3]);
    1271           0 :   v[3] = _mm_unpackhi_epi16(u[2], u[3]);
    1272             : 
    1273           0 :   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
    1274           0 :   u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
    1275           0 :   u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
    1276           0 :   u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
    1277           0 :   u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
    1278           0 :   u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
    1279           0 :   u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
    1280           0 :   u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
    1281             : 
    1282           0 :   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    1283           0 :   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    1284           0 :   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    1285           0 :   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
    1286           0 :   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
    1287           0 :   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
    1288           0 :   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
    1289           0 :   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
    1290             : 
    1291           0 :   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
    1292           0 :   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
    1293           0 :   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
    1294           0 :   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
    1295           0 :   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
    1296           0 :   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
    1297           0 :   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
    1298           0 :   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
    1299             : 
    1300           0 :   in[0] = _mm_packs_epi32(u[0], u[1]);
    1301           0 :   in[4] = _mm_packs_epi32(u[4], u[5]);
    1302           0 :   in[8] = _mm_packs_epi32(u[2], u[3]);
    1303           0 :   in[12] = _mm_packs_epi32(u[6], u[7]);
    1304             : 
    1305           0 :   u[0] = _mm_unpacklo_epi16(p[5], p[6]);
    1306           0 :   u[1] = _mm_unpackhi_epi16(p[5], p[6]);
    1307           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
    1308           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
    1309           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
    1310           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
    1311             : 
    1312           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1313           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1314           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1315           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1316             : 
    1317           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1318           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1319           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1320           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1321             : 
    1322           0 :   u[0] = _mm_packs_epi32(v[0], v[1]);
    1323           0 :   u[1] = _mm_packs_epi32(v[2], v[3]);
    1324             : 
    1325           0 :   t[0] = _mm_add_epi16(p[4], u[0]);
    1326           0 :   t[1] = _mm_sub_epi16(p[4], u[0]);
    1327           0 :   t[2] = _mm_sub_epi16(p[7], u[1]);
    1328           0 :   t[3] = _mm_add_epi16(p[7], u[1]);
    1329             : 
    1330           0 :   u[0] = _mm_unpacklo_epi16(t[0], t[3]);
    1331           0 :   u[1] = _mm_unpackhi_epi16(t[0], t[3]);
    1332           0 :   u[2] = _mm_unpacklo_epi16(t[1], t[2]);
    1333           0 :   u[3] = _mm_unpackhi_epi16(t[1], t[2]);
    1334             : 
    1335           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
    1336           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
    1337           0 :   v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
    1338           0 :   v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
    1339           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
    1340           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
    1341           0 :   v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
    1342           0 :   v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
    1343             : 
    1344           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1345           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1346           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1347           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1348           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    1349           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    1350           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    1351           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    1352             : 
    1353           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1354           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1355           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1356           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1357           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1358           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1359           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1360           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1361             : 
    1362           0 :   in[2] = _mm_packs_epi32(v[0], v[1]);
    1363           0 :   in[6] = _mm_packs_epi32(v[4], v[5]);
    1364           0 :   in[10] = _mm_packs_epi32(v[2], v[3]);
    1365           0 :   in[14] = _mm_packs_epi32(v[6], v[7]);
    1366             : 
    1367             :   // stage 2
    1368           0 :   u[0] = _mm_unpacklo_epi16(s[2], s[5]);
    1369           0 :   u[1] = _mm_unpackhi_epi16(s[2], s[5]);
    1370           0 :   u[2] = _mm_unpacklo_epi16(s[3], s[4]);
    1371           0 :   u[3] = _mm_unpackhi_epi16(s[3], s[4]);
    1372             : 
    1373           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
    1374           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
    1375           0 :   v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
    1376           0 :   v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
    1377           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
    1378           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
    1379           0 :   v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
    1380           0 :   v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
    1381             : 
    1382           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1383           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1384           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1385           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1386           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    1387           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    1388           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    1389           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    1390             : 
    1391           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1392           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1393           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1394           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1395           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1396           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1397           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1398           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1399             : 
    1400           0 :   t[2] = _mm_packs_epi32(v[0], v[1]);
    1401           0 :   t[3] = _mm_packs_epi32(v[2], v[3]);
    1402           0 :   t[4] = _mm_packs_epi32(v[4], v[5]);
    1403           0 :   t[5] = _mm_packs_epi32(v[6], v[7]);
    1404             : 
    1405             :   // stage 3
    1406           0 :   p[0] = _mm_add_epi16(s[0], t[3]);
    1407           0 :   p[1] = _mm_add_epi16(s[1], t[2]);
    1408           0 :   p[2] = _mm_sub_epi16(s[1], t[2]);
    1409           0 :   p[3] = _mm_sub_epi16(s[0], t[3]);
    1410           0 :   p[4] = _mm_sub_epi16(s[7], t[4]);
    1411           0 :   p[5] = _mm_sub_epi16(s[6], t[5]);
    1412           0 :   p[6] = _mm_add_epi16(s[6], t[5]);
    1413           0 :   p[7] = _mm_add_epi16(s[7], t[4]);
    1414             : 
    1415             :   // stage 4
    1416           0 :   u[0] = _mm_unpacklo_epi16(p[1], p[6]);
    1417           0 :   u[1] = _mm_unpackhi_epi16(p[1], p[6]);
    1418           0 :   u[2] = _mm_unpacklo_epi16(p[2], p[5]);
    1419           0 :   u[3] = _mm_unpackhi_epi16(p[2], p[5]);
    1420             : 
    1421           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
    1422           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
    1423           0 :   v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
    1424           0 :   v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
    1425           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
    1426           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
    1427           0 :   v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
    1428           0 :   v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
    1429             : 
    1430           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1431           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1432           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1433           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1434           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    1435           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    1436           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    1437           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    1438             : 
    1439           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1440           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1441           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1442           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1443           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1444           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1445           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1446           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1447             : 
    1448           0 :   t[1] = _mm_packs_epi32(v[0], v[1]);
    1449           0 :   t[2] = _mm_packs_epi32(v[2], v[3]);
    1450           0 :   t[5] = _mm_packs_epi32(v[4], v[5]);
    1451           0 :   t[6] = _mm_packs_epi32(v[6], v[7]);
    1452             : 
    1453             :   // stage 5
    1454           0 :   s[0] = _mm_add_epi16(p[0], t[1]);
    1455           0 :   s[1] = _mm_sub_epi16(p[0], t[1]);
    1456           0 :   s[2] = _mm_add_epi16(p[3], t[2]);
    1457           0 :   s[3] = _mm_sub_epi16(p[3], t[2]);
    1458           0 :   s[4] = _mm_sub_epi16(p[4], t[5]);
    1459           0 :   s[5] = _mm_add_epi16(p[4], t[5]);
    1460           0 :   s[6] = _mm_sub_epi16(p[7], t[6]);
    1461           0 :   s[7] = _mm_add_epi16(p[7], t[6]);
    1462             : 
    1463             :   // stage 6
    1464           0 :   u[0] = _mm_unpacklo_epi16(s[0], s[7]);
    1465           0 :   u[1] = _mm_unpackhi_epi16(s[0], s[7]);
    1466           0 :   u[2] = _mm_unpacklo_epi16(s[1], s[6]);
    1467           0 :   u[3] = _mm_unpackhi_epi16(s[1], s[6]);
    1468           0 :   u[4] = _mm_unpacklo_epi16(s[2], s[5]);
    1469           0 :   u[5] = _mm_unpackhi_epi16(s[2], s[5]);
    1470           0 :   u[6] = _mm_unpacklo_epi16(s[3], s[4]);
    1471           0 :   u[7] = _mm_unpackhi_epi16(s[3], s[4]);
    1472             : 
    1473           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
    1474           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
    1475           0 :   v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
    1476           0 :   v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
    1477           0 :   v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
    1478           0 :   v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
    1479           0 :   v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
    1480           0 :   v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
    1481           0 :   v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
    1482           0 :   v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
    1483           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
    1484           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
    1485           0 :   v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
    1486           0 :   v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
    1487           0 :   v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
    1488           0 :   v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
    1489             : 
    1490           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1491           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1492           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1493           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1494           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    1495           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    1496           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    1497           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    1498           0 :   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
    1499           0 :   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
    1500           0 :   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
    1501           0 :   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
    1502           0 :   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
    1503           0 :   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
    1504           0 :   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
    1505           0 :   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
    1506             : 
    1507           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1508           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1509           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1510           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1511           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1512           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1513           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1514           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1515           0 :   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
    1516           0 :   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
    1517           0 :   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
    1518           0 :   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
    1519           0 :   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
    1520           0 :   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
    1521           0 :   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
    1522           0 :   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
    1523             : 
    1524           0 :   in[1] = _mm_packs_epi32(v[0], v[1]);
    1525           0 :   in[9] = _mm_packs_epi32(v[2], v[3]);
    1526           0 :   in[5] = _mm_packs_epi32(v[4], v[5]);
    1527           0 :   in[13] = _mm_packs_epi32(v[6], v[7]);
    1528           0 :   in[3] = _mm_packs_epi32(v[8], v[9]);
    1529           0 :   in[11] = _mm_packs_epi32(v[10], v[11]);
    1530           0 :   in[7] = _mm_packs_epi32(v[12], v[13]);
    1531           0 :   in[15] = _mm_packs_epi32(v[14], v[15]);
    1532           0 : }
    1533             : 
    1534           0 : static void fadst16_8col(__m128i *in) {
    1535             :   // perform 16x16 1-D ADST for 8 columns
    1536             :   __m128i s[16], x[16], u[32], v[32];
    1537           0 :   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
    1538           0 :   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
    1539           0 :   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
    1540           0 :   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
    1541           0 :   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
    1542           0 :   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
    1543           0 :   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
    1544           0 :   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
    1545           0 :   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
    1546           0 :   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
    1547           0 :   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
    1548           0 :   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
    1549           0 :   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
    1550           0 :   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
    1551           0 :   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
    1552           0 :   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
    1553           0 :   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
    1554           0 :   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    1555           0 :   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
    1556           0 :   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
    1557           0 :   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
    1558           0 :   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
    1559           0 :   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
    1560           0 :   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    1561           0 :   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
    1562           0 :   const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
    1563           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
    1564           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    1565           0 :   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    1566           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    1567           0 :   const __m128i kZero = _mm_set1_epi16(0);
    1568             : 
    1569           0 :   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
    1570           0 :   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
    1571           0 :   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
    1572           0 :   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
    1573           0 :   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
    1574           0 :   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
    1575           0 :   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
    1576           0 :   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
    1577           0 :   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
    1578           0 :   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
    1579           0 :   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
    1580           0 :   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
    1581           0 :   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
    1582           0 :   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
    1583           0 :   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
    1584           0 :   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
    1585             : 
    1586           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
    1587           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
    1588           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
    1589           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
    1590           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
    1591           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
    1592           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
    1593           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
    1594           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
    1595           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
    1596           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
    1597           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
    1598           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
    1599           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
    1600           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
    1601           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
    1602           0 :   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
    1603           0 :   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
    1604           0 :   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
    1605           0 :   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
    1606           0 :   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
    1607           0 :   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
    1608           0 :   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
    1609           0 :   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
    1610           0 :   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
    1611           0 :   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
    1612           0 :   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
    1613           0 :   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
    1614           0 :   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
    1615           0 :   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
    1616           0 :   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
    1617           0 :   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
    1618             : 
    1619           0 :   u[0] = _mm_add_epi32(v[0], v[16]);
    1620           0 :   u[1] = _mm_add_epi32(v[1], v[17]);
    1621           0 :   u[2] = _mm_add_epi32(v[2], v[18]);
    1622           0 :   u[3] = _mm_add_epi32(v[3], v[19]);
    1623           0 :   u[4] = _mm_add_epi32(v[4], v[20]);
    1624           0 :   u[5] = _mm_add_epi32(v[5], v[21]);
    1625           0 :   u[6] = _mm_add_epi32(v[6], v[22]);
    1626           0 :   u[7] = _mm_add_epi32(v[7], v[23]);
    1627           0 :   u[8] = _mm_add_epi32(v[8], v[24]);
    1628           0 :   u[9] = _mm_add_epi32(v[9], v[25]);
    1629           0 :   u[10] = _mm_add_epi32(v[10], v[26]);
    1630           0 :   u[11] = _mm_add_epi32(v[11], v[27]);
    1631           0 :   u[12] = _mm_add_epi32(v[12], v[28]);
    1632           0 :   u[13] = _mm_add_epi32(v[13], v[29]);
    1633           0 :   u[14] = _mm_add_epi32(v[14], v[30]);
    1634           0 :   u[15] = _mm_add_epi32(v[15], v[31]);
    1635           0 :   u[16] = _mm_sub_epi32(v[0], v[16]);
    1636           0 :   u[17] = _mm_sub_epi32(v[1], v[17]);
    1637           0 :   u[18] = _mm_sub_epi32(v[2], v[18]);
    1638           0 :   u[19] = _mm_sub_epi32(v[3], v[19]);
    1639           0 :   u[20] = _mm_sub_epi32(v[4], v[20]);
    1640           0 :   u[21] = _mm_sub_epi32(v[5], v[21]);
    1641           0 :   u[22] = _mm_sub_epi32(v[6], v[22]);
    1642           0 :   u[23] = _mm_sub_epi32(v[7], v[23]);
    1643           0 :   u[24] = _mm_sub_epi32(v[8], v[24]);
    1644           0 :   u[25] = _mm_sub_epi32(v[9], v[25]);
    1645           0 :   u[26] = _mm_sub_epi32(v[10], v[26]);
    1646           0 :   u[27] = _mm_sub_epi32(v[11], v[27]);
    1647           0 :   u[28] = _mm_sub_epi32(v[12], v[28]);
    1648           0 :   u[29] = _mm_sub_epi32(v[13], v[29]);
    1649           0 :   u[30] = _mm_sub_epi32(v[14], v[30]);
    1650           0 :   u[31] = _mm_sub_epi32(v[15], v[31]);
    1651             : 
    1652           0 :   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    1653           0 :   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    1654           0 :   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    1655           0 :   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
    1656           0 :   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
    1657           0 :   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
    1658           0 :   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
    1659           0 :   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
    1660           0 :   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
    1661           0 :   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
    1662           0 :   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
    1663           0 :   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
    1664           0 :   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
    1665           0 :   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
    1666           0 :   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
    1667           0 :   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
    1668           0 :   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
    1669           0 :   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
    1670           0 :   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
    1671           0 :   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
    1672           0 :   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
    1673           0 :   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
    1674           0 :   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
    1675           0 :   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
    1676           0 :   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
    1677           0 :   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
    1678           0 :   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
    1679           0 :   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
    1680           0 :   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
    1681           0 :   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
    1682           0 :   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
    1683           0 :   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
    1684             : 
    1685           0 :   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
    1686           0 :   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
    1687           0 :   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
    1688           0 :   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
    1689           0 :   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
    1690           0 :   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
    1691           0 :   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
    1692           0 :   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
    1693           0 :   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
    1694           0 :   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
    1695           0 :   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
    1696           0 :   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
    1697           0 :   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
    1698           0 :   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
    1699           0 :   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
    1700           0 :   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
    1701           0 :   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
    1702           0 :   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
    1703           0 :   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
    1704           0 :   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
    1705           0 :   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
    1706           0 :   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
    1707           0 :   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
    1708           0 :   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
    1709           0 :   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
    1710           0 :   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
    1711           0 :   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
    1712           0 :   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
    1713           0 :   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
    1714           0 :   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
    1715           0 :   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
    1716           0 :   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
    1717             : 
    1718           0 :   s[0] = _mm_packs_epi32(u[0], u[1]);
    1719           0 :   s[1] = _mm_packs_epi32(u[2], u[3]);
    1720           0 :   s[2] = _mm_packs_epi32(u[4], u[5]);
    1721           0 :   s[3] = _mm_packs_epi32(u[6], u[7]);
    1722           0 :   s[4] = _mm_packs_epi32(u[8], u[9]);
    1723           0 :   s[5] = _mm_packs_epi32(u[10], u[11]);
    1724           0 :   s[6] = _mm_packs_epi32(u[12], u[13]);
    1725           0 :   s[7] = _mm_packs_epi32(u[14], u[15]);
    1726           0 :   s[8] = _mm_packs_epi32(u[16], u[17]);
    1727           0 :   s[9] = _mm_packs_epi32(u[18], u[19]);
    1728           0 :   s[10] = _mm_packs_epi32(u[20], u[21]);
    1729           0 :   s[11] = _mm_packs_epi32(u[22], u[23]);
    1730           0 :   s[12] = _mm_packs_epi32(u[24], u[25]);
    1731           0 :   s[13] = _mm_packs_epi32(u[26], u[27]);
    1732           0 :   s[14] = _mm_packs_epi32(u[28], u[29]);
    1733           0 :   s[15] = _mm_packs_epi32(u[30], u[31]);
    1734             : 
    1735             :   // stage 2
    1736           0 :   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
    1737           0 :   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
    1738           0 :   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
    1739           0 :   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
    1740           0 :   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
    1741           0 :   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
    1742           0 :   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
    1743           0 :   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
    1744             : 
    1745           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
    1746           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
    1747           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
    1748           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
    1749           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
    1750           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
    1751           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
    1752           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
    1753           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
    1754           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
    1755           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
    1756           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
    1757           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
    1758           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
    1759           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
    1760           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
    1761             : 
    1762           0 :   u[0] = _mm_add_epi32(v[0], v[8]);
    1763           0 :   u[1] = _mm_add_epi32(v[1], v[9]);
    1764           0 :   u[2] = _mm_add_epi32(v[2], v[10]);
    1765           0 :   u[3] = _mm_add_epi32(v[3], v[11]);
    1766           0 :   u[4] = _mm_add_epi32(v[4], v[12]);
    1767           0 :   u[5] = _mm_add_epi32(v[5], v[13]);
    1768           0 :   u[6] = _mm_add_epi32(v[6], v[14]);
    1769           0 :   u[7] = _mm_add_epi32(v[7], v[15]);
    1770           0 :   u[8] = _mm_sub_epi32(v[0], v[8]);
    1771           0 :   u[9] = _mm_sub_epi32(v[1], v[9]);
    1772           0 :   u[10] = _mm_sub_epi32(v[2], v[10]);
    1773           0 :   u[11] = _mm_sub_epi32(v[3], v[11]);
    1774           0 :   u[12] = _mm_sub_epi32(v[4], v[12]);
    1775           0 :   u[13] = _mm_sub_epi32(v[5], v[13]);
    1776           0 :   u[14] = _mm_sub_epi32(v[6], v[14]);
    1777           0 :   u[15] = _mm_sub_epi32(v[7], v[15]);
    1778             : 
    1779           0 :   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    1780           0 :   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    1781           0 :   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    1782           0 :   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
    1783           0 :   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
    1784           0 :   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
    1785           0 :   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
    1786           0 :   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
    1787           0 :   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
    1788           0 :   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
    1789           0 :   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
    1790           0 :   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
    1791           0 :   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
    1792           0 :   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
    1793           0 :   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
    1794           0 :   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
    1795             : 
    1796           0 :   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
    1797           0 :   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
    1798           0 :   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
    1799           0 :   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
    1800           0 :   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
    1801           0 :   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
    1802           0 :   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
    1803           0 :   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
    1804           0 :   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
    1805           0 :   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
    1806           0 :   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
    1807           0 :   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
    1808           0 :   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
    1809           0 :   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
    1810           0 :   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
    1811           0 :   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
    1812             : 
    1813           0 :   x[0] = _mm_add_epi16(s[0], s[4]);
    1814           0 :   x[1] = _mm_add_epi16(s[1], s[5]);
    1815           0 :   x[2] = _mm_add_epi16(s[2], s[6]);
    1816           0 :   x[3] = _mm_add_epi16(s[3], s[7]);
    1817           0 :   x[4] = _mm_sub_epi16(s[0], s[4]);
    1818           0 :   x[5] = _mm_sub_epi16(s[1], s[5]);
    1819           0 :   x[6] = _mm_sub_epi16(s[2], s[6]);
    1820           0 :   x[7] = _mm_sub_epi16(s[3], s[7]);
    1821           0 :   x[8] = _mm_packs_epi32(u[0], u[1]);
    1822           0 :   x[9] = _mm_packs_epi32(u[2], u[3]);
    1823           0 :   x[10] = _mm_packs_epi32(u[4], u[5]);
    1824           0 :   x[11] = _mm_packs_epi32(u[6], u[7]);
    1825           0 :   x[12] = _mm_packs_epi32(u[8], u[9]);
    1826           0 :   x[13] = _mm_packs_epi32(u[10], u[11]);
    1827           0 :   x[14] = _mm_packs_epi32(u[12], u[13]);
    1828           0 :   x[15] = _mm_packs_epi32(u[14], u[15]);
    1829             : 
    1830             :   // stage 3
    1831           0 :   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
    1832           0 :   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
    1833           0 :   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
    1834           0 :   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
    1835           0 :   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
    1836           0 :   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
    1837           0 :   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
    1838           0 :   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
    1839             : 
    1840           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
    1841           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
    1842           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
    1843           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
    1844           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
    1845           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
    1846           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
    1847           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
    1848           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
    1849           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
    1850           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
    1851           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
    1852           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
    1853           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
    1854           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
    1855           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
    1856             : 
    1857           0 :   u[0] = _mm_add_epi32(v[0], v[4]);
    1858           0 :   u[1] = _mm_add_epi32(v[1], v[5]);
    1859           0 :   u[2] = _mm_add_epi32(v[2], v[6]);
    1860           0 :   u[3] = _mm_add_epi32(v[3], v[7]);
    1861           0 :   u[4] = _mm_sub_epi32(v[0], v[4]);
    1862           0 :   u[5] = _mm_sub_epi32(v[1], v[5]);
    1863           0 :   u[6] = _mm_sub_epi32(v[2], v[6]);
    1864           0 :   u[7] = _mm_sub_epi32(v[3], v[7]);
    1865           0 :   u[8] = _mm_add_epi32(v[8], v[12]);
    1866           0 :   u[9] = _mm_add_epi32(v[9], v[13]);
    1867           0 :   u[10] = _mm_add_epi32(v[10], v[14]);
    1868           0 :   u[11] = _mm_add_epi32(v[11], v[15]);
    1869           0 :   u[12] = _mm_sub_epi32(v[8], v[12]);
    1870           0 :   u[13] = _mm_sub_epi32(v[9], v[13]);
    1871           0 :   u[14] = _mm_sub_epi32(v[10], v[14]);
    1872           0 :   u[15] = _mm_sub_epi32(v[11], v[15]);
    1873             : 
    1874           0 :   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    1875           0 :   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    1876           0 :   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    1877           0 :   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
    1878           0 :   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
    1879           0 :   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
    1880           0 :   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
    1881           0 :   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
    1882           0 :   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
    1883           0 :   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
    1884           0 :   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
    1885           0 :   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
    1886           0 :   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
    1887           0 :   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
    1888           0 :   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
    1889           0 :   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
    1890             : 
    1891           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1892           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1893           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1894           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1895           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1896           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1897           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1898           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1899           0 :   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
    1900           0 :   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
    1901           0 :   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
    1902           0 :   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
    1903           0 :   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
    1904           0 :   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
    1905           0 :   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
    1906           0 :   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
    1907             : 
    1908           0 :   s[0] = _mm_add_epi16(x[0], x[2]);
    1909           0 :   s[1] = _mm_add_epi16(x[1], x[3]);
    1910           0 :   s[2] = _mm_sub_epi16(x[0], x[2]);
    1911           0 :   s[3] = _mm_sub_epi16(x[1], x[3]);
    1912           0 :   s[4] = _mm_packs_epi32(v[0], v[1]);
    1913           0 :   s[5] = _mm_packs_epi32(v[2], v[3]);
    1914           0 :   s[6] = _mm_packs_epi32(v[4], v[5]);
    1915           0 :   s[7] = _mm_packs_epi32(v[6], v[7]);
    1916           0 :   s[8] = _mm_add_epi16(x[8], x[10]);
    1917           0 :   s[9] = _mm_add_epi16(x[9], x[11]);
    1918           0 :   s[10] = _mm_sub_epi16(x[8], x[10]);
    1919           0 :   s[11] = _mm_sub_epi16(x[9], x[11]);
    1920           0 :   s[12] = _mm_packs_epi32(v[8], v[9]);
    1921           0 :   s[13] = _mm_packs_epi32(v[10], v[11]);
    1922           0 :   s[14] = _mm_packs_epi32(v[12], v[13]);
    1923           0 :   s[15] = _mm_packs_epi32(v[14], v[15]);
    1924             : 
    1925             :   // stage 4
    1926           0 :   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
    1927           0 :   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
    1928           0 :   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
    1929           0 :   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
    1930           0 :   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
    1931           0 :   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
    1932           0 :   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
    1933           0 :   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
    1934             : 
    1935           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
    1936           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
    1937           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
    1938           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
    1939           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
    1940           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
    1941           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
    1942           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
    1943           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
    1944           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
    1945           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
    1946           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
    1947           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
    1948           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
    1949           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
    1950           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
    1951             : 
    1952           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1953           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1954           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1955           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1956           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    1957           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    1958           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    1959           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    1960           0 :   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
    1961           0 :   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
    1962           0 :   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
    1963           0 :   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
    1964           0 :   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
    1965           0 :   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
    1966           0 :   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
    1967           0 :   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
    1968             : 
    1969           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1970           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1971           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1972           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1973           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1974           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1975           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1976           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1977           0 :   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
    1978           0 :   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
    1979           0 :   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
    1980           0 :   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
    1981           0 :   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
    1982           0 :   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
    1983           0 :   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
    1984           0 :   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
    1985             : 
    1986           0 :   in[0] = s[0];
    1987           0 :   in[1] = _mm_sub_epi16(kZero, s[8]);
    1988           0 :   in[2] = s[12];
    1989           0 :   in[3] = _mm_sub_epi16(kZero, s[4]);
    1990           0 :   in[4] = _mm_packs_epi32(v[4], v[5]);
    1991           0 :   in[5] = _mm_packs_epi32(v[12], v[13]);
    1992           0 :   in[6] = _mm_packs_epi32(v[8], v[9]);
    1993           0 :   in[7] = _mm_packs_epi32(v[0], v[1]);
    1994           0 :   in[8] = _mm_packs_epi32(v[2], v[3]);
    1995           0 :   in[9] = _mm_packs_epi32(v[10], v[11]);
    1996           0 :   in[10] = _mm_packs_epi32(v[14], v[15]);
    1997           0 :   in[11] = _mm_packs_epi32(v[6], v[7]);
    1998           0 :   in[12] = s[5];
    1999           0 :   in[13] = _mm_sub_epi16(kZero, s[13]);
    2000           0 :   in[14] = s[9];
    2001           0 :   in[15] = _mm_sub_epi16(kZero, s[1]);
    2002           0 : }
    2003             : 
    2004           0 : static void fdct16_sse2(__m128i *in0, __m128i *in1) {
    2005           0 :   fdct16_8col(in0);
    2006           0 :   fdct16_8col(in1);
    2007           0 :   array_transpose_16x16(in0, in1);
    2008           0 : }
    2009             : 
    2010           0 : static void fadst16_sse2(__m128i *in0, __m128i *in1) {
    2011           0 :   fadst16_8col(in0);
    2012           0 :   fadst16_8col(in1);
    2013           0 :   array_transpose_16x16(in0, in1);
    2014           0 : }
    2015             : 
    2016           0 : void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
    2017             :                        int tx_type) {
    2018             :   __m128i in0[16], in1[16];
    2019             : 
    2020           0 :   switch (tx_type) {
    2021           0 :     case DCT_DCT: vpx_fdct16x16_sse2(input, output, stride); break;
    2022             :     case ADST_DCT:
    2023           0 :       load_buffer_16x16(input, in0, in1, stride);
    2024           0 :       fadst16_sse2(in0, in1);
    2025           0 :       right_shift_16x16(in0, in1);
    2026           0 :       fdct16_sse2(in0, in1);
    2027           0 :       write_buffer_16x16(output, in0, in1, 16);
    2028           0 :       break;
    2029             :     case DCT_ADST:
    2030           0 :       load_buffer_16x16(input, in0, in1, stride);
    2031           0 :       fdct16_sse2(in0, in1);
    2032           0 :       right_shift_16x16(in0, in1);
    2033           0 :       fadst16_sse2(in0, in1);
    2034           0 :       write_buffer_16x16(output, in0, in1, 16);
    2035           0 :       break;
    2036             :     case ADST_ADST:
    2037           0 :       load_buffer_16x16(input, in0, in1, stride);
    2038           0 :       fadst16_sse2(in0, in1);
    2039           0 :       right_shift_16x16(in0, in1);
    2040           0 :       fadst16_sse2(in0, in1);
    2041           0 :       write_buffer_16x16(output, in0, in1, 16);
    2042           0 :       break;
    2043           0 :     default: assert(0); break;
    2044             :   }
    2045           0 : }

Generated by: LCOV version 1.13