LCOV - code coverage report
Current view: top level - third_party/aom/av1/encoder/x86 - dct_intrin_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 3003 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 61 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <assert.h>
      13             : #include <emmintrin.h>  // SSE2
      14             : 
      15             : #include "./aom_dsp_rtcd.h"
      16             : #include "./av1_rtcd.h"
      17             : #include "aom_dsp/txfm_common.h"
      18             : #include "aom_dsp/x86/fwd_txfm_sse2.h"
      19             : #include "aom_dsp/x86/synonyms.h"
      20             : #include "aom_dsp/x86/txfm_common_sse2.h"
      21             : #include "aom_ports/mem.h"
      22             : 
      23           0 : static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
      24             :                                    int stride, int flipud, int fliplr) {
      25           0 :   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
      26           0 :   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
      27             :   __m128i mask;
      28             : 
      29           0 :   if (!flipud) {
      30           0 :     in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
      31           0 :     in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
      32           0 :     in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
      33           0 :     in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
      34             :   } else {
      35           0 :     in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
      36           0 :     in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
      37           0 :     in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
      38           0 :     in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
      39             :   }
      40             : 
      41           0 :   if (fliplr) {
      42           0 :     in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
      43           0 :     in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
      44           0 :     in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
      45           0 :     in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
      46             :   }
      47             : 
      48           0 :   in[0] = _mm_slli_epi16(in[0], 4);
      49           0 :   in[1] = _mm_slli_epi16(in[1], 4);
      50           0 :   in[2] = _mm_slli_epi16(in[2], 4);
      51           0 :   in[3] = _mm_slli_epi16(in[3], 4);
      52             : 
      53           0 :   mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
      54           0 :   in[0] = _mm_add_epi16(in[0], mask);
      55           0 :   in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
      56           0 : }
      57             : 
      58           0 : static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
      59           0 :   const __m128i kOne = _mm_set1_epi16(1);
      60           0 :   __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
      61           0 :   __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
      62           0 :   __m128i out01 = _mm_add_epi16(in01, kOne);
      63           0 :   __m128i out23 = _mm_add_epi16(in23, kOne);
      64           0 :   out01 = _mm_srai_epi16(out01, 2);
      65           0 :   out23 = _mm_srai_epi16(out23, 2);
      66           0 :   store_output(&out01, (output + 0 * 8));
      67           0 :   store_output(&out23, (output + 1 * 8));
      68           0 : }
      69             : 
      70           0 : static INLINE void transpose_4x4(__m128i *res) {
      71             :   // Combine and transpose
      72             :   // 00 01 02 03 20 21 22 23
      73             :   // 10 11 12 13 30 31 32 33
      74           0 :   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
      75           0 :   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
      76             : 
      77             :   // 00 10 01 11 02 12 03 13
      78             :   // 20 30 21 31 22 32 23 33
      79           0 :   res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
      80           0 :   res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
      81             : 
      82             :   // 00 10 20 30 01 11 21 31
      83             :   // 02 12 22 32 03 13 23 33
      84             :   // only use the first 4 16-bit integers
      85           0 :   res[1] = _mm_unpackhi_epi64(res[0], res[0]);
      86           0 :   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
      87           0 : }
      88             : 
      89           0 : static void fdct4_sse2(__m128i *in) {
      90           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
      91           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
      92           0 :   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
      93           0 :   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
      94           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
      95             : 
      96             :   __m128i u[4], v[4];
      97           0 :   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
      98           0 :   u[1] = _mm_unpacklo_epi16(in[3], in[2]);
      99             : 
     100           0 :   v[0] = _mm_add_epi16(u[0], u[1]);
     101           0 :   v[1] = _mm_sub_epi16(u[0], u[1]);
     102             : 
     103           0 :   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
     104           0 :   u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
     105           0 :   u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
     106           0 :   u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
     107             : 
     108           0 :   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
     109           0 :   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
     110           0 :   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
     111           0 :   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
     112           0 :   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
     113           0 :   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
     114           0 :   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
     115           0 :   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
     116             : 
     117           0 :   in[0] = _mm_packs_epi32(u[0], u[1]);
     118           0 :   in[1] = _mm_packs_epi32(u[2], u[3]);
     119           0 :   transpose_4x4(in);
     120           0 : }
     121             : 
     122           0 : static void fadst4_sse2(__m128i *in) {
     123           0 :   const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
     124           0 :   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
     125           0 :   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
     126           0 :   const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
     127           0 :   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
     128           0 :   const __m128i kZero = _mm_set1_epi16(0);
     129           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     130             :   __m128i u[8], v[8];
     131           0 :   __m128i in7 = _mm_add_epi16(in[0], in[1]);
     132             : 
     133           0 :   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
     134           0 :   u[1] = _mm_unpacklo_epi16(in[2], in[3]);
     135           0 :   u[2] = _mm_unpacklo_epi16(in7, kZero);
     136           0 :   u[3] = _mm_unpacklo_epi16(in[2], kZero);
     137           0 :   u[4] = _mm_unpacklo_epi16(in[3], kZero);
     138             : 
     139           0 :   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
     140           0 :   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
     141           0 :   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
     142           0 :   v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
     143           0 :   v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
     144           0 :   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
     145           0 :   v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
     146             : 
     147           0 :   u[0] = _mm_add_epi32(v[0], v[1]);
     148           0 :   u[1] = _mm_sub_epi32(v[2], v[6]);
     149           0 :   u[2] = _mm_add_epi32(v[3], v[4]);
     150           0 :   u[3] = _mm_sub_epi32(u[2], u[0]);
     151           0 :   u[4] = _mm_slli_epi32(v[5], 2);
     152           0 :   u[5] = _mm_sub_epi32(u[4], v[5]);
     153           0 :   u[6] = _mm_add_epi32(u[3], u[5]);
     154             : 
     155           0 :   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
     156           0 :   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
     157           0 :   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
     158           0 :   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
     159             : 
     160           0 :   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
     161           0 :   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
     162           0 :   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
     163           0 :   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
     164             : 
     165           0 :   in[0] = _mm_packs_epi32(u[0], u[2]);
     166           0 :   in[1] = _mm_packs_epi32(u[1], u[3]);
     167           0 :   transpose_4x4(in);
     168           0 : }
     169             : 
     170             : #if CONFIG_EXT_TX
     171           0 : static void fidtx4_sse2(__m128i *in) {
     172           0 :   const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
     173           0 :   const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
     174           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     175             : 
     176             :   __m128i v0, v1, v2, v3;
     177             :   __m128i u0, u1, u2, u3;
     178             : 
     179           0 :   v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
     180           0 :   v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
     181           0 :   v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
     182           0 :   v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
     183             : 
     184           0 :   u0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
     185           0 :   u1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
     186           0 :   u2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
     187           0 :   u3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
     188             : 
     189           0 :   v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
     190           0 :   v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
     191           0 :   v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
     192           0 :   v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
     193             : 
     194           0 :   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
     195           0 :   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
     196           0 :   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
     197           0 :   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
     198             : 
     199           0 :   in[0] = _mm_packs_epi32(u0, u2);
     200           0 :   in[1] = _mm_packs_epi32(u1, u3);
     201           0 :   transpose_4x4(in);
     202           0 : }
     203             : #endif  // CONFIG_EXT_TX
     204             : 
     205           0 : void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
     206             :                      int tx_type) {
     207             :   __m128i in[4];
     208             : 
     209           0 :   switch (tx_type) {
     210           0 :     case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break;
     211             :     case ADST_DCT:
     212           0 :       load_buffer_4x4(input, in, stride, 0, 0);
     213           0 :       fadst4_sse2(in);
     214           0 :       fdct4_sse2(in);
     215           0 :       write_buffer_4x4(output, in);
     216           0 :       break;
     217             :     case DCT_ADST:
     218           0 :       load_buffer_4x4(input, in, stride, 0, 0);
     219           0 :       fdct4_sse2(in);
     220           0 :       fadst4_sse2(in);
     221           0 :       write_buffer_4x4(output, in);
     222           0 :       break;
     223             :     case ADST_ADST:
     224           0 :       load_buffer_4x4(input, in, stride, 0, 0);
     225           0 :       fadst4_sse2(in);
     226           0 :       fadst4_sse2(in);
     227           0 :       write_buffer_4x4(output, in);
     228           0 :       break;
     229             : #if CONFIG_EXT_TX
     230             :     case FLIPADST_DCT:
     231           0 :       load_buffer_4x4(input, in, stride, 1, 0);
     232           0 :       fadst4_sse2(in);
     233           0 :       fdct4_sse2(in);
     234           0 :       write_buffer_4x4(output, in);
     235           0 :       break;
     236             :     case DCT_FLIPADST:
     237           0 :       load_buffer_4x4(input, in, stride, 0, 1);
     238           0 :       fdct4_sse2(in);
     239           0 :       fadst4_sse2(in);
     240           0 :       write_buffer_4x4(output, in);
     241           0 :       break;
     242             :     case FLIPADST_FLIPADST:
     243           0 :       load_buffer_4x4(input, in, stride, 1, 1);
     244           0 :       fadst4_sse2(in);
     245           0 :       fadst4_sse2(in);
     246           0 :       write_buffer_4x4(output, in);
     247           0 :       break;
     248             :     case ADST_FLIPADST:
     249           0 :       load_buffer_4x4(input, in, stride, 0, 1);
     250           0 :       fadst4_sse2(in);
     251           0 :       fadst4_sse2(in);
     252           0 :       write_buffer_4x4(output, in);
     253           0 :       break;
     254             :     case FLIPADST_ADST:
     255           0 :       load_buffer_4x4(input, in, stride, 1, 0);
     256           0 :       fadst4_sse2(in);
     257           0 :       fadst4_sse2(in);
     258           0 :       write_buffer_4x4(output, in);
     259           0 :       break;
     260             :     case IDTX:
     261           0 :       load_buffer_4x4(input, in, stride, 0, 0);
     262           0 :       fidtx4_sse2(in);
     263           0 :       fidtx4_sse2(in);
     264           0 :       write_buffer_4x4(output, in);
     265           0 :       break;
     266             :     case V_DCT:
     267           0 :       load_buffer_4x4(input, in, stride, 0, 0);
     268           0 :       fdct4_sse2(in);
     269           0 :       fidtx4_sse2(in);
     270           0 :       write_buffer_4x4(output, in);
     271           0 :       break;
     272             :     case H_DCT:
     273           0 :       load_buffer_4x4(input, in, stride, 0, 0);
     274           0 :       fidtx4_sse2(in);
     275           0 :       fdct4_sse2(in);
     276           0 :       write_buffer_4x4(output, in);
     277           0 :       break;
     278             :     case V_ADST:
     279           0 :       load_buffer_4x4(input, in, stride, 0, 0);
     280           0 :       fadst4_sse2(in);
     281           0 :       fidtx4_sse2(in);
     282           0 :       write_buffer_4x4(output, in);
     283           0 :       break;
     284             :     case H_ADST:
     285           0 :       load_buffer_4x4(input, in, stride, 0, 0);
     286           0 :       fidtx4_sse2(in);
     287           0 :       fadst4_sse2(in);
     288           0 :       write_buffer_4x4(output, in);
     289           0 :       break;
     290             :     case V_FLIPADST:
     291           0 :       load_buffer_4x4(input, in, stride, 1, 0);
     292           0 :       fadst4_sse2(in);
     293           0 :       fidtx4_sse2(in);
     294           0 :       write_buffer_4x4(output, in);
     295           0 :       break;
     296             :     case H_FLIPADST:
     297           0 :       load_buffer_4x4(input, in, stride, 0, 1);
     298           0 :       fidtx4_sse2(in);
     299           0 :       fadst4_sse2(in);
     300           0 :       write_buffer_4x4(output, in);
     301           0 :       break;
     302             : #endif  // CONFIG_EXT_TX
     303           0 :     default: assert(0);
     304             :   }
     305           0 : }
     306             : 
     307           0 : void av1_fdct8x8_quant_sse2(const int16_t *input, int stride,
     308             :                             int16_t *coeff_ptr, intptr_t n_coeffs,
     309             :                             int skip_block, const int16_t *zbin_ptr,
     310             :                             const int16_t *round_ptr, const int16_t *quant_ptr,
     311             :                             const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
     312             :                             int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
     313             :                             uint16_t *eob_ptr, const int16_t *scan_ptr,
     314             :                             const int16_t *iscan_ptr) {
     315             :   __m128i zero;
     316             :   int pass;
     317             :   // Constants
     318             :   //    When we use them, in one case, they are all the same. In all others
     319             :   //    it's a pair of them that we need to repeat four times. This is done
     320             :   //    by constructing the 32 bit constant corresponding to that pair.
     321           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
     322           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     323           0 :   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
     324           0 :   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
     325           0 :   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
     326           0 :   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
     327           0 :   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
     328           0 :   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
     329           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     330             :   // Load input
     331           0 :   __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
     332           0 :   __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
     333           0 :   __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
     334           0 :   __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
     335           0 :   __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
     336           0 :   __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
     337           0 :   __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
     338           0 :   __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
     339             :   __m128i *in[8];
     340           0 :   int index = 0;
     341             : 
     342             :   (void)scan_ptr;
     343             :   (void)zbin_ptr;
     344             :   (void)quant_shift_ptr;
     345             :   (void)coeff_ptr;
     346             : 
     347             :   // Pre-condition input (shift by two)
     348           0 :   in0 = _mm_slli_epi16(in0, 2);
     349           0 :   in1 = _mm_slli_epi16(in1, 2);
     350           0 :   in2 = _mm_slli_epi16(in2, 2);
     351           0 :   in3 = _mm_slli_epi16(in3, 2);
     352           0 :   in4 = _mm_slli_epi16(in4, 2);
     353           0 :   in5 = _mm_slli_epi16(in5, 2);
     354           0 :   in6 = _mm_slli_epi16(in6, 2);
     355           0 :   in7 = _mm_slli_epi16(in7, 2);
     356             : 
     357           0 :   in[0] = &in0;
     358           0 :   in[1] = &in1;
     359           0 :   in[2] = &in2;
     360           0 :   in[3] = &in3;
     361           0 :   in[4] = &in4;
     362           0 :   in[5] = &in5;
     363           0 :   in[6] = &in6;
     364           0 :   in[7] = &in7;
     365             : 
     366             :   // We do two passes, first the columns, then the rows. The results of the
     367             :   // first pass are transposed so that the same column code can be reused. The
     368             :   // results of the second pass are also transposed so that the rows (processed
     369             :   // as columns) are put back in row positions.
     370           0 :   for (pass = 0; pass < 2; pass++) {
     371             :     // To store results of each pass before the transpose.
     372             :     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
     373             :     // Add/subtract
     374           0 :     const __m128i q0 = _mm_add_epi16(in0, in7);
     375           0 :     const __m128i q1 = _mm_add_epi16(in1, in6);
     376           0 :     const __m128i q2 = _mm_add_epi16(in2, in5);
     377           0 :     const __m128i q3 = _mm_add_epi16(in3, in4);
     378           0 :     const __m128i q4 = _mm_sub_epi16(in3, in4);
     379           0 :     const __m128i q5 = _mm_sub_epi16(in2, in5);
     380           0 :     const __m128i q6 = _mm_sub_epi16(in1, in6);
     381           0 :     const __m128i q7 = _mm_sub_epi16(in0, in7);
     382             :     // Work on first four results
     383             :     {
     384             :       // Add/subtract
     385           0 :       const __m128i r0 = _mm_add_epi16(q0, q3);
     386           0 :       const __m128i r1 = _mm_add_epi16(q1, q2);
     387           0 :       const __m128i r2 = _mm_sub_epi16(q1, q2);
     388           0 :       const __m128i r3 = _mm_sub_epi16(q0, q3);
     389             :       // Interleave to do the multiply by constants which gets us into 32bits
     390           0 :       const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
     391           0 :       const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
     392           0 :       const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
     393           0 :       const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
     394           0 :       const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
     395           0 :       const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
     396           0 :       const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
     397           0 :       const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
     398           0 :       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
     399           0 :       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
     400           0 :       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
     401           0 :       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
     402             :       // dct_const_round_shift
     403           0 :       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
     404           0 :       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
     405           0 :       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
     406           0 :       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
     407           0 :       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
     408           0 :       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
     409           0 :       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
     410           0 :       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
     411           0 :       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
     412           0 :       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
     413           0 :       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
     414           0 :       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
     415           0 :       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
     416           0 :       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
     417           0 :       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
     418           0 :       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
     419             :       // Combine
     420           0 :       res0 = _mm_packs_epi32(w0, w1);
     421           0 :       res4 = _mm_packs_epi32(w2, w3);
     422           0 :       res2 = _mm_packs_epi32(w4, w5);
     423           0 :       res6 = _mm_packs_epi32(w6, w7);
     424             :     }
     425             :     // Work on next four results
     426             :     {
     427             :       // Interleave to do the multiply by constants which gets us into 32bits
     428           0 :       const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
     429           0 :       const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
     430           0 :       const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
     431           0 :       const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
     432           0 :       const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
     433           0 :       const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
     434             :       // dct_const_round_shift
     435           0 :       const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
     436           0 :       const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
     437           0 :       const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
     438           0 :       const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
     439           0 :       const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
     440           0 :       const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
     441           0 :       const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
     442           0 :       const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
     443             :       // Combine
     444           0 :       const __m128i r0 = _mm_packs_epi32(s0, s1);
     445           0 :       const __m128i r1 = _mm_packs_epi32(s2, s3);
     446             :       // Add/subtract
     447           0 :       const __m128i x0 = _mm_add_epi16(q4, r0);
     448           0 :       const __m128i x1 = _mm_sub_epi16(q4, r0);
     449           0 :       const __m128i x2 = _mm_sub_epi16(q7, r1);
     450           0 :       const __m128i x3 = _mm_add_epi16(q7, r1);
     451             :       // Interleave to do the multiply by constants which gets us into 32bits
     452           0 :       const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
     453           0 :       const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
     454           0 :       const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
     455           0 :       const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
     456           0 :       const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
     457           0 :       const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
     458           0 :       const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
     459           0 :       const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
     460           0 :       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
     461           0 :       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
     462           0 :       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
     463           0 :       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
     464             :       // dct_const_round_shift
     465           0 :       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
     466           0 :       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
     467           0 :       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
     468           0 :       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
     469           0 :       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
     470           0 :       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
     471           0 :       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
     472           0 :       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
     473           0 :       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
     474           0 :       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
     475           0 :       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
     476           0 :       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
     477           0 :       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
     478           0 :       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
     479           0 :       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
     480           0 :       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
     481             :       // Combine
     482           0 :       res1 = _mm_packs_epi32(w0, w1);
     483           0 :       res7 = _mm_packs_epi32(w2, w3);
     484           0 :       res5 = _mm_packs_epi32(w4, w5);
     485           0 :       res3 = _mm_packs_epi32(w6, w7);
     486             :     }
     487             :     // Transpose the 8x8.
     488             :     {
     489             :       // 00 01 02 03 04 05 06 07
     490             :       // 10 11 12 13 14 15 16 17
     491             :       // 20 21 22 23 24 25 26 27
     492             :       // 30 31 32 33 34 35 36 37
     493             :       // 40 41 42 43 44 45 46 47
     494             :       // 50 51 52 53 54 55 56 57
     495             :       // 60 61 62 63 64 65 66 67
     496             :       // 70 71 72 73 74 75 76 77
     497           0 :       const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
     498           0 :       const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
     499           0 :       const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
     500           0 :       const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
     501           0 :       const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
     502           0 :       const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
     503           0 :       const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
     504           0 :       const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
     505             :       // 00 10 01 11 02 12 03 13
     506             :       // 20 30 21 31 22 32 23 33
     507             :       // 04 14 05 15 06 16 07 17
     508             :       // 24 34 25 35 26 36 27 37
     509             :       // 40 50 41 51 42 52 43 53
     510             :       // 60 70 61 71 62 72 63 73
     511             :       // 54 54 55 55 56 56 57 57
     512             :       // 64 74 65 75 66 76 67 77
     513           0 :       const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
     514           0 :       const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
     515           0 :       const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
     516           0 :       const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
     517           0 :       const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
     518           0 :       const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
     519           0 :       const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
     520           0 :       const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
     521             :       // 00 10 20 30 01 11 21 31
     522             :       // 40 50 60 70 41 51 61 71
     523             :       // 02 12 22 32 03 13 23 33
     524             :       // 42 52 62 72 43 53 63 73
     525             :       // 04 14 24 34 05 15 21 36
     526             :       // 44 54 64 74 45 55 61 76
     527             :       // 06 16 26 36 07 17 27 37
     528             :       // 46 56 66 76 47 57 67 77
     529           0 :       in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
     530           0 :       in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
     531           0 :       in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
     532           0 :       in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
     533           0 :       in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
     534           0 :       in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
     535           0 :       in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
     536           0 :       in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
     537             :       // 00 10 20 30 40 50 60 70
     538             :       // 01 11 21 31 41 51 61 71
     539             :       // 02 12 22 32 42 52 62 72
     540             :       // 03 13 23 33 43 53 63 73
     541             :       // 04 14 24 34 44 54 64 74
     542             :       // 05 15 25 35 45 55 65 75
     543             :       // 06 16 26 36 46 56 66 76
     544             :       // 07 17 27 37 47 57 67 77
     545             :     }
     546             :   }
     547             :   // Post-condition output and store it
     548             :   {
     549             :     // Post-condition (division by two)
     550             :     //    division of two 16 bits signed numbers using shifts
     551             :     //    n / 2 = (n - (n >> 15)) >> 1
     552           0 :     const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
     553           0 :     const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
     554           0 :     const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
     555           0 :     const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
     556           0 :     const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
     557           0 :     const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
     558           0 :     const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
     559           0 :     const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
     560           0 :     in0 = _mm_sub_epi16(in0, sign_in0);
     561           0 :     in1 = _mm_sub_epi16(in1, sign_in1);
     562           0 :     in2 = _mm_sub_epi16(in2, sign_in2);
     563           0 :     in3 = _mm_sub_epi16(in3, sign_in3);
     564           0 :     in4 = _mm_sub_epi16(in4, sign_in4);
     565           0 :     in5 = _mm_sub_epi16(in5, sign_in5);
     566           0 :     in6 = _mm_sub_epi16(in6, sign_in6);
     567           0 :     in7 = _mm_sub_epi16(in7, sign_in7);
     568           0 :     in0 = _mm_srai_epi16(in0, 1);
     569           0 :     in1 = _mm_srai_epi16(in1, 1);
     570           0 :     in2 = _mm_srai_epi16(in2, 1);
     571           0 :     in3 = _mm_srai_epi16(in3, 1);
     572           0 :     in4 = _mm_srai_epi16(in4, 1);
     573           0 :     in5 = _mm_srai_epi16(in5, 1);
     574           0 :     in6 = _mm_srai_epi16(in6, 1);
     575           0 :     in7 = _mm_srai_epi16(in7, 1);
     576             :   }
     577             : 
     578           0 :   iscan_ptr += n_coeffs;
     579           0 :   qcoeff_ptr += n_coeffs;
     580           0 :   dqcoeff_ptr += n_coeffs;
     581           0 :   n_coeffs = -n_coeffs;
     582           0 :   zero = _mm_setzero_si128();
     583             : 
     584           0 :   if (!skip_block) {
     585             :     __m128i eob;
     586             :     __m128i round, quant, dequant;
     587             :     {
     588             :       __m128i coeff0, coeff1;
     589             : 
     590             :       // Setup global values
     591             :       {
     592           0 :         round = _mm_load_si128((const __m128i *)round_ptr);
     593           0 :         quant = _mm_load_si128((const __m128i *)quant_ptr);
     594           0 :         dequant = _mm_load_si128((const __m128i *)dequant_ptr);
     595             :       }
     596             : 
     597             :       {
     598             :         __m128i coeff0_sign, coeff1_sign;
     599             :         __m128i qcoeff0, qcoeff1;
     600             :         __m128i qtmp0, qtmp1;
     601             :         // Do DC and first 15 AC
     602           0 :         coeff0 = *in[0];
     603           0 :         coeff1 = *in[1];
     604             : 
     605             :         // Poor man's sign extract
     606           0 :         coeff0_sign = _mm_srai_epi16(coeff0, 15);
     607           0 :         coeff1_sign = _mm_srai_epi16(coeff1, 15);
     608           0 :         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
     609           0 :         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
     610           0 :         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     611           0 :         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     612             : 
     613           0 :         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
     614           0 :         round = _mm_unpackhi_epi64(round, round);
     615           0 :         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
     616           0 :         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
     617           0 :         quant = _mm_unpackhi_epi64(quant, quant);
     618           0 :         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
     619             : 
     620             :         // Reinsert signs
     621           0 :         qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
     622           0 :         qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
     623           0 :         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     624           0 :         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     625             : 
     626           0 :         _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
     627           0 :         _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
     628             : 
     629           0 :         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
     630           0 :         dequant = _mm_unpackhi_epi64(dequant, dequant);
     631           0 :         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
     632             : 
     633           0 :         _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
     634           0 :         _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
     635             :       }
     636             : 
     637             :       {
     638             :         // Scan for eob
     639             :         __m128i zero_coeff0, zero_coeff1;
     640             :         __m128i nzero_coeff0, nzero_coeff1;
     641             :         __m128i iscan0, iscan1;
     642             :         __m128i eob1;
     643           0 :         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
     644           0 :         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
     645           0 :         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
     646           0 :         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
     647           0 :         iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
     648           0 :         iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
     649             :         // Add one to convert from indices to counts
     650           0 :         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
     651           0 :         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
     652           0 :         eob = _mm_and_si128(iscan0, nzero_coeff0);
     653           0 :         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
     654           0 :         eob = _mm_max_epi16(eob, eob1);
     655             :       }
     656           0 :       n_coeffs += 8 * 2;
     657             :     }
     658             : 
     659             :     // AC only loop
     660           0 :     index = 2;
     661           0 :     while (n_coeffs < 0) {
     662             :       __m128i coeff0, coeff1;
     663             :       {
     664             :         __m128i coeff0_sign, coeff1_sign;
     665             :         __m128i qcoeff0, qcoeff1;
     666             :         __m128i qtmp0, qtmp1;
     667             : 
     668           0 :         assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
     669           0 :         coeff0 = *in[index];
     670           0 :         coeff1 = *in[index + 1];
     671             : 
     672             :         // Poor man's sign extract
     673           0 :         coeff0_sign = _mm_srai_epi16(coeff0, 15);
     674           0 :         coeff1_sign = _mm_srai_epi16(coeff1, 15);
     675           0 :         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
     676           0 :         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
     677           0 :         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     678           0 :         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     679             : 
     680           0 :         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
     681           0 :         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
     682           0 :         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
     683           0 :         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
     684             : 
     685             :         // Reinsert signs
     686           0 :         qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
     687           0 :         qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
     688           0 :         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     689           0 :         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     690             : 
     691           0 :         _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
     692           0 :         _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
     693             : 
     694           0 :         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
     695           0 :         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
     696             : 
     697           0 :         _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
     698           0 :         _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
     699             :       }
     700             : 
     701             :       {
     702             :         // Scan for eob
     703             :         __m128i zero_coeff0, zero_coeff1;
     704             :         __m128i nzero_coeff0, nzero_coeff1;
     705             :         __m128i iscan0, iscan1;
     706             :         __m128i eob0, eob1;
     707           0 :         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
     708           0 :         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
     709           0 :         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
     710           0 :         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
     711           0 :         iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
     712           0 :         iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
     713             :         // Add one to convert from indices to counts
     714           0 :         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
     715           0 :         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
     716           0 :         eob0 = _mm_and_si128(iscan0, nzero_coeff0);
     717           0 :         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
     718           0 :         eob0 = _mm_max_epi16(eob0, eob1);
     719           0 :         eob = _mm_max_epi16(eob, eob0);
     720             :       }
     721           0 :       n_coeffs += 8 * 2;
     722           0 :       index += 2;
     723             :     }
     724             : 
     725             :     // Accumulate EOB
     726             :     {
     727             :       __m128i eob_shuffled;
     728           0 :       eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
     729           0 :       eob = _mm_max_epi16(eob, eob_shuffled);
     730           0 :       eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
     731           0 :       eob = _mm_max_epi16(eob, eob_shuffled);
     732           0 :       eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
     733           0 :       eob = _mm_max_epi16(eob, eob_shuffled);
     734           0 :       *eob_ptr = _mm_extract_epi16(eob, 1);
     735             :     }
     736             :   } else {
     737             :     do {
     738           0 :       _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
     739           0 :       _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
     740           0 :       _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
     741           0 :       _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
     742           0 :       n_coeffs += 8 * 2;
     743           0 :     } while (n_coeffs < 0);
     744           0 :     *eob_ptr = 0;
     745             :   }
     746           0 : }
     747             : 
     748             : // load 8x8 array
     749           0 : static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
     750             :                                    int stride, int flipud, int fliplr) {
     751           0 :   if (!flipud) {
     752           0 :     in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
     753           0 :     in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
     754           0 :     in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
     755           0 :     in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
     756           0 :     in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
     757           0 :     in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
     758           0 :     in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
     759           0 :     in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
     760             :   } else {
     761           0 :     in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
     762           0 :     in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
     763           0 :     in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
     764           0 :     in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
     765           0 :     in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
     766           0 :     in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
     767           0 :     in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
     768           0 :     in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
     769             :   }
     770             : 
     771           0 :   if (fliplr) {
     772           0 :     in[0] = mm_reverse_epi16(in[0]);
     773           0 :     in[1] = mm_reverse_epi16(in[1]);
     774           0 :     in[2] = mm_reverse_epi16(in[2]);
     775           0 :     in[3] = mm_reverse_epi16(in[3]);
     776           0 :     in[4] = mm_reverse_epi16(in[4]);
     777           0 :     in[5] = mm_reverse_epi16(in[5]);
     778           0 :     in[6] = mm_reverse_epi16(in[6]);
     779           0 :     in[7] = mm_reverse_epi16(in[7]);
     780             :   }
     781             : 
     782           0 :   in[0] = _mm_slli_epi16(in[0], 2);
     783           0 :   in[1] = _mm_slli_epi16(in[1], 2);
     784           0 :   in[2] = _mm_slli_epi16(in[2], 2);
     785           0 :   in[3] = _mm_slli_epi16(in[3], 2);
     786           0 :   in[4] = _mm_slli_epi16(in[4], 2);
     787           0 :   in[5] = _mm_slli_epi16(in[5], 2);
     788           0 :   in[6] = _mm_slli_epi16(in[6], 2);
     789           0 :   in[7] = _mm_slli_epi16(in[7], 2);
     790           0 : }
     791             : 
     792             : // right shift and rounding
     793           0 : static INLINE void right_shift_8x8(__m128i *res, const int bit) {
     794           0 :   __m128i sign0 = _mm_srai_epi16(res[0], 15);
     795           0 :   __m128i sign1 = _mm_srai_epi16(res[1], 15);
     796           0 :   __m128i sign2 = _mm_srai_epi16(res[2], 15);
     797           0 :   __m128i sign3 = _mm_srai_epi16(res[3], 15);
     798           0 :   __m128i sign4 = _mm_srai_epi16(res[4], 15);
     799           0 :   __m128i sign5 = _mm_srai_epi16(res[5], 15);
     800           0 :   __m128i sign6 = _mm_srai_epi16(res[6], 15);
     801           0 :   __m128i sign7 = _mm_srai_epi16(res[7], 15);
     802             : 
     803           0 :   if (bit == 2) {
     804           0 :     const __m128i const_rounding = _mm_set1_epi16(1);
     805           0 :     res[0] = _mm_adds_epi16(res[0], const_rounding);
     806           0 :     res[1] = _mm_adds_epi16(res[1], const_rounding);
     807           0 :     res[2] = _mm_adds_epi16(res[2], const_rounding);
     808           0 :     res[3] = _mm_adds_epi16(res[3], const_rounding);
     809           0 :     res[4] = _mm_adds_epi16(res[4], const_rounding);
     810           0 :     res[5] = _mm_adds_epi16(res[5], const_rounding);
     811           0 :     res[6] = _mm_adds_epi16(res[6], const_rounding);
     812           0 :     res[7] = _mm_adds_epi16(res[7], const_rounding);
     813             :   }
     814             : 
     815           0 :   res[0] = _mm_sub_epi16(res[0], sign0);
     816           0 :   res[1] = _mm_sub_epi16(res[1], sign1);
     817           0 :   res[2] = _mm_sub_epi16(res[2], sign2);
     818           0 :   res[3] = _mm_sub_epi16(res[3], sign3);
     819           0 :   res[4] = _mm_sub_epi16(res[4], sign4);
     820           0 :   res[5] = _mm_sub_epi16(res[5], sign5);
     821           0 :   res[6] = _mm_sub_epi16(res[6], sign6);
     822           0 :   res[7] = _mm_sub_epi16(res[7], sign7);
     823             : 
     824           0 :   if (bit == 1) {
     825           0 :     res[0] = _mm_srai_epi16(res[0], 1);
     826           0 :     res[1] = _mm_srai_epi16(res[1], 1);
     827           0 :     res[2] = _mm_srai_epi16(res[2], 1);
     828           0 :     res[3] = _mm_srai_epi16(res[3], 1);
     829           0 :     res[4] = _mm_srai_epi16(res[4], 1);
     830           0 :     res[5] = _mm_srai_epi16(res[5], 1);
     831           0 :     res[6] = _mm_srai_epi16(res[6], 1);
     832           0 :     res[7] = _mm_srai_epi16(res[7], 1);
     833             :   } else {
     834           0 :     res[0] = _mm_srai_epi16(res[0], 2);
     835           0 :     res[1] = _mm_srai_epi16(res[1], 2);
     836           0 :     res[2] = _mm_srai_epi16(res[2], 2);
     837           0 :     res[3] = _mm_srai_epi16(res[3], 2);
     838           0 :     res[4] = _mm_srai_epi16(res[4], 2);
     839           0 :     res[5] = _mm_srai_epi16(res[5], 2);
     840           0 :     res[6] = _mm_srai_epi16(res[6], 2);
     841           0 :     res[7] = _mm_srai_epi16(res[7], 2);
     842             :   }
     843           0 : }
     844             : 
     845             : // write 8x8 array
     846           0 : static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
     847             :                                     int stride) {
     848           0 :   store_output(&res[0], (output + 0 * stride));
     849           0 :   store_output(&res[1], (output + 1 * stride));
     850           0 :   store_output(&res[2], (output + 2 * stride));
     851           0 :   store_output(&res[3], (output + 3 * stride));
     852           0 :   store_output(&res[4], (output + 4 * stride));
     853           0 :   store_output(&res[5], (output + 5 * stride));
     854           0 :   store_output(&res[6], (output + 6 * stride));
     855           0 :   store_output(&res[7], (output + 7 * stride));
     856           0 : }
     857             : 
     858             : // perform in-place transpose
     859           0 : static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
     860           0 :   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
     861           0 :   const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
     862           0 :   const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
     863           0 :   const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
     864           0 :   const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
     865           0 :   const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
     866           0 :   const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
     867           0 :   const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
     868             :   // 00 10 01 11 02 12 03 13
     869             :   // 20 30 21 31 22 32 23 33
     870             :   // 04 14 05 15 06 16 07 17
     871             :   // 24 34 25 35 26 36 27 37
     872             :   // 40 50 41 51 42 52 43 53
     873             :   // 60 70 61 71 62 72 63 73
     874             :   // 44 54 45 55 46 56 47 57
     875             :   // 64 74 65 75 66 76 67 77
     876           0 :   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
     877           0 :   const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
     878           0 :   const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
     879           0 :   const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
     880           0 :   const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
     881           0 :   const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
     882           0 :   const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
     883           0 :   const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
     884             :   // 00 10 20 30 01 11 21 31
     885             :   // 40 50 60 70 41 51 61 71
     886             :   // 02 12 22 32 03 13 23 33
     887             :   // 42 52 62 72 43 53 63 73
     888             :   // 04 14 24 34 05 15 25 35
     889             :   // 44 54 64 74 45 55 65 75
     890             :   // 06 16 26 36 07 17 27 37
     891             :   // 46 56 66 76 47 57 67 77
     892           0 :   res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
     893           0 :   res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
     894           0 :   res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
     895           0 :   res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
     896           0 :   res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
     897           0 :   res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
     898           0 :   res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
     899           0 :   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
     900             :   // 00 10 20 30 40 50 60 70
     901             :   // 01 11 21 31 41 51 61 71
     902             :   // 02 12 22 32 42 52 62 72
     903             :   // 03 13 23 33 43 53 63 73
     904             :   // 04 14 24 34 44 54 64 74
     905             :   // 05 15 25 35 45 55 65 75
     906             :   // 06 16 26 36 46 56 66 76
     907             :   // 07 17 27 37 47 57 67 77
     908           0 : }
     909             : 
     910           0 : static void fdct8_sse2(__m128i *in) {
     911             :   // constants
     912           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
     913           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     914           0 :   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
     915           0 :   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
     916           0 :   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
     917           0 :   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
     918           0 :   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
     919           0 :   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
     920           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
     921             :   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
     922             :   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
     923             :   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
     924             : 
     925             :   // stage 1
     926           0 :   s0 = _mm_add_epi16(in[0], in[7]);
     927           0 :   s1 = _mm_add_epi16(in[1], in[6]);
     928           0 :   s2 = _mm_add_epi16(in[2], in[5]);
     929           0 :   s3 = _mm_add_epi16(in[3], in[4]);
     930           0 :   s4 = _mm_sub_epi16(in[3], in[4]);
     931           0 :   s5 = _mm_sub_epi16(in[2], in[5]);
     932           0 :   s6 = _mm_sub_epi16(in[1], in[6]);
     933           0 :   s7 = _mm_sub_epi16(in[0], in[7]);
     934             : 
     935           0 :   u0 = _mm_add_epi16(s0, s3);
     936           0 :   u1 = _mm_add_epi16(s1, s2);
     937           0 :   u2 = _mm_sub_epi16(s1, s2);
     938           0 :   u3 = _mm_sub_epi16(s0, s3);
     939             :   // interleave and perform butterfly multiplication/addition
     940           0 :   v0 = _mm_unpacklo_epi16(u0, u1);
     941           0 :   v1 = _mm_unpackhi_epi16(u0, u1);
     942           0 :   v2 = _mm_unpacklo_epi16(u2, u3);
     943           0 :   v3 = _mm_unpackhi_epi16(u2, u3);
     944             : 
     945           0 :   u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
     946           0 :   u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
     947           0 :   u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
     948           0 :   u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
     949           0 :   u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
     950           0 :   u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
     951           0 :   u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
     952           0 :   u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
     953             : 
     954             :   // shift and rounding
     955           0 :   v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
     956           0 :   v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
     957           0 :   v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
     958           0 :   v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
     959           0 :   v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
     960           0 :   v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
     961           0 :   v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
     962           0 :   v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
     963             : 
     964           0 :   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
     965           0 :   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
     966           0 :   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
     967           0 :   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
     968           0 :   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
     969           0 :   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
     970           0 :   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
     971           0 :   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
     972             : 
     973           0 :   in[0] = _mm_packs_epi32(u0, u1);
     974           0 :   in[2] = _mm_packs_epi32(u4, u5);
     975           0 :   in[4] = _mm_packs_epi32(u2, u3);
     976           0 :   in[6] = _mm_packs_epi32(u6, u7);
     977             : 
     978             :   // stage 2
     979             :   // interleave and perform butterfly multiplication/addition
     980           0 :   u0 = _mm_unpacklo_epi16(s6, s5);
     981           0 :   u1 = _mm_unpackhi_epi16(s6, s5);
     982           0 :   v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
     983           0 :   v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
     984           0 :   v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
     985           0 :   v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
     986             : 
     987             :   // shift and rounding
     988           0 :   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
     989           0 :   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
     990           0 :   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
     991           0 :   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
     992             : 
     993           0 :   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
     994           0 :   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
     995           0 :   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
     996           0 :   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
     997             : 
     998           0 :   u0 = _mm_packs_epi32(v0, v1);
     999           0 :   u1 = _mm_packs_epi32(v2, v3);
    1000             : 
    1001             :   // stage 3
    1002           0 :   s0 = _mm_add_epi16(s4, u0);
    1003           0 :   s1 = _mm_sub_epi16(s4, u0);
    1004           0 :   s2 = _mm_sub_epi16(s7, u1);
    1005           0 :   s3 = _mm_add_epi16(s7, u1);
    1006             : 
    1007             :   // stage 4
    1008           0 :   u0 = _mm_unpacklo_epi16(s0, s3);
    1009           0 :   u1 = _mm_unpackhi_epi16(s0, s3);
    1010           0 :   u2 = _mm_unpacklo_epi16(s1, s2);
    1011           0 :   u3 = _mm_unpackhi_epi16(s1, s2);
    1012             : 
    1013           0 :   v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
    1014           0 :   v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
    1015           0 :   v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
    1016           0 :   v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
    1017           0 :   v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
    1018           0 :   v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
    1019           0 :   v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
    1020           0 :   v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
    1021             : 
    1022             :   // shift and rounding
    1023           0 :   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
    1024           0 :   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
    1025           0 :   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
    1026           0 :   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
    1027           0 :   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
    1028           0 :   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
    1029           0 :   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
    1030           0 :   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
    1031             : 
    1032           0 :   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
    1033           0 :   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
    1034           0 :   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
    1035           0 :   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
    1036           0 :   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
    1037           0 :   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
    1038           0 :   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
    1039           0 :   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
    1040             : 
    1041           0 :   in[1] = _mm_packs_epi32(v0, v1);
    1042           0 :   in[3] = _mm_packs_epi32(v4, v5);
    1043           0 :   in[5] = _mm_packs_epi32(v2, v3);
    1044           0 :   in[7] = _mm_packs_epi32(v6, v7);
    1045             : 
    1046             :   // transpose
    1047           0 :   array_transpose_8x8(in, in);
    1048           0 : }
    1049             : 
    1050           0 : static void fadst8_sse2(__m128i *in) {
    1051             :   // Constants
    1052           0 :   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
    1053           0 :   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
    1054           0 :   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
    1055           0 :   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
    1056           0 :   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
    1057           0 :   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
    1058           0 :   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
    1059           0 :   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
    1060           0 :   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
    1061           0 :   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    1062           0 :   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
    1063           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    1064           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
    1065           0 :   const __m128i k__const_0 = _mm_set1_epi16(0);
    1066           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    1067             : 
    1068             :   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
    1069             :   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
    1070             :   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
    1071             :   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
    1072             :   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
    1073             : 
    1074             :   // properly aligned for butterfly input
    1075           0 :   in0 = in[7];
    1076           0 :   in1 = in[0];
    1077           0 :   in2 = in[5];
    1078           0 :   in3 = in[2];
    1079           0 :   in4 = in[3];
    1080           0 :   in5 = in[4];
    1081           0 :   in6 = in[1];
    1082           0 :   in7 = in[6];
    1083             : 
    1084             :   // column transformation
    1085             :   // stage 1
    1086             :   // interleave and multiply/add into 32-bit integer
    1087           0 :   s0 = _mm_unpacklo_epi16(in0, in1);
    1088           0 :   s1 = _mm_unpackhi_epi16(in0, in1);
    1089           0 :   s2 = _mm_unpacklo_epi16(in2, in3);
    1090           0 :   s3 = _mm_unpackhi_epi16(in2, in3);
    1091           0 :   s4 = _mm_unpacklo_epi16(in4, in5);
    1092           0 :   s5 = _mm_unpackhi_epi16(in4, in5);
    1093           0 :   s6 = _mm_unpacklo_epi16(in6, in7);
    1094           0 :   s7 = _mm_unpackhi_epi16(in6, in7);
    1095             : 
    1096           0 :   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
    1097           0 :   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
    1098           0 :   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
    1099           0 :   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
    1100           0 :   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
    1101           0 :   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
    1102           0 :   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
    1103           0 :   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
    1104           0 :   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
    1105           0 :   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
    1106           0 :   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
    1107           0 :   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
    1108           0 :   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
    1109           0 :   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
    1110           0 :   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
    1111           0 :   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
    1112             : 
    1113             :   // addition
    1114           0 :   w0 = _mm_add_epi32(u0, u8);
    1115           0 :   w1 = _mm_add_epi32(u1, u9);
    1116           0 :   w2 = _mm_add_epi32(u2, u10);
    1117           0 :   w3 = _mm_add_epi32(u3, u11);
    1118           0 :   w4 = _mm_add_epi32(u4, u12);
    1119           0 :   w5 = _mm_add_epi32(u5, u13);
    1120           0 :   w6 = _mm_add_epi32(u6, u14);
    1121           0 :   w7 = _mm_add_epi32(u7, u15);
    1122           0 :   w8 = _mm_sub_epi32(u0, u8);
    1123           0 :   w9 = _mm_sub_epi32(u1, u9);
    1124           0 :   w10 = _mm_sub_epi32(u2, u10);
    1125           0 :   w11 = _mm_sub_epi32(u3, u11);
    1126           0 :   w12 = _mm_sub_epi32(u4, u12);
    1127           0 :   w13 = _mm_sub_epi32(u5, u13);
    1128           0 :   w14 = _mm_sub_epi32(u6, u14);
    1129           0 :   w15 = _mm_sub_epi32(u7, u15);
    1130             : 
    1131             :   // shift and rounding
    1132           0 :   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
    1133           0 :   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
    1134           0 :   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
    1135           0 :   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
    1136           0 :   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
    1137           0 :   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
    1138           0 :   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
    1139           0 :   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
    1140             : 
    1141           0 :   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
    1142           0 :   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
    1143           0 :   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
    1144           0 :   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
    1145           0 :   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
    1146           0 :   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
    1147           0 :   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
    1148           0 :   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
    1149             : 
    1150             :   // back to 16-bit and pack 8 integers into __m128i
    1151           0 :   v0 = _mm_add_epi32(w0, w4);
    1152           0 :   v1 = _mm_add_epi32(w1, w5);
    1153           0 :   v2 = _mm_add_epi32(w2, w6);
    1154           0 :   v3 = _mm_add_epi32(w3, w7);
    1155           0 :   v4 = _mm_sub_epi32(w0, w4);
    1156           0 :   v5 = _mm_sub_epi32(w1, w5);
    1157           0 :   v6 = _mm_sub_epi32(w2, w6);
    1158           0 :   v7 = _mm_sub_epi32(w3, w7);
    1159             : 
    1160           0 :   w0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
    1161           0 :   w1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
    1162           0 :   w2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
    1163           0 :   w3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
    1164           0 :   w4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
    1165           0 :   w5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
    1166           0 :   w6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
    1167           0 :   w7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
    1168             : 
    1169           0 :   v0 = _mm_srai_epi32(w0, DCT_CONST_BITS);
    1170           0 :   v1 = _mm_srai_epi32(w1, DCT_CONST_BITS);
    1171           0 :   v2 = _mm_srai_epi32(w2, DCT_CONST_BITS);
    1172           0 :   v3 = _mm_srai_epi32(w3, DCT_CONST_BITS);
    1173           0 :   v4 = _mm_srai_epi32(w4, DCT_CONST_BITS);
    1174           0 :   v5 = _mm_srai_epi32(w5, DCT_CONST_BITS);
    1175           0 :   v6 = _mm_srai_epi32(w6, DCT_CONST_BITS);
    1176           0 :   v7 = _mm_srai_epi32(w7, DCT_CONST_BITS);
    1177             : 
    1178           0 :   in[4] = _mm_packs_epi32(u8, u9);
    1179           0 :   in[5] = _mm_packs_epi32(u10, u11);
    1180           0 :   in[6] = _mm_packs_epi32(u12, u13);
    1181           0 :   in[7] = _mm_packs_epi32(u14, u15);
    1182             : 
    1183             :   // stage 2
    1184           0 :   s0 = _mm_packs_epi32(v0, v1);
    1185           0 :   s1 = _mm_packs_epi32(v2, v3);
    1186           0 :   s2 = _mm_packs_epi32(v4, v5);
    1187           0 :   s3 = _mm_packs_epi32(v6, v7);
    1188             : 
    1189           0 :   u0 = _mm_unpacklo_epi16(in[4], in[5]);
    1190           0 :   u1 = _mm_unpackhi_epi16(in[4], in[5]);
    1191           0 :   u2 = _mm_unpacklo_epi16(in[6], in[7]);
    1192           0 :   u3 = _mm_unpackhi_epi16(in[6], in[7]);
    1193             : 
    1194           0 :   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
    1195           0 :   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
    1196           0 :   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
    1197           0 :   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
    1198           0 :   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
    1199           0 :   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
    1200           0 :   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
    1201           0 :   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
    1202             : 
    1203           0 :   w0 = _mm_add_epi32(v0, v4);
    1204           0 :   w1 = _mm_add_epi32(v1, v5);
    1205           0 :   w2 = _mm_add_epi32(v2, v6);
    1206           0 :   w3 = _mm_add_epi32(v3, v7);
    1207           0 :   w4 = _mm_sub_epi32(v0, v4);
    1208           0 :   w5 = _mm_sub_epi32(v1, v5);
    1209           0 :   w6 = _mm_sub_epi32(v2, v6);
    1210           0 :   w7 = _mm_sub_epi32(v3, v7);
    1211             : 
    1212           0 :   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
    1213           0 :   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
    1214           0 :   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
    1215           0 :   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
    1216           0 :   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
    1217           0 :   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
    1218           0 :   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
    1219           0 :   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
    1220             : 
    1221           0 :   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    1222           0 :   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
    1223           0 :   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    1224           0 :   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
    1225           0 :   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    1226           0 :   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
    1227           0 :   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    1228           0 :   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
    1229             : 
    1230             :   // back to 16-bit intergers
    1231           0 :   s4 = _mm_packs_epi32(u0, u1);
    1232           0 :   s5 = _mm_packs_epi32(u2, u3);
    1233           0 :   s6 = _mm_packs_epi32(u4, u5);
    1234           0 :   s7 = _mm_packs_epi32(u6, u7);
    1235             : 
    1236             :   // stage 3
    1237           0 :   u0 = _mm_unpacklo_epi16(s2, s3);
    1238           0 :   u1 = _mm_unpackhi_epi16(s2, s3);
    1239           0 :   u2 = _mm_unpacklo_epi16(s6, s7);
    1240           0 :   u3 = _mm_unpackhi_epi16(s6, s7);
    1241             : 
    1242           0 :   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
    1243           0 :   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
    1244           0 :   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
    1245           0 :   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
    1246           0 :   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
    1247           0 :   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
    1248           0 :   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
    1249           0 :   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
    1250             : 
    1251           0 :   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
    1252           0 :   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
    1253           0 :   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
    1254           0 :   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
    1255           0 :   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
    1256           0 :   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
    1257           0 :   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
    1258           0 :   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
    1259             : 
    1260           0 :   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
    1261           0 :   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
    1262           0 :   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
    1263           0 :   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
    1264           0 :   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
    1265           0 :   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
    1266           0 :   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
    1267           0 :   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
    1268             : 
    1269           0 :   s2 = _mm_packs_epi32(v0, v1);
    1270           0 :   s3 = _mm_packs_epi32(v2, v3);
    1271           0 :   s6 = _mm_packs_epi32(v4, v5);
    1272           0 :   s7 = _mm_packs_epi32(v6, v7);
    1273             : 
    1274             :   // FIXME(jingning): do subtract using bit inversion?
    1275           0 :   in[0] = s0;
    1276           0 :   in[1] = _mm_sub_epi16(k__const_0, s4);
    1277           0 :   in[2] = s6;
    1278           0 :   in[3] = _mm_sub_epi16(k__const_0, s2);
    1279           0 :   in[4] = s3;
    1280           0 :   in[5] = _mm_sub_epi16(k__const_0, s7);
    1281           0 :   in[6] = s5;
    1282           0 :   in[7] = _mm_sub_epi16(k__const_0, s1);
    1283             : 
    1284             :   // transpose
    1285           0 :   array_transpose_8x8(in, in);
    1286           0 : }
    1287             : 
    1288             : #if CONFIG_EXT_TX
    1289           0 : static void fidtx8_sse2(__m128i *in) {
    1290           0 :   in[0] = _mm_slli_epi16(in[0], 1);
    1291           0 :   in[1] = _mm_slli_epi16(in[1], 1);
    1292           0 :   in[2] = _mm_slli_epi16(in[2], 1);
    1293           0 :   in[3] = _mm_slli_epi16(in[3], 1);
    1294           0 :   in[4] = _mm_slli_epi16(in[4], 1);
    1295           0 :   in[5] = _mm_slli_epi16(in[5], 1);
    1296           0 :   in[6] = _mm_slli_epi16(in[6], 1);
    1297           0 :   in[7] = _mm_slli_epi16(in[7], 1);
    1298             : 
    1299           0 :   array_transpose_8x8(in, in);
    1300           0 : }
    1301             : #endif  // CONFIG_EXT_TX
    1302             : 
    1303           0 : void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
    1304             :                      int tx_type) {
    1305             :   __m128i in[8];
    1306             : 
    1307           0 :   switch (tx_type) {
    1308           0 :     case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break;
    1309             :     case ADST_DCT:
    1310           0 :       load_buffer_8x8(input, in, stride, 0, 0);
    1311           0 :       fadst8_sse2(in);
    1312           0 :       fdct8_sse2(in);
    1313           0 :       right_shift_8x8(in, 1);
    1314           0 :       write_buffer_8x8(output, in, 8);
    1315           0 :       break;
    1316             :     case DCT_ADST:
    1317           0 :       load_buffer_8x8(input, in, stride, 0, 0);
    1318           0 :       fdct8_sse2(in);
    1319           0 :       fadst8_sse2(in);
    1320           0 :       right_shift_8x8(in, 1);
    1321           0 :       write_buffer_8x8(output, in, 8);
    1322           0 :       break;
    1323             :     case ADST_ADST:
    1324           0 :       load_buffer_8x8(input, in, stride, 0, 0);
    1325           0 :       fadst8_sse2(in);
    1326           0 :       fadst8_sse2(in);
    1327           0 :       right_shift_8x8(in, 1);
    1328           0 :       write_buffer_8x8(output, in, 8);
    1329           0 :       break;
    1330             : #if CONFIG_EXT_TX
    1331             :     case FLIPADST_DCT:
    1332           0 :       load_buffer_8x8(input, in, stride, 1, 0);
    1333           0 :       fadst8_sse2(in);
    1334           0 :       fdct8_sse2(in);
    1335           0 :       right_shift_8x8(in, 1);
    1336           0 :       write_buffer_8x8(output, in, 8);
    1337           0 :       break;
    1338             :     case DCT_FLIPADST:
    1339           0 :       load_buffer_8x8(input, in, stride, 0, 1);
    1340           0 :       fdct8_sse2(in);
    1341           0 :       fadst8_sse2(in);
    1342           0 :       right_shift_8x8(in, 1);
    1343           0 :       write_buffer_8x8(output, in, 8);
    1344           0 :       break;
    1345             :     case FLIPADST_FLIPADST:
    1346           0 :       load_buffer_8x8(input, in, stride, 1, 1);
    1347           0 :       fadst8_sse2(in);
    1348           0 :       fadst8_sse2(in);
    1349           0 :       right_shift_8x8(in, 1);
    1350           0 :       write_buffer_8x8(output, in, 8);
    1351           0 :       break;
    1352             :     case ADST_FLIPADST:
    1353           0 :       load_buffer_8x8(input, in, stride, 0, 1);
    1354           0 :       fadst8_sse2(in);
    1355           0 :       fadst8_sse2(in);
    1356           0 :       right_shift_8x8(in, 1);
    1357           0 :       write_buffer_8x8(output, in, 8);
    1358           0 :       break;
    1359             :     case FLIPADST_ADST:
    1360           0 :       load_buffer_8x8(input, in, stride, 1, 0);
    1361           0 :       fadst8_sse2(in);
    1362           0 :       fadst8_sse2(in);
    1363           0 :       right_shift_8x8(in, 1);
    1364           0 :       write_buffer_8x8(output, in, 8);
    1365           0 :       break;
    1366             :     case IDTX:
    1367           0 :       load_buffer_8x8(input, in, stride, 0, 0);
    1368           0 :       fidtx8_sse2(in);
    1369           0 :       fidtx8_sse2(in);
    1370           0 :       right_shift_8x8(in, 1);
    1371           0 :       write_buffer_8x8(output, in, 8);
    1372           0 :       break;
    1373             :     case V_DCT:
    1374           0 :       load_buffer_8x8(input, in, stride, 0, 0);
    1375           0 :       fdct8_sse2(in);
    1376           0 :       fidtx8_sse2(in);
    1377           0 :       right_shift_8x8(in, 1);
    1378           0 :       write_buffer_8x8(output, in, 8);
    1379           0 :       break;
    1380             :     case H_DCT:
    1381           0 :       load_buffer_8x8(input, in, stride, 0, 0);
    1382           0 :       fidtx8_sse2(in);
    1383           0 :       fdct8_sse2(in);
    1384           0 :       right_shift_8x8(in, 1);
    1385           0 :       write_buffer_8x8(output, in, 8);
    1386           0 :       break;
    1387             :     case V_ADST:
    1388           0 :       load_buffer_8x8(input, in, stride, 0, 0);
    1389           0 :       fadst8_sse2(in);
    1390           0 :       fidtx8_sse2(in);
    1391           0 :       right_shift_8x8(in, 1);
    1392           0 :       write_buffer_8x8(output, in, 8);
    1393           0 :       break;
    1394             :     case H_ADST:
    1395           0 :       load_buffer_8x8(input, in, stride, 0, 0);
    1396           0 :       fidtx8_sse2(in);
    1397           0 :       fadst8_sse2(in);
    1398           0 :       right_shift_8x8(in, 1);
    1399           0 :       write_buffer_8x8(output, in, 8);
    1400           0 :       break;
    1401             :     case V_FLIPADST:
    1402           0 :       load_buffer_8x8(input, in, stride, 1, 0);
    1403           0 :       fadst8_sse2(in);
    1404           0 :       fidtx8_sse2(in);
    1405           0 :       right_shift_8x8(in, 1);
    1406           0 :       write_buffer_8x8(output, in, 8);
    1407           0 :       break;
    1408             :     case H_FLIPADST:
    1409           0 :       load_buffer_8x8(input, in, stride, 0, 1);
    1410           0 :       fidtx8_sse2(in);
    1411           0 :       fadst8_sse2(in);
    1412           0 :       right_shift_8x8(in, 1);
    1413           0 :       write_buffer_8x8(output, in, 8);
    1414           0 :       break;
    1415             : #endif  // CONFIG_EXT_TX
    1416           0 :     default: assert(0);
    1417             :   }
    1418           0 : }
    1419             : 
    1420           0 : static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
    1421             :                                      __m128i *in1, int stride, int flipud,
    1422             :                                      int fliplr) {
    1423             :   // Load 4 8x8 blocks
    1424           0 :   const int16_t *topL = input;
    1425           0 :   const int16_t *topR = input + 8;
    1426           0 :   const int16_t *botL = input + 8 * stride;
    1427           0 :   const int16_t *botR = input + 8 * stride + 8;
    1428             : 
    1429             :   const int16_t *tmp;
    1430             : 
    1431           0 :   if (flipud) {
    1432             :     // Swap left columns
    1433           0 :     tmp = topL;
    1434           0 :     topL = botL;
    1435           0 :     botL = tmp;
    1436             :     // Swap right columns
    1437           0 :     tmp = topR;
    1438           0 :     topR = botR;
    1439           0 :     botR = tmp;
    1440             :   }
    1441             : 
    1442           0 :   if (fliplr) {
    1443             :     // Swap top rows
    1444           0 :     tmp = topL;
    1445           0 :     topL = topR;
    1446           0 :     topR = tmp;
    1447             :     // Swap bottom rows
    1448           0 :     tmp = botL;
    1449           0 :     botL = botR;
    1450           0 :     botR = tmp;
    1451             :   }
    1452             : 
    1453             :   // load first 8 columns
    1454           0 :   load_buffer_8x8(topL, in0, stride, flipud, fliplr);
    1455           0 :   load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
    1456             : 
    1457             :   // load second 8 columns
    1458           0 :   load_buffer_8x8(topR, in1, stride, flipud, fliplr);
    1459           0 :   load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
    1460           0 : }
    1461             : 
    1462           0 : static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
    1463             :                                       __m128i *in1, int stride) {
    1464             :   // write first 8 columns
    1465           0 :   write_buffer_8x8(output, in0, stride);
    1466           0 :   write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
    1467             :   // write second 8 columns
    1468           0 :   output += 8;
    1469           0 :   write_buffer_8x8(output, in1, stride);
    1470           0 :   write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
    1471           0 : }
    1472             : 
    1473           0 : static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
    1474             :   __m128i tbuf[8];
    1475           0 :   array_transpose_8x8(res0, res0);
    1476           0 :   array_transpose_8x8(res1, tbuf);
    1477           0 :   array_transpose_8x8(res0 + 8, res1);
    1478           0 :   array_transpose_8x8(res1 + 8, res1 + 8);
    1479             : 
    1480           0 :   res0[8] = tbuf[0];
    1481           0 :   res0[9] = tbuf[1];
    1482           0 :   res0[10] = tbuf[2];
    1483           0 :   res0[11] = tbuf[3];
    1484           0 :   res0[12] = tbuf[4];
    1485           0 :   res0[13] = tbuf[5];
    1486           0 :   res0[14] = tbuf[6];
    1487           0 :   res0[15] = tbuf[7];
    1488           0 : }
    1489             : 
    1490           0 : static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
    1491             :   // perform rounding operations
    1492           0 :   right_shift_8x8(res0, 2);
    1493           0 :   right_shift_8x8(res0 + 8, 2);
    1494           0 :   right_shift_8x8(res1, 2);
    1495           0 :   right_shift_8x8(res1 + 8, 2);
    1496           0 : }
    1497             : 
    1498           0 : static void fdct16_8col(__m128i *in) {
    1499             :   // perform 16x16 1-D DCT for 8 columns
    1500             :   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
    1501           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
    1502           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    1503           0 :   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    1504           0 :   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
    1505           0 :   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
    1506           0 :   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    1507           0 :   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
    1508           0 :   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
    1509           0 :   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
    1510           0 :   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    1511           0 :   const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
    1512           0 :   const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
    1513           0 :   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
    1514           0 :   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
    1515           0 :   const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
    1516           0 :   const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
    1517           0 :   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
    1518           0 :   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
    1519           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    1520             : 
    1521             :   // stage 1
    1522           0 :   i[0] = _mm_add_epi16(in[0], in[15]);
    1523           0 :   i[1] = _mm_add_epi16(in[1], in[14]);
    1524           0 :   i[2] = _mm_add_epi16(in[2], in[13]);
    1525           0 :   i[3] = _mm_add_epi16(in[3], in[12]);
    1526           0 :   i[4] = _mm_add_epi16(in[4], in[11]);
    1527           0 :   i[5] = _mm_add_epi16(in[5], in[10]);
    1528           0 :   i[6] = _mm_add_epi16(in[6], in[9]);
    1529           0 :   i[7] = _mm_add_epi16(in[7], in[8]);
    1530             : 
    1531           0 :   s[0] = _mm_sub_epi16(in[7], in[8]);
    1532           0 :   s[1] = _mm_sub_epi16(in[6], in[9]);
    1533           0 :   s[2] = _mm_sub_epi16(in[5], in[10]);
    1534           0 :   s[3] = _mm_sub_epi16(in[4], in[11]);
    1535           0 :   s[4] = _mm_sub_epi16(in[3], in[12]);
    1536           0 :   s[5] = _mm_sub_epi16(in[2], in[13]);
    1537           0 :   s[6] = _mm_sub_epi16(in[1], in[14]);
    1538           0 :   s[7] = _mm_sub_epi16(in[0], in[15]);
    1539             : 
    1540           0 :   p[0] = _mm_add_epi16(i[0], i[7]);
    1541           0 :   p[1] = _mm_add_epi16(i[1], i[6]);
    1542           0 :   p[2] = _mm_add_epi16(i[2], i[5]);
    1543           0 :   p[3] = _mm_add_epi16(i[3], i[4]);
    1544           0 :   p[4] = _mm_sub_epi16(i[3], i[4]);
    1545           0 :   p[5] = _mm_sub_epi16(i[2], i[5]);
    1546           0 :   p[6] = _mm_sub_epi16(i[1], i[6]);
    1547           0 :   p[7] = _mm_sub_epi16(i[0], i[7]);
    1548             : 
    1549           0 :   u[0] = _mm_add_epi16(p[0], p[3]);
    1550           0 :   u[1] = _mm_add_epi16(p[1], p[2]);
    1551           0 :   u[2] = _mm_sub_epi16(p[1], p[2]);
    1552           0 :   u[3] = _mm_sub_epi16(p[0], p[3]);
    1553             : 
    1554           0 :   v[0] = _mm_unpacklo_epi16(u[0], u[1]);
    1555           0 :   v[1] = _mm_unpackhi_epi16(u[0], u[1]);
    1556           0 :   v[2] = _mm_unpacklo_epi16(u[2], u[3]);
    1557           0 :   v[3] = _mm_unpackhi_epi16(u[2], u[3]);
    1558             : 
    1559           0 :   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
    1560           0 :   u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
    1561           0 :   u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
    1562           0 :   u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
    1563           0 :   u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
    1564           0 :   u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
    1565           0 :   u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
    1566           0 :   u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
    1567             : 
    1568           0 :   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    1569           0 :   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    1570           0 :   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    1571           0 :   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
    1572           0 :   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
    1573           0 :   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
    1574           0 :   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
    1575           0 :   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
    1576             : 
    1577           0 :   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
    1578           0 :   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
    1579           0 :   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
    1580           0 :   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
    1581           0 :   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
    1582           0 :   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
    1583           0 :   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
    1584           0 :   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
    1585             : 
    1586           0 :   in[0] = _mm_packs_epi32(u[0], u[1]);
    1587           0 :   in[4] = _mm_packs_epi32(u[4], u[5]);
    1588           0 :   in[8] = _mm_packs_epi32(u[2], u[3]);
    1589           0 :   in[12] = _mm_packs_epi32(u[6], u[7]);
    1590             : 
    1591           0 :   u[0] = _mm_unpacklo_epi16(p[5], p[6]);
    1592           0 :   u[1] = _mm_unpackhi_epi16(p[5], p[6]);
    1593           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
    1594           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
    1595           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
    1596           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
    1597             : 
    1598           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1599           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1600           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1601           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1602             : 
    1603           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1604           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1605           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1606           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1607             : 
    1608           0 :   u[0] = _mm_packs_epi32(v[0], v[1]);
    1609           0 :   u[1] = _mm_packs_epi32(v[2], v[3]);
    1610             : 
    1611           0 :   t[0] = _mm_add_epi16(p[4], u[0]);
    1612           0 :   t[1] = _mm_sub_epi16(p[4], u[0]);
    1613           0 :   t[2] = _mm_sub_epi16(p[7], u[1]);
    1614           0 :   t[3] = _mm_add_epi16(p[7], u[1]);
    1615             : 
    1616           0 :   u[0] = _mm_unpacklo_epi16(t[0], t[3]);
    1617           0 :   u[1] = _mm_unpackhi_epi16(t[0], t[3]);
    1618           0 :   u[2] = _mm_unpacklo_epi16(t[1], t[2]);
    1619           0 :   u[3] = _mm_unpackhi_epi16(t[1], t[2]);
    1620             : 
    1621           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
    1622           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
    1623           0 :   v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
    1624           0 :   v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
    1625           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
    1626           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
    1627           0 :   v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
    1628           0 :   v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
    1629             : 
    1630           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1631           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1632           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1633           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1634           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    1635           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    1636           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    1637           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    1638             : 
    1639           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1640           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1641           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1642           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1643           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1644           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1645           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1646           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1647             : 
    1648           0 :   in[2] = _mm_packs_epi32(v[0], v[1]);
    1649           0 :   in[6] = _mm_packs_epi32(v[4], v[5]);
    1650           0 :   in[10] = _mm_packs_epi32(v[2], v[3]);
    1651           0 :   in[14] = _mm_packs_epi32(v[6], v[7]);
    1652             : 
    1653             :   // stage 2
    1654           0 :   u[0] = _mm_unpacklo_epi16(s[2], s[5]);
    1655           0 :   u[1] = _mm_unpackhi_epi16(s[2], s[5]);
    1656           0 :   u[2] = _mm_unpacklo_epi16(s[3], s[4]);
    1657           0 :   u[3] = _mm_unpackhi_epi16(s[3], s[4]);
    1658             : 
    1659           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
    1660           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
    1661           0 :   v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
    1662           0 :   v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
    1663           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
    1664           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
    1665           0 :   v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
    1666           0 :   v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
    1667             : 
    1668           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1669           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1670           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1671           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1672           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    1673           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    1674           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    1675           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    1676             : 
    1677           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1678           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1679           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1680           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1681           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1682           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1683           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1684           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1685             : 
    1686           0 :   t[2] = _mm_packs_epi32(v[0], v[1]);
    1687           0 :   t[3] = _mm_packs_epi32(v[2], v[3]);
    1688           0 :   t[4] = _mm_packs_epi32(v[4], v[5]);
    1689           0 :   t[5] = _mm_packs_epi32(v[6], v[7]);
    1690             : 
    1691             :   // stage 3
    1692           0 :   p[0] = _mm_add_epi16(s[0], t[3]);
    1693           0 :   p[1] = _mm_add_epi16(s[1], t[2]);
    1694           0 :   p[2] = _mm_sub_epi16(s[1], t[2]);
    1695           0 :   p[3] = _mm_sub_epi16(s[0], t[3]);
    1696           0 :   p[4] = _mm_sub_epi16(s[7], t[4]);
    1697           0 :   p[5] = _mm_sub_epi16(s[6], t[5]);
    1698           0 :   p[6] = _mm_add_epi16(s[6], t[5]);
    1699           0 :   p[7] = _mm_add_epi16(s[7], t[4]);
    1700             : 
    1701             :   // stage 4
    1702           0 :   u[0] = _mm_unpacklo_epi16(p[1], p[6]);
    1703           0 :   u[1] = _mm_unpackhi_epi16(p[1], p[6]);
    1704           0 :   u[2] = _mm_unpacklo_epi16(p[2], p[5]);
    1705           0 :   u[3] = _mm_unpackhi_epi16(p[2], p[5]);
    1706             : 
    1707           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
    1708           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
    1709           0 :   v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
    1710           0 :   v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
    1711           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
    1712           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
    1713           0 :   v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
    1714           0 :   v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
    1715             : 
    1716           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1717           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1718           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1719           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1720           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    1721           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    1722           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    1723           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    1724             : 
    1725           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1726           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1727           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1728           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1729           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1730           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1731           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1732           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1733             : 
    1734           0 :   t[1] = _mm_packs_epi32(v[0], v[1]);
    1735           0 :   t[2] = _mm_packs_epi32(v[2], v[3]);
    1736           0 :   t[5] = _mm_packs_epi32(v[4], v[5]);
    1737           0 :   t[6] = _mm_packs_epi32(v[6], v[7]);
    1738             : 
    1739             :   // stage 5
    1740           0 :   s[0] = _mm_add_epi16(p[0], t[1]);
    1741           0 :   s[1] = _mm_sub_epi16(p[0], t[1]);
    1742           0 :   s[2] = _mm_sub_epi16(p[3], t[2]);
    1743           0 :   s[3] = _mm_add_epi16(p[3], t[2]);
    1744           0 :   s[4] = _mm_add_epi16(p[4], t[5]);
    1745           0 :   s[5] = _mm_sub_epi16(p[4], t[5]);
    1746           0 :   s[6] = _mm_sub_epi16(p[7], t[6]);
    1747           0 :   s[7] = _mm_add_epi16(p[7], t[6]);
    1748             : 
    1749             :   // stage 6
    1750           0 :   u[0] = _mm_unpacklo_epi16(s[0], s[7]);
    1751           0 :   u[1] = _mm_unpackhi_epi16(s[0], s[7]);
    1752           0 :   u[2] = _mm_unpacklo_epi16(s[1], s[6]);
    1753           0 :   u[3] = _mm_unpackhi_epi16(s[1], s[6]);
    1754           0 :   u[4] = _mm_unpacklo_epi16(s[2], s[5]);
    1755           0 :   u[5] = _mm_unpackhi_epi16(s[2], s[5]);
    1756           0 :   u[6] = _mm_unpacklo_epi16(s[3], s[4]);
    1757           0 :   u[7] = _mm_unpackhi_epi16(s[3], s[4]);
    1758             : 
    1759           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
    1760           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
    1761           0 :   v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
    1762           0 :   v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
    1763           0 :   v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
    1764           0 :   v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
    1765           0 :   v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
    1766           0 :   v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
    1767           0 :   v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
    1768           0 :   v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
    1769           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
    1770           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
    1771           0 :   v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
    1772           0 :   v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
    1773           0 :   v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
    1774           0 :   v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
    1775             : 
    1776           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    1777           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    1778           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    1779           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    1780           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    1781           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    1782           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    1783           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    1784           0 :   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
    1785           0 :   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
    1786           0 :   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
    1787           0 :   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
    1788           0 :   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
    1789           0 :   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
    1790           0 :   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
    1791           0 :   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
    1792             : 
    1793           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    1794           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    1795           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    1796           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    1797           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    1798           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    1799           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    1800           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    1801           0 :   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
    1802           0 :   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
    1803           0 :   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
    1804           0 :   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
    1805           0 :   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
    1806           0 :   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
    1807           0 :   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
    1808           0 :   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
    1809             : 
    1810           0 :   in[1] = _mm_packs_epi32(v[0], v[1]);
    1811           0 :   in[9] = _mm_packs_epi32(v[2], v[3]);
    1812           0 :   in[5] = _mm_packs_epi32(v[4], v[5]);
    1813           0 :   in[13] = _mm_packs_epi32(v[6], v[7]);
    1814           0 :   in[3] = _mm_packs_epi32(v[8], v[9]);
    1815           0 :   in[11] = _mm_packs_epi32(v[10], v[11]);
    1816           0 :   in[7] = _mm_packs_epi32(v[12], v[13]);
    1817           0 :   in[15] = _mm_packs_epi32(v[14], v[15]);
    1818           0 : }
    1819             : 
    1820           0 : static void fadst16_8col(__m128i *in) {
    1821             :   // perform 16x16 1-D ADST for 8 columns
    1822             :   __m128i s[16], x[16], u[32], v[32];
    1823           0 :   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
    1824           0 :   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
    1825           0 :   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
    1826           0 :   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
    1827           0 :   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
    1828           0 :   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
    1829           0 :   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
    1830           0 :   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
    1831           0 :   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
    1832           0 :   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
    1833           0 :   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
    1834           0 :   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
    1835           0 :   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
    1836           0 :   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
    1837           0 :   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
    1838           0 :   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
    1839           0 :   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
    1840           0 :   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    1841           0 :   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
    1842           0 :   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
    1843           0 :   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
    1844           0 :   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
    1845           0 :   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
    1846           0 :   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    1847           0 :   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
    1848           0 :   const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
    1849           0 :   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
    1850           0 :   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    1851           0 :   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    1852           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    1853           0 :   const __m128i kZero = _mm_set1_epi16(0);
    1854             : 
    1855           0 :   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
    1856           0 :   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
    1857           0 :   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
    1858           0 :   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
    1859           0 :   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
    1860           0 :   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
    1861           0 :   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
    1862           0 :   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
    1863           0 :   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
    1864           0 :   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
    1865           0 :   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
    1866           0 :   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
    1867           0 :   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
    1868           0 :   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
    1869           0 :   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
    1870           0 :   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
    1871             : 
    1872           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
    1873           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
    1874           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
    1875           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
    1876           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
    1877           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
    1878           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
    1879           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
    1880           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
    1881           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
    1882           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
    1883           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
    1884           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
    1885           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
    1886           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
    1887           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
    1888           0 :   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
    1889           0 :   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
    1890           0 :   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
    1891           0 :   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
    1892           0 :   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
    1893           0 :   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
    1894           0 :   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
    1895           0 :   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
    1896           0 :   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
    1897           0 :   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
    1898           0 :   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
    1899           0 :   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
    1900           0 :   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
    1901           0 :   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
    1902           0 :   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
    1903           0 :   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
    1904             : 
    1905           0 :   u[0] = _mm_add_epi32(v[0], v[16]);
    1906           0 :   u[1] = _mm_add_epi32(v[1], v[17]);
    1907           0 :   u[2] = _mm_add_epi32(v[2], v[18]);
    1908           0 :   u[3] = _mm_add_epi32(v[3], v[19]);
    1909           0 :   u[4] = _mm_add_epi32(v[4], v[20]);
    1910           0 :   u[5] = _mm_add_epi32(v[5], v[21]);
    1911           0 :   u[6] = _mm_add_epi32(v[6], v[22]);
    1912           0 :   u[7] = _mm_add_epi32(v[7], v[23]);
    1913           0 :   u[8] = _mm_add_epi32(v[8], v[24]);
    1914           0 :   u[9] = _mm_add_epi32(v[9], v[25]);
    1915           0 :   u[10] = _mm_add_epi32(v[10], v[26]);
    1916           0 :   u[11] = _mm_add_epi32(v[11], v[27]);
    1917           0 :   u[12] = _mm_add_epi32(v[12], v[28]);
    1918           0 :   u[13] = _mm_add_epi32(v[13], v[29]);
    1919           0 :   u[14] = _mm_add_epi32(v[14], v[30]);
    1920           0 :   u[15] = _mm_add_epi32(v[15], v[31]);
    1921           0 :   u[16] = _mm_sub_epi32(v[0], v[16]);
    1922           0 :   u[17] = _mm_sub_epi32(v[1], v[17]);
    1923           0 :   u[18] = _mm_sub_epi32(v[2], v[18]);
    1924           0 :   u[19] = _mm_sub_epi32(v[3], v[19]);
    1925           0 :   u[20] = _mm_sub_epi32(v[4], v[20]);
    1926           0 :   u[21] = _mm_sub_epi32(v[5], v[21]);
    1927           0 :   u[22] = _mm_sub_epi32(v[6], v[22]);
    1928           0 :   u[23] = _mm_sub_epi32(v[7], v[23]);
    1929           0 :   u[24] = _mm_sub_epi32(v[8], v[24]);
    1930           0 :   u[25] = _mm_sub_epi32(v[9], v[25]);
    1931           0 :   u[26] = _mm_sub_epi32(v[10], v[26]);
    1932           0 :   u[27] = _mm_sub_epi32(v[11], v[27]);
    1933           0 :   u[28] = _mm_sub_epi32(v[12], v[28]);
    1934           0 :   u[29] = _mm_sub_epi32(v[13], v[29]);
    1935           0 :   u[30] = _mm_sub_epi32(v[14], v[30]);
    1936           0 :   u[31] = _mm_sub_epi32(v[15], v[31]);
    1937             : 
    1938           0 :   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
    1939           0 :   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
    1940           0 :   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
    1941           0 :   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
    1942           0 :   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
    1943           0 :   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
    1944           0 :   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
    1945           0 :   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
    1946           0 :   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
    1947           0 :   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
    1948           0 :   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
    1949           0 :   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
    1950           0 :   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
    1951           0 :   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
    1952           0 :   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
    1953           0 :   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
    1954             : 
    1955           0 :   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
    1956           0 :   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
    1957           0 :   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
    1958           0 :   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
    1959           0 :   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
    1960           0 :   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
    1961           0 :   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
    1962           0 :   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
    1963           0 :   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
    1964           0 :   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
    1965           0 :   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
    1966           0 :   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
    1967           0 :   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
    1968           0 :   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
    1969           0 :   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
    1970           0 :   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
    1971             : 
    1972           0 :   v[0] = _mm_add_epi32(u[0], u[8]);
    1973           0 :   v[1] = _mm_add_epi32(u[1], u[9]);
    1974           0 :   v[2] = _mm_add_epi32(u[2], u[10]);
    1975           0 :   v[3] = _mm_add_epi32(u[3], u[11]);
    1976           0 :   v[4] = _mm_add_epi32(u[4], u[12]);
    1977           0 :   v[5] = _mm_add_epi32(u[5], u[13]);
    1978           0 :   v[6] = _mm_add_epi32(u[6], u[14]);
    1979           0 :   v[7] = _mm_add_epi32(u[7], u[15]);
    1980             : 
    1981           0 :   v[16] = _mm_add_epi32(v[0], v[4]);
    1982           0 :   v[17] = _mm_add_epi32(v[1], v[5]);
    1983           0 :   v[18] = _mm_add_epi32(v[2], v[6]);
    1984           0 :   v[19] = _mm_add_epi32(v[3], v[7]);
    1985           0 :   v[20] = _mm_sub_epi32(v[0], v[4]);
    1986           0 :   v[21] = _mm_sub_epi32(v[1], v[5]);
    1987           0 :   v[22] = _mm_sub_epi32(v[2], v[6]);
    1988           0 :   v[23] = _mm_sub_epi32(v[3], v[7]);
    1989           0 :   v[16] = _mm_add_epi32(v[16], k__DCT_CONST_ROUNDING);
    1990           0 :   v[17] = _mm_add_epi32(v[17], k__DCT_CONST_ROUNDING);
    1991           0 :   v[18] = _mm_add_epi32(v[18], k__DCT_CONST_ROUNDING);
    1992           0 :   v[19] = _mm_add_epi32(v[19], k__DCT_CONST_ROUNDING);
    1993           0 :   v[20] = _mm_add_epi32(v[20], k__DCT_CONST_ROUNDING);
    1994           0 :   v[21] = _mm_add_epi32(v[21], k__DCT_CONST_ROUNDING);
    1995           0 :   v[22] = _mm_add_epi32(v[22], k__DCT_CONST_ROUNDING);
    1996           0 :   v[23] = _mm_add_epi32(v[23], k__DCT_CONST_ROUNDING);
    1997           0 :   v[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
    1998           0 :   v[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
    1999           0 :   v[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
    2000           0 :   v[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
    2001           0 :   v[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
    2002           0 :   v[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
    2003           0 :   v[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
    2004           0 :   v[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
    2005           0 :   s[0] = _mm_packs_epi32(v[16], v[17]);
    2006           0 :   s[1] = _mm_packs_epi32(v[18], v[19]);
    2007           0 :   s[2] = _mm_packs_epi32(v[20], v[21]);
    2008           0 :   s[3] = _mm_packs_epi32(v[22], v[23]);
    2009             : 
    2010           0 :   v[8] = _mm_sub_epi32(u[0], u[8]);
    2011           0 :   v[9] = _mm_sub_epi32(u[1], u[9]);
    2012           0 :   v[10] = _mm_sub_epi32(u[2], u[10]);
    2013           0 :   v[11] = _mm_sub_epi32(u[3], u[11]);
    2014           0 :   v[12] = _mm_sub_epi32(u[4], u[12]);
    2015           0 :   v[13] = _mm_sub_epi32(u[5], u[13]);
    2016           0 :   v[14] = _mm_sub_epi32(u[6], u[14]);
    2017           0 :   v[15] = _mm_sub_epi32(u[7], u[15]);
    2018             : 
    2019           0 :   v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
    2020           0 :   v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
    2021           0 :   v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
    2022           0 :   v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
    2023           0 :   v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
    2024           0 :   v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
    2025           0 :   v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
    2026           0 :   v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
    2027             : 
    2028           0 :   v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
    2029           0 :   v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
    2030           0 :   v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
    2031           0 :   v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
    2032           0 :   v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
    2033           0 :   v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
    2034           0 :   v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
    2035           0 :   v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
    2036             : 
    2037           0 :   s[4] = _mm_packs_epi32(v[8], v[9]);
    2038           0 :   s[5] = _mm_packs_epi32(v[10], v[11]);
    2039           0 :   s[6] = _mm_packs_epi32(v[12], v[13]);
    2040           0 :   s[7] = _mm_packs_epi32(v[14], v[15]);
    2041             :   //
    2042             : 
    2043           0 :   s[8] = _mm_packs_epi32(u[16], u[17]);
    2044           0 :   s[9] = _mm_packs_epi32(u[18], u[19]);
    2045           0 :   s[10] = _mm_packs_epi32(u[20], u[21]);
    2046           0 :   s[11] = _mm_packs_epi32(u[22], u[23]);
    2047           0 :   s[12] = _mm_packs_epi32(u[24], u[25]);
    2048           0 :   s[13] = _mm_packs_epi32(u[26], u[27]);
    2049           0 :   s[14] = _mm_packs_epi32(u[28], u[29]);
    2050           0 :   s[15] = _mm_packs_epi32(u[30], u[31]);
    2051             : 
    2052             :   // stage 2
    2053           0 :   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
    2054           0 :   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
    2055           0 :   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
    2056           0 :   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
    2057           0 :   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
    2058           0 :   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
    2059           0 :   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
    2060           0 :   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
    2061             : 
    2062           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
    2063           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
    2064           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
    2065           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
    2066           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
    2067           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
    2068           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
    2069           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
    2070           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
    2071           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
    2072           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
    2073           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
    2074           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
    2075           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
    2076           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
    2077           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
    2078             : 
    2079           0 :   u[0] = _mm_add_epi32(v[0], v[8]);
    2080           0 :   u[1] = _mm_add_epi32(v[1], v[9]);
    2081           0 :   u[2] = _mm_add_epi32(v[2], v[10]);
    2082           0 :   u[3] = _mm_add_epi32(v[3], v[11]);
    2083           0 :   u[4] = _mm_add_epi32(v[4], v[12]);
    2084           0 :   u[5] = _mm_add_epi32(v[5], v[13]);
    2085           0 :   u[6] = _mm_add_epi32(v[6], v[14]);
    2086           0 :   u[7] = _mm_add_epi32(v[7], v[15]);
    2087           0 :   u[8] = _mm_sub_epi32(v[0], v[8]);
    2088           0 :   u[9] = _mm_sub_epi32(v[1], v[9]);
    2089           0 :   u[10] = _mm_sub_epi32(v[2], v[10]);
    2090           0 :   u[11] = _mm_sub_epi32(v[3], v[11]);
    2091           0 :   u[12] = _mm_sub_epi32(v[4], v[12]);
    2092           0 :   u[13] = _mm_sub_epi32(v[5], v[13]);
    2093           0 :   u[14] = _mm_sub_epi32(v[6], v[14]);
    2094           0 :   u[15] = _mm_sub_epi32(v[7], v[15]);
    2095             : 
    2096           0 :   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
    2097           0 :   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
    2098           0 :   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
    2099           0 :   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
    2100           0 :   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
    2101           0 :   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
    2102           0 :   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
    2103           0 :   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
    2104             : 
    2105           0 :   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
    2106           0 :   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
    2107           0 :   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
    2108           0 :   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
    2109           0 :   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
    2110           0 :   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
    2111           0 :   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
    2112           0 :   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
    2113             : 
    2114           0 :   v[8] = _mm_add_epi32(u[0], u[4]);
    2115           0 :   v[9] = _mm_add_epi32(u[1], u[5]);
    2116           0 :   v[10] = _mm_add_epi32(u[2], u[6]);
    2117           0 :   v[11] = _mm_add_epi32(u[3], u[7]);
    2118           0 :   v[12] = _mm_sub_epi32(u[0], u[4]);
    2119           0 :   v[13] = _mm_sub_epi32(u[1], u[5]);
    2120           0 :   v[14] = _mm_sub_epi32(u[2], u[6]);
    2121           0 :   v[15] = _mm_sub_epi32(u[3], u[7]);
    2122             : 
    2123           0 :   v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
    2124           0 :   v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
    2125           0 :   v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
    2126           0 :   v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
    2127           0 :   v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
    2128           0 :   v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
    2129           0 :   v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
    2130           0 :   v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
    2131           0 :   v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
    2132           0 :   v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
    2133           0 :   v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
    2134           0 :   v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
    2135           0 :   v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
    2136           0 :   v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
    2137           0 :   v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
    2138           0 :   v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
    2139           0 :   s[8] = _mm_packs_epi32(v[8], v[9]);
    2140           0 :   s[9] = _mm_packs_epi32(v[10], v[11]);
    2141           0 :   s[10] = _mm_packs_epi32(v[12], v[13]);
    2142           0 :   s[11] = _mm_packs_epi32(v[14], v[15]);
    2143             : 
    2144           0 :   x[12] = _mm_packs_epi32(u[8], u[9]);
    2145           0 :   x[13] = _mm_packs_epi32(u[10], u[11]);
    2146           0 :   x[14] = _mm_packs_epi32(u[12], u[13]);
    2147           0 :   x[15] = _mm_packs_epi32(u[14], u[15]);
    2148             : 
    2149             :   // stage 3
    2150           0 :   u[0] = _mm_unpacklo_epi16(s[4], s[5]);
    2151           0 :   u[1] = _mm_unpackhi_epi16(s[4], s[5]);
    2152           0 :   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
    2153           0 :   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
    2154           0 :   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
    2155           0 :   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
    2156           0 :   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
    2157           0 :   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
    2158             : 
    2159           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
    2160           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
    2161           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
    2162           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
    2163           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
    2164           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
    2165           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
    2166           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
    2167           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
    2168           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
    2169           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
    2170           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
    2171           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
    2172           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
    2173           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
    2174           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
    2175             : 
    2176           0 :   u[0] = _mm_add_epi32(v[0], v[4]);
    2177           0 :   u[1] = _mm_add_epi32(v[1], v[5]);
    2178           0 :   u[2] = _mm_add_epi32(v[2], v[6]);
    2179           0 :   u[3] = _mm_add_epi32(v[3], v[7]);
    2180           0 :   u[4] = _mm_sub_epi32(v[0], v[4]);
    2181           0 :   u[5] = _mm_sub_epi32(v[1], v[5]);
    2182           0 :   u[6] = _mm_sub_epi32(v[2], v[6]);
    2183           0 :   u[7] = _mm_sub_epi32(v[3], v[7]);
    2184           0 :   u[8] = _mm_add_epi32(v[8], v[12]);
    2185           0 :   u[9] = _mm_add_epi32(v[9], v[13]);
    2186           0 :   u[10] = _mm_add_epi32(v[10], v[14]);
    2187           0 :   u[11] = _mm_add_epi32(v[11], v[15]);
    2188           0 :   u[12] = _mm_sub_epi32(v[8], v[12]);
    2189           0 :   u[13] = _mm_sub_epi32(v[9], v[13]);
    2190           0 :   u[14] = _mm_sub_epi32(v[10], v[14]);
    2191           0 :   u[15] = _mm_sub_epi32(v[11], v[15]);
    2192             : 
    2193           0 :   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
    2194           0 :   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
    2195           0 :   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
    2196           0 :   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
    2197           0 :   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
    2198           0 :   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
    2199           0 :   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
    2200           0 :   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
    2201           0 :   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
    2202           0 :   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
    2203           0 :   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
    2204           0 :   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
    2205           0 :   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
    2206           0 :   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
    2207           0 :   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
    2208           0 :   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
    2209             : 
    2210           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    2211           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    2212           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    2213           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    2214           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    2215           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    2216           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    2217           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    2218           0 :   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
    2219           0 :   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
    2220           0 :   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
    2221           0 :   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
    2222           0 :   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
    2223           0 :   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
    2224           0 :   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
    2225           0 :   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
    2226             : 
    2227           0 :   s[4] = _mm_packs_epi32(v[0], v[1]);
    2228           0 :   s[5] = _mm_packs_epi32(v[2], v[3]);
    2229           0 :   s[6] = _mm_packs_epi32(v[4], v[5]);
    2230           0 :   s[7] = _mm_packs_epi32(v[6], v[7]);
    2231             : 
    2232           0 :   s[12] = _mm_packs_epi32(v[8], v[9]);
    2233           0 :   s[13] = _mm_packs_epi32(v[10], v[11]);
    2234           0 :   s[14] = _mm_packs_epi32(v[12], v[13]);
    2235           0 :   s[15] = _mm_packs_epi32(v[14], v[15]);
    2236             : 
    2237             :   // stage 4
    2238           0 :   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
    2239           0 :   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
    2240           0 :   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
    2241           0 :   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
    2242           0 :   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
    2243           0 :   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
    2244           0 :   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
    2245           0 :   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
    2246             : 
    2247           0 :   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
    2248           0 :   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
    2249           0 :   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
    2250           0 :   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
    2251           0 :   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
    2252           0 :   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
    2253           0 :   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
    2254           0 :   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
    2255           0 :   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
    2256           0 :   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
    2257           0 :   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
    2258           0 :   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
    2259           0 :   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
    2260           0 :   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
    2261           0 :   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
    2262           0 :   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
    2263             : 
    2264           0 :   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
    2265           0 :   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
    2266           0 :   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
    2267           0 :   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
    2268           0 :   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
    2269           0 :   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
    2270           0 :   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
    2271           0 :   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
    2272           0 :   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
    2273           0 :   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
    2274           0 :   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
    2275           0 :   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
    2276           0 :   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
    2277           0 :   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
    2278           0 :   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
    2279           0 :   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
    2280             : 
    2281           0 :   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
    2282           0 :   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
    2283           0 :   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
    2284           0 :   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
    2285           0 :   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
    2286           0 :   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
    2287           0 :   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
    2288           0 :   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
    2289           0 :   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
    2290           0 :   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
    2291           0 :   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
    2292           0 :   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
    2293           0 :   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
    2294           0 :   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
    2295           0 :   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
    2296           0 :   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
    2297             : 
    2298           0 :   in[0] = s[0];
    2299           0 :   in[1] = _mm_sub_epi16(kZero, s[8]);
    2300           0 :   in[2] = s[12];
    2301           0 :   in[3] = _mm_sub_epi16(kZero, s[4]);
    2302           0 :   in[4] = _mm_packs_epi32(v[4], v[5]);
    2303           0 :   in[5] = _mm_packs_epi32(v[12], v[13]);
    2304           0 :   in[6] = _mm_packs_epi32(v[8], v[9]);
    2305           0 :   in[7] = _mm_packs_epi32(v[0], v[1]);
    2306           0 :   in[8] = _mm_packs_epi32(v[2], v[3]);
    2307           0 :   in[9] = _mm_packs_epi32(v[10], v[11]);
    2308           0 :   in[10] = _mm_packs_epi32(v[14], v[15]);
    2309           0 :   in[11] = _mm_packs_epi32(v[6], v[7]);
    2310           0 :   in[12] = s[5];
    2311           0 :   in[13] = _mm_sub_epi16(kZero, s[13]);
    2312           0 :   in[14] = s[9];
    2313           0 :   in[15] = _mm_sub_epi16(kZero, s[1]);
    2314           0 : }
    2315             : 
    2316           0 : static void fdct16_sse2(__m128i *in0, __m128i *in1) {
    2317           0 :   fdct16_8col(in0);
    2318           0 :   fdct16_8col(in1);
    2319           0 :   array_transpose_16x16(in0, in1);
    2320           0 : }
    2321             : 
    2322           0 : static void fadst16_sse2(__m128i *in0, __m128i *in1) {
    2323           0 :   fadst16_8col(in0);
    2324           0 :   fadst16_8col(in1);
    2325           0 :   array_transpose_16x16(in0, in1);
    2326           0 : }
    2327             : 
    2328             : #if CONFIG_EXT_TX
    2329           0 : static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
    2330           0 :   idtx16_8col(in0);
    2331           0 :   idtx16_8col(in1);
    2332           0 :   array_transpose_16x16(in0, in1);
    2333           0 : }
    2334             : #endif  // CONFIG_EXT_TX
    2335             : 
    2336           0 : void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
    2337             :                        int tx_type) {
    2338             :   __m128i in0[16], in1[16];
    2339             : 
    2340           0 :   switch (tx_type) {
    2341             :     case DCT_DCT:
    2342           0 :       load_buffer_16x16(input, in0, in1, stride, 0, 0);
    2343           0 :       fdct16_sse2(in0, in1);
    2344           0 :       right_shift_16x16(in0, in1);
    2345           0 :       fdct16_sse2(in0, in1);
    2346           0 :       write_buffer_16x16(output, in0, in1, 16);
    2347           0 :       break;
    2348             :     case ADST_DCT:
    2349           0 :       load_buffer_16x16(input, in0, in1, stride, 0, 0);
    2350           0 :       fadst16_sse2(in0, in1);
    2351           0 :       right_shift_16x16(in0, in1);
    2352           0 :       fdct16_sse2(in0, in1);
    2353           0 :       write_buffer_16x16(output, in0, in1, 16);
    2354           0 :       break;
    2355             :     case DCT_ADST:
    2356           0 :       load_buffer_16x16(input, in0, in1, stride, 0, 0);
    2357           0 :       fdct16_sse2(in0, in1);
    2358           0 :       right_shift_16x16(in0, in1);
    2359           0 :       fadst16_sse2(in0, in1);
    2360           0 :       write_buffer_16x16(output, in0, in1, 16);
    2361           0 :       break;
    2362             :     case ADST_ADST:
    2363           0 :       load_buffer_16x16(input, in0, in1, stride, 0, 0);
    2364           0 :       fadst16_sse2(in0, in1);
    2365           0 :       right_shift_16x16(in0, in1);
    2366           0 :       fadst16_sse2(in0, in1);
    2367           0 :       write_buffer_16x16(output, in0, in1, 16);
    2368           0 :       break;
    2369             : #if CONFIG_EXT_TX
    2370             :     case FLIPADST_DCT:
    2371           0 :       load_buffer_16x16(input, in0, in1, stride, 1, 0);
    2372           0 :       fadst16_sse2(in0, in1);
    2373           0 :       right_shift_16x16(in0, in1);
    2374           0 :       fdct16_sse2(in0, in1);
    2375           0 :       write_buffer_16x16(output, in0, in1, 16);
    2376           0 :       break;
    2377             :     case DCT_FLIPADST:
    2378           0 :       load_buffer_16x16(input, in0, in1, stride, 0, 1);
    2379           0 :       fdct16_sse2(in0, in1);
    2380           0 :       right_shift_16x16(in0, in1);
    2381           0 :       fadst16_sse2(in0, in1);
    2382           0 :       write_buffer_16x16(output, in0, in1, 16);
    2383           0 :       break;
    2384             :     case FLIPADST_FLIPADST:
    2385           0 :       load_buffer_16x16(input, in0, in1, stride, 1, 1);
    2386           0 :       fadst16_sse2(in0, in1);
    2387           0 :       right_shift_16x16(in0, in1);
    2388           0 :       fadst16_sse2(in0, in1);
    2389           0 :       write_buffer_16x16(output, in0, in1, 16);
    2390           0 :       break;
    2391             :     case ADST_FLIPADST:
    2392           0 :       load_buffer_16x16(input, in0, in1, stride, 0, 1);
    2393           0 :       fadst16_sse2(in0, in1);
    2394           0 :       right_shift_16x16(in0, in1);
    2395           0 :       fadst16_sse2(in0, in1);
    2396           0 :       write_buffer_16x16(output, in0, in1, 16);
    2397           0 :       break;
    2398             :     case FLIPADST_ADST:
    2399           0 :       load_buffer_16x16(input, in0, in1, stride, 1, 0);
    2400           0 :       fadst16_sse2(in0, in1);
    2401           0 :       right_shift_16x16(in0, in1);
    2402           0 :       fadst16_sse2(in0, in1);
    2403           0 :       write_buffer_16x16(output, in0, in1, 16);
    2404           0 :       break;
    2405             :     case IDTX:
    2406           0 :       load_buffer_16x16(input, in0, in1, stride, 0, 0);
    2407           0 :       fidtx16_sse2(in0, in1);
    2408           0 :       right_shift_16x16(in0, in1);
    2409           0 :       fidtx16_sse2(in0, in1);
    2410           0 :       write_buffer_16x16(output, in0, in1, 16);
    2411           0 :       break;
    2412             :     case V_DCT:
    2413           0 :       load_buffer_16x16(input, in0, in1, stride, 0, 0);
    2414           0 :       fdct16_sse2(in0, in1);
    2415           0 :       right_shift_16x16(in0, in1);
    2416           0 :       fidtx16_sse2(in0, in1);
    2417           0 :       write_buffer_16x16(output, in0, in1, 16);
    2418           0 :       break;
    2419             :     case H_DCT:
    2420           0 :       load_buffer_16x16(input, in0, in1, stride, 0, 0);
    2421           0 :       fidtx16_sse2(in0, in1);
    2422           0 :       right_shift_16x16(in0, in1);
    2423           0 :       fdct16_sse2(in0, in1);
    2424           0 :       write_buffer_16x16(output, in0, in1, 16);
    2425           0 :       break;
    2426             :     case V_ADST:
    2427           0 :       load_buffer_16x16(input, in0, in1, stride, 0, 0);
    2428           0 :       fadst16_sse2(in0, in1);
    2429           0 :       right_shift_16x16(in0, in1);
    2430           0 :       fidtx16_sse2(in0, in1);
    2431           0 :       write_buffer_16x16(output, in0, in1, 16);
    2432           0 :       break;
    2433             :     case H_ADST:
    2434           0 :       load_buffer_16x16(input, in0, in1, stride, 0, 0);
    2435           0 :       fidtx16_sse2(in0, in1);
    2436           0 :       right_shift_16x16(in0, in1);
    2437           0 :       fadst16_sse2(in0, in1);
    2438           0 :       write_buffer_16x16(output, in0, in1, 16);
    2439           0 :       break;
    2440             :     case V_FLIPADST:
    2441           0 :       load_buffer_16x16(input, in0, in1, stride, 1, 0);
    2442           0 :       fadst16_sse2(in0, in1);
    2443           0 :       right_shift_16x16(in0, in1);
    2444           0 :       fidtx16_sse2(in0, in1);
    2445           0 :       write_buffer_16x16(output, in0, in1, 16);
    2446           0 :       break;
    2447             :     case H_FLIPADST:
    2448           0 :       load_buffer_16x16(input, in0, in1, stride, 0, 1);
    2449           0 :       fidtx16_sse2(in0, in1);
    2450           0 :       right_shift_16x16(in0, in1);
    2451           0 :       fadst16_sse2(in0, in1);
    2452           0 :       write_buffer_16x16(output, in0, in1, 16);
    2453           0 :       break;
    2454             : #endif  // CONFIG_EXT_TX
    2455           0 :     default: assert(0); break;
    2456             :   }
    2457           0 : }
    2458             : 
    2459           0 : static INLINE void prepare_4x8_row_first(__m128i *in) {
    2460           0 :   in[0] = _mm_unpacklo_epi64(in[0], in[2]);
    2461           0 :   in[1] = _mm_unpacklo_epi64(in[1], in[3]);
    2462           0 :   transpose_4x4(in);
    2463           0 :   in[4] = _mm_unpacklo_epi64(in[4], in[6]);
    2464           0 :   in[5] = _mm_unpacklo_epi64(in[5], in[7]);
    2465           0 :   transpose_4x4(in + 4);
    2466           0 : }
    2467             : 
    2468             : // Load input into the left-hand half of in (ie, into lanes 0..3 of
    2469             : // each element of in). The right hand half (lanes 4..7) should be
    2470             : // treated as being filled with "don't care" values.
    2471           0 : static INLINE void load_buffer_4x8(const int16_t *input, __m128i *in,
    2472             :                                    int stride, int flipud, int fliplr) {
    2473           0 :   const int shift = 2;
    2474           0 :   if (!flipud) {
    2475           0 :     in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
    2476           0 :     in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
    2477           0 :     in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
    2478           0 :     in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
    2479           0 :     in[4] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
    2480           0 :     in[5] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
    2481           0 :     in[6] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
    2482           0 :     in[7] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
    2483             :   } else {
    2484           0 :     in[0] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
    2485           0 :     in[1] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
    2486           0 :     in[2] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
    2487           0 :     in[3] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
    2488           0 :     in[4] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
    2489           0 :     in[5] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
    2490           0 :     in[6] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
    2491           0 :     in[7] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
    2492             :   }
    2493             : 
    2494           0 :   if (fliplr) {
    2495           0 :     in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
    2496           0 :     in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
    2497           0 :     in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
    2498           0 :     in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
    2499           0 :     in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
    2500           0 :     in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
    2501           0 :     in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
    2502           0 :     in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
    2503             :   }
    2504             : 
    2505           0 :   in[0] = _mm_slli_epi16(in[0], shift);
    2506           0 :   in[1] = _mm_slli_epi16(in[1], shift);
    2507           0 :   in[2] = _mm_slli_epi16(in[2], shift);
    2508           0 :   in[3] = _mm_slli_epi16(in[3], shift);
    2509           0 :   in[4] = _mm_slli_epi16(in[4], shift);
    2510           0 :   in[5] = _mm_slli_epi16(in[5], shift);
    2511           0 :   in[6] = _mm_slli_epi16(in[6], shift);
    2512           0 :   in[7] = _mm_slli_epi16(in[7], shift);
    2513             : 
    2514           0 :   scale_sqrt2_8x4(in);
    2515           0 :   scale_sqrt2_8x4(in + 4);
    2516           0 :   prepare_4x8_row_first(in);
    2517           0 : }
    2518             : 
    2519           0 : static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
    2520             :   __m128i in01, in23, in45, in67, sign01, sign23, sign45, sign67;
    2521           0 :   const int shift = 1;
    2522             : 
    2523             :   // revert the 8x8 txfm's transpose
    2524           0 :   array_transpose_8x8(res, res);
    2525             : 
    2526           0 :   in01 = _mm_unpacklo_epi64(res[0], res[1]);
    2527           0 :   in23 = _mm_unpacklo_epi64(res[2], res[3]);
    2528           0 :   in45 = _mm_unpacklo_epi64(res[4], res[5]);
    2529           0 :   in67 = _mm_unpacklo_epi64(res[6], res[7]);
    2530             : 
    2531           0 :   sign01 = _mm_srai_epi16(in01, 15);
    2532           0 :   sign23 = _mm_srai_epi16(in23, 15);
    2533           0 :   sign45 = _mm_srai_epi16(in45, 15);
    2534           0 :   sign67 = _mm_srai_epi16(in67, 15);
    2535             : 
    2536           0 :   in01 = _mm_sub_epi16(in01, sign01);
    2537           0 :   in23 = _mm_sub_epi16(in23, sign23);
    2538           0 :   in45 = _mm_sub_epi16(in45, sign45);
    2539           0 :   in67 = _mm_sub_epi16(in67, sign67);
    2540             : 
    2541           0 :   in01 = _mm_srai_epi16(in01, shift);
    2542           0 :   in23 = _mm_srai_epi16(in23, shift);
    2543           0 :   in45 = _mm_srai_epi16(in45, shift);
    2544           0 :   in67 = _mm_srai_epi16(in67, shift);
    2545             : 
    2546           0 :   store_output(&in01, (output + 0 * 8));
    2547           0 :   store_output(&in23, (output + 1 * 8));
    2548           0 :   store_output(&in45, (output + 2 * 8));
    2549           0 :   store_output(&in67, (output + 3 * 8));
    2550           0 : }
    2551             : 
    2552           0 : void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
    2553             :                      int tx_type) {
    2554             :   __m128i in[8];
    2555             : 
    2556           0 :   switch (tx_type) {
    2557             :     case DCT_DCT:
    2558           0 :       load_buffer_4x8(input, in, stride, 0, 0);
    2559           0 :       fdct4_sse2(in);
    2560           0 :       fdct4_sse2(in + 4);
    2561           0 :       fdct8_sse2(in);
    2562           0 :       break;
    2563             :     case ADST_DCT:
    2564           0 :       load_buffer_4x8(input, in, stride, 0, 0);
    2565           0 :       fdct4_sse2(in);
    2566           0 :       fdct4_sse2(in + 4);
    2567           0 :       fadst8_sse2(in);
    2568           0 :       break;
    2569             :     case DCT_ADST:
    2570           0 :       load_buffer_4x8(input, in, stride, 0, 0);
    2571           0 :       fadst4_sse2(in);
    2572           0 :       fadst4_sse2(in + 4);
    2573           0 :       fdct8_sse2(in);
    2574           0 :       break;
    2575             :     case ADST_ADST:
    2576           0 :       load_buffer_4x8(input, in, stride, 0, 0);
    2577           0 :       fadst4_sse2(in);
    2578           0 :       fadst4_sse2(in + 4);
    2579           0 :       fadst8_sse2(in);
    2580           0 :       break;
    2581             : #if CONFIG_EXT_TX
    2582             :     case FLIPADST_DCT:
    2583           0 :       load_buffer_4x8(input, in, stride, 1, 0);
    2584           0 :       fdct4_sse2(in);
    2585           0 :       fdct4_sse2(in + 4);
    2586           0 :       fadst8_sse2(in);
    2587           0 :       break;
    2588             :     case DCT_FLIPADST:
    2589           0 :       load_buffer_4x8(input, in, stride, 0, 1);
    2590           0 :       fadst4_sse2(in);
    2591           0 :       fadst4_sse2(in + 4);
    2592           0 :       fdct8_sse2(in);
    2593           0 :       break;
    2594             :     case FLIPADST_FLIPADST:
    2595           0 :       load_buffer_4x8(input, in, stride, 1, 1);
    2596           0 :       fadst4_sse2(in);
    2597           0 :       fadst4_sse2(in + 4);
    2598           0 :       fadst8_sse2(in);
    2599           0 :       break;
    2600             :     case ADST_FLIPADST:
    2601           0 :       load_buffer_4x8(input, in, stride, 0, 1);
    2602           0 :       fadst4_sse2(in);
    2603           0 :       fadst4_sse2(in + 4);
    2604           0 :       fadst8_sse2(in);
    2605           0 :       break;
    2606             :     case FLIPADST_ADST:
    2607           0 :       load_buffer_4x8(input, in, stride, 1, 0);
    2608           0 :       fadst4_sse2(in);
    2609           0 :       fadst4_sse2(in + 4);
    2610           0 :       fadst8_sse2(in);
    2611           0 :       break;
    2612             :     case IDTX:
    2613           0 :       load_buffer_4x8(input, in, stride, 0, 0);
    2614           0 :       fidtx4_sse2(in);
    2615           0 :       fidtx4_sse2(in + 4);
    2616           0 :       fidtx8_sse2(in);
    2617           0 :       break;
    2618             :     case V_DCT:
    2619           0 :       load_buffer_4x8(input, in, stride, 0, 0);
    2620           0 :       fidtx4_sse2(in);
    2621           0 :       fidtx4_sse2(in + 4);
    2622           0 :       fdct8_sse2(in);
    2623           0 :       break;
    2624             :     case H_DCT:
    2625           0 :       load_buffer_4x8(input, in, stride, 0, 0);
    2626           0 :       fdct4_sse2(in);
    2627           0 :       fdct4_sse2(in + 4);
    2628           0 :       fidtx8_sse2(in);
    2629           0 :       break;
    2630             :     case V_ADST:
    2631           0 :       load_buffer_4x8(input, in, stride, 0, 0);
    2632           0 :       fidtx4_sse2(in);
    2633           0 :       fidtx4_sse2(in + 4);
    2634           0 :       fadst8_sse2(in);
    2635           0 :       break;
    2636             :     case H_ADST:
    2637           0 :       load_buffer_4x8(input, in, stride, 0, 0);
    2638           0 :       fadst4_sse2(in);
    2639           0 :       fadst4_sse2(in + 4);
    2640           0 :       fidtx8_sse2(in);
    2641           0 :       break;
    2642             :     case V_FLIPADST:
    2643           0 :       load_buffer_4x8(input, in, stride, 1, 0);
    2644           0 :       fidtx4_sse2(in);
    2645           0 :       fidtx4_sse2(in + 4);
    2646           0 :       fadst8_sse2(in);
    2647           0 :       break;
    2648             :     case H_FLIPADST:
    2649           0 :       load_buffer_4x8(input, in, stride, 0, 1);
    2650           0 :       fadst4_sse2(in);
    2651           0 :       fadst4_sse2(in + 4);
    2652           0 :       fidtx8_sse2(in);
    2653           0 :       break;
    2654             : #endif
    2655           0 :     default: assert(0); break;
    2656             :   }
    2657           0 :   write_buffer_4x8(output, in);
    2658           0 : }
    2659             : 
    2660             : // Load input into the left-hand half of in (ie, into lanes 0..3 of
    2661             : // each element of in). The right hand half (lanes 4..7) should be
    2662             : // treated as being filled with "don't care" values.
    2663             : // The input is split horizontally into two 4x4
    2664             : // chunks 'l' and 'r'. Then 'l' is stored in the top-left 4x4
    2665             : // block of 'in' and 'r' is stored in the bottom-left block.
    2666             : // This is to allow us to reuse 4x4 transforms.
    2667           0 : static INLINE void load_buffer_8x4(const int16_t *input, __m128i *in,
    2668             :                                    int stride, int flipud, int fliplr) {
    2669           0 :   const int shift = 2;
    2670           0 :   if (!flipud) {
    2671           0 :     in[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
    2672           0 :     in[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
    2673           0 :     in[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
    2674           0 :     in[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
    2675             :   } else {
    2676           0 :     in[0] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
    2677           0 :     in[1] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
    2678           0 :     in[2] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
    2679           0 :     in[3] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
    2680             :   }
    2681             : 
    2682           0 :   if (fliplr) {
    2683           0 :     in[0] = mm_reverse_epi16(in[0]);
    2684           0 :     in[1] = mm_reverse_epi16(in[1]);
    2685           0 :     in[2] = mm_reverse_epi16(in[2]);
    2686           0 :     in[3] = mm_reverse_epi16(in[3]);
    2687             :   }
    2688             : 
    2689           0 :   in[0] = _mm_slli_epi16(in[0], shift);
    2690           0 :   in[1] = _mm_slli_epi16(in[1], shift);
    2691           0 :   in[2] = _mm_slli_epi16(in[2], shift);
    2692           0 :   in[3] = _mm_slli_epi16(in[3], shift);
    2693             : 
    2694           0 :   scale_sqrt2_8x4(in);
    2695             : 
    2696           0 :   in[4] = _mm_shuffle_epi32(in[0], 0xe);
    2697           0 :   in[5] = _mm_shuffle_epi32(in[1], 0xe);
    2698           0 :   in[6] = _mm_shuffle_epi32(in[2], 0xe);
    2699           0 :   in[7] = _mm_shuffle_epi32(in[3], 0xe);
    2700           0 : }
    2701             : 
    2702           0 : static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
    2703             :   __m128i out0, out1, out2, out3, sign0, sign1, sign2, sign3;
    2704           0 :   const int shift = 1;
    2705           0 :   sign0 = _mm_srai_epi16(res[0], 15);
    2706           0 :   sign1 = _mm_srai_epi16(res[1], 15);
    2707           0 :   sign2 = _mm_srai_epi16(res[2], 15);
    2708           0 :   sign3 = _mm_srai_epi16(res[3], 15);
    2709             : 
    2710           0 :   out0 = _mm_sub_epi16(res[0], sign0);
    2711           0 :   out1 = _mm_sub_epi16(res[1], sign1);
    2712           0 :   out2 = _mm_sub_epi16(res[2], sign2);
    2713           0 :   out3 = _mm_sub_epi16(res[3], sign3);
    2714             : 
    2715           0 :   out0 = _mm_srai_epi16(out0, shift);
    2716           0 :   out1 = _mm_srai_epi16(out1, shift);
    2717           0 :   out2 = _mm_srai_epi16(out2, shift);
    2718           0 :   out3 = _mm_srai_epi16(out3, shift);
    2719             : 
    2720           0 :   store_output(&out0, (output + 0 * 8));
    2721           0 :   store_output(&out1, (output + 1 * 8));
    2722           0 :   store_output(&out2, (output + 2 * 8));
    2723           0 :   store_output(&out3, (output + 3 * 8));
    2724           0 : }
    2725             : 
    2726           0 : void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
    2727             :                      int tx_type) {
    2728             :   __m128i in[8];
    2729             : 
    2730           0 :   switch (tx_type) {
    2731             :     case DCT_DCT:
    2732           0 :       load_buffer_8x4(input, in, stride, 0, 0);
    2733           0 :       fdct4_sse2(in);
    2734           0 :       fdct4_sse2(in + 4);
    2735           0 :       fdct8_sse2(in);
    2736           0 :       break;
    2737             :     case ADST_DCT:
    2738           0 :       load_buffer_8x4(input, in, stride, 0, 0);
    2739           0 :       fadst4_sse2(in);
    2740           0 :       fadst4_sse2(in + 4);
    2741           0 :       fdct8_sse2(in);
    2742           0 :       break;
    2743             :     case DCT_ADST:
    2744           0 :       load_buffer_8x4(input, in, stride, 0, 0);
    2745           0 :       fdct4_sse2(in);
    2746           0 :       fdct4_sse2(in + 4);
    2747           0 :       fadst8_sse2(in);
    2748           0 :       break;
    2749             :     case ADST_ADST:
    2750           0 :       load_buffer_8x4(input, in, stride, 0, 0);
    2751           0 :       fadst4_sse2(in);
    2752           0 :       fadst4_sse2(in + 4);
    2753           0 :       fadst8_sse2(in);
    2754           0 :       break;
    2755             : #if CONFIG_EXT_TX
    2756             :     case FLIPADST_DCT:
    2757           0 :       load_buffer_8x4(input, in, stride, 1, 0);
    2758           0 :       fadst4_sse2(in);
    2759           0 :       fadst4_sse2(in + 4);
    2760           0 :       fdct8_sse2(in);
    2761           0 :       break;
    2762             :     case DCT_FLIPADST:
    2763           0 :       load_buffer_8x4(input, in, stride, 0, 1);
    2764           0 :       fdct4_sse2(in);
    2765           0 :       fdct4_sse2(in + 4);
    2766           0 :       fadst8_sse2(in);
    2767           0 :       break;
    2768             :     case FLIPADST_FLIPADST:
    2769           0 :       load_buffer_8x4(input, in, stride, 1, 1);
    2770           0 :       fadst4_sse2(in);
    2771           0 :       fadst4_sse2(in + 4);
    2772           0 :       fadst8_sse2(in);
    2773           0 :       break;
    2774             :     case ADST_FLIPADST:
    2775           0 :       load_buffer_8x4(input, in, stride, 0, 1);
    2776           0 :       fadst4_sse2(in);
    2777           0 :       fadst4_sse2(in + 4);
    2778           0 :       fadst8_sse2(in);
    2779           0 :       break;
    2780             :     case FLIPADST_ADST:
    2781           0 :       load_buffer_8x4(input, in, stride, 1, 0);
    2782           0 :       fadst4_sse2(in);
    2783           0 :       fadst4_sse2(in + 4);
    2784           0 :       fadst8_sse2(in);
    2785           0 :       break;
    2786             :     case IDTX:
    2787           0 :       load_buffer_8x4(input, in, stride, 0, 0);
    2788           0 :       fidtx4_sse2(in);
    2789           0 :       fidtx4_sse2(in + 4);
    2790           0 :       fidtx8_sse2(in);
    2791           0 :       break;
    2792             :     case V_DCT:
    2793           0 :       load_buffer_8x4(input, in, stride, 0, 0);
    2794           0 :       fdct4_sse2(in);
    2795           0 :       fdct4_sse2(in + 4);
    2796           0 :       fidtx8_sse2(in);
    2797           0 :       break;
    2798             :     case H_DCT:
    2799           0 :       load_buffer_8x4(input, in, stride, 0, 0);
    2800           0 :       fidtx4_sse2(in);
    2801           0 :       fidtx4_sse2(in + 4);
    2802           0 :       fdct8_sse2(in);
    2803           0 :       break;
    2804             :     case V_ADST:
    2805           0 :       load_buffer_8x4(input, in, stride, 0, 0);
    2806           0 :       fadst4_sse2(in);
    2807           0 :       fadst4_sse2(in + 4);
    2808           0 :       fidtx8_sse2(in);
    2809           0 :       break;
    2810             :     case H_ADST:
    2811           0 :       load_buffer_8x4(input, in, stride, 0, 0);
    2812           0 :       fidtx4_sse2(in);
    2813           0 :       fidtx4_sse2(in + 4);
    2814           0 :       fadst8_sse2(in);
    2815           0 :       break;
    2816             :     case V_FLIPADST:
    2817           0 :       load_buffer_8x4(input, in, stride, 1, 0);
    2818           0 :       fadst4_sse2(in);
    2819           0 :       fadst4_sse2(in + 4);
    2820           0 :       fidtx8_sse2(in);
    2821           0 :       break;
    2822             :     case H_FLIPADST:
    2823           0 :       load_buffer_8x4(input, in, stride, 0, 1);
    2824           0 :       fidtx4_sse2(in);
    2825           0 :       fidtx4_sse2(in + 4);
    2826           0 :       fadst8_sse2(in);
    2827           0 :       break;
    2828             : #endif
    2829           0 :     default: assert(0); break;
    2830             :   }
    2831           0 :   write_buffer_8x4(output, in);
    2832           0 : }
    2833             : 
    2834           0 : static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in,
    2835             :                                     int stride, int flipud, int fliplr) {
    2836             :   // Load 2 8x8 blocks
    2837           0 :   const int16_t *t = input;
    2838           0 :   const int16_t *b = input + 8 * stride;
    2839             : 
    2840           0 :   if (flipud) {
    2841           0 :     const int16_t *const tmp = t;
    2842           0 :     t = b;
    2843           0 :     b = tmp;
    2844             :   }
    2845             : 
    2846           0 :   load_buffer_8x8(t, in, stride, flipud, fliplr);
    2847           0 :   scale_sqrt2_8x8(in);
    2848           0 :   load_buffer_8x8(b, in + 8, stride, flipud, fliplr);
    2849           0 :   scale_sqrt2_8x8(in + 8);
    2850           0 : }
    2851             : 
    2852           0 : static INLINE void round_power_of_two_signed(__m128i *x, int n) {
    2853           0 :   const __m128i rounding = _mm_set1_epi16((1 << n) >> 1);
    2854           0 :   const __m128i sign = _mm_srai_epi16(*x, 15);
    2855           0 :   const __m128i res = _mm_add_epi16(_mm_add_epi16(*x, rounding), sign);
    2856           0 :   *x = _mm_srai_epi16(res, n);
    2857           0 : }
    2858             : 
    2859           0 : static void row_8x16_rounding(__m128i *in, int bits) {
    2860             :   int i;
    2861           0 :   for (i = 0; i < 16; i++) {
    2862           0 :     round_power_of_two_signed(&in[i], bits);
    2863             :   }
    2864           0 : }
    2865             : 
    2866           0 : void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
    2867             :                       int tx_type) {
    2868             :   __m128i in[16];
    2869             : 
    2870           0 :   __m128i *const t = in;      // Alias to top 8x8 sub block
    2871           0 :   __m128i *const b = in + 8;  // Alias to bottom 8x8 sub block
    2872             : 
    2873           0 :   switch (tx_type) {
    2874             :     case DCT_DCT:
    2875           0 :       load_buffer_8x16(input, in, stride, 0, 0);
    2876           0 :       array_transpose_8x8(t, t);
    2877           0 :       array_transpose_8x8(b, b);
    2878           0 :       fdct8_sse2(t);
    2879           0 :       fdct8_sse2(b);
    2880           0 :       row_8x16_rounding(in, 2);
    2881           0 :       fdct16_8col(in);
    2882           0 :       break;
    2883             :     case ADST_DCT:
    2884           0 :       load_buffer_8x16(input, in, stride, 0, 0);
    2885           0 :       array_transpose_8x8(t, t);
    2886           0 :       array_transpose_8x8(b, b);
    2887           0 :       fdct8_sse2(t);
    2888           0 :       fdct8_sse2(b);
    2889           0 :       row_8x16_rounding(in, 2);
    2890           0 :       fadst16_8col(in);
    2891           0 :       break;
    2892             :     case DCT_ADST:
    2893           0 :       load_buffer_8x16(input, in, stride, 0, 0);
    2894           0 :       array_transpose_8x8(t, t);
    2895           0 :       array_transpose_8x8(b, b);
    2896           0 :       fadst8_sse2(t);
    2897           0 :       fadst8_sse2(b);
    2898           0 :       row_8x16_rounding(in, 2);
    2899           0 :       fdct16_8col(in);
    2900           0 :       break;
    2901             :     case ADST_ADST:
    2902           0 :       load_buffer_8x16(input, in, stride, 0, 0);
    2903           0 :       array_transpose_8x8(t, t);
    2904           0 :       array_transpose_8x8(b, b);
    2905           0 :       fadst8_sse2(t);
    2906           0 :       fadst8_sse2(b);
    2907           0 :       row_8x16_rounding(in, 2);
    2908           0 :       fadst16_8col(in);
    2909           0 :       break;
    2910             : #if CONFIG_EXT_TX
    2911             :     case FLIPADST_DCT:
    2912           0 :       load_buffer_8x16(input, in, stride, 1, 0);
    2913           0 :       array_transpose_8x8(t, t);
    2914           0 :       array_transpose_8x8(b, b);
    2915           0 :       fdct8_sse2(t);
    2916           0 :       fdct8_sse2(b);
    2917           0 :       row_8x16_rounding(in, 2);
    2918           0 :       fadst16_8col(in);
    2919           0 :       break;
    2920             :     case DCT_FLIPADST:
    2921           0 :       load_buffer_8x16(input, in, stride, 0, 1);
    2922           0 :       array_transpose_8x8(t, t);
    2923           0 :       array_transpose_8x8(b, b);
    2924           0 :       fadst8_sse2(t);
    2925           0 :       fadst8_sse2(b);
    2926           0 :       row_8x16_rounding(in, 2);
    2927           0 :       fdct16_8col(in);
    2928           0 :       break;
    2929             :     case FLIPADST_FLIPADST:
    2930           0 :       load_buffer_8x16(input, in, stride, 1, 1);
    2931           0 :       array_transpose_8x8(t, t);
    2932           0 :       array_transpose_8x8(b, b);
    2933           0 :       fadst8_sse2(t);
    2934           0 :       fadst8_sse2(b);
    2935           0 :       row_8x16_rounding(in, 2);
    2936           0 :       fadst16_8col(in);
    2937           0 :       break;
    2938             :     case ADST_FLIPADST:
    2939           0 :       load_buffer_8x16(input, in, stride, 0, 1);
    2940           0 :       array_transpose_8x8(t, t);
    2941           0 :       array_transpose_8x8(b, b);
    2942           0 :       fadst8_sse2(t);
    2943           0 :       fadst8_sse2(b);
    2944           0 :       row_8x16_rounding(in, 2);
    2945           0 :       fadst16_8col(in);
    2946           0 :       break;
    2947             :     case FLIPADST_ADST:
    2948           0 :       load_buffer_8x16(input, in, stride, 1, 0);
    2949           0 :       array_transpose_8x8(t, t);
    2950           0 :       array_transpose_8x8(b, b);
    2951           0 :       fadst8_sse2(t);
    2952           0 :       fadst8_sse2(b);
    2953           0 :       row_8x16_rounding(in, 2);
    2954           0 :       fadst16_8col(in);
    2955           0 :       break;
    2956             :     case IDTX:
    2957           0 :       load_buffer_8x16(input, in, stride, 0, 0);
    2958           0 :       array_transpose_8x8(t, t);
    2959           0 :       array_transpose_8x8(b, b);
    2960           0 :       fidtx8_sse2(t);
    2961           0 :       fidtx8_sse2(b);
    2962           0 :       row_8x16_rounding(in, 2);
    2963           0 :       idtx16_8col(in);
    2964           0 :       break;
    2965             :     case V_DCT:
    2966           0 :       load_buffer_8x16(input, in, stride, 0, 0);
    2967           0 :       array_transpose_8x8(t, t);
    2968           0 :       array_transpose_8x8(b, b);
    2969           0 :       fidtx8_sse2(t);
    2970           0 :       fidtx8_sse2(b);
    2971           0 :       row_8x16_rounding(in, 2);
    2972           0 :       fdct16_8col(in);
    2973           0 :       break;
    2974             :     case H_DCT:
    2975           0 :       load_buffer_8x16(input, in, stride, 0, 0);
    2976           0 :       array_transpose_8x8(t, t);
    2977           0 :       array_transpose_8x8(b, b);
    2978           0 :       fdct8_sse2(t);
    2979           0 :       fdct8_sse2(b);
    2980           0 :       row_8x16_rounding(in, 2);
    2981           0 :       idtx16_8col(in);
    2982           0 :       break;
    2983             :     case V_ADST:
    2984           0 :       load_buffer_8x16(input, in, stride, 0, 0);
    2985           0 :       array_transpose_8x8(t, t);
    2986           0 :       array_transpose_8x8(b, b);
    2987           0 :       fidtx8_sse2(t);
    2988           0 :       fidtx8_sse2(b);
    2989           0 :       row_8x16_rounding(in, 2);
    2990           0 :       fadst16_8col(in);
    2991           0 :       break;
    2992             :     case H_ADST:
    2993           0 :       load_buffer_8x16(input, in, stride, 0, 0);
    2994           0 :       array_transpose_8x8(t, t);
    2995           0 :       array_transpose_8x8(b, b);
    2996           0 :       fadst8_sse2(t);
    2997           0 :       fadst8_sse2(b);
    2998           0 :       row_8x16_rounding(in, 2);
    2999           0 :       idtx16_8col(in);
    3000           0 :       break;
    3001             :     case V_FLIPADST:
    3002           0 :       load_buffer_8x16(input, in, stride, 1, 0);
    3003           0 :       array_transpose_8x8(t, t);
    3004           0 :       array_transpose_8x8(b, b);
    3005           0 :       fidtx8_sse2(t);
    3006           0 :       fidtx8_sse2(b);
    3007           0 :       row_8x16_rounding(in, 2);
    3008           0 :       fadst16_8col(in);
    3009           0 :       break;
    3010             :     case H_FLIPADST:
    3011           0 :       load_buffer_8x16(input, in, stride, 0, 1);
    3012           0 :       array_transpose_8x8(t, t);
    3013           0 :       array_transpose_8x8(b, b);
    3014           0 :       fadst8_sse2(t);
    3015           0 :       fadst8_sse2(b);
    3016           0 :       row_8x16_rounding(in, 2);
    3017           0 :       idtx16_8col(in);
    3018           0 :       break;
    3019             : #endif
    3020           0 :     default: assert(0); break;
    3021             :   }
    3022           0 :   write_buffer_8x8(output, t, 8);
    3023           0 :   write_buffer_8x8(output + 64, b, 8);
    3024           0 : }
    3025             : 
    3026           0 : static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in,
    3027             :                                     int stride, int flipud, int fliplr) {
    3028             :   // Load 2 8x8 blocks
    3029           0 :   const int16_t *l = input;
    3030           0 :   const int16_t *r = input + 8;
    3031             : 
    3032           0 :   if (fliplr) {
    3033           0 :     const int16_t *const tmp = l;
    3034           0 :     l = r;
    3035           0 :     r = tmp;
    3036             :   }
    3037             : 
    3038             :   // load first 8 columns
    3039           0 :   load_buffer_8x8(l, in, stride, flipud, fliplr);
    3040           0 :   scale_sqrt2_8x8(in);
    3041           0 :   load_buffer_8x8(r, in + 8, stride, flipud, fliplr);
    3042           0 :   scale_sqrt2_8x8(in + 8);
    3043           0 : }
    3044             : 
    3045             : #define col_16x8_rounding row_8x16_rounding
    3046             : 
    3047           0 : void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
    3048             :                       int tx_type) {
    3049             :   __m128i in[16];
    3050             : 
    3051           0 :   __m128i *const l = in;      // Alias to left 8x8 sub block
    3052           0 :   __m128i *const r = in + 8;  // Alias to right 8x8 sub block, which we store
    3053             :                               // in the second half of the array
    3054             : 
    3055           0 :   switch (tx_type) {
    3056             :     case DCT_DCT:
    3057           0 :       load_buffer_16x8(input, in, stride, 0, 0);
    3058           0 :       fdct8_sse2(l);
    3059           0 :       fdct8_sse2(r);
    3060           0 :       col_16x8_rounding(in, 2);
    3061           0 :       fdct16_8col(in);
    3062           0 :       break;
    3063             :     case ADST_DCT:
    3064           0 :       load_buffer_16x8(input, in, stride, 0, 0);
    3065           0 :       fadst8_sse2(l);
    3066           0 :       fadst8_sse2(r);
    3067           0 :       col_16x8_rounding(in, 2);
    3068           0 :       fdct16_8col(in);
    3069           0 :       break;
    3070             :     case DCT_ADST:
    3071           0 :       load_buffer_16x8(input, in, stride, 0, 0);
    3072           0 :       fdct8_sse2(l);
    3073           0 :       fdct8_sse2(r);
    3074           0 :       col_16x8_rounding(in, 2);
    3075           0 :       fadst16_8col(in);
    3076           0 :       break;
    3077             :     case ADST_ADST:
    3078           0 :       load_buffer_16x8(input, in, stride, 0, 0);
    3079           0 :       fadst8_sse2(l);
    3080           0 :       fadst8_sse2(r);
    3081           0 :       col_16x8_rounding(in, 2);
    3082           0 :       fadst16_8col(in);
    3083           0 :       break;
    3084             : #if CONFIG_EXT_TX
    3085             :     case FLIPADST_DCT:
    3086           0 :       load_buffer_16x8(input, in, stride, 1, 0);
    3087           0 :       fadst8_sse2(l);
    3088           0 :       fadst8_sse2(r);
    3089           0 :       col_16x8_rounding(in, 2);
    3090           0 :       fdct16_8col(in);
    3091           0 :       break;
    3092             :     case DCT_FLIPADST:
    3093           0 :       load_buffer_16x8(input, in, stride, 0, 1);
    3094           0 :       fdct8_sse2(l);
    3095           0 :       fdct8_sse2(r);
    3096           0 :       col_16x8_rounding(in, 2);
    3097           0 :       fadst16_8col(in);
    3098           0 :       break;
    3099             :     case FLIPADST_FLIPADST:
    3100           0 :       load_buffer_16x8(input, in, stride, 1, 1);
    3101           0 :       fadst8_sse2(l);
    3102           0 :       fadst8_sse2(r);
    3103           0 :       col_16x8_rounding(in, 2);
    3104           0 :       fadst16_8col(in);
    3105           0 :       break;
    3106             :     case ADST_FLIPADST:
    3107           0 :       load_buffer_16x8(input, in, stride, 0, 1);
    3108           0 :       fadst8_sse2(l);
    3109           0 :       fadst8_sse2(r);
    3110           0 :       col_16x8_rounding(in, 2);
    3111           0 :       fadst16_8col(in);
    3112           0 :       break;
    3113             :     case FLIPADST_ADST:
    3114           0 :       load_buffer_16x8(input, in, stride, 1, 0);
    3115           0 :       fadst8_sse2(l);
    3116           0 :       fadst8_sse2(r);
    3117           0 :       col_16x8_rounding(in, 2);
    3118           0 :       fadst16_8col(in);
    3119           0 :       break;
    3120             :     case IDTX:
    3121           0 :       load_buffer_16x8(input, in, stride, 0, 0);
    3122           0 :       fidtx8_sse2(l);
    3123           0 :       fidtx8_sse2(r);
    3124           0 :       col_16x8_rounding(in, 2);
    3125           0 :       idtx16_8col(in);
    3126           0 :       break;
    3127             :     case V_DCT:
    3128           0 :       load_buffer_16x8(input, in, stride, 0, 0);
    3129           0 :       fdct8_sse2(l);
    3130           0 :       fdct8_sse2(r);
    3131           0 :       col_16x8_rounding(in, 2);
    3132           0 :       idtx16_8col(in);
    3133           0 :       break;
    3134             :     case H_DCT:
    3135           0 :       load_buffer_16x8(input, in, stride, 0, 0);
    3136           0 :       fidtx8_sse2(l);
    3137           0 :       fidtx8_sse2(r);
    3138           0 :       col_16x8_rounding(in, 2);
    3139           0 :       fdct16_8col(in);
    3140           0 :       break;
    3141             :     case V_ADST:
    3142           0 :       load_buffer_16x8(input, in, stride, 0, 0);
    3143           0 :       fadst8_sse2(l);
    3144           0 :       fadst8_sse2(r);
    3145           0 :       col_16x8_rounding(in, 2);
    3146           0 :       idtx16_8col(in);
    3147           0 :       break;
    3148             :     case H_ADST:
    3149           0 :       load_buffer_16x8(input, in, stride, 0, 0);
    3150           0 :       fidtx8_sse2(l);
    3151           0 :       fidtx8_sse2(r);
    3152           0 :       col_16x8_rounding(in, 2);
    3153           0 :       fadst16_8col(in);
    3154           0 :       break;
    3155             :     case V_FLIPADST:
    3156           0 :       load_buffer_16x8(input, in, stride, 1, 0);
    3157           0 :       fadst8_sse2(l);
    3158           0 :       fadst8_sse2(r);
    3159           0 :       col_16x8_rounding(in, 2);
    3160           0 :       idtx16_8col(in);
    3161           0 :       break;
    3162             :     case H_FLIPADST:
    3163           0 :       load_buffer_16x8(input, in, stride, 0, 1);
    3164           0 :       fidtx8_sse2(l);
    3165           0 :       fidtx8_sse2(r);
    3166           0 :       col_16x8_rounding(in, 2);
    3167           0 :       fadst16_8col(in);
    3168           0 :       break;
    3169             : #endif
    3170           0 :     default: assert(0); break;
    3171             :   }
    3172           0 :   array_transpose_8x8(l, l);
    3173           0 :   array_transpose_8x8(r, r);
    3174           0 :   write_buffer_8x8(output, l, 16);
    3175           0 :   write_buffer_8x8(output + 8, r, 16);
    3176           0 : }
    3177             : 
    3178             : // Note: The 16-column 32-element transforms expect their input to be
    3179             : // split up into a 2x2 grid of 8x16 blocks
    3180           0 : static INLINE void fdct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
    3181             :                                 __m128i *br) {
    3182           0 :   fdct32_8col(tl, bl);
    3183           0 :   fdct32_8col(tr, br);
    3184           0 :   array_transpose_16x16(tl, tr);
    3185           0 :   array_transpose_16x16(bl, br);
    3186           0 : }
    3187             : 
    3188             : #if CONFIG_EXT_TX
    3189           0 : static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
    3190             :                                  __m128i *br) {
    3191             :   int i;
    3192           0 :   for (i = 0; i < 16; ++i) {
    3193           0 :     tl[i] = _mm_slli_epi16(tl[i], 2);
    3194           0 :     tr[i] = _mm_slli_epi16(tr[i], 2);
    3195           0 :     bl[i] = _mm_slli_epi16(bl[i], 2);
    3196           0 :     br[i] = _mm_slli_epi16(br[i], 2);
    3197             :   }
    3198           0 :   array_transpose_16x16(tl, tr);
    3199           0 :   array_transpose_16x16(bl, br);
    3200           0 : }
    3201             : #endif
    3202             : 
    3203           0 : static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl,
    3204             :                                      __m128i *intr, __m128i *inbl,
    3205             :                                      __m128i *inbr, int stride, int flipud,
    3206             :                                      int fliplr) {
    3207             :   int i;
    3208           0 :   if (flipud) {
    3209           0 :     input = input + 31 * stride;
    3210           0 :     stride = -stride;
    3211             :   }
    3212             : 
    3213           0 :   for (i = 0; i < 16; ++i) {
    3214           0 :     intl[i] = _mm_slli_epi16(
    3215           0 :         _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
    3216           0 :     intr[i] = _mm_slli_epi16(
    3217           0 :         _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
    3218           0 :     inbl[i] = _mm_slli_epi16(
    3219           0 :         _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0)), 2);
    3220           0 :     inbr[i] = _mm_slli_epi16(
    3221           0 :         _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8)), 2);
    3222             :   }
    3223             : 
    3224           0 :   if (fliplr) {
    3225             :     __m128i tmp;
    3226           0 :     for (i = 0; i < 16; ++i) {
    3227           0 :       tmp = intl[i];
    3228           0 :       intl[i] = mm_reverse_epi16(intr[i]);
    3229           0 :       intr[i] = mm_reverse_epi16(tmp);
    3230           0 :       tmp = inbl[i];
    3231           0 :       inbl[i] = mm_reverse_epi16(inbr[i]);
    3232           0 :       inbr[i] = mm_reverse_epi16(tmp);
    3233             :     }
    3234             :   }
    3235             : 
    3236           0 :   scale_sqrt2_8x16(intl);
    3237           0 :   scale_sqrt2_8x16(intr);
    3238           0 :   scale_sqrt2_8x16(inbl);
    3239           0 :   scale_sqrt2_8x16(inbr);
    3240           0 : }
    3241             : 
    3242           0 : static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl,
    3243             :                                       __m128i *restr, __m128i *resbl,
    3244             :                                       __m128i *resbr) {
    3245             :   int i;
    3246           0 :   for (i = 0; i < 16; ++i) {
    3247           0 :     store_output(&restl[i], output + i * 16 + 0);
    3248           0 :     store_output(&restr[i], output + i * 16 + 8);
    3249           0 :     store_output(&resbl[i], output + (i + 16) * 16 + 0);
    3250           0 :     store_output(&resbr[i], output + (i + 16) * 16 + 8);
    3251             :   }
    3252           0 : }
    3253             : 
    3254           0 : static INLINE void round_signed_8x8(__m128i *in, const int bit) {
    3255           0 :   const __m128i rounding = _mm_set1_epi16((1 << bit) >> 1);
    3256           0 :   __m128i sign0 = _mm_srai_epi16(in[0], 15);
    3257           0 :   __m128i sign1 = _mm_srai_epi16(in[1], 15);
    3258           0 :   __m128i sign2 = _mm_srai_epi16(in[2], 15);
    3259           0 :   __m128i sign3 = _mm_srai_epi16(in[3], 15);
    3260           0 :   __m128i sign4 = _mm_srai_epi16(in[4], 15);
    3261           0 :   __m128i sign5 = _mm_srai_epi16(in[5], 15);
    3262           0 :   __m128i sign6 = _mm_srai_epi16(in[6], 15);
    3263           0 :   __m128i sign7 = _mm_srai_epi16(in[7], 15);
    3264             : 
    3265           0 :   in[0] = _mm_add_epi16(_mm_add_epi16(in[0], rounding), sign0);
    3266           0 :   in[1] = _mm_add_epi16(_mm_add_epi16(in[1], rounding), sign1);
    3267           0 :   in[2] = _mm_add_epi16(_mm_add_epi16(in[2], rounding), sign2);
    3268           0 :   in[3] = _mm_add_epi16(_mm_add_epi16(in[3], rounding), sign3);
    3269           0 :   in[4] = _mm_add_epi16(_mm_add_epi16(in[4], rounding), sign4);
    3270           0 :   in[5] = _mm_add_epi16(_mm_add_epi16(in[5], rounding), sign5);
    3271           0 :   in[6] = _mm_add_epi16(_mm_add_epi16(in[6], rounding), sign6);
    3272           0 :   in[7] = _mm_add_epi16(_mm_add_epi16(in[7], rounding), sign7);
    3273             : 
    3274           0 :   in[0] = _mm_srai_epi16(in[0], bit);
    3275           0 :   in[1] = _mm_srai_epi16(in[1], bit);
    3276           0 :   in[2] = _mm_srai_epi16(in[2], bit);
    3277           0 :   in[3] = _mm_srai_epi16(in[3], bit);
    3278           0 :   in[4] = _mm_srai_epi16(in[4], bit);
    3279           0 :   in[5] = _mm_srai_epi16(in[5], bit);
    3280           0 :   in[6] = _mm_srai_epi16(in[6], bit);
    3281           0 :   in[7] = _mm_srai_epi16(in[7], bit);
    3282           0 : }
    3283             : 
    3284           0 : static INLINE void round_signed_16x16(__m128i *in0, __m128i *in1) {
    3285           0 :   const int bit = 4;
    3286           0 :   round_signed_8x8(in0, bit);
    3287           0 :   round_signed_8x8(in0 + 8, bit);
    3288           0 :   round_signed_8x8(in1, bit);
    3289           0 :   round_signed_8x8(in1 + 8, bit);
    3290           0 : }
    3291             : 
    3292             : // Note:
    3293             : //  suffix "t" indicates the transpose operation comes first
    3294           0 : static void fdct16t_sse2(__m128i *in0, __m128i *in1) {
    3295           0 :   array_transpose_16x16(in0, in1);
    3296           0 :   fdct16_8col(in0);
    3297           0 :   fdct16_8col(in1);
    3298           0 : }
    3299             : 
    3300           0 : static void fadst16t_sse2(__m128i *in0, __m128i *in1) {
    3301           0 :   array_transpose_16x16(in0, in1);
    3302           0 :   fadst16_8col(in0);
    3303           0 :   fadst16_8col(in1);
    3304           0 : }
    3305             : 
    3306           0 : static INLINE void fdct32t_16col(__m128i *tl, __m128i *tr, __m128i *bl,
    3307             :                                  __m128i *br) {
    3308           0 :   array_transpose_16x16(tl, tr);
    3309           0 :   array_transpose_16x16(bl, br);
    3310           0 :   fdct32_8col(tl, bl);
    3311           0 :   fdct32_8col(tr, br);
    3312           0 : }
    3313             : 
    3314             : typedef enum transpose_indicator_ {
    3315             :   transpose,
    3316             :   no_transpose,
    3317             : } transpose_indicator;
    3318             : 
    3319           0 : static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
    3320             :                                       __m128i *br, transpose_indicator t) {
    3321             :   __m128i tmpl[16], tmpr[16];
    3322             :   int i;
    3323             : 
    3324             :   // Copy the bottom half of the input to temporary storage
    3325           0 :   for (i = 0; i < 16; ++i) {
    3326           0 :     tmpl[i] = bl[i];
    3327           0 :     tmpr[i] = br[i];
    3328             :   }
    3329             : 
    3330             :   // Generate the bottom half of the output
    3331           0 :   for (i = 0; i < 16; ++i) {
    3332           0 :     bl[i] = _mm_slli_epi16(tl[i], 2);
    3333           0 :     br[i] = _mm_slli_epi16(tr[i], 2);
    3334             :   }
    3335           0 :   array_transpose_16x16(bl, br);
    3336             : 
    3337             :   // Copy the temporary storage back to the top half of the input
    3338           0 :   for (i = 0; i < 16; ++i) {
    3339           0 :     tl[i] = tmpl[i];
    3340           0 :     tr[i] = tmpr[i];
    3341             :   }
    3342             : 
    3343             :   // Generate the top half of the output
    3344           0 :   scale_sqrt2_8x16(tl);
    3345           0 :   scale_sqrt2_8x16(tr);
    3346           0 :   if (t == transpose)
    3347           0 :     fdct16t_sse2(tl, tr);
    3348             :   else
    3349           0 :     fdct16_sse2(tl, tr);
    3350           0 : }
    3351             : 
    3352             : // Note on data layout, for both this and the 32x16 transforms:
    3353             : // So that we can reuse the 16-element transforms easily,
    3354             : // we want to split the input into 8x16 blocks.
    3355             : // For 16x32, this means the input is a 2x2 grid of such blocks.
    3356             : // For 32x16, it means the input is a 4x1 grid.
    3357           0 : void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
    3358             :                        int tx_type) {
    3359             :   __m128i intl[16], intr[16], inbl[16], inbr[16];
    3360             : 
    3361           0 :   switch (tx_type) {
    3362             :     case DCT_DCT:
    3363           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
    3364           0 :       fdct16t_sse2(intl, intr);
    3365           0 :       fdct16t_sse2(inbl, inbr);
    3366           0 :       round_signed_16x16(intl, intr);
    3367           0 :       round_signed_16x16(inbl, inbr);
    3368           0 :       fdct32t_16col(intl, intr, inbl, inbr);
    3369           0 :       break;
    3370             :     case ADST_DCT:
    3371           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
    3372           0 :       fdct16t_sse2(intl, intr);
    3373           0 :       fdct16t_sse2(inbl, inbr);
    3374           0 :       round_signed_16x16(intl, intr);
    3375           0 :       round_signed_16x16(inbl, inbr);
    3376           0 :       fhalfright32_16col(intl, intr, inbl, inbr, transpose);
    3377           0 :       break;
    3378             :     case DCT_ADST:
    3379           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
    3380           0 :       fadst16t_sse2(intl, intr);
    3381           0 :       fadst16t_sse2(inbl, inbr);
    3382           0 :       round_signed_16x16(intl, intr);
    3383           0 :       round_signed_16x16(inbl, inbr);
    3384           0 :       fdct32t_16col(intl, intr, inbl, inbr);
    3385           0 :       break;
    3386             :     case ADST_ADST:
    3387           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
    3388           0 :       fadst16t_sse2(intl, intr);
    3389           0 :       fadst16t_sse2(inbl, inbr);
    3390           0 :       round_signed_16x16(intl, intr);
    3391           0 :       round_signed_16x16(inbl, inbr);
    3392           0 :       fhalfright32_16col(intl, intr, inbl, inbr, transpose);
    3393           0 :       break;
    3394             : #if CONFIG_EXT_TX
    3395             :     case FLIPADST_DCT:
    3396           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
    3397           0 :       fdct16t_sse2(intl, intr);
    3398           0 :       fdct16t_sse2(inbl, inbr);
    3399           0 :       round_signed_16x16(intl, intr);
    3400           0 :       round_signed_16x16(inbl, inbr);
    3401           0 :       fhalfright32_16col(intl, intr, inbl, inbr, transpose);
    3402           0 :       break;
    3403             :     case DCT_FLIPADST:
    3404           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
    3405           0 :       fadst16t_sse2(intl, intr);
    3406           0 :       fadst16t_sse2(inbl, inbr);
    3407           0 :       round_signed_16x16(intl, intr);
    3408           0 :       round_signed_16x16(inbl, inbr);
    3409           0 :       fdct32t_16col(intl, intr, inbl, inbr);
    3410           0 :       break;
    3411             :     case FLIPADST_FLIPADST:
    3412           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1);
    3413           0 :       fadst16t_sse2(intl, intr);
    3414           0 :       fadst16t_sse2(inbl, inbr);
    3415           0 :       round_signed_16x16(intl, intr);
    3416           0 :       round_signed_16x16(inbl, inbr);
    3417           0 :       fhalfright32_16col(intl, intr, inbl, inbr, transpose);
    3418           0 :       break;
    3419             :     case ADST_FLIPADST:
    3420           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
    3421           0 :       fadst16t_sse2(intl, intr);
    3422           0 :       fadst16t_sse2(inbl, inbr);
    3423           0 :       round_signed_16x16(intl, intr);
    3424           0 :       round_signed_16x16(inbl, inbr);
    3425           0 :       fhalfright32_16col(intl, intr, inbl, inbr, transpose);
    3426           0 :       break;
    3427             :     case FLIPADST_ADST:
    3428           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
    3429           0 :       fadst16t_sse2(intl, intr);
    3430           0 :       fadst16t_sse2(inbl, inbr);
    3431           0 :       round_signed_16x16(intl, intr);
    3432           0 :       round_signed_16x16(inbl, inbr);
    3433           0 :       fhalfright32_16col(intl, intr, inbl, inbr, transpose);
    3434           0 :       break;
    3435             :     case IDTX:
    3436           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
    3437           0 :       fidtx16_sse2(intl, intr);
    3438           0 :       fidtx16_sse2(inbl, inbr);
    3439           0 :       round_signed_16x16(intl, intr);
    3440           0 :       round_signed_16x16(inbl, inbr);
    3441           0 :       fidtx32_16col(intl, intr, inbl, inbr);
    3442           0 :       break;
    3443             :     case V_DCT:
    3444           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
    3445           0 :       fidtx16_sse2(intl, intr);
    3446           0 :       fidtx16_sse2(inbl, inbr);
    3447           0 :       round_signed_16x16(intl, intr);
    3448           0 :       round_signed_16x16(inbl, inbr);
    3449           0 :       fdct32t_16col(intl, intr, inbl, inbr);
    3450           0 :       break;
    3451             :     case H_DCT:
    3452           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
    3453           0 :       fdct16t_sse2(intl, intr);
    3454           0 :       fdct16t_sse2(inbl, inbr);
    3455           0 :       round_signed_16x16(intl, intr);
    3456           0 :       round_signed_16x16(inbl, inbr);
    3457           0 :       fidtx32_16col(intl, intr, inbl, inbr);
    3458           0 :       break;
    3459             :     case V_ADST:
    3460           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
    3461           0 :       fidtx16_sse2(intl, intr);
    3462           0 :       fidtx16_sse2(inbl, inbr);
    3463           0 :       round_signed_16x16(intl, intr);
    3464           0 :       round_signed_16x16(inbl, inbr);
    3465           0 :       fhalfright32_16col(intl, intr, inbl, inbr, transpose);
    3466           0 :       break;
    3467             :     case H_ADST:
    3468           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
    3469           0 :       fadst16t_sse2(intl, intr);
    3470           0 :       fadst16t_sse2(inbl, inbr);
    3471           0 :       round_signed_16x16(intl, intr);
    3472           0 :       round_signed_16x16(inbl, inbr);
    3473           0 :       fidtx32_16col(intl, intr, inbl, inbr);
    3474           0 :       break;
    3475             :     case V_FLIPADST:
    3476           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
    3477           0 :       fidtx16_sse2(intl, intr);
    3478           0 :       fidtx16_sse2(inbl, inbr);
    3479           0 :       round_signed_16x16(intl, intr);
    3480           0 :       round_signed_16x16(inbl, inbr);
    3481           0 :       fhalfright32_16col(intl, intr, inbl, inbr, transpose);
    3482           0 :       break;
    3483             :     case H_FLIPADST:
    3484           0 :       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
    3485           0 :       fadst16t_sse2(intl, intr);
    3486           0 :       fadst16t_sse2(inbl, inbr);
    3487           0 :       round_signed_16x16(intl, intr);
    3488           0 :       round_signed_16x16(inbl, inbr);
    3489           0 :       fidtx32_16col(intl, intr, inbl, inbr);
    3490           0 :       break;
    3491             : #endif
    3492           0 :     default: assert(0); break;
    3493             :   }
    3494           0 :   write_buffer_16x32(output, intl, intr, inbl, inbr);
    3495           0 : }
    3496             : 
    3497           0 : static INLINE void load_buffer_32x16(const int16_t *input, __m128i *in0,
    3498             :                                      __m128i *in1, __m128i *in2, __m128i *in3,
    3499             :                                      int stride, int flipud, int fliplr) {
    3500             :   int i;
    3501           0 :   if (flipud) {
    3502           0 :     input += 15 * stride;
    3503           0 :     stride = -stride;
    3504             :   }
    3505             : 
    3506           0 :   for (i = 0; i < 16; ++i) {
    3507           0 :     in0[i] = _mm_slli_epi16(
    3508           0 :         _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
    3509           0 :     in1[i] = _mm_slli_epi16(
    3510           0 :         _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
    3511           0 :     in2[i] = _mm_slli_epi16(
    3512           0 :         _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
    3513           0 :     in3[i] = _mm_slli_epi16(
    3514           0 :         _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
    3515             :   }
    3516             : 
    3517           0 :   if (fliplr) {
    3518           0 :     for (i = 0; i < 16; ++i) {
    3519           0 :       __m128i tmp1 = in0[i];
    3520           0 :       __m128i tmp2 = in1[i];
    3521           0 :       in0[i] = mm_reverse_epi16(in3[i]);
    3522           0 :       in1[i] = mm_reverse_epi16(in2[i]);
    3523           0 :       in2[i] = mm_reverse_epi16(tmp2);
    3524           0 :       in3[i] = mm_reverse_epi16(tmp1);
    3525             :     }
    3526             :   }
    3527             : 
    3528           0 :   scale_sqrt2_8x16(in0);
    3529           0 :   scale_sqrt2_8x16(in1);
    3530           0 :   scale_sqrt2_8x16(in2);
    3531           0 :   scale_sqrt2_8x16(in3);
    3532           0 : }
    3533             : 
    3534           0 : static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
    3535             :                                       __m128i *res1, __m128i *res2,
    3536             :                                       __m128i *res3) {
    3537             :   int i;
    3538           0 :   for (i = 0; i < 16; ++i) {
    3539           0 :     store_output(&res0[i], output + i * 32 + 0);
    3540           0 :     store_output(&res1[i], output + i * 32 + 8);
    3541           0 :     store_output(&res2[i], output + i * 32 + 16);
    3542           0 :     store_output(&res3[i], output + i * 32 + 24);
    3543             :   }
    3544           0 : }
    3545             : 
    3546           0 : void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
    3547             :                        int tx_type) {
    3548             :   __m128i in0[16], in1[16], in2[16], in3[16];
    3549             : 
    3550           0 :   load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
    3551           0 :   switch (tx_type) {
    3552             :     case DCT_DCT:
    3553           0 :       fdct16_sse2(in0, in1);
    3554           0 :       fdct16_sse2(in2, in3);
    3555           0 :       round_signed_16x16(in0, in1);
    3556           0 :       round_signed_16x16(in2, in3);
    3557           0 :       fdct32_16col(in0, in1, in2, in3);
    3558           0 :       break;
    3559             :     case ADST_DCT:
    3560           0 :       fadst16_sse2(in0, in1);
    3561           0 :       fadst16_sse2(in2, in3);
    3562           0 :       round_signed_16x16(in0, in1);
    3563           0 :       round_signed_16x16(in2, in3);
    3564           0 :       fdct32_16col(in0, in1, in2, in3);
    3565           0 :       break;
    3566             :     case DCT_ADST:
    3567           0 :       fdct16_sse2(in0, in1);
    3568           0 :       fdct16_sse2(in2, in3);
    3569           0 :       round_signed_16x16(in0, in1);
    3570           0 :       round_signed_16x16(in2, in3);
    3571           0 :       fhalfright32_16col(in0, in1, in2, in3, no_transpose);
    3572           0 :       break;
    3573             :     case ADST_ADST:
    3574           0 :       fadst16_sse2(in0, in1);
    3575           0 :       fadst16_sse2(in2, in3);
    3576           0 :       round_signed_16x16(in0, in1);
    3577           0 :       round_signed_16x16(in2, in3);
    3578           0 :       fhalfright32_16col(in0, in1, in2, in3, no_transpose);
    3579           0 :       break;
    3580             : #if CONFIG_EXT_TX
    3581             :     case FLIPADST_DCT:
    3582           0 :       load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
    3583           0 :       fadst16_sse2(in0, in1);
    3584           0 :       fadst16_sse2(in2, in3);
    3585           0 :       round_signed_16x16(in0, in1);
    3586           0 :       round_signed_16x16(in2, in3);
    3587           0 :       fdct32_16col(in0, in1, in2, in3);
    3588           0 :       break;
    3589             :     case DCT_FLIPADST:
    3590           0 :       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
    3591           0 :       fdct16_sse2(in0, in1);
    3592           0 :       fdct16_sse2(in2, in3);
    3593           0 :       round_signed_16x16(in0, in1);
    3594           0 :       round_signed_16x16(in2, in3);
    3595           0 :       fhalfright32_16col(in0, in1, in2, in3, no_transpose);
    3596           0 :       break;
    3597             :     case FLIPADST_FLIPADST:
    3598           0 :       load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1);
    3599           0 :       fadst16_sse2(in0, in1);
    3600           0 :       fadst16_sse2(in2, in3);
    3601           0 :       round_signed_16x16(in0, in1);
    3602           0 :       round_signed_16x16(in2, in3);
    3603           0 :       fhalfright32_16col(in0, in1, in2, in3, no_transpose);
    3604           0 :       break;
    3605             :     case ADST_FLIPADST:
    3606           0 :       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
    3607           0 :       fadst16_sse2(in0, in1);
    3608           0 :       fadst16_sse2(in2, in3);
    3609           0 :       round_signed_16x16(in0, in1);
    3610           0 :       round_signed_16x16(in2, in3);
    3611           0 :       fhalfright32_16col(in0, in1, in2, in3, no_transpose);
    3612           0 :       break;
    3613             :     case FLIPADST_ADST:
    3614           0 :       load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
    3615           0 :       fadst16_sse2(in0, in1);
    3616           0 :       fadst16_sse2(in2, in3);
    3617           0 :       round_signed_16x16(in0, in1);
    3618           0 :       round_signed_16x16(in2, in3);
    3619           0 :       fhalfright32_16col(in0, in1, in2, in3, no_transpose);
    3620           0 :       break;
    3621             :     case IDTX:
    3622           0 :       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
    3623           0 :       fidtx16_sse2(in0, in1);
    3624           0 :       fidtx16_sse2(in2, in3);
    3625           0 :       round_signed_16x16(in0, in1);
    3626           0 :       round_signed_16x16(in2, in3);
    3627           0 :       fidtx32_16col(in0, in1, in2, in3);
    3628           0 :       break;
    3629             :     case V_DCT:
    3630           0 :       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
    3631           0 :       fdct16_sse2(in0, in1);
    3632           0 :       fdct16_sse2(in2, in3);
    3633           0 :       round_signed_16x16(in0, in1);
    3634           0 :       round_signed_16x16(in2, in3);
    3635           0 :       fidtx32_16col(in0, in1, in2, in3);
    3636           0 :       break;
    3637             :     case H_DCT:
    3638           0 :       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
    3639           0 :       fidtx16_sse2(in0, in1);
    3640           0 :       fidtx16_sse2(in2, in3);
    3641           0 :       round_signed_16x16(in0, in1);
    3642           0 :       round_signed_16x16(in2, in3);
    3643           0 :       fdct32_16col(in0, in1, in2, in3);
    3644           0 :       break;
    3645             :     case V_ADST:
    3646           0 :       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
    3647           0 :       fadst16_sse2(in0, in1);
    3648           0 :       fadst16_sse2(in2, in3);
    3649           0 :       round_signed_16x16(in0, in1);
    3650           0 :       round_signed_16x16(in2, in3);
    3651           0 :       fidtx32_16col(in0, in1, in2, in3);
    3652           0 :       break;
    3653             :     case H_ADST:
    3654           0 :       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
    3655           0 :       fidtx16_sse2(in0, in1);
    3656           0 :       fidtx16_sse2(in2, in3);
    3657           0 :       round_signed_16x16(in0, in1);
    3658           0 :       round_signed_16x16(in2, in3);
    3659           0 :       fhalfright32_16col(in0, in1, in2, in3, no_transpose);
    3660           0 :       break;
    3661             :     case V_FLIPADST:
    3662           0 :       load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
    3663           0 :       fadst16_sse2(in0, in1);
    3664           0 :       fadst16_sse2(in2, in3);
    3665           0 :       round_signed_16x16(in0, in1);
    3666           0 :       round_signed_16x16(in2, in3);
    3667           0 :       fidtx32_16col(in0, in1, in2, in3);
    3668           0 :       break;
    3669             :     case H_FLIPADST:
    3670           0 :       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
    3671           0 :       fidtx16_sse2(in0, in1);
    3672           0 :       fidtx16_sse2(in2, in3);
    3673           0 :       round_signed_16x16(in0, in1);
    3674           0 :       round_signed_16x16(in2, in3);
    3675           0 :       fhalfright32_16col(in0, in1, in2, in3, no_transpose);
    3676           0 :       break;
    3677             : #endif
    3678           0 :     default: assert(0); break;
    3679             :   }
    3680           0 :   write_buffer_32x16(output, in0, in1, in2, in3);
    3681           0 : }
    3682             : 
    3683             : // Note:
    3684             : // 32x32 hybrid fwd txfm
    3685             : //  4x2 grids of 8x16 block. Each block is represented by __m128i in[16]
    3686           0 : static INLINE void load_buffer_32x32(const int16_t *input,
    3687             :                                      __m128i *in0 /*in0[32]*/,
    3688             :                                      __m128i *in1 /*in1[32]*/,
    3689             :                                      __m128i *in2 /*in2[32]*/,
    3690             :                                      __m128i *in3 /*in3[32]*/, int stride,
    3691             :                                      int flipud, int fliplr) {
    3692           0 :   if (flipud) {
    3693           0 :     input += 31 * stride;
    3694           0 :     stride = -stride;
    3695             :   }
    3696             : 
    3697             :   int i;
    3698           0 :   for (i = 0; i < 32; ++i) {
    3699           0 :     in0[i] = _mm_slli_epi16(
    3700           0 :         _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
    3701           0 :     in1[i] = _mm_slli_epi16(
    3702           0 :         _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
    3703           0 :     in2[i] = _mm_slli_epi16(
    3704           0 :         _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
    3705           0 :     in3[i] = _mm_slli_epi16(
    3706           0 :         _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
    3707             :   }
    3708             : 
    3709           0 :   if (fliplr) {
    3710           0 :     for (i = 0; i < 32; ++i) {
    3711           0 :       __m128i tmp1 = in0[i];
    3712           0 :       __m128i tmp2 = in1[i];
    3713           0 :       in0[i] = mm_reverse_epi16(in3[i]);
    3714           0 :       in1[i] = mm_reverse_epi16(in2[i]);
    3715           0 :       in2[i] = mm_reverse_epi16(tmp2);
    3716           0 :       in3[i] = mm_reverse_epi16(tmp1);
    3717             :     }
    3718             :   }
    3719           0 : }
    3720             : 
    3721           0 : static INLINE void swap_16x16(__m128i *b0l /*b0l[16]*/,
    3722             :                               __m128i *b0r /*b0r[16]*/,
    3723             :                               __m128i *b1l /*b1l[16]*/,
    3724             :                               __m128i *b1r /*b1r[16]*/) {
    3725             :   int i;
    3726           0 :   for (i = 0; i < 16; ++i) {
    3727           0 :     __m128i tmp0 = b1l[i];
    3728           0 :     __m128i tmp1 = b1r[i];
    3729           0 :     b1l[i] = b0l[i];
    3730           0 :     b1r[i] = b0r[i];
    3731           0 :     b0l[i] = tmp0;
    3732           0 :     b0r[i] = tmp1;
    3733             :   }
    3734           0 : }
    3735             : 
    3736           0 : static INLINE void fdct32(__m128i *in0, __m128i *in1, __m128i *in2,
    3737             :                           __m128i *in3) {
    3738           0 :   fdct32_8col(in0, &in0[16]);
    3739           0 :   fdct32_8col(in1, &in1[16]);
    3740           0 :   fdct32_8col(in2, &in2[16]);
    3741           0 :   fdct32_8col(in3, &in3[16]);
    3742             : 
    3743           0 :   array_transpose_16x16(in0, in1);
    3744           0 :   array_transpose_16x16(&in0[16], &in1[16]);
    3745           0 :   array_transpose_16x16(in2, in3);
    3746           0 :   array_transpose_16x16(&in2[16], &in3[16]);
    3747             : 
    3748           0 :   swap_16x16(&in0[16], &in1[16], in2, in3);
    3749           0 : }
    3750             : 
    3751           0 : static INLINE void fhalfright32(__m128i *in0, __m128i *in1, __m128i *in2,
    3752             :                                 __m128i *in3) {
    3753           0 :   fhalfright32_16col(in0, in1, &in0[16], &in1[16], no_transpose);
    3754           0 :   fhalfright32_16col(in2, in3, &in2[16], &in3[16], no_transpose);
    3755           0 :   swap_16x16(&in0[16], &in1[16], in2, in3);
    3756           0 : }
    3757             : 
    3758             : #if CONFIG_EXT_TX
    3759           0 : static INLINE void fidtx32(__m128i *in0, __m128i *in1, __m128i *in2,
    3760             :                            __m128i *in3) {
    3761           0 :   fidtx32_16col(in0, in1, &in0[16], &in1[16]);
    3762           0 :   fidtx32_16col(in2, in3, &in2[16], &in3[16]);
    3763           0 :   swap_16x16(&in0[16], &in1[16], in2, in3);
    3764           0 : }
    3765             : #endif
    3766             : 
    3767           0 : static INLINE void round_signed_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
    3768             :                                       __m128i *in3) {
    3769           0 :   round_signed_16x16(in0, in1);
    3770           0 :   round_signed_16x16(&in0[16], &in1[16]);
    3771           0 :   round_signed_16x16(in2, in3);
    3772           0 :   round_signed_16x16(&in2[16], &in3[16]);
    3773           0 : }
    3774             : 
    3775           0 : static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
    3776             :                                       __m128i *in3, tran_low_t *output) {
    3777             :   int i;
    3778           0 :   for (i = 0; i < 32; ++i) {
    3779           0 :     store_output(&in0[i], output + i * 32 + 0);
    3780           0 :     store_output(&in1[i], output + i * 32 + 8);
    3781           0 :     store_output(&in2[i], output + i * 32 + 16);
    3782           0 :     store_output(&in3[i], output + i * 32 + 24);
    3783             :   }
    3784           0 : }
    3785             : 
    3786           0 : void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride,
    3787             :                        int tx_type) {
    3788             :   __m128i in0[32], in1[32], in2[32], in3[32];
    3789             : 
    3790           0 :   load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0);
    3791           0 :   switch (tx_type) {
    3792             :     case DCT_DCT:
    3793           0 :       fdct32(in0, in1, in2, in3);
    3794           0 :       round_signed_32x32(in0, in1, in2, in3);
    3795           0 :       fdct32(in0, in1, in2, in3);
    3796           0 :       break;
    3797             :     case ADST_DCT:
    3798           0 :       fhalfright32(in0, in1, in2, in3);
    3799           0 :       round_signed_32x32(in0, in1, in2, in3);
    3800           0 :       fdct32(in0, in1, in2, in3);
    3801           0 :       break;
    3802             :     case DCT_ADST:
    3803           0 :       fdct32(in0, in1, in2, in3);
    3804           0 :       round_signed_32x32(in0, in1, in2, in3);
    3805           0 :       fhalfright32(in0, in1, in2, in3);
    3806           0 :       break;
    3807             :     case ADST_ADST:
    3808           0 :       fhalfright32(in0, in1, in2, in3);
    3809           0 :       round_signed_32x32(in0, in1, in2, in3);
    3810           0 :       fhalfright32(in0, in1, in2, in3);
    3811           0 :       break;
    3812             : #if CONFIG_EXT_TX
    3813             :     case FLIPADST_DCT:
    3814           0 :       load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
    3815           0 :       fhalfright32(in0, in1, in2, in3);
    3816           0 :       round_signed_32x32(in0, in1, in2, in3);
    3817           0 :       fdct32(in0, in1, in2, in3);
    3818           0 :       break;
    3819             :     case DCT_FLIPADST:
    3820           0 :       load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
    3821           0 :       fdct32(in0, in1, in2, in3);
    3822           0 :       round_signed_32x32(in0, in1, in2, in3);
    3823           0 :       fhalfright32(in0, in1, in2, in3);
    3824           0 :       break;
    3825             :     case FLIPADST_FLIPADST:
    3826           0 :       load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 1);
    3827           0 :       fhalfright32(in0, in1, in2, in3);
    3828           0 :       round_signed_32x32(in0, in1, in2, in3);
    3829           0 :       fhalfright32(in0, in1, in2, in3);
    3830           0 :       break;
    3831             :     case ADST_FLIPADST:
    3832           0 :       load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
    3833           0 :       fhalfright32(in0, in1, in2, in3);
    3834           0 :       round_signed_32x32(in0, in1, in2, in3);
    3835           0 :       fhalfright32(in0, in1, in2, in3);
    3836           0 :       break;
    3837             :     case FLIPADST_ADST:
    3838           0 :       load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
    3839           0 :       fhalfright32(in0, in1, in2, in3);
    3840           0 :       round_signed_32x32(in0, in1, in2, in3);
    3841           0 :       fhalfright32(in0, in1, in2, in3);
    3842           0 :       break;
    3843             :     case IDTX:
    3844           0 :       fidtx32(in0, in1, in2, in3);
    3845           0 :       round_signed_32x32(in0, in1, in2, in3);
    3846           0 :       fidtx32(in0, in1, in2, in3);
    3847           0 :       break;
    3848             :     case V_DCT:
    3849           0 :       fdct32(in0, in1, in2, in3);
    3850           0 :       round_signed_32x32(in0, in1, in2, in3);
    3851           0 :       fidtx32(in0, in1, in2, in3);
    3852           0 :       break;
    3853             :     case H_DCT:
    3854           0 :       fidtx32(in0, in1, in2, in3);
    3855           0 :       round_signed_32x32(in0, in1, in2, in3);
    3856           0 :       fdct32(in0, in1, in2, in3);
    3857           0 :       break;
    3858             :     case V_ADST:
    3859           0 :       fhalfright32(in0, in1, in2, in3);
    3860           0 :       round_signed_32x32(in0, in1, in2, in3);
    3861           0 :       fidtx32(in0, in1, in2, in3);
    3862           0 :       break;
    3863             :     case H_ADST:
    3864           0 :       fidtx32(in0, in1, in2, in3);
    3865           0 :       round_signed_32x32(in0, in1, in2, in3);
    3866           0 :       fhalfright32(in0, in1, in2, in3);
    3867           0 :       break;
    3868             :     case V_FLIPADST:
    3869           0 :       load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
    3870           0 :       fhalfright32(in0, in1, in2, in3);
    3871           0 :       round_signed_32x32(in0, in1, in2, in3);
    3872           0 :       fidtx32(in0, in1, in2, in3);
    3873           0 :       break;
    3874             :     case H_FLIPADST:
    3875           0 :       load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
    3876           0 :       fidtx32(in0, in1, in2, in3);
    3877           0 :       round_signed_32x32(in0, in1, in2, in3);
    3878           0 :       fhalfright32(in0, in1, in2, in3);
    3879           0 :       break;
    3880             : #endif
    3881           0 :     default: assert(0);
    3882             :   }
    3883           0 :   write_buffer_32x32(in0, in1, in2, in3, output);
    3884           0 : }

Generated by: LCOV version 1.13