LCOV - code coverage report
Current view: top level - third_party/aom/av1/encoder/x86 - highbd_fwd_txfm_sse4.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 1497 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 19 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : #include <assert.h>
      12             : #include <smmintrin.h> /* SSE4.1 */
      13             : 
      14             : #include "./av1_rtcd.h"
      15             : #include "./aom_config.h"
      16             : #include "av1/common/av1_fwd_txfm1d_cfg.h"
      17             : #include "av1/common/av1_txfm.h"
      18             : #include "av1/common/x86/highbd_txfm_utility_sse4.h"
      19             : #include "aom_dsp/txfm_common.h"
      20             : #include "aom_dsp/x86/txfm_common_sse2.h"
      21             : #include "aom_ports/mem.h"
      22             : 
      23           0 : static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
      24             :                                    int stride, int flipud, int fliplr,
      25             :                                    int shift) {
      26           0 :   if (!flipud) {
      27           0 :     in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
      28           0 :     in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
      29           0 :     in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
      30           0 :     in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
      31             :   } else {
      32           0 :     in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
      33           0 :     in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
      34           0 :     in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
      35           0 :     in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
      36             :   }
      37             : 
      38           0 :   if (fliplr) {
      39           0 :     in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
      40           0 :     in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
      41           0 :     in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
      42           0 :     in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
      43             :   }
      44             : 
      45           0 :   in[0] = _mm_cvtepi16_epi32(in[0]);
      46           0 :   in[1] = _mm_cvtepi16_epi32(in[1]);
      47           0 :   in[2] = _mm_cvtepi16_epi32(in[2]);
      48           0 :   in[3] = _mm_cvtepi16_epi32(in[3]);
      49             : 
      50           0 :   in[0] = _mm_slli_epi32(in[0], shift);
      51           0 :   in[1] = _mm_slli_epi32(in[1], shift);
      52           0 :   in[2] = _mm_slli_epi32(in[2], shift);
      53           0 :   in[3] = _mm_slli_epi32(in[3], shift);
      54           0 : }
      55             : 
      56             : // We only use stage-2 bit;
      57             : // shift[0] is used in load_buffer_4x4()
      58             : // shift[1] is used in txfm_func_col()
      59             : // shift[2] is used in txfm_func_row()
      60           0 : static void fdct4x4_sse4_1(__m128i *in, int bit) {
      61           0 :   const int32_t *cospi = cospi_arr(bit);
      62           0 :   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
      63           0 :   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
      64           0 :   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
      65           0 :   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
      66             :   __m128i s0, s1, s2, s3;
      67             :   __m128i u0, u1, u2, u3;
      68             :   __m128i v0, v1, v2, v3;
      69             : 
      70           0 :   s0 = _mm_add_epi32(in[0], in[3]);
      71           0 :   s1 = _mm_add_epi32(in[1], in[2]);
      72           0 :   s2 = _mm_sub_epi32(in[1], in[2]);
      73           0 :   s3 = _mm_sub_epi32(in[0], in[3]);
      74             : 
      75             :   // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
      76           0 :   u0 = _mm_mullo_epi32(s0, cospi32);
      77           0 :   u1 = _mm_mullo_epi32(s1, cospi32);
      78           0 :   u2 = _mm_add_epi32(u0, u1);
      79           0 :   v0 = _mm_sub_epi32(u0, u1);
      80             : 
      81           0 :   u3 = _mm_add_epi32(u2, rnding);
      82           0 :   v1 = _mm_add_epi32(v0, rnding);
      83             : 
      84           0 :   u0 = _mm_srai_epi32(u3, bit);
      85           0 :   u2 = _mm_srai_epi32(v1, bit);
      86             : 
      87             :   // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
      88           0 :   v0 = _mm_mullo_epi32(s2, cospi48);
      89           0 :   v1 = _mm_mullo_epi32(s3, cospi16);
      90           0 :   v2 = _mm_add_epi32(v0, v1);
      91             : 
      92           0 :   v3 = _mm_add_epi32(v2, rnding);
      93           0 :   u1 = _mm_srai_epi32(v3, bit);
      94             : 
      95           0 :   v0 = _mm_mullo_epi32(s2, cospi16);
      96           0 :   v1 = _mm_mullo_epi32(s3, cospi48);
      97           0 :   v2 = _mm_sub_epi32(v1, v0);
      98             : 
      99           0 :   v3 = _mm_add_epi32(v2, rnding);
     100           0 :   u3 = _mm_srai_epi32(v3, bit);
     101             : 
     102             :   // Note: shift[1] and shift[2] are zeros
     103             : 
     104             :   // Transpose 4x4 32-bit
     105           0 :   v0 = _mm_unpacklo_epi32(u0, u1);
     106           0 :   v1 = _mm_unpackhi_epi32(u0, u1);
     107           0 :   v2 = _mm_unpacklo_epi32(u2, u3);
     108           0 :   v3 = _mm_unpackhi_epi32(u2, u3);
     109             : 
     110           0 :   in[0] = _mm_unpacklo_epi64(v0, v2);
     111           0 :   in[1] = _mm_unpackhi_epi64(v0, v2);
     112           0 :   in[2] = _mm_unpacklo_epi64(v1, v3);
     113           0 :   in[3] = _mm_unpackhi_epi64(v1, v3);
     114           0 : }
     115             : 
     116           0 : static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) {
     117           0 :   _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
     118           0 :   _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
     119           0 :   _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
     120           0 :   _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
     121           0 : }
     122             : 
     123             : // Note:
     124             : //  We implement av1_fwd_txfm2d_4x4(). This function is kept here since
     125             : //  av1_highbd_fht4x4_c() is not removed yet
     126           0 : void av1_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output,
     127             :                               int stride, int tx_type) {
     128             :   (void)input;
     129             :   (void)output;
     130             :   (void)stride;
     131             :   (void)tx_type;
     132           0 :   assert(0);
     133             : }
     134             : 
     135           0 : static void fadst4x4_sse4_1(__m128i *in, int bit) {
     136           0 :   const int32_t *cospi = cospi_arr(bit);
     137           0 :   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
     138           0 :   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
     139           0 :   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
     140           0 :   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
     141           0 :   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
     142           0 :   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
     143           0 :   const __m128i kZero = _mm_setzero_si128();
     144             :   __m128i s0, s1, s2, s3;
     145             :   __m128i u0, u1, u2, u3;
     146             :   __m128i v0, v1, v2, v3;
     147             : 
     148             :   // stage 0
     149             :   // stage 1
     150             :   // stage 2
     151           0 :   u0 = _mm_mullo_epi32(in[3], cospi8);
     152           0 :   u1 = _mm_mullo_epi32(in[0], cospi56);
     153           0 :   u2 = _mm_add_epi32(u0, u1);
     154           0 :   s0 = _mm_add_epi32(u2, rnding);
     155           0 :   s0 = _mm_srai_epi32(s0, bit);
     156             : 
     157           0 :   v0 = _mm_mullo_epi32(in[3], cospi56);
     158           0 :   v1 = _mm_mullo_epi32(in[0], cospi8);
     159           0 :   v2 = _mm_sub_epi32(v0, v1);
     160           0 :   s1 = _mm_add_epi32(v2, rnding);
     161           0 :   s1 = _mm_srai_epi32(s1, bit);
     162             : 
     163           0 :   u0 = _mm_mullo_epi32(in[1], cospi40);
     164           0 :   u1 = _mm_mullo_epi32(in[2], cospi24);
     165           0 :   u2 = _mm_add_epi32(u0, u1);
     166           0 :   s2 = _mm_add_epi32(u2, rnding);
     167           0 :   s2 = _mm_srai_epi32(s2, bit);
     168             : 
     169           0 :   v0 = _mm_mullo_epi32(in[1], cospi24);
     170           0 :   v1 = _mm_mullo_epi32(in[2], cospi40);
     171           0 :   v2 = _mm_sub_epi32(v0, v1);
     172           0 :   s3 = _mm_add_epi32(v2, rnding);
     173           0 :   s3 = _mm_srai_epi32(s3, bit);
     174             : 
     175             :   // stage 3
     176           0 :   u0 = _mm_add_epi32(s0, s2);
     177           0 :   u2 = _mm_sub_epi32(s0, s2);
     178           0 :   u1 = _mm_add_epi32(s1, s3);
     179           0 :   u3 = _mm_sub_epi32(s1, s3);
     180             : 
     181             :   // stage 4
     182           0 :   v0 = _mm_mullo_epi32(u2, cospi32);
     183           0 :   v1 = _mm_mullo_epi32(u3, cospi32);
     184           0 :   v2 = _mm_add_epi32(v0, v1);
     185           0 :   s2 = _mm_add_epi32(v2, rnding);
     186           0 :   u2 = _mm_srai_epi32(s2, bit);
     187             : 
     188           0 :   v2 = _mm_sub_epi32(v0, v1);
     189           0 :   s3 = _mm_add_epi32(v2, rnding);
     190           0 :   u3 = _mm_srai_epi32(s3, bit);
     191             : 
     192             :   // u0, u1, u2, u3
     193           0 :   u2 = _mm_sub_epi32(kZero, u2);
     194           0 :   u1 = _mm_sub_epi32(kZero, u1);
     195             : 
     196             :   // u0, u2, u3, u1
     197             :   // Transpose 4x4 32-bit
     198           0 :   v0 = _mm_unpacklo_epi32(u0, u2);
     199           0 :   v1 = _mm_unpackhi_epi32(u0, u2);
     200           0 :   v2 = _mm_unpacklo_epi32(u3, u1);
     201           0 :   v3 = _mm_unpackhi_epi32(u3, u1);
     202             : 
     203           0 :   in[0] = _mm_unpacklo_epi64(v0, v2);
     204           0 :   in[1] = _mm_unpackhi_epi64(v0, v2);
     205           0 :   in[2] = _mm_unpacklo_epi64(v1, v3);
     206           0 :   in[3] = _mm_unpackhi_epi64(v1, v3);
     207           0 : }
     208             : 
     209           0 : void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
     210             :                                int input_stride, int tx_type, int bd) {
     211             :   __m128i in[4];
     212           0 :   const TXFM_1D_CFG *row_cfg = NULL;
     213           0 :   const TXFM_1D_CFG *col_cfg = NULL;
     214             : 
     215           0 :   switch (tx_type) {
     216             :     case DCT_DCT:
     217           0 :       row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
     218           0 :       col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
     219           0 :       load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
     220           0 :       fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
     221           0 :       fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
     222           0 :       write_buffer_4x4(in, coeff);
     223           0 :       break;
     224             :     case ADST_DCT:
     225           0 :       row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
     226           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
     227           0 :       load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
     228           0 :       fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
     229           0 :       fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
     230           0 :       write_buffer_4x4(in, coeff);
     231           0 :       break;
     232             :     case DCT_ADST:
     233           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
     234           0 :       col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
     235           0 :       load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
     236           0 :       fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
     237           0 :       fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
     238           0 :       write_buffer_4x4(in, coeff);
     239           0 :       break;
     240             :     case ADST_ADST:
     241           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
     242           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
     243           0 :       load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
     244           0 :       fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
     245           0 :       fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
     246           0 :       write_buffer_4x4(in, coeff);
     247           0 :       break;
     248             : #if CONFIG_EXT_TX
     249             :     case FLIPADST_DCT:
     250           0 :       row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
     251           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
     252           0 :       load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
     253           0 :       fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
     254           0 :       fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
     255           0 :       write_buffer_4x4(in, coeff);
     256           0 :       break;
     257             :     case DCT_FLIPADST:
     258           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
     259           0 :       col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
     260           0 :       load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
     261           0 :       fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
     262           0 :       fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
     263           0 :       write_buffer_4x4(in, coeff);
     264           0 :       break;
     265             :     case FLIPADST_FLIPADST:
     266           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
     267           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
     268           0 :       load_buffer_4x4(input, in, input_stride, 1, 1, row_cfg->shift[0]);
     269           0 :       fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
     270           0 :       fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
     271           0 :       write_buffer_4x4(in, coeff);
     272           0 :       break;
     273             :     case ADST_FLIPADST:
     274           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
     275           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
     276           0 :       load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
     277           0 :       fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
     278           0 :       fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
     279           0 :       write_buffer_4x4(in, coeff);
     280           0 :       break;
     281             :     case FLIPADST_ADST:
     282           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
     283           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
     284           0 :       load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
     285           0 :       fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
     286           0 :       fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
     287           0 :       write_buffer_4x4(in, coeff);
     288           0 :       break;
     289             : #endif
     290           0 :     default: assert(0);
     291             :   }
     292             :   (void)bd;
     293           0 : }
     294             : 
     295           0 : static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
     296             :                                    int stride, int flipud, int fliplr,
     297             :                                    int shift) {
     298             :   __m128i u;
     299           0 :   if (!flipud) {
     300           0 :     in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
     301           0 :     in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
     302           0 :     in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
     303           0 :     in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
     304           0 :     in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
     305           0 :     in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
     306           0 :     in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
     307           0 :     in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
     308             :   } else {
     309           0 :     in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
     310           0 :     in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
     311           0 :     in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
     312           0 :     in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
     313           0 :     in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
     314           0 :     in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
     315           0 :     in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
     316           0 :     in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
     317             :   }
     318             : 
     319           0 :   if (fliplr) {
     320           0 :     in[0] = mm_reverse_epi16(in[0]);
     321           0 :     in[1] = mm_reverse_epi16(in[1]);
     322           0 :     in[2] = mm_reverse_epi16(in[2]);
     323           0 :     in[3] = mm_reverse_epi16(in[3]);
     324           0 :     in[4] = mm_reverse_epi16(in[4]);
     325           0 :     in[5] = mm_reverse_epi16(in[5]);
     326           0 :     in[6] = mm_reverse_epi16(in[6]);
     327           0 :     in[7] = mm_reverse_epi16(in[7]);
     328             :   }
     329             : 
     330           0 :   u = _mm_unpackhi_epi64(in[4], in[4]);
     331           0 :   in[8] = _mm_cvtepi16_epi32(in[4]);
     332           0 :   in[9] = _mm_cvtepi16_epi32(u);
     333             : 
     334           0 :   u = _mm_unpackhi_epi64(in[5], in[5]);
     335           0 :   in[10] = _mm_cvtepi16_epi32(in[5]);
     336           0 :   in[11] = _mm_cvtepi16_epi32(u);
     337             : 
     338           0 :   u = _mm_unpackhi_epi64(in[6], in[6]);
     339           0 :   in[12] = _mm_cvtepi16_epi32(in[6]);
     340           0 :   in[13] = _mm_cvtepi16_epi32(u);
     341             : 
     342           0 :   u = _mm_unpackhi_epi64(in[7], in[7]);
     343           0 :   in[14] = _mm_cvtepi16_epi32(in[7]);
     344           0 :   in[15] = _mm_cvtepi16_epi32(u);
     345             : 
     346           0 :   u = _mm_unpackhi_epi64(in[3], in[3]);
     347           0 :   in[6] = _mm_cvtepi16_epi32(in[3]);
     348           0 :   in[7] = _mm_cvtepi16_epi32(u);
     349             : 
     350           0 :   u = _mm_unpackhi_epi64(in[2], in[2]);
     351           0 :   in[4] = _mm_cvtepi16_epi32(in[2]);
     352           0 :   in[5] = _mm_cvtepi16_epi32(u);
     353             : 
     354           0 :   u = _mm_unpackhi_epi64(in[1], in[1]);
     355           0 :   in[2] = _mm_cvtepi16_epi32(in[1]);
     356           0 :   in[3] = _mm_cvtepi16_epi32(u);
     357             : 
     358           0 :   u = _mm_unpackhi_epi64(in[0], in[0]);
     359           0 :   in[0] = _mm_cvtepi16_epi32(in[0]);
     360           0 :   in[1] = _mm_cvtepi16_epi32(u);
     361             : 
     362           0 :   in[0] = _mm_slli_epi32(in[0], shift);
     363           0 :   in[1] = _mm_slli_epi32(in[1], shift);
     364           0 :   in[2] = _mm_slli_epi32(in[2], shift);
     365           0 :   in[3] = _mm_slli_epi32(in[3], shift);
     366           0 :   in[4] = _mm_slli_epi32(in[4], shift);
     367           0 :   in[5] = _mm_slli_epi32(in[5], shift);
     368           0 :   in[6] = _mm_slli_epi32(in[6], shift);
     369           0 :   in[7] = _mm_slli_epi32(in[7], shift);
     370             : 
     371           0 :   in[8] = _mm_slli_epi32(in[8], shift);
     372           0 :   in[9] = _mm_slli_epi32(in[9], shift);
     373           0 :   in[10] = _mm_slli_epi32(in[10], shift);
     374           0 :   in[11] = _mm_slli_epi32(in[11], shift);
     375           0 :   in[12] = _mm_slli_epi32(in[12], shift);
     376           0 :   in[13] = _mm_slli_epi32(in[13], shift);
     377           0 :   in[14] = _mm_slli_epi32(in[14], shift);
     378           0 :   in[15] = _mm_slli_epi32(in[15], shift);
     379           0 : }
     380             : 
     381           0 : static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
     382           0 :   const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
     383             : 
     384           0 :   in[0] = _mm_add_epi32(in[0], rounding);
     385           0 :   in[1] = _mm_add_epi32(in[1], rounding);
     386           0 :   in[2] = _mm_add_epi32(in[2], rounding);
     387           0 :   in[3] = _mm_add_epi32(in[3], rounding);
     388           0 :   in[4] = _mm_add_epi32(in[4], rounding);
     389           0 :   in[5] = _mm_add_epi32(in[5], rounding);
     390           0 :   in[6] = _mm_add_epi32(in[6], rounding);
     391           0 :   in[7] = _mm_add_epi32(in[7], rounding);
     392           0 :   in[8] = _mm_add_epi32(in[8], rounding);
     393           0 :   in[9] = _mm_add_epi32(in[9], rounding);
     394           0 :   in[10] = _mm_add_epi32(in[10], rounding);
     395           0 :   in[11] = _mm_add_epi32(in[11], rounding);
     396           0 :   in[12] = _mm_add_epi32(in[12], rounding);
     397           0 :   in[13] = _mm_add_epi32(in[13], rounding);
     398           0 :   in[14] = _mm_add_epi32(in[14], rounding);
     399           0 :   in[15] = _mm_add_epi32(in[15], rounding);
     400             : 
     401           0 :   in[0] = _mm_srai_epi32(in[0], shift);
     402           0 :   in[1] = _mm_srai_epi32(in[1], shift);
     403           0 :   in[2] = _mm_srai_epi32(in[2], shift);
     404           0 :   in[3] = _mm_srai_epi32(in[3], shift);
     405           0 :   in[4] = _mm_srai_epi32(in[4], shift);
     406           0 :   in[5] = _mm_srai_epi32(in[5], shift);
     407           0 :   in[6] = _mm_srai_epi32(in[6], shift);
     408           0 :   in[7] = _mm_srai_epi32(in[7], shift);
     409           0 :   in[8] = _mm_srai_epi32(in[8], shift);
     410           0 :   in[9] = _mm_srai_epi32(in[9], shift);
     411           0 :   in[10] = _mm_srai_epi32(in[10], shift);
     412           0 :   in[11] = _mm_srai_epi32(in[11], shift);
     413           0 :   in[12] = _mm_srai_epi32(in[12], shift);
     414           0 :   in[13] = _mm_srai_epi32(in[13], shift);
     415           0 :   in[14] = _mm_srai_epi32(in[14], shift);
     416           0 :   in[15] = _mm_srai_epi32(in[15], shift);
     417           0 : }
     418             : 
     419           0 : static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) {
     420           0 :   _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
     421           0 :   _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
     422           0 :   _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
     423           0 :   _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
     424             : 
     425           0 :   _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
     426           0 :   _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
     427           0 :   _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
     428           0 :   _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
     429             : 
     430           0 :   _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
     431           0 :   _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
     432           0 :   _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
     433           0 :   _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
     434             : 
     435           0 :   _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
     436           0 :   _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
     437           0 :   _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
     438           0 :   _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
     439           0 : }
     440             : 
     441           0 : static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
     442           0 :   const int32_t *cospi = cospi_arr(bit);
     443           0 :   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
     444           0 :   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
     445           0 :   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
     446           0 :   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
     447           0 :   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
     448           0 :   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
     449           0 :   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
     450           0 :   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
     451           0 :   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
     452             :   __m128i u[8], v[8];
     453             : 
     454             :   // Even 8 points 0, 2, ..., 14
     455             :   // stage 0
     456             :   // stage 1
     457           0 :   u[0] = _mm_add_epi32(in[0], in[14]);
     458           0 :   v[7] = _mm_sub_epi32(in[0], in[14]);  // v[7]
     459           0 :   u[1] = _mm_add_epi32(in[2], in[12]);
     460           0 :   u[6] = _mm_sub_epi32(in[2], in[12]);
     461           0 :   u[2] = _mm_add_epi32(in[4], in[10]);
     462           0 :   u[5] = _mm_sub_epi32(in[4], in[10]);
     463           0 :   u[3] = _mm_add_epi32(in[6], in[8]);
     464           0 :   v[4] = _mm_sub_epi32(in[6], in[8]);  // v[4]
     465             : 
     466             :   // stage 2
     467           0 :   v[0] = _mm_add_epi32(u[0], u[3]);
     468           0 :   v[3] = _mm_sub_epi32(u[0], u[3]);
     469           0 :   v[1] = _mm_add_epi32(u[1], u[2]);
     470           0 :   v[2] = _mm_sub_epi32(u[1], u[2]);
     471             : 
     472           0 :   v[5] = _mm_mullo_epi32(u[5], cospim32);
     473           0 :   v[6] = _mm_mullo_epi32(u[6], cospi32);
     474           0 :   v[5] = _mm_add_epi32(v[5], v[6]);
     475           0 :   v[5] = _mm_add_epi32(v[5], rnding);
     476           0 :   v[5] = _mm_srai_epi32(v[5], bit);
     477             : 
     478           0 :   u[0] = _mm_mullo_epi32(u[5], cospi32);
     479           0 :   v[6] = _mm_mullo_epi32(u[6], cospim32);
     480           0 :   v[6] = _mm_sub_epi32(u[0], v[6]);
     481           0 :   v[6] = _mm_add_epi32(v[6], rnding);
     482           0 :   v[6] = _mm_srai_epi32(v[6], bit);
     483             : 
     484             :   // stage 3
     485             :   // type 0
     486           0 :   v[0] = _mm_mullo_epi32(v[0], cospi32);
     487           0 :   v[1] = _mm_mullo_epi32(v[1], cospi32);
     488           0 :   u[0] = _mm_add_epi32(v[0], v[1]);
     489           0 :   u[0] = _mm_add_epi32(u[0], rnding);
     490           0 :   u[0] = _mm_srai_epi32(u[0], bit);
     491             : 
     492           0 :   u[1] = _mm_sub_epi32(v[0], v[1]);
     493           0 :   u[1] = _mm_add_epi32(u[1], rnding);
     494           0 :   u[1] = _mm_srai_epi32(u[1], bit);
     495             : 
     496             :   // type 1
     497           0 :   v[0] = _mm_mullo_epi32(v[2], cospi48);
     498           0 :   v[1] = _mm_mullo_epi32(v[3], cospi16);
     499           0 :   u[2] = _mm_add_epi32(v[0], v[1]);
     500           0 :   u[2] = _mm_add_epi32(u[2], rnding);
     501           0 :   u[2] = _mm_srai_epi32(u[2], bit);
     502             : 
     503           0 :   v[0] = _mm_mullo_epi32(v[2], cospi16);
     504           0 :   v[1] = _mm_mullo_epi32(v[3], cospi48);
     505           0 :   u[3] = _mm_sub_epi32(v[1], v[0]);
     506           0 :   u[3] = _mm_add_epi32(u[3], rnding);
     507           0 :   u[3] = _mm_srai_epi32(u[3], bit);
     508             : 
     509           0 :   u[4] = _mm_add_epi32(v[4], v[5]);
     510           0 :   u[5] = _mm_sub_epi32(v[4], v[5]);
     511           0 :   u[6] = _mm_sub_epi32(v[7], v[6]);
     512           0 :   u[7] = _mm_add_epi32(v[7], v[6]);
     513             : 
     514             :   // stage 4
     515             :   // stage 5
     516           0 :   v[0] = _mm_mullo_epi32(u[4], cospi56);
     517           0 :   v[1] = _mm_mullo_epi32(u[7], cospi8);
     518           0 :   v[0] = _mm_add_epi32(v[0], v[1]);
     519           0 :   v[0] = _mm_add_epi32(v[0], rnding);
     520           0 :   out[2] = _mm_srai_epi32(v[0], bit);  // buf0[4]
     521             : 
     522           0 :   v[0] = _mm_mullo_epi32(u[4], cospi8);
     523           0 :   v[1] = _mm_mullo_epi32(u[7], cospi56);
     524           0 :   v[0] = _mm_sub_epi32(v[1], v[0]);
     525           0 :   v[0] = _mm_add_epi32(v[0], rnding);
     526           0 :   out[14] = _mm_srai_epi32(v[0], bit);  // buf0[7]
     527             : 
     528           0 :   v[0] = _mm_mullo_epi32(u[5], cospi24);
     529           0 :   v[1] = _mm_mullo_epi32(u[6], cospi40);
     530           0 :   v[0] = _mm_add_epi32(v[0], v[1]);
     531           0 :   v[0] = _mm_add_epi32(v[0], rnding);
     532           0 :   out[10] = _mm_srai_epi32(v[0], bit);  // buf0[5]
     533             : 
     534           0 :   v[0] = _mm_mullo_epi32(u[5], cospi40);
     535           0 :   v[1] = _mm_mullo_epi32(u[6], cospi24);
     536           0 :   v[0] = _mm_sub_epi32(v[1], v[0]);
     537           0 :   v[0] = _mm_add_epi32(v[0], rnding);
     538           0 :   out[6] = _mm_srai_epi32(v[0], bit);  // buf0[6]
     539             : 
     540           0 :   out[0] = u[0];   // buf0[0]
     541           0 :   out[8] = u[1];   // buf0[1]
     542           0 :   out[4] = u[2];   // buf0[2]
     543           0 :   out[12] = u[3];  // buf0[3]
     544             : 
     545             :   // Odd 8 points: 1, 3, ..., 15
     546             :   // stage 0
     547             :   // stage 1
     548           0 :   u[0] = _mm_add_epi32(in[1], in[15]);
     549           0 :   v[7] = _mm_sub_epi32(in[1], in[15]);  // v[7]
     550           0 :   u[1] = _mm_add_epi32(in[3], in[13]);
     551           0 :   u[6] = _mm_sub_epi32(in[3], in[13]);
     552           0 :   u[2] = _mm_add_epi32(in[5], in[11]);
     553           0 :   u[5] = _mm_sub_epi32(in[5], in[11]);
     554           0 :   u[3] = _mm_add_epi32(in[7], in[9]);
     555           0 :   v[4] = _mm_sub_epi32(in[7], in[9]);  // v[4]
     556             : 
     557             :   // stage 2
     558           0 :   v[0] = _mm_add_epi32(u[0], u[3]);
     559           0 :   v[3] = _mm_sub_epi32(u[0], u[3]);
     560           0 :   v[1] = _mm_add_epi32(u[1], u[2]);
     561           0 :   v[2] = _mm_sub_epi32(u[1], u[2]);
     562             : 
     563           0 :   v[5] = _mm_mullo_epi32(u[5], cospim32);
     564           0 :   v[6] = _mm_mullo_epi32(u[6], cospi32);
     565           0 :   v[5] = _mm_add_epi32(v[5], v[6]);
     566           0 :   v[5] = _mm_add_epi32(v[5], rnding);
     567           0 :   v[5] = _mm_srai_epi32(v[5], bit);
     568             : 
     569           0 :   u[0] = _mm_mullo_epi32(u[5], cospi32);
     570           0 :   v[6] = _mm_mullo_epi32(u[6], cospim32);
     571           0 :   v[6] = _mm_sub_epi32(u[0], v[6]);
     572           0 :   v[6] = _mm_add_epi32(v[6], rnding);
     573           0 :   v[6] = _mm_srai_epi32(v[6], bit);
     574             : 
     575             :   // stage 3
     576             :   // type 0
     577           0 :   v[0] = _mm_mullo_epi32(v[0], cospi32);
     578           0 :   v[1] = _mm_mullo_epi32(v[1], cospi32);
     579           0 :   u[0] = _mm_add_epi32(v[0], v[1]);
     580           0 :   u[0] = _mm_add_epi32(u[0], rnding);
     581           0 :   u[0] = _mm_srai_epi32(u[0], bit);
     582             : 
     583           0 :   u[1] = _mm_sub_epi32(v[0], v[1]);
     584           0 :   u[1] = _mm_add_epi32(u[1], rnding);
     585           0 :   u[1] = _mm_srai_epi32(u[1], bit);
     586             : 
     587             :   // type 1
     588           0 :   v[0] = _mm_mullo_epi32(v[2], cospi48);
     589           0 :   v[1] = _mm_mullo_epi32(v[3], cospi16);
     590           0 :   u[2] = _mm_add_epi32(v[0], v[1]);
     591           0 :   u[2] = _mm_add_epi32(u[2], rnding);
     592           0 :   u[2] = _mm_srai_epi32(u[2], bit);
     593             : 
     594           0 :   v[0] = _mm_mullo_epi32(v[2], cospi16);
     595           0 :   v[1] = _mm_mullo_epi32(v[3], cospi48);
     596           0 :   u[3] = _mm_sub_epi32(v[1], v[0]);
     597           0 :   u[3] = _mm_add_epi32(u[3], rnding);
     598           0 :   u[3] = _mm_srai_epi32(u[3], bit);
     599             : 
     600           0 :   u[4] = _mm_add_epi32(v[4], v[5]);
     601           0 :   u[5] = _mm_sub_epi32(v[4], v[5]);
     602           0 :   u[6] = _mm_sub_epi32(v[7], v[6]);
     603           0 :   u[7] = _mm_add_epi32(v[7], v[6]);
     604             : 
     605             :   // stage 4
     606             :   // stage 5
     607           0 :   v[0] = _mm_mullo_epi32(u[4], cospi56);
     608           0 :   v[1] = _mm_mullo_epi32(u[7], cospi8);
     609           0 :   v[0] = _mm_add_epi32(v[0], v[1]);
     610           0 :   v[0] = _mm_add_epi32(v[0], rnding);
     611           0 :   out[3] = _mm_srai_epi32(v[0], bit);  // buf0[4]
     612             : 
     613           0 :   v[0] = _mm_mullo_epi32(u[4], cospi8);
     614           0 :   v[1] = _mm_mullo_epi32(u[7], cospi56);
     615           0 :   v[0] = _mm_sub_epi32(v[1], v[0]);
     616           0 :   v[0] = _mm_add_epi32(v[0], rnding);
     617           0 :   out[15] = _mm_srai_epi32(v[0], bit);  // buf0[7]
     618             : 
     619           0 :   v[0] = _mm_mullo_epi32(u[5], cospi24);
     620           0 :   v[1] = _mm_mullo_epi32(u[6], cospi40);
     621           0 :   v[0] = _mm_add_epi32(v[0], v[1]);
     622           0 :   v[0] = _mm_add_epi32(v[0], rnding);
     623           0 :   out[11] = _mm_srai_epi32(v[0], bit);  // buf0[5]
     624             : 
     625           0 :   v[0] = _mm_mullo_epi32(u[5], cospi40);
     626           0 :   v[1] = _mm_mullo_epi32(u[6], cospi24);
     627           0 :   v[0] = _mm_sub_epi32(v[1], v[0]);
     628           0 :   v[0] = _mm_add_epi32(v[0], rnding);
     629           0 :   out[7] = _mm_srai_epi32(v[0], bit);  // buf0[6]
     630             : 
     631           0 :   out[1] = u[0];   // buf0[0]
     632           0 :   out[9] = u[1];   // buf0[1]
     633           0 :   out[5] = u[2];   // buf0[2]
     634           0 :   out[13] = u[3];  // buf0[3]
     635           0 : }
     636             : 
     637           0 : static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
     638           0 :   const int32_t *cospi = cospi_arr(bit);
     639           0 :   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
     640           0 :   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
     641           0 :   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
     642           0 :   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
     643           0 :   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
     644           0 :   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
     645           0 :   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
     646           0 :   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
     647           0 :   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
     648           0 :   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
     649           0 :   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
     650           0 :   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
     651           0 :   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
     652           0 :   const __m128i kZero = _mm_setzero_si128();
     653             :   __m128i u[8], v[8], x;
     654             : 
     655             :   // Even 8 points: 0, 2, ..., 14
     656             :   // stage 0
     657             :   // stage 1
     658             :   // stage 2
     659             :   // (1)
     660           0 :   u[0] = _mm_mullo_epi32(in[14], cospi4);
     661           0 :   x = _mm_mullo_epi32(in[0], cospi60);
     662           0 :   u[0] = _mm_add_epi32(u[0], x);
     663           0 :   u[0] = _mm_add_epi32(u[0], rnding);
     664           0 :   u[0] = _mm_srai_epi32(u[0], bit);
     665             : 
     666           0 :   u[1] = _mm_mullo_epi32(in[14], cospi60);
     667           0 :   x = _mm_mullo_epi32(in[0], cospi4);
     668           0 :   u[1] = _mm_sub_epi32(u[1], x);
     669           0 :   u[1] = _mm_add_epi32(u[1], rnding);
     670           0 :   u[1] = _mm_srai_epi32(u[1], bit);
     671             : 
     672             :   // (2)
     673           0 :   u[2] = _mm_mullo_epi32(in[10], cospi20);
     674           0 :   x = _mm_mullo_epi32(in[4], cospi44);
     675           0 :   u[2] = _mm_add_epi32(u[2], x);
     676           0 :   u[2] = _mm_add_epi32(u[2], rnding);
     677           0 :   u[2] = _mm_srai_epi32(u[2], bit);
     678             : 
     679           0 :   u[3] = _mm_mullo_epi32(in[10], cospi44);
     680           0 :   x = _mm_mullo_epi32(in[4], cospi20);
     681           0 :   u[3] = _mm_sub_epi32(u[3], x);
     682           0 :   u[3] = _mm_add_epi32(u[3], rnding);
     683           0 :   u[3] = _mm_srai_epi32(u[3], bit);
     684             : 
     685             :   // (3)
     686           0 :   u[4] = _mm_mullo_epi32(in[6], cospi36);
     687           0 :   x = _mm_mullo_epi32(in[8], cospi28);
     688           0 :   u[4] = _mm_add_epi32(u[4], x);
     689           0 :   u[4] = _mm_add_epi32(u[4], rnding);
     690           0 :   u[4] = _mm_srai_epi32(u[4], bit);
     691             : 
     692           0 :   u[5] = _mm_mullo_epi32(in[6], cospi28);
     693           0 :   x = _mm_mullo_epi32(in[8], cospi36);
     694           0 :   u[5] = _mm_sub_epi32(u[5], x);
     695           0 :   u[5] = _mm_add_epi32(u[5], rnding);
     696           0 :   u[5] = _mm_srai_epi32(u[5], bit);
     697             : 
     698             :   // (4)
     699           0 :   u[6] = _mm_mullo_epi32(in[2], cospi52);
     700           0 :   x = _mm_mullo_epi32(in[12], cospi12);
     701           0 :   u[6] = _mm_add_epi32(u[6], x);
     702           0 :   u[6] = _mm_add_epi32(u[6], rnding);
     703           0 :   u[6] = _mm_srai_epi32(u[6], bit);
     704             : 
     705           0 :   u[7] = _mm_mullo_epi32(in[2], cospi12);
     706           0 :   x = _mm_mullo_epi32(in[12], cospi52);
     707           0 :   u[7] = _mm_sub_epi32(u[7], x);
     708           0 :   u[7] = _mm_add_epi32(u[7], rnding);
     709           0 :   u[7] = _mm_srai_epi32(u[7], bit);
     710             : 
     711             :   // stage 3
     712           0 :   v[0] = _mm_add_epi32(u[0], u[4]);
     713           0 :   v[4] = _mm_sub_epi32(u[0], u[4]);
     714           0 :   v[1] = _mm_add_epi32(u[1], u[5]);
     715           0 :   v[5] = _mm_sub_epi32(u[1], u[5]);
     716           0 :   v[2] = _mm_add_epi32(u[2], u[6]);
     717           0 :   v[6] = _mm_sub_epi32(u[2], u[6]);
     718           0 :   v[3] = _mm_add_epi32(u[3], u[7]);
     719           0 :   v[7] = _mm_sub_epi32(u[3], u[7]);
     720             : 
     721             :   // stage 4
     722           0 :   u[0] = v[0];
     723           0 :   u[1] = v[1];
     724           0 :   u[2] = v[2];
     725           0 :   u[3] = v[3];
     726             : 
     727           0 :   u[4] = _mm_mullo_epi32(v[4], cospi16);
     728           0 :   x = _mm_mullo_epi32(v[5], cospi48);
     729           0 :   u[4] = _mm_add_epi32(u[4], x);
     730           0 :   u[4] = _mm_add_epi32(u[4], rnding);
     731           0 :   u[4] = _mm_srai_epi32(u[4], bit);
     732             : 
     733           0 :   u[5] = _mm_mullo_epi32(v[4], cospi48);
     734           0 :   x = _mm_mullo_epi32(v[5], cospi16);
     735           0 :   u[5] = _mm_sub_epi32(u[5], x);
     736           0 :   u[5] = _mm_add_epi32(u[5], rnding);
     737           0 :   u[5] = _mm_srai_epi32(u[5], bit);
     738             : 
     739           0 :   u[6] = _mm_mullo_epi32(v[6], cospim48);
     740           0 :   x = _mm_mullo_epi32(v[7], cospi16);
     741           0 :   u[6] = _mm_add_epi32(u[6], x);
     742           0 :   u[6] = _mm_add_epi32(u[6], rnding);
     743           0 :   u[6] = _mm_srai_epi32(u[6], bit);
     744             : 
     745           0 :   u[7] = _mm_mullo_epi32(v[6], cospi16);
     746           0 :   x = _mm_mullo_epi32(v[7], cospim48);
     747           0 :   u[7] = _mm_sub_epi32(u[7], x);
     748           0 :   u[7] = _mm_add_epi32(u[7], rnding);
     749           0 :   u[7] = _mm_srai_epi32(u[7], bit);
     750             : 
     751             :   // stage 5
     752           0 :   v[0] = _mm_add_epi32(u[0], u[2]);
     753           0 :   v[2] = _mm_sub_epi32(u[0], u[2]);
     754           0 :   v[1] = _mm_add_epi32(u[1], u[3]);
     755           0 :   v[3] = _mm_sub_epi32(u[1], u[3]);
     756           0 :   v[4] = _mm_add_epi32(u[4], u[6]);
     757           0 :   v[6] = _mm_sub_epi32(u[4], u[6]);
     758           0 :   v[5] = _mm_add_epi32(u[5], u[7]);
     759           0 :   v[7] = _mm_sub_epi32(u[5], u[7]);
     760             : 
     761             :   // stage 6
     762           0 :   u[0] = v[0];
     763           0 :   u[1] = v[1];
     764           0 :   u[4] = v[4];
     765           0 :   u[5] = v[5];
     766             : 
     767           0 :   v[0] = _mm_mullo_epi32(v[2], cospi32);
     768           0 :   x = _mm_mullo_epi32(v[3], cospi32);
     769           0 :   u[2] = _mm_add_epi32(v[0], x);
     770           0 :   u[2] = _mm_add_epi32(u[2], rnding);
     771           0 :   u[2] = _mm_srai_epi32(u[2], bit);
     772             : 
     773           0 :   u[3] = _mm_sub_epi32(v[0], x);
     774           0 :   u[3] = _mm_add_epi32(u[3], rnding);
     775           0 :   u[3] = _mm_srai_epi32(u[3], bit);
     776             : 
     777           0 :   v[0] = _mm_mullo_epi32(v[6], cospi32);
     778           0 :   x = _mm_mullo_epi32(v[7], cospi32);
     779           0 :   u[6] = _mm_add_epi32(v[0], x);
     780           0 :   u[6] = _mm_add_epi32(u[6], rnding);
     781           0 :   u[6] = _mm_srai_epi32(u[6], bit);
     782             : 
     783           0 :   u[7] = _mm_sub_epi32(v[0], x);
     784           0 :   u[7] = _mm_add_epi32(u[7], rnding);
     785           0 :   u[7] = _mm_srai_epi32(u[7], bit);
     786             : 
     787             :   // stage 7
     788           0 :   out[0] = u[0];
     789           0 :   out[2] = _mm_sub_epi32(kZero, u[4]);
     790           0 :   out[4] = u[6];
     791           0 :   out[6] = _mm_sub_epi32(kZero, u[2]);
     792           0 :   out[8] = u[3];
     793           0 :   out[10] = _mm_sub_epi32(kZero, u[7]);
     794           0 :   out[12] = u[5];
     795           0 :   out[14] = _mm_sub_epi32(kZero, u[1]);
     796             : 
     797             :   // Odd 8 points: 1, 3, ..., 15
     798             :   // stage 0
     799             :   // stage 1
     800             :   // stage 2
     801             :   // (1)
     802           0 :   u[0] = _mm_mullo_epi32(in[15], cospi4);
     803           0 :   x = _mm_mullo_epi32(in[1], cospi60);
     804           0 :   u[0] = _mm_add_epi32(u[0], x);
     805           0 :   u[0] = _mm_add_epi32(u[0], rnding);
     806           0 :   u[0] = _mm_srai_epi32(u[0], bit);
     807             : 
     808           0 :   u[1] = _mm_mullo_epi32(in[15], cospi60);
     809           0 :   x = _mm_mullo_epi32(in[1], cospi4);
     810           0 :   u[1] = _mm_sub_epi32(u[1], x);
     811           0 :   u[1] = _mm_add_epi32(u[1], rnding);
     812           0 :   u[1] = _mm_srai_epi32(u[1], bit);
     813             : 
     814             :   // (2)
     815           0 :   u[2] = _mm_mullo_epi32(in[11], cospi20);
     816           0 :   x = _mm_mullo_epi32(in[5], cospi44);
     817           0 :   u[2] = _mm_add_epi32(u[2], x);
     818           0 :   u[2] = _mm_add_epi32(u[2], rnding);
     819           0 :   u[2] = _mm_srai_epi32(u[2], bit);
     820             : 
     821           0 :   u[3] = _mm_mullo_epi32(in[11], cospi44);
     822           0 :   x = _mm_mullo_epi32(in[5], cospi20);
     823           0 :   u[3] = _mm_sub_epi32(u[3], x);
     824           0 :   u[3] = _mm_add_epi32(u[3], rnding);
     825           0 :   u[3] = _mm_srai_epi32(u[3], bit);
     826             : 
     827             :   // (3)
     828           0 :   u[4] = _mm_mullo_epi32(in[7], cospi36);
     829           0 :   x = _mm_mullo_epi32(in[9], cospi28);
     830           0 :   u[4] = _mm_add_epi32(u[4], x);
     831           0 :   u[4] = _mm_add_epi32(u[4], rnding);
     832           0 :   u[4] = _mm_srai_epi32(u[4], bit);
     833             : 
     834           0 :   u[5] = _mm_mullo_epi32(in[7], cospi28);
     835           0 :   x = _mm_mullo_epi32(in[9], cospi36);
     836           0 :   u[5] = _mm_sub_epi32(u[5], x);
     837           0 :   u[5] = _mm_add_epi32(u[5], rnding);
     838           0 :   u[5] = _mm_srai_epi32(u[5], bit);
     839             : 
     840             :   // (4)
     841           0 :   u[6] = _mm_mullo_epi32(in[3], cospi52);
     842           0 :   x = _mm_mullo_epi32(in[13], cospi12);
     843           0 :   u[6] = _mm_add_epi32(u[6], x);
     844           0 :   u[6] = _mm_add_epi32(u[6], rnding);
     845           0 :   u[6] = _mm_srai_epi32(u[6], bit);
     846             : 
     847           0 :   u[7] = _mm_mullo_epi32(in[3], cospi12);
     848           0 :   x = _mm_mullo_epi32(in[13], cospi52);
     849           0 :   u[7] = _mm_sub_epi32(u[7], x);
     850           0 :   u[7] = _mm_add_epi32(u[7], rnding);
     851           0 :   u[7] = _mm_srai_epi32(u[7], bit);
     852             : 
     853             :   // stage 3
     854           0 :   v[0] = _mm_add_epi32(u[0], u[4]);
     855           0 :   v[4] = _mm_sub_epi32(u[0], u[4]);
     856           0 :   v[1] = _mm_add_epi32(u[1], u[5]);
     857           0 :   v[5] = _mm_sub_epi32(u[1], u[5]);
     858           0 :   v[2] = _mm_add_epi32(u[2], u[6]);
     859           0 :   v[6] = _mm_sub_epi32(u[2], u[6]);
     860           0 :   v[3] = _mm_add_epi32(u[3], u[7]);
     861           0 :   v[7] = _mm_sub_epi32(u[3], u[7]);
     862             : 
     863             :   // stage 4
     864           0 :   u[0] = v[0];
     865           0 :   u[1] = v[1];
     866           0 :   u[2] = v[2];
     867           0 :   u[3] = v[3];
     868             : 
     869           0 :   u[4] = _mm_mullo_epi32(v[4], cospi16);
     870           0 :   x = _mm_mullo_epi32(v[5], cospi48);
     871           0 :   u[4] = _mm_add_epi32(u[4], x);
     872           0 :   u[4] = _mm_add_epi32(u[4], rnding);
     873           0 :   u[4] = _mm_srai_epi32(u[4], bit);
     874             : 
     875           0 :   u[5] = _mm_mullo_epi32(v[4], cospi48);
     876           0 :   x = _mm_mullo_epi32(v[5], cospi16);
     877           0 :   u[5] = _mm_sub_epi32(u[5], x);
     878           0 :   u[5] = _mm_add_epi32(u[5], rnding);
     879           0 :   u[5] = _mm_srai_epi32(u[5], bit);
     880             : 
     881           0 :   u[6] = _mm_mullo_epi32(v[6], cospim48);
     882           0 :   x = _mm_mullo_epi32(v[7], cospi16);
     883           0 :   u[6] = _mm_add_epi32(u[6], x);
     884           0 :   u[6] = _mm_add_epi32(u[6], rnding);
     885           0 :   u[6] = _mm_srai_epi32(u[6], bit);
     886             : 
     887           0 :   u[7] = _mm_mullo_epi32(v[6], cospi16);
     888           0 :   x = _mm_mullo_epi32(v[7], cospim48);
     889           0 :   u[7] = _mm_sub_epi32(u[7], x);
     890           0 :   u[7] = _mm_add_epi32(u[7], rnding);
     891           0 :   u[7] = _mm_srai_epi32(u[7], bit);
     892             : 
     893             :   // stage 5
     894           0 :   v[0] = _mm_add_epi32(u[0], u[2]);
     895           0 :   v[2] = _mm_sub_epi32(u[0], u[2]);
     896           0 :   v[1] = _mm_add_epi32(u[1], u[3]);
     897           0 :   v[3] = _mm_sub_epi32(u[1], u[3]);
     898           0 :   v[4] = _mm_add_epi32(u[4], u[6]);
     899           0 :   v[6] = _mm_sub_epi32(u[4], u[6]);
     900           0 :   v[5] = _mm_add_epi32(u[5], u[7]);
     901           0 :   v[7] = _mm_sub_epi32(u[5], u[7]);
     902             : 
     903             :   // stage 6
     904           0 :   u[0] = v[0];
     905           0 :   u[1] = v[1];
     906           0 :   u[4] = v[4];
     907           0 :   u[5] = v[5];
     908             : 
     909           0 :   v[0] = _mm_mullo_epi32(v[2], cospi32);
     910           0 :   x = _mm_mullo_epi32(v[3], cospi32);
     911           0 :   u[2] = _mm_add_epi32(v[0], x);
     912           0 :   u[2] = _mm_add_epi32(u[2], rnding);
     913           0 :   u[2] = _mm_srai_epi32(u[2], bit);
     914             : 
     915           0 :   u[3] = _mm_sub_epi32(v[0], x);
     916           0 :   u[3] = _mm_add_epi32(u[3], rnding);
     917           0 :   u[3] = _mm_srai_epi32(u[3], bit);
     918             : 
     919           0 :   v[0] = _mm_mullo_epi32(v[6], cospi32);
     920           0 :   x = _mm_mullo_epi32(v[7], cospi32);
     921           0 :   u[6] = _mm_add_epi32(v[0], x);
     922           0 :   u[6] = _mm_add_epi32(u[6], rnding);
     923           0 :   u[6] = _mm_srai_epi32(u[6], bit);
     924             : 
     925           0 :   u[7] = _mm_sub_epi32(v[0], x);
     926           0 :   u[7] = _mm_add_epi32(u[7], rnding);
     927           0 :   u[7] = _mm_srai_epi32(u[7], bit);
     928             : 
     929             :   // stage 7
     930           0 :   out[1] = u[0];
     931           0 :   out[3] = _mm_sub_epi32(kZero, u[4]);
     932           0 :   out[5] = u[6];
     933           0 :   out[7] = _mm_sub_epi32(kZero, u[2]);
     934           0 :   out[9] = u[3];
     935           0 :   out[11] = _mm_sub_epi32(kZero, u[7]);
     936           0 :   out[13] = u[5];
     937           0 :   out[15] = _mm_sub_epi32(kZero, u[1]);
     938           0 : }
     939             : 
     940           0 : void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
     941             :                                int tx_type, int bd) {
     942             :   __m128i in[16], out[16];
     943           0 :   const TXFM_1D_CFG *row_cfg = NULL;
     944           0 :   const TXFM_1D_CFG *col_cfg = NULL;
     945             : 
     946           0 :   switch (tx_type) {
     947             :     case DCT_DCT:
     948           0 :       row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
     949           0 :       col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
     950           0 :       load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
     951           0 :       fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
     952           0 :       col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
     953           0 :       transpose_8x8(out, in);
     954           0 :       fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
     955           0 :       transpose_8x8(out, in);
     956           0 :       write_buffer_8x8(in, coeff);
     957           0 :       break;
     958             :     case ADST_DCT:
     959           0 :       row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
     960           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
     961           0 :       load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
     962           0 :       fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
     963           0 :       col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
     964           0 :       transpose_8x8(out, in);
     965           0 :       fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
     966           0 :       transpose_8x8(out, in);
     967           0 :       write_buffer_8x8(in, coeff);
     968           0 :       break;
     969             :     case DCT_ADST:
     970           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
     971           0 :       col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
     972           0 :       load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
     973           0 :       fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
     974           0 :       col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
     975           0 :       transpose_8x8(out, in);
     976           0 :       fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
     977           0 :       transpose_8x8(out, in);
     978           0 :       write_buffer_8x8(in, coeff);
     979           0 :       break;
     980             :     case ADST_ADST:
     981           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
     982           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
     983           0 :       load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
     984           0 :       fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
     985           0 :       col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
     986           0 :       transpose_8x8(out, in);
     987           0 :       fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
     988           0 :       transpose_8x8(out, in);
     989           0 :       write_buffer_8x8(in, coeff);
     990           0 :       break;
     991             : #if CONFIG_EXT_TX
     992             :     case FLIPADST_DCT:
     993           0 :       row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
     994           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
     995           0 :       load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]);
     996           0 :       fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
     997           0 :       col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
     998           0 :       transpose_8x8(out, in);
     999           0 :       fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
    1000           0 :       transpose_8x8(out, in);
    1001           0 :       write_buffer_8x8(in, coeff);
    1002           0 :       break;
    1003             :     case DCT_FLIPADST:
    1004           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
    1005           0 :       col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
    1006           0 :       load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]);
    1007           0 :       fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
    1008           0 :       col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
    1009           0 :       transpose_8x8(out, in);
    1010           0 :       fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
    1011           0 :       transpose_8x8(out, in);
    1012           0 :       write_buffer_8x8(in, coeff);
    1013           0 :       break;
    1014             :     case FLIPADST_FLIPADST:
    1015           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
    1016           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
    1017           0 :       load_buffer_8x8(input, in, stride, 1, 1, row_cfg->shift[0]);
    1018           0 :       fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
    1019           0 :       col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
    1020           0 :       transpose_8x8(out, in);
    1021           0 :       fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
    1022           0 :       transpose_8x8(out, in);
    1023           0 :       write_buffer_8x8(in, coeff);
    1024           0 :       break;
    1025             :     case ADST_FLIPADST:
    1026           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
    1027           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
    1028           0 :       load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]);
    1029           0 :       fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
    1030           0 :       col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
    1031           0 :       transpose_8x8(out, in);
    1032           0 :       fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
    1033           0 :       transpose_8x8(out, in);
    1034           0 :       write_buffer_8x8(in, coeff);
    1035           0 :       break;
    1036             :     case FLIPADST_ADST:
    1037           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
    1038           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
    1039           0 :       load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]);
    1040           0 :       fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
    1041           0 :       col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
    1042           0 :       transpose_8x8(out, in);
    1043           0 :       fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
    1044           0 :       transpose_8x8(out, in);
    1045           0 :       write_buffer_8x8(in, coeff);
    1046           0 :       break;
    1047             : #endif  // CONFIG_EXT_TX
    1048           0 :     default: assert(0);
    1049             :   }
    1050             :   (void)bd;
    1051           0 : }
    1052             : 
    1053             : // Hybrid Transform 16x16
    1054             : 
    1055           0 : static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
    1056           0 :   int row_index = 0;
    1057           0 :   int dst_index = 0;
    1058           0 :   int src_index = 0;
    1059             : 
    1060             :   // row 0, 1, .., 7
    1061             :   do {
    1062           0 :     out[dst_index] = in[src_index];
    1063           0 :     out[dst_index + 1] = in[src_index + 1];
    1064           0 :     out[dst_index + 2] = in[src_index + 16];
    1065           0 :     out[dst_index + 3] = in[src_index + 17];
    1066           0 :     dst_index += 4;
    1067           0 :     src_index += 2;
    1068           0 :     row_index += 1;
    1069           0 :   } while (row_index < 8);
    1070             : 
    1071             :   // row 8, 9, ..., 15
    1072           0 :   src_index += 16;
    1073             :   do {
    1074           0 :     out[dst_index] = in[src_index];
    1075           0 :     out[dst_index + 1] = in[src_index + 1];
    1076           0 :     out[dst_index + 2] = in[src_index + 16];
    1077           0 :     out[dst_index + 3] = in[src_index + 17];
    1078           0 :     dst_index += 4;
    1079           0 :     src_index += 2;
    1080           0 :     row_index += 1;
    1081           0 :   } while (row_index < 16);
    1082           0 : }
    1083             : 
    1084           0 : static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
    1085             :                                      int stride, int flipud, int fliplr,
    1086             :                                      int shift) {
    1087             :   __m128i in[64];
    1088             :   // Load 4 8x8 blocks
    1089           0 :   const int16_t *topL = input;
    1090           0 :   const int16_t *topR = input + 8;
    1091           0 :   const int16_t *botL = input + 8 * stride;
    1092           0 :   const int16_t *botR = input + 8 * stride + 8;
    1093             : 
    1094             :   const int16_t *tmp;
    1095             : 
    1096           0 :   if (flipud) {
    1097             :     // Swap left columns
    1098           0 :     tmp = topL;
    1099           0 :     topL = botL;
    1100           0 :     botL = tmp;
    1101             :     // Swap right columns
    1102           0 :     tmp = topR;
    1103           0 :     topR = botR;
    1104           0 :     botR = tmp;
    1105             :   }
    1106             : 
    1107           0 :   if (fliplr) {
    1108             :     // Swap top rows
    1109           0 :     tmp = topL;
    1110           0 :     topL = topR;
    1111           0 :     topR = tmp;
    1112             :     // Swap bottom rows
    1113           0 :     tmp = botL;
    1114           0 :     botL = botR;
    1115           0 :     botR = tmp;
    1116             :   }
    1117             : 
    1118             :   // load first 8 columns
    1119           0 :   load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
    1120           0 :   load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
    1121             : 
    1122             :   // load second 8 columns
    1123           0 :   load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
    1124           0 :   load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
    1125             : 
    1126           0 :   convert_8x8_to_16x16(in, out);
    1127           0 : }
    1128             : 
    1129           0 : static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
    1130           0 :   const int32_t *cospi = cospi_arr(bit);
    1131           0 :   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
    1132           0 :   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
    1133           0 :   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
    1134           0 :   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
    1135           0 :   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
    1136           0 :   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
    1137           0 :   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
    1138           0 :   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
    1139           0 :   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
    1140           0 :   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
    1141           0 :   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
    1142           0 :   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
    1143           0 :   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
    1144           0 :   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
    1145           0 :   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
    1146           0 :   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
    1147           0 :   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
    1148           0 :   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
    1149           0 :   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
    1150             :   __m128i u[16], v[16], x;
    1151           0 :   const int col_num = 4;
    1152             :   int col;
    1153             : 
    1154             :   // Calculate the column 0, 1, 2, 3
    1155           0 :   for (col = 0; col < col_num; ++col) {
    1156             :     // stage 0
    1157             :     // stage 1
    1158           0 :     u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
    1159           0 :     u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
    1160           0 :     u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
    1161           0 :     u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
    1162           0 :     u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
    1163           0 :     u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
    1164           0 :     u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
    1165           0 :     u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
    1166           0 :     u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
    1167           0 :     u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
    1168           0 :     u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
    1169           0 :     u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
    1170           0 :     u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
    1171           0 :     u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
    1172           0 :     u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
    1173           0 :     u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
    1174             : 
    1175             :     // stage 2
    1176           0 :     v[0] = _mm_add_epi32(u[0], u[7]);
    1177           0 :     v[7] = _mm_sub_epi32(u[0], u[7]);
    1178           0 :     v[1] = _mm_add_epi32(u[1], u[6]);
    1179           0 :     v[6] = _mm_sub_epi32(u[1], u[6]);
    1180           0 :     v[2] = _mm_add_epi32(u[2], u[5]);
    1181           0 :     v[5] = _mm_sub_epi32(u[2], u[5]);
    1182           0 :     v[3] = _mm_add_epi32(u[3], u[4]);
    1183           0 :     v[4] = _mm_sub_epi32(u[3], u[4]);
    1184           0 :     v[8] = u[8];
    1185           0 :     v[9] = u[9];
    1186             : 
    1187           0 :     v[10] = _mm_mullo_epi32(u[10], cospim32);
    1188           0 :     x = _mm_mullo_epi32(u[13], cospi32);
    1189           0 :     v[10] = _mm_add_epi32(v[10], x);
    1190           0 :     v[10] = _mm_add_epi32(v[10], rnding);
    1191           0 :     v[10] = _mm_srai_epi32(v[10], bit);
    1192             : 
    1193           0 :     v[13] = _mm_mullo_epi32(u[10], cospi32);
    1194           0 :     x = _mm_mullo_epi32(u[13], cospim32);
    1195           0 :     v[13] = _mm_sub_epi32(v[13], x);
    1196           0 :     v[13] = _mm_add_epi32(v[13], rnding);
    1197           0 :     v[13] = _mm_srai_epi32(v[13], bit);
    1198             : 
    1199           0 :     v[11] = _mm_mullo_epi32(u[11], cospim32);
    1200           0 :     x = _mm_mullo_epi32(u[12], cospi32);
    1201           0 :     v[11] = _mm_add_epi32(v[11], x);
    1202           0 :     v[11] = _mm_add_epi32(v[11], rnding);
    1203           0 :     v[11] = _mm_srai_epi32(v[11], bit);
    1204             : 
    1205           0 :     v[12] = _mm_mullo_epi32(u[11], cospi32);
    1206           0 :     x = _mm_mullo_epi32(u[12], cospim32);
    1207           0 :     v[12] = _mm_sub_epi32(v[12], x);
    1208           0 :     v[12] = _mm_add_epi32(v[12], rnding);
    1209           0 :     v[12] = _mm_srai_epi32(v[12], bit);
    1210           0 :     v[14] = u[14];
    1211           0 :     v[15] = u[15];
    1212             : 
    1213             :     // stage 3
    1214           0 :     u[0] = _mm_add_epi32(v[0], v[3]);
    1215           0 :     u[3] = _mm_sub_epi32(v[0], v[3]);
    1216           0 :     u[1] = _mm_add_epi32(v[1], v[2]);
    1217           0 :     u[2] = _mm_sub_epi32(v[1], v[2]);
    1218           0 :     u[4] = v[4];
    1219             : 
    1220           0 :     u[5] = _mm_mullo_epi32(v[5], cospim32);
    1221           0 :     x = _mm_mullo_epi32(v[6], cospi32);
    1222           0 :     u[5] = _mm_add_epi32(u[5], x);
    1223           0 :     u[5] = _mm_add_epi32(u[5], rnding);
    1224           0 :     u[5] = _mm_srai_epi32(u[5], bit);
    1225             : 
    1226           0 :     u[6] = _mm_mullo_epi32(v[5], cospi32);
    1227           0 :     x = _mm_mullo_epi32(v[6], cospim32);
    1228           0 :     u[6] = _mm_sub_epi32(u[6], x);
    1229           0 :     u[6] = _mm_add_epi32(u[6], rnding);
    1230           0 :     u[6] = _mm_srai_epi32(u[6], bit);
    1231             : 
    1232           0 :     u[7] = v[7];
    1233           0 :     u[8] = _mm_add_epi32(v[8], v[11]);
    1234           0 :     u[11] = _mm_sub_epi32(v[8], v[11]);
    1235           0 :     u[9] = _mm_add_epi32(v[9], v[10]);
    1236           0 :     u[10] = _mm_sub_epi32(v[9], v[10]);
    1237           0 :     u[12] = _mm_sub_epi32(v[15], v[12]);
    1238           0 :     u[15] = _mm_add_epi32(v[15], v[12]);
    1239           0 :     u[13] = _mm_sub_epi32(v[14], v[13]);
    1240           0 :     u[14] = _mm_add_epi32(v[14], v[13]);
    1241             : 
    1242             :     // stage 4
    1243           0 :     u[0] = _mm_mullo_epi32(u[0], cospi32);
    1244           0 :     u[1] = _mm_mullo_epi32(u[1], cospi32);
    1245           0 :     v[0] = _mm_add_epi32(u[0], u[1]);
    1246           0 :     v[0] = _mm_add_epi32(v[0], rnding);
    1247           0 :     v[0] = _mm_srai_epi32(v[0], bit);
    1248             : 
    1249           0 :     v[1] = _mm_sub_epi32(u[0], u[1]);
    1250           0 :     v[1] = _mm_add_epi32(v[1], rnding);
    1251           0 :     v[1] = _mm_srai_epi32(v[1], bit);
    1252             : 
    1253           0 :     v[2] = _mm_mullo_epi32(u[2], cospi48);
    1254           0 :     x = _mm_mullo_epi32(u[3], cospi16);
    1255           0 :     v[2] = _mm_add_epi32(v[2], x);
    1256           0 :     v[2] = _mm_add_epi32(v[2], rnding);
    1257           0 :     v[2] = _mm_srai_epi32(v[2], bit);
    1258             : 
    1259           0 :     v[3] = _mm_mullo_epi32(u[2], cospi16);
    1260           0 :     x = _mm_mullo_epi32(u[3], cospi48);
    1261           0 :     v[3] = _mm_sub_epi32(x, v[3]);
    1262           0 :     v[3] = _mm_add_epi32(v[3], rnding);
    1263           0 :     v[3] = _mm_srai_epi32(v[3], bit);
    1264             : 
    1265           0 :     v[4] = _mm_add_epi32(u[4], u[5]);
    1266           0 :     v[5] = _mm_sub_epi32(u[4], u[5]);
    1267           0 :     v[6] = _mm_sub_epi32(u[7], u[6]);
    1268           0 :     v[7] = _mm_add_epi32(u[7], u[6]);
    1269           0 :     v[8] = u[8];
    1270             : 
    1271           0 :     v[9] = _mm_mullo_epi32(u[9], cospim16);
    1272           0 :     x = _mm_mullo_epi32(u[14], cospi48);
    1273           0 :     v[9] = _mm_add_epi32(v[9], x);
    1274           0 :     v[9] = _mm_add_epi32(v[9], rnding);
    1275           0 :     v[9] = _mm_srai_epi32(v[9], bit);
    1276             : 
    1277           0 :     v[14] = _mm_mullo_epi32(u[9], cospi48);
    1278           0 :     x = _mm_mullo_epi32(u[14], cospim16);
    1279           0 :     v[14] = _mm_sub_epi32(v[14], x);
    1280           0 :     v[14] = _mm_add_epi32(v[14], rnding);
    1281           0 :     v[14] = _mm_srai_epi32(v[14], bit);
    1282             : 
    1283           0 :     v[10] = _mm_mullo_epi32(u[10], cospim48);
    1284           0 :     x = _mm_mullo_epi32(u[13], cospim16);
    1285           0 :     v[10] = _mm_add_epi32(v[10], x);
    1286           0 :     v[10] = _mm_add_epi32(v[10], rnding);
    1287           0 :     v[10] = _mm_srai_epi32(v[10], bit);
    1288             : 
    1289           0 :     v[13] = _mm_mullo_epi32(u[10], cospim16);
    1290           0 :     x = _mm_mullo_epi32(u[13], cospim48);
    1291           0 :     v[13] = _mm_sub_epi32(v[13], x);
    1292           0 :     v[13] = _mm_add_epi32(v[13], rnding);
    1293           0 :     v[13] = _mm_srai_epi32(v[13], bit);
    1294             : 
    1295           0 :     v[11] = u[11];
    1296           0 :     v[12] = u[12];
    1297           0 :     v[15] = u[15];
    1298             : 
    1299             :     // stage 5
    1300           0 :     u[0] = v[0];
    1301           0 :     u[1] = v[1];
    1302           0 :     u[2] = v[2];
    1303           0 :     u[3] = v[3];
    1304             : 
    1305           0 :     u[4] = _mm_mullo_epi32(v[4], cospi56);
    1306           0 :     x = _mm_mullo_epi32(v[7], cospi8);
    1307           0 :     u[4] = _mm_add_epi32(u[4], x);
    1308           0 :     u[4] = _mm_add_epi32(u[4], rnding);
    1309           0 :     u[4] = _mm_srai_epi32(u[4], bit);
    1310             : 
    1311           0 :     u[7] = _mm_mullo_epi32(v[4], cospi8);
    1312           0 :     x = _mm_mullo_epi32(v[7], cospi56);
    1313           0 :     u[7] = _mm_sub_epi32(x, u[7]);
    1314           0 :     u[7] = _mm_add_epi32(u[7], rnding);
    1315           0 :     u[7] = _mm_srai_epi32(u[7], bit);
    1316             : 
    1317           0 :     u[5] = _mm_mullo_epi32(v[5], cospi24);
    1318           0 :     x = _mm_mullo_epi32(v[6], cospi40);
    1319           0 :     u[5] = _mm_add_epi32(u[5], x);
    1320           0 :     u[5] = _mm_add_epi32(u[5], rnding);
    1321           0 :     u[5] = _mm_srai_epi32(u[5], bit);
    1322             : 
    1323           0 :     u[6] = _mm_mullo_epi32(v[5], cospi40);
    1324           0 :     x = _mm_mullo_epi32(v[6], cospi24);
    1325           0 :     u[6] = _mm_sub_epi32(x, u[6]);
    1326           0 :     u[6] = _mm_add_epi32(u[6], rnding);
    1327           0 :     u[6] = _mm_srai_epi32(u[6], bit);
    1328             : 
    1329           0 :     u[8] = _mm_add_epi32(v[8], v[9]);
    1330           0 :     u[9] = _mm_sub_epi32(v[8], v[9]);
    1331           0 :     u[10] = _mm_sub_epi32(v[11], v[10]);
    1332           0 :     u[11] = _mm_add_epi32(v[11], v[10]);
    1333           0 :     u[12] = _mm_add_epi32(v[12], v[13]);
    1334           0 :     u[13] = _mm_sub_epi32(v[12], v[13]);
    1335           0 :     u[14] = _mm_sub_epi32(v[15], v[14]);
    1336           0 :     u[15] = _mm_add_epi32(v[15], v[14]);
    1337             : 
    1338             :     // stage 6
    1339           0 :     v[0] = u[0];
    1340           0 :     v[1] = u[1];
    1341           0 :     v[2] = u[2];
    1342           0 :     v[3] = u[3];
    1343           0 :     v[4] = u[4];
    1344           0 :     v[5] = u[5];
    1345           0 :     v[6] = u[6];
    1346           0 :     v[7] = u[7];
    1347             : 
    1348           0 :     v[8] = _mm_mullo_epi32(u[8], cospi60);
    1349           0 :     x = _mm_mullo_epi32(u[15], cospi4);
    1350           0 :     v[8] = _mm_add_epi32(v[8], x);
    1351           0 :     v[8] = _mm_add_epi32(v[8], rnding);
    1352           0 :     v[8] = _mm_srai_epi32(v[8], bit);
    1353             : 
    1354           0 :     v[15] = _mm_mullo_epi32(u[8], cospi4);
    1355           0 :     x = _mm_mullo_epi32(u[15], cospi60);
    1356           0 :     v[15] = _mm_sub_epi32(x, v[15]);
    1357           0 :     v[15] = _mm_add_epi32(v[15], rnding);
    1358           0 :     v[15] = _mm_srai_epi32(v[15], bit);
    1359             : 
    1360           0 :     v[9] = _mm_mullo_epi32(u[9], cospi28);
    1361           0 :     x = _mm_mullo_epi32(u[14], cospi36);
    1362           0 :     v[9] = _mm_add_epi32(v[9], x);
    1363           0 :     v[9] = _mm_add_epi32(v[9], rnding);
    1364           0 :     v[9] = _mm_srai_epi32(v[9], bit);
    1365             : 
    1366           0 :     v[14] = _mm_mullo_epi32(u[9], cospi36);
    1367           0 :     x = _mm_mullo_epi32(u[14], cospi28);
    1368           0 :     v[14] = _mm_sub_epi32(x, v[14]);
    1369           0 :     v[14] = _mm_add_epi32(v[14], rnding);
    1370           0 :     v[14] = _mm_srai_epi32(v[14], bit);
    1371             : 
    1372           0 :     v[10] = _mm_mullo_epi32(u[10], cospi44);
    1373           0 :     x = _mm_mullo_epi32(u[13], cospi20);
    1374           0 :     v[10] = _mm_add_epi32(v[10], x);
    1375           0 :     v[10] = _mm_add_epi32(v[10], rnding);
    1376           0 :     v[10] = _mm_srai_epi32(v[10], bit);
    1377             : 
    1378           0 :     v[13] = _mm_mullo_epi32(u[10], cospi20);
    1379           0 :     x = _mm_mullo_epi32(u[13], cospi44);
    1380           0 :     v[13] = _mm_sub_epi32(x, v[13]);
    1381           0 :     v[13] = _mm_add_epi32(v[13], rnding);
    1382           0 :     v[13] = _mm_srai_epi32(v[13], bit);
    1383             : 
    1384           0 :     v[11] = _mm_mullo_epi32(u[11], cospi12);
    1385           0 :     x = _mm_mullo_epi32(u[12], cospi52);
    1386           0 :     v[11] = _mm_add_epi32(v[11], x);
    1387           0 :     v[11] = _mm_add_epi32(v[11], rnding);
    1388           0 :     v[11] = _mm_srai_epi32(v[11], bit);
    1389             : 
    1390           0 :     v[12] = _mm_mullo_epi32(u[11], cospi52);
    1391           0 :     x = _mm_mullo_epi32(u[12], cospi12);
    1392           0 :     v[12] = _mm_sub_epi32(x, v[12]);
    1393           0 :     v[12] = _mm_add_epi32(v[12], rnding);
    1394           0 :     v[12] = _mm_srai_epi32(v[12], bit);
    1395             : 
    1396           0 :     out[0 * col_num + col] = v[0];
    1397           0 :     out[1 * col_num + col] = v[8];
    1398           0 :     out[2 * col_num + col] = v[4];
    1399           0 :     out[3 * col_num + col] = v[12];
    1400           0 :     out[4 * col_num + col] = v[2];
    1401           0 :     out[5 * col_num + col] = v[10];
    1402           0 :     out[6 * col_num + col] = v[6];
    1403           0 :     out[7 * col_num + col] = v[14];
    1404           0 :     out[8 * col_num + col] = v[1];
    1405           0 :     out[9 * col_num + col] = v[9];
    1406           0 :     out[10 * col_num + col] = v[5];
    1407           0 :     out[11 * col_num + col] = v[13];
    1408           0 :     out[12 * col_num + col] = v[3];
    1409           0 :     out[13 * col_num + col] = v[11];
    1410           0 :     out[14 * col_num + col] = v[7];
    1411           0 :     out[15 * col_num + col] = v[15];
    1412             :   }
    1413           0 : }
    1414             : 
    1415           0 : static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
    1416           0 :   const int32_t *cospi = cospi_arr(bit);
    1417           0 :   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
    1418           0 :   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
    1419           0 :   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
    1420           0 :   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
    1421           0 :   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
    1422           0 :   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
    1423           0 :   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
    1424           0 :   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
    1425           0 :   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
    1426           0 :   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
    1427           0 :   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
    1428           0 :   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
    1429           0 :   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
    1430           0 :   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
    1431           0 :   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
    1432           0 :   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
    1433           0 :   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
    1434           0 :   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
    1435           0 :   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
    1436           0 :   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
    1437           0 :   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
    1438           0 :   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
    1439           0 :   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
    1440           0 :   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
    1441           0 :   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
    1442           0 :   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
    1443           0 :   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
    1444             :   __m128i u[16], v[16], x, y;
    1445           0 :   const int col_num = 4;
    1446             :   int col;
    1447             : 
    1448             :   // Calculate the column 0, 1, 2, 3
    1449           0 :   for (col = 0; col < col_num; ++col) {
    1450             :     // stage 0
    1451             :     // stage 1
    1452             :     // stage 2
    1453           0 :     v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
    1454           0 :     x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
    1455           0 :     v[0] = _mm_add_epi32(v[0], x);
    1456           0 :     v[0] = _mm_add_epi32(v[0], rnding);
    1457           0 :     v[0] = _mm_srai_epi32(v[0], bit);
    1458             : 
    1459           0 :     v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
    1460           0 :     x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
    1461           0 :     v[1] = _mm_sub_epi32(v[1], x);
    1462           0 :     v[1] = _mm_add_epi32(v[1], rnding);
    1463           0 :     v[1] = _mm_srai_epi32(v[1], bit);
    1464             : 
    1465           0 :     v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
    1466           0 :     x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
    1467           0 :     v[2] = _mm_add_epi32(v[2], x);
    1468           0 :     v[2] = _mm_add_epi32(v[2], rnding);
    1469           0 :     v[2] = _mm_srai_epi32(v[2], bit);
    1470             : 
    1471           0 :     v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
    1472           0 :     x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
    1473           0 :     v[3] = _mm_sub_epi32(v[3], x);
    1474           0 :     v[3] = _mm_add_epi32(v[3], rnding);
    1475           0 :     v[3] = _mm_srai_epi32(v[3], bit);
    1476             : 
    1477           0 :     v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
    1478           0 :     x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
    1479           0 :     v[4] = _mm_add_epi32(v[4], x);
    1480           0 :     v[4] = _mm_add_epi32(v[4], rnding);
    1481           0 :     v[4] = _mm_srai_epi32(v[4], bit);
    1482             : 
    1483           0 :     v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
    1484           0 :     x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
    1485           0 :     v[5] = _mm_sub_epi32(v[5], x);
    1486           0 :     v[5] = _mm_add_epi32(v[5], rnding);
    1487           0 :     v[5] = _mm_srai_epi32(v[5], bit);
    1488             : 
    1489           0 :     v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
    1490           0 :     x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
    1491           0 :     v[6] = _mm_add_epi32(v[6], x);
    1492           0 :     v[6] = _mm_add_epi32(v[6], rnding);
    1493           0 :     v[6] = _mm_srai_epi32(v[6], bit);
    1494             : 
    1495           0 :     v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
    1496           0 :     x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
    1497           0 :     v[7] = _mm_sub_epi32(v[7], x);
    1498           0 :     v[7] = _mm_add_epi32(v[7], rnding);
    1499           0 :     v[7] = _mm_srai_epi32(v[7], bit);
    1500             : 
    1501           0 :     v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
    1502           0 :     x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
    1503           0 :     v[8] = _mm_add_epi32(v[8], x);
    1504           0 :     v[8] = _mm_add_epi32(v[8], rnding);
    1505           0 :     v[8] = _mm_srai_epi32(v[8], bit);
    1506             : 
    1507           0 :     v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
    1508           0 :     x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
    1509           0 :     v[9] = _mm_sub_epi32(v[9], x);
    1510           0 :     v[9] = _mm_add_epi32(v[9], rnding);
    1511           0 :     v[9] = _mm_srai_epi32(v[9], bit);
    1512             : 
    1513           0 :     v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
    1514           0 :     x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
    1515           0 :     v[10] = _mm_add_epi32(v[10], x);
    1516           0 :     v[10] = _mm_add_epi32(v[10], rnding);
    1517           0 :     v[10] = _mm_srai_epi32(v[10], bit);
    1518             : 
    1519           0 :     v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
    1520           0 :     x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
    1521           0 :     v[11] = _mm_sub_epi32(v[11], x);
    1522           0 :     v[11] = _mm_add_epi32(v[11], rnding);
    1523           0 :     v[11] = _mm_srai_epi32(v[11], bit);
    1524             : 
    1525           0 :     v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
    1526           0 :     x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
    1527           0 :     v[12] = _mm_add_epi32(v[12], x);
    1528           0 :     v[12] = _mm_add_epi32(v[12], rnding);
    1529           0 :     v[12] = _mm_srai_epi32(v[12], bit);
    1530             : 
    1531           0 :     v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
    1532           0 :     x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
    1533           0 :     v[13] = _mm_sub_epi32(v[13], x);
    1534           0 :     v[13] = _mm_add_epi32(v[13], rnding);
    1535           0 :     v[13] = _mm_srai_epi32(v[13], bit);
    1536             : 
    1537           0 :     v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
    1538           0 :     x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
    1539           0 :     v[14] = _mm_add_epi32(v[14], x);
    1540           0 :     v[14] = _mm_add_epi32(v[14], rnding);
    1541           0 :     v[14] = _mm_srai_epi32(v[14], bit);
    1542             : 
    1543           0 :     v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
    1544           0 :     x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
    1545           0 :     v[15] = _mm_sub_epi32(v[15], x);
    1546           0 :     v[15] = _mm_add_epi32(v[15], rnding);
    1547           0 :     v[15] = _mm_srai_epi32(v[15], bit);
    1548             : 
    1549             :     // stage 3
    1550           0 :     u[0] = _mm_add_epi32(v[0], v[8]);
    1551           0 :     u[8] = _mm_sub_epi32(v[0], v[8]);
    1552           0 :     u[1] = _mm_add_epi32(v[1], v[9]);
    1553           0 :     u[9] = _mm_sub_epi32(v[1], v[9]);
    1554           0 :     u[2] = _mm_add_epi32(v[2], v[10]);
    1555           0 :     u[10] = _mm_sub_epi32(v[2], v[10]);
    1556           0 :     u[3] = _mm_add_epi32(v[3], v[11]);
    1557           0 :     u[11] = _mm_sub_epi32(v[3], v[11]);
    1558           0 :     u[4] = _mm_add_epi32(v[4], v[12]);
    1559           0 :     u[12] = _mm_sub_epi32(v[4], v[12]);
    1560           0 :     u[5] = _mm_add_epi32(v[5], v[13]);
    1561           0 :     u[13] = _mm_sub_epi32(v[5], v[13]);
    1562           0 :     u[6] = _mm_add_epi32(v[6], v[14]);
    1563           0 :     u[14] = _mm_sub_epi32(v[6], v[14]);
    1564           0 :     u[7] = _mm_add_epi32(v[7], v[15]);
    1565           0 :     u[15] = _mm_sub_epi32(v[7], v[15]);
    1566             : 
    1567             :     // stage 4
    1568           0 :     v[0] = u[0];
    1569           0 :     v[1] = u[1];
    1570           0 :     v[2] = u[2];
    1571           0 :     v[3] = u[3];
    1572           0 :     v[4] = u[4];
    1573           0 :     v[5] = u[5];
    1574           0 :     v[6] = u[6];
    1575           0 :     v[7] = u[7];
    1576             : 
    1577           0 :     v[8] = _mm_mullo_epi32(u[8], cospi8);
    1578           0 :     x = _mm_mullo_epi32(u[9], cospi56);
    1579           0 :     v[8] = _mm_add_epi32(v[8], x);
    1580           0 :     v[8] = _mm_add_epi32(v[8], rnding);
    1581           0 :     v[8] = _mm_srai_epi32(v[8], bit);
    1582             : 
    1583           0 :     v[9] = _mm_mullo_epi32(u[8], cospi56);
    1584           0 :     x = _mm_mullo_epi32(u[9], cospi8);
    1585           0 :     v[9] = _mm_sub_epi32(v[9], x);
    1586           0 :     v[9] = _mm_add_epi32(v[9], rnding);
    1587           0 :     v[9] = _mm_srai_epi32(v[9], bit);
    1588             : 
    1589           0 :     v[10] = _mm_mullo_epi32(u[10], cospi40);
    1590           0 :     x = _mm_mullo_epi32(u[11], cospi24);
    1591           0 :     v[10] = _mm_add_epi32(v[10], x);
    1592           0 :     v[10] = _mm_add_epi32(v[10], rnding);
    1593           0 :     v[10] = _mm_srai_epi32(v[10], bit);
    1594             : 
    1595           0 :     v[11] = _mm_mullo_epi32(u[10], cospi24);
    1596           0 :     x = _mm_mullo_epi32(u[11], cospi40);
    1597           0 :     v[11] = _mm_sub_epi32(v[11], x);
    1598           0 :     v[11] = _mm_add_epi32(v[11], rnding);
    1599           0 :     v[11] = _mm_srai_epi32(v[11], bit);
    1600             : 
    1601           0 :     v[12] = _mm_mullo_epi32(u[12], cospim56);
    1602           0 :     x = _mm_mullo_epi32(u[13], cospi8);
    1603           0 :     v[12] = _mm_add_epi32(v[12], x);
    1604           0 :     v[12] = _mm_add_epi32(v[12], rnding);
    1605           0 :     v[12] = _mm_srai_epi32(v[12], bit);
    1606             : 
    1607           0 :     v[13] = _mm_mullo_epi32(u[12], cospi8);
    1608           0 :     x = _mm_mullo_epi32(u[13], cospim56);
    1609           0 :     v[13] = _mm_sub_epi32(v[13], x);
    1610           0 :     v[13] = _mm_add_epi32(v[13], rnding);
    1611           0 :     v[13] = _mm_srai_epi32(v[13], bit);
    1612             : 
    1613           0 :     v[14] = _mm_mullo_epi32(u[14], cospim24);
    1614           0 :     x = _mm_mullo_epi32(u[15], cospi40);
    1615           0 :     v[14] = _mm_add_epi32(v[14], x);
    1616           0 :     v[14] = _mm_add_epi32(v[14], rnding);
    1617           0 :     v[14] = _mm_srai_epi32(v[14], bit);
    1618             : 
    1619           0 :     v[15] = _mm_mullo_epi32(u[14], cospi40);
    1620           0 :     x = _mm_mullo_epi32(u[15], cospim24);
    1621           0 :     v[15] = _mm_sub_epi32(v[15], x);
    1622           0 :     v[15] = _mm_add_epi32(v[15], rnding);
    1623           0 :     v[15] = _mm_srai_epi32(v[15], bit);
    1624             : 
    1625             :     // stage 5
    1626           0 :     u[0] = _mm_add_epi32(v[0], v[4]);
    1627           0 :     u[4] = _mm_sub_epi32(v[0], v[4]);
    1628           0 :     u[1] = _mm_add_epi32(v[1], v[5]);
    1629           0 :     u[5] = _mm_sub_epi32(v[1], v[5]);
    1630           0 :     u[2] = _mm_add_epi32(v[2], v[6]);
    1631           0 :     u[6] = _mm_sub_epi32(v[2], v[6]);
    1632           0 :     u[3] = _mm_add_epi32(v[3], v[7]);
    1633           0 :     u[7] = _mm_sub_epi32(v[3], v[7]);
    1634           0 :     u[8] = _mm_add_epi32(v[8], v[12]);
    1635           0 :     u[12] = _mm_sub_epi32(v[8], v[12]);
    1636           0 :     u[9] = _mm_add_epi32(v[9], v[13]);
    1637           0 :     u[13] = _mm_sub_epi32(v[9], v[13]);
    1638           0 :     u[10] = _mm_add_epi32(v[10], v[14]);
    1639           0 :     u[14] = _mm_sub_epi32(v[10], v[14]);
    1640           0 :     u[11] = _mm_add_epi32(v[11], v[15]);
    1641           0 :     u[15] = _mm_sub_epi32(v[11], v[15]);
    1642             : 
    1643             :     // stage 6
    1644           0 :     v[0] = u[0];
    1645           0 :     v[1] = u[1];
    1646           0 :     v[2] = u[2];
    1647           0 :     v[3] = u[3];
    1648             : 
    1649           0 :     v[4] = _mm_mullo_epi32(u[4], cospi16);
    1650           0 :     x = _mm_mullo_epi32(u[5], cospi48);
    1651           0 :     v[4] = _mm_add_epi32(v[4], x);
    1652           0 :     v[4] = _mm_add_epi32(v[4], rnding);
    1653           0 :     v[4] = _mm_srai_epi32(v[4], bit);
    1654             : 
    1655           0 :     v[5] = _mm_mullo_epi32(u[4], cospi48);
    1656           0 :     x = _mm_mullo_epi32(u[5], cospi16);
    1657           0 :     v[5] = _mm_sub_epi32(v[5], x);
    1658           0 :     v[5] = _mm_add_epi32(v[5], rnding);
    1659           0 :     v[5] = _mm_srai_epi32(v[5], bit);
    1660             : 
    1661           0 :     v[6] = _mm_mullo_epi32(u[6], cospim48);
    1662           0 :     x = _mm_mullo_epi32(u[7], cospi16);
    1663           0 :     v[6] = _mm_add_epi32(v[6], x);
    1664           0 :     v[6] = _mm_add_epi32(v[6], rnding);
    1665           0 :     v[6] = _mm_srai_epi32(v[6], bit);
    1666             : 
    1667           0 :     v[7] = _mm_mullo_epi32(u[6], cospi16);
    1668           0 :     x = _mm_mullo_epi32(u[7], cospim48);
    1669           0 :     v[7] = _mm_sub_epi32(v[7], x);
    1670           0 :     v[7] = _mm_add_epi32(v[7], rnding);
    1671           0 :     v[7] = _mm_srai_epi32(v[7], bit);
    1672             : 
    1673           0 :     v[8] = u[8];
    1674           0 :     v[9] = u[9];
    1675           0 :     v[10] = u[10];
    1676           0 :     v[11] = u[11];
    1677             : 
    1678           0 :     v[12] = _mm_mullo_epi32(u[12], cospi16);
    1679           0 :     x = _mm_mullo_epi32(u[13], cospi48);
    1680           0 :     v[12] = _mm_add_epi32(v[12], x);
    1681           0 :     v[12] = _mm_add_epi32(v[12], rnding);
    1682           0 :     v[12] = _mm_srai_epi32(v[12], bit);
    1683             : 
    1684           0 :     v[13] = _mm_mullo_epi32(u[12], cospi48);
    1685           0 :     x = _mm_mullo_epi32(u[13], cospi16);
    1686           0 :     v[13] = _mm_sub_epi32(v[13], x);
    1687           0 :     v[13] = _mm_add_epi32(v[13], rnding);
    1688           0 :     v[13] = _mm_srai_epi32(v[13], bit);
    1689             : 
    1690           0 :     v[14] = _mm_mullo_epi32(u[14], cospim48);
    1691           0 :     x = _mm_mullo_epi32(u[15], cospi16);
    1692           0 :     v[14] = _mm_add_epi32(v[14], x);
    1693           0 :     v[14] = _mm_add_epi32(v[14], rnding);
    1694           0 :     v[14] = _mm_srai_epi32(v[14], bit);
    1695             : 
    1696           0 :     v[15] = _mm_mullo_epi32(u[14], cospi16);
    1697           0 :     x = _mm_mullo_epi32(u[15], cospim48);
    1698           0 :     v[15] = _mm_sub_epi32(v[15], x);
    1699           0 :     v[15] = _mm_add_epi32(v[15], rnding);
    1700           0 :     v[15] = _mm_srai_epi32(v[15], bit);
    1701             : 
    1702             :     // stage 7
    1703           0 :     u[0] = _mm_add_epi32(v[0], v[2]);
    1704           0 :     u[2] = _mm_sub_epi32(v[0], v[2]);
    1705           0 :     u[1] = _mm_add_epi32(v[1], v[3]);
    1706           0 :     u[3] = _mm_sub_epi32(v[1], v[3]);
    1707           0 :     u[4] = _mm_add_epi32(v[4], v[6]);
    1708           0 :     u[6] = _mm_sub_epi32(v[4], v[6]);
    1709           0 :     u[5] = _mm_add_epi32(v[5], v[7]);
    1710           0 :     u[7] = _mm_sub_epi32(v[5], v[7]);
    1711           0 :     u[8] = _mm_add_epi32(v[8], v[10]);
    1712           0 :     u[10] = _mm_sub_epi32(v[8], v[10]);
    1713           0 :     u[9] = _mm_add_epi32(v[9], v[11]);
    1714           0 :     u[11] = _mm_sub_epi32(v[9], v[11]);
    1715           0 :     u[12] = _mm_add_epi32(v[12], v[14]);
    1716           0 :     u[14] = _mm_sub_epi32(v[12], v[14]);
    1717           0 :     u[13] = _mm_add_epi32(v[13], v[15]);
    1718           0 :     u[15] = _mm_sub_epi32(v[13], v[15]);
    1719             : 
    1720             :     // stage 8
    1721           0 :     v[0] = u[0];
    1722           0 :     v[1] = u[1];
    1723             : 
    1724           0 :     y = _mm_mullo_epi32(u[2], cospi32);
    1725           0 :     x = _mm_mullo_epi32(u[3], cospi32);
    1726           0 :     v[2] = _mm_add_epi32(y, x);
    1727           0 :     v[2] = _mm_add_epi32(v[2], rnding);
    1728           0 :     v[2] = _mm_srai_epi32(v[2], bit);
    1729             : 
    1730           0 :     v[3] = _mm_sub_epi32(y, x);
    1731           0 :     v[3] = _mm_add_epi32(v[3], rnding);
    1732           0 :     v[3] = _mm_srai_epi32(v[3], bit);
    1733             : 
    1734           0 :     v[4] = u[4];
    1735           0 :     v[5] = u[5];
    1736             : 
    1737           0 :     y = _mm_mullo_epi32(u[6], cospi32);
    1738           0 :     x = _mm_mullo_epi32(u[7], cospi32);
    1739           0 :     v[6] = _mm_add_epi32(y, x);
    1740           0 :     v[6] = _mm_add_epi32(v[6], rnding);
    1741           0 :     v[6] = _mm_srai_epi32(v[6], bit);
    1742             : 
    1743           0 :     v[7] = _mm_sub_epi32(y, x);
    1744           0 :     v[7] = _mm_add_epi32(v[7], rnding);
    1745           0 :     v[7] = _mm_srai_epi32(v[7], bit);
    1746             : 
    1747           0 :     v[8] = u[8];
    1748           0 :     v[9] = u[9];
    1749             : 
    1750           0 :     y = _mm_mullo_epi32(u[10], cospi32);
    1751           0 :     x = _mm_mullo_epi32(u[11], cospi32);
    1752           0 :     v[10] = _mm_add_epi32(y, x);
    1753           0 :     v[10] = _mm_add_epi32(v[10], rnding);
    1754           0 :     v[10] = _mm_srai_epi32(v[10], bit);
    1755             : 
    1756           0 :     v[11] = _mm_sub_epi32(y, x);
    1757           0 :     v[11] = _mm_add_epi32(v[11], rnding);
    1758           0 :     v[11] = _mm_srai_epi32(v[11], bit);
    1759             : 
    1760           0 :     v[12] = u[12];
    1761           0 :     v[13] = u[13];
    1762             : 
    1763           0 :     y = _mm_mullo_epi32(u[14], cospi32);
    1764           0 :     x = _mm_mullo_epi32(u[15], cospi32);
    1765           0 :     v[14] = _mm_add_epi32(y, x);
    1766           0 :     v[14] = _mm_add_epi32(v[14], rnding);
    1767           0 :     v[14] = _mm_srai_epi32(v[14], bit);
    1768             : 
    1769           0 :     v[15] = _mm_sub_epi32(y, x);
    1770           0 :     v[15] = _mm_add_epi32(v[15], rnding);
    1771           0 :     v[15] = _mm_srai_epi32(v[15], bit);
    1772             : 
    1773             :     // stage 9
    1774           0 :     out[0 * col_num + col] = v[0];
    1775           0 :     out[1 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[8]);
    1776           0 :     out[2 * col_num + col] = v[12];
    1777           0 :     out[3 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[4]);
    1778           0 :     out[4 * col_num + col] = v[6];
    1779           0 :     out[5 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[14]);
    1780           0 :     out[6 * col_num + col] = v[10];
    1781           0 :     out[7 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[2]);
    1782           0 :     out[8 * col_num + col] = v[3];
    1783           0 :     out[9 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[11]);
    1784           0 :     out[10 * col_num + col] = v[15];
    1785           0 :     out[11 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[7]);
    1786           0 :     out[12 * col_num + col] = v[5];
    1787           0 :     out[13 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[13]);
    1788           0 :     out[14 * col_num + col] = v[9];
    1789           0 :     out[15 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[1]);
    1790             :   }
    1791           0 : }
    1792             : 
    1793           0 : static void col_txfm_16x16_rounding(__m128i *in, int shift) {
    1794             :   // Note:
    1795             :   //  We split 16x16 rounding into 4 sections of 8x8 rounding,
    1796             :   //  instead of 4 columns
    1797           0 :   col_txfm_8x8_rounding(&in[0], shift);
    1798           0 :   col_txfm_8x8_rounding(&in[16], shift);
    1799           0 :   col_txfm_8x8_rounding(&in[32], shift);
    1800           0 :   col_txfm_8x8_rounding(&in[48], shift);
    1801           0 : }
    1802             : 
    1803           0 : static void write_buffer_16x16(const __m128i *in, tran_low_t *output) {
    1804           0 :   const int size_8x8 = 16 * 4;
    1805           0 :   write_buffer_8x8(&in[0], output);
    1806           0 :   output += size_8x8;
    1807           0 :   write_buffer_8x8(&in[16], output);
    1808           0 :   output += size_8x8;
    1809           0 :   write_buffer_8x8(&in[32], output);
    1810           0 :   output += size_8x8;
    1811           0 :   write_buffer_8x8(&in[48], output);
    1812           0 : }
    1813             : 
    1814           0 : void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
    1815             :                                  int stride, int tx_type, int bd) {
    1816             :   __m128i in[64], out[64];
    1817           0 :   const TXFM_1D_CFG *row_cfg = NULL;
    1818           0 :   const TXFM_1D_CFG *col_cfg = NULL;
    1819             : 
    1820           0 :   switch (tx_type) {
    1821             :     case DCT_DCT:
    1822           0 :       row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
    1823           0 :       col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
    1824           0 :       load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
    1825           0 :       fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
    1826           0 :       col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
    1827           0 :       transpose_16x16(out, in);
    1828           0 :       fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
    1829           0 :       transpose_16x16(out, in);
    1830           0 :       write_buffer_16x16(in, coeff);
    1831           0 :       break;
    1832             :     case ADST_DCT:
    1833           0 :       row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
    1834           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
    1835           0 :       load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
    1836           0 :       fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
    1837           0 :       col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
    1838           0 :       transpose_16x16(out, in);
    1839           0 :       fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
    1840           0 :       transpose_16x16(out, in);
    1841           0 :       write_buffer_16x16(in, coeff);
    1842           0 :       break;
    1843             :     case DCT_ADST:
    1844           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
    1845           0 :       col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
    1846           0 :       load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
    1847           0 :       fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
    1848           0 :       col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
    1849           0 :       transpose_16x16(out, in);
    1850           0 :       fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
    1851           0 :       transpose_16x16(out, in);
    1852           0 :       write_buffer_16x16(in, coeff);
    1853           0 :       break;
    1854             :     case ADST_ADST:
    1855           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
    1856           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
    1857           0 :       load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
    1858           0 :       fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
    1859           0 :       col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
    1860           0 :       transpose_16x16(out, in);
    1861           0 :       fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
    1862           0 :       transpose_16x16(out, in);
    1863           0 :       write_buffer_16x16(in, coeff);
    1864           0 :       break;
    1865             : #if CONFIG_EXT_TX
    1866             :     case FLIPADST_DCT:
    1867           0 :       row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
    1868           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
    1869           0 :       load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]);
    1870           0 :       fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
    1871           0 :       col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
    1872           0 :       transpose_16x16(out, in);
    1873           0 :       fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
    1874           0 :       transpose_16x16(out, in);
    1875           0 :       write_buffer_16x16(in, coeff);
    1876           0 :       break;
    1877             :     case DCT_FLIPADST:
    1878           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
    1879           0 :       col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
    1880           0 :       load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]);
    1881           0 :       fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
    1882           0 :       col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
    1883           0 :       transpose_16x16(out, in);
    1884           0 :       fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
    1885           0 :       transpose_16x16(out, in);
    1886           0 :       write_buffer_16x16(in, coeff);
    1887           0 :       break;
    1888             :     case FLIPADST_FLIPADST:
    1889           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
    1890           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
    1891           0 :       load_buffer_16x16(input, in, stride, 1, 1, row_cfg->shift[0]);
    1892           0 :       fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
    1893           0 :       col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
    1894           0 :       transpose_16x16(out, in);
    1895           0 :       fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
    1896           0 :       transpose_16x16(out, in);
    1897           0 :       write_buffer_16x16(in, coeff);
    1898           0 :       break;
    1899             :     case ADST_FLIPADST:
    1900           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
    1901           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
    1902           0 :       load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]);
    1903           0 :       fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
    1904           0 :       col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
    1905           0 :       transpose_16x16(out, in);
    1906           0 :       fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
    1907           0 :       transpose_16x16(out, in);
    1908           0 :       write_buffer_16x16(in, coeff);
    1909           0 :       break;
    1910             :     case FLIPADST_ADST:
    1911           0 :       row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
    1912           0 :       col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
    1913           0 :       load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]);
    1914           0 :       fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
    1915           0 :       col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
    1916           0 :       transpose_16x16(out, in);
    1917           0 :       fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
    1918           0 :       transpose_16x16(out, in);
    1919           0 :       write_buffer_16x16(in, coeff);
    1920           0 :       break;
    1921             : #endif  // CONFIG_EXT_TX
    1922           0 :     default: assert(0);
    1923             :   }
    1924             :   (void)bd;
    1925           0 : }

Generated by: LCOV version 1.13