LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - inv_txfm_ssse3.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 899 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 27 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include <tmmintrin.h>
      12             : 
      13             : #include "./aom_dsp_rtcd.h"
      14             : #include "aom_dsp/x86/inv_txfm_sse2.h"
      15             : #include "aom_dsp/x86/txfm_common_sse2.h"
      16             : 
      17           0 : void aom_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
      18             :                               int stride) {
      19           0 :   const __m128i zero = _mm_setzero_si128();
      20           0 :   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
      21           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
      22           0 :   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
      23           0 :   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
      24           0 :   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
      25           0 :   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
      26           0 :   const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
      27           0 :   const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
      28           0 :   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
      29           0 :   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
      30             : 
      31             :   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
      32             :   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
      33             :   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
      34             :   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
      35             :   int i;
      36             : 
      37             :   // Load input data.
      38           0 :   in0 = load_input_data(input);
      39           0 :   in1 = load_input_data(input + 8 * 1);
      40           0 :   in2 = load_input_data(input + 8 * 2);
      41           0 :   in3 = load_input_data(input + 8 * 3);
      42           0 :   in4 = load_input_data(input + 8 * 4);
      43           0 :   in5 = load_input_data(input + 8 * 5);
      44           0 :   in6 = load_input_data(input + 8 * 6);
      45           0 :   in7 = load_input_data(input + 8 * 7);
      46             : 
      47             :   // 2-D
      48           0 :   for (i = 0; i < 2; i++) {
      49             :     // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
      50           0 :     TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
      51             :                   in4, in5, in6, in7);
      52             : 
      53             :     // 4-stage 1D idct8x8
      54             :     {
      55             :       /* Stage1 */
      56             :       {
      57           0 :         const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);
      58           0 :         const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);
      59           0 :         const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);
      60           0 :         const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);
      61             : 
      62             :         {
      63           0 :           tmp0 = _mm_madd_epi16(lo_17, stg1_0);
      64           0 :           tmp1 = _mm_madd_epi16(hi_17, stg1_0);
      65           0 :           tmp2 = _mm_madd_epi16(lo_17, stg1_1);
      66           0 :           tmp3 = _mm_madd_epi16(hi_17, stg1_1);
      67           0 :           tmp4 = _mm_madd_epi16(lo_35, stg1_2);
      68           0 :           tmp5 = _mm_madd_epi16(hi_35, stg1_2);
      69           0 :           tmp6 = _mm_madd_epi16(lo_35, stg1_3);
      70           0 :           tmp7 = _mm_madd_epi16(hi_35, stg1_3);
      71             : 
      72           0 :           tmp0 = _mm_add_epi32(tmp0, rounding);
      73           0 :           tmp1 = _mm_add_epi32(tmp1, rounding);
      74           0 :           tmp2 = _mm_add_epi32(tmp2, rounding);
      75           0 :           tmp3 = _mm_add_epi32(tmp3, rounding);
      76           0 :           tmp4 = _mm_add_epi32(tmp4, rounding);
      77           0 :           tmp5 = _mm_add_epi32(tmp5, rounding);
      78           0 :           tmp6 = _mm_add_epi32(tmp6, rounding);
      79           0 :           tmp7 = _mm_add_epi32(tmp7, rounding);
      80             : 
      81           0 :           tmp0 = _mm_srai_epi32(tmp0, 14);
      82           0 :           tmp1 = _mm_srai_epi32(tmp1, 14);
      83           0 :           tmp2 = _mm_srai_epi32(tmp2, 14);
      84           0 :           tmp3 = _mm_srai_epi32(tmp3, 14);
      85           0 :           tmp4 = _mm_srai_epi32(tmp4, 14);
      86           0 :           tmp5 = _mm_srai_epi32(tmp5, 14);
      87           0 :           tmp6 = _mm_srai_epi32(tmp6, 14);
      88           0 :           tmp7 = _mm_srai_epi32(tmp7, 14);
      89             : 
      90           0 :           stp1_4 = _mm_packs_epi32(tmp0, tmp1);
      91           0 :           stp1_7 = _mm_packs_epi32(tmp2, tmp3);
      92           0 :           stp1_5 = _mm_packs_epi32(tmp4, tmp5);
      93           0 :           stp1_6 = _mm_packs_epi32(tmp6, tmp7);
      94             :         }
      95             :       }
      96             : 
      97             :       /* Stage2 */
      98             :       {
      99           0 :         const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);
     100           0 :         const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);
     101             : 
     102             :         {
     103           0 :           tmp0 = _mm_unpacklo_epi16(in0, in4);
     104           0 :           tmp1 = _mm_unpackhi_epi16(in0, in4);
     105             : 
     106           0 :           tmp2 = _mm_madd_epi16(tmp0, stk2_0);
     107           0 :           tmp3 = _mm_madd_epi16(tmp1, stk2_0);
     108           0 :           tmp4 = _mm_madd_epi16(tmp0, stk2_1);
     109           0 :           tmp5 = _mm_madd_epi16(tmp1, stk2_1);
     110             : 
     111           0 :           tmp2 = _mm_add_epi32(tmp2, rounding);
     112           0 :           tmp3 = _mm_add_epi32(tmp3, rounding);
     113           0 :           tmp4 = _mm_add_epi32(tmp4, rounding);
     114           0 :           tmp5 = _mm_add_epi32(tmp5, rounding);
     115             : 
     116           0 :           tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
     117           0 :           tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
     118           0 :           tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
     119           0 :           tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
     120             : 
     121           0 :           stp2_0 = _mm_packs_epi32(tmp2, tmp3);
     122           0 :           stp2_1 = _mm_packs_epi32(tmp4, tmp5);
     123             : 
     124           0 :           tmp0 = _mm_madd_epi16(lo_26, stg2_2);
     125           0 :           tmp1 = _mm_madd_epi16(hi_26, stg2_2);
     126           0 :           tmp2 = _mm_madd_epi16(lo_26, stg2_3);
     127           0 :           tmp3 = _mm_madd_epi16(hi_26, stg2_3);
     128             : 
     129           0 :           tmp0 = _mm_add_epi32(tmp0, rounding);
     130           0 :           tmp1 = _mm_add_epi32(tmp1, rounding);
     131           0 :           tmp2 = _mm_add_epi32(tmp2, rounding);
     132           0 :           tmp3 = _mm_add_epi32(tmp3, rounding);
     133             : 
     134           0 :           tmp0 = _mm_srai_epi32(tmp0, 14);
     135           0 :           tmp1 = _mm_srai_epi32(tmp1, 14);
     136           0 :           tmp2 = _mm_srai_epi32(tmp2, 14);
     137           0 :           tmp3 = _mm_srai_epi32(tmp3, 14);
     138             : 
     139           0 :           stp2_2 = _mm_packs_epi32(tmp0, tmp1);
     140           0 :           stp2_3 = _mm_packs_epi32(tmp2, tmp3);
     141             :         }
     142             : 
     143           0 :         stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
     144           0 :         stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
     145           0 :         stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
     146           0 :         stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
     147             :       }
     148             : 
     149             :       /* Stage3 */
     150             :       {
     151           0 :         stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
     152           0 :         stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
     153           0 :         stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
     154           0 :         stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
     155             : 
     156           0 :         tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
     157           0 :         tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
     158             : 
     159           0 :         tmp2 = _mm_madd_epi16(tmp0, stk2_1);
     160           0 :         tmp3 = _mm_madd_epi16(tmp1, stk2_1);
     161           0 :         tmp4 = _mm_madd_epi16(tmp0, stk2_0);
     162           0 :         tmp5 = _mm_madd_epi16(tmp1, stk2_0);
     163             : 
     164           0 :         tmp2 = _mm_add_epi32(tmp2, rounding);
     165           0 :         tmp3 = _mm_add_epi32(tmp3, rounding);
     166           0 :         tmp4 = _mm_add_epi32(tmp4, rounding);
     167           0 :         tmp5 = _mm_add_epi32(tmp5, rounding);
     168             : 
     169           0 :         tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
     170           0 :         tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
     171           0 :         tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
     172           0 :         tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
     173             : 
     174           0 :         stp1_5 = _mm_packs_epi32(tmp2, tmp3);
     175           0 :         stp1_6 = _mm_packs_epi32(tmp4, tmp5);
     176             :       }
     177             : 
     178             :       /* Stage4  */
     179           0 :       in0 = _mm_add_epi16(stp1_0, stp2_7);
     180           0 :       in1 = _mm_add_epi16(stp1_1, stp1_6);
     181           0 :       in2 = _mm_add_epi16(stp1_2, stp1_5);
     182           0 :       in3 = _mm_add_epi16(stp1_3, stp2_4);
     183           0 :       in4 = _mm_sub_epi16(stp1_3, stp2_4);
     184           0 :       in5 = _mm_sub_epi16(stp1_2, stp1_5);
     185           0 :       in6 = _mm_sub_epi16(stp1_1, stp1_6);
     186           0 :       in7 = _mm_sub_epi16(stp1_0, stp2_7);
     187             :     }
     188             :   }
     189             : 
     190             :   // Final rounding and shift
     191           0 :   in0 = _mm_adds_epi16(in0, final_rounding);
     192           0 :   in1 = _mm_adds_epi16(in1, final_rounding);
     193           0 :   in2 = _mm_adds_epi16(in2, final_rounding);
     194           0 :   in3 = _mm_adds_epi16(in3, final_rounding);
     195           0 :   in4 = _mm_adds_epi16(in4, final_rounding);
     196           0 :   in5 = _mm_adds_epi16(in5, final_rounding);
     197           0 :   in6 = _mm_adds_epi16(in6, final_rounding);
     198           0 :   in7 = _mm_adds_epi16(in7, final_rounding);
     199             : 
     200           0 :   in0 = _mm_srai_epi16(in0, 5);
     201           0 :   in1 = _mm_srai_epi16(in1, 5);
     202           0 :   in2 = _mm_srai_epi16(in2, 5);
     203           0 :   in3 = _mm_srai_epi16(in3, 5);
     204           0 :   in4 = _mm_srai_epi16(in4, 5);
     205           0 :   in5 = _mm_srai_epi16(in5, 5);
     206           0 :   in6 = _mm_srai_epi16(in6, 5);
     207           0 :   in7 = _mm_srai_epi16(in7, 5);
     208             : 
     209           0 :   RECON_AND_STORE(dest + 0 * stride, in0);
     210           0 :   RECON_AND_STORE(dest + 1 * stride, in1);
     211           0 :   RECON_AND_STORE(dest + 2 * stride, in2);
     212           0 :   RECON_AND_STORE(dest + 3 * stride, in3);
     213           0 :   RECON_AND_STORE(dest + 4 * stride, in4);
     214           0 :   RECON_AND_STORE(dest + 5 * stride, in5);
     215           0 :   RECON_AND_STORE(dest + 6 * stride, in6);
     216           0 :   RECON_AND_STORE(dest + 7 * stride, in7);
     217           0 : }
     218             : 
     219           0 : void aom_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
     220             :                               int stride) {
     221           0 :   const __m128i zero = _mm_setzero_si128();
     222           0 :   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
     223           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
     224           0 :   const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
     225           0 :   const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
     226           0 :   const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
     227           0 :   const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
     228           0 :   const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
     229           0 :   const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
     230           0 :   const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     231           0 :   const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
     232           0 :   const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
     233           0 :   const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
     234             : 
     235             :   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
     236             :   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
     237             :   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
     238             :   __m128i tmp0, tmp1, tmp2, tmp3;
     239             : 
     240             :   // Rows. Load 4-row input data.
     241           0 :   in0 = load_input_data(input);
     242           0 :   in1 = load_input_data(input + 8 * 1);
     243           0 :   in2 = load_input_data(input + 8 * 2);
     244           0 :   in3 = load_input_data(input + 8 * 3);
     245             : 
     246             :   // 8x4 Transpose
     247           0 :   TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
     248             : 
     249             :   // Stage1
     250           0 :   tmp0 = _mm_mulhrs_epi16(in0, stg1_0);
     251           0 :   tmp1 = _mm_mulhrs_epi16(in0, stg1_1);
     252           0 :   tmp2 = _mm_mulhrs_epi16(in1, stg1_2);
     253           0 :   tmp3 = _mm_mulhrs_epi16(in1, stg1_3);
     254             : 
     255           0 :   stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1);
     256           0 :   stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3);
     257             : 
     258             :   // Stage2
     259           0 :   tmp0 = _mm_mulhrs_epi16(in0, stg2_0);
     260           0 :   stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0);
     261             : 
     262           0 :   tmp1 = _mm_mulhrs_epi16(in1, stg2_2);
     263           0 :   tmp2 = _mm_mulhrs_epi16(in1, stg2_3);
     264           0 :   stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1);
     265             : 
     266           0 :   tmp0 = _mm_add_epi16(stp1_4, stp1_5);
     267           0 :   tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
     268             : 
     269           0 :   stp2_4 = tmp0;
     270           0 :   stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
     271           0 :   stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
     272             : 
     273           0 :   tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6);
     274           0 :   tmp1 = _mm_madd_epi16(tmp0, stg3_0);
     275           0 :   tmp2 = _mm_madd_epi16(tmp0, stk2_0);  // stg3_1 = stk2_0
     276             : 
     277           0 :   tmp1 = _mm_add_epi32(tmp1, rounding);
     278           0 :   tmp2 = _mm_add_epi32(tmp2, rounding);
     279           0 :   tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
     280           0 :   tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
     281             : 
     282           0 :   stp1_5 = _mm_packs_epi32(tmp1, tmp2);
     283             : 
     284             :   // Stage3
     285           0 :   tmp2 = _mm_add_epi16(stp2_0, stp2_2);
     286           0 :   tmp3 = _mm_sub_epi16(stp2_0, stp2_2);
     287             : 
     288           0 :   stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2);
     289           0 :   stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2);
     290             : 
     291             :   // Stage4
     292           0 :   tmp0 = _mm_add_epi16(stp1_3, stp2_4);
     293           0 :   tmp1 = _mm_add_epi16(stp1_2, stp1_5);
     294           0 :   tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
     295           0 :   tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
     296             : 
     297           0 :   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
     298             : 
     299             :   /* Stage1 */
     300           0 :   stp1_4 = _mm_mulhrs_epi16(in1, stg1_0);
     301           0 :   stp1_7 = _mm_mulhrs_epi16(in1, stg1_1);
     302           0 :   stp1_5 = _mm_mulhrs_epi16(in3, stg1_2);
     303           0 :   stp1_6 = _mm_mulhrs_epi16(in3, stg1_3);
     304             : 
     305             :   /* Stage2 */
     306           0 :   stp2_0 = _mm_mulhrs_epi16(in0, stg2_0);
     307           0 :   stp2_1 = _mm_mulhrs_epi16(in0, stg2_0);
     308             : 
     309           0 :   stp2_2 = _mm_mulhrs_epi16(in2, stg2_2);
     310           0 :   stp2_3 = _mm_mulhrs_epi16(in2, stg2_3);
     311             : 
     312           0 :   stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
     313           0 :   stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
     314           0 :   stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
     315           0 :   stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
     316             : 
     317             :   /* Stage3 */
     318           0 :   stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
     319           0 :   stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
     320           0 :   stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
     321           0 :   stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
     322             : 
     323           0 :   tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
     324           0 :   tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
     325             : 
     326           0 :   tmp2 = _mm_madd_epi16(tmp0, stk2_0);
     327           0 :   tmp3 = _mm_madd_epi16(tmp1, stk2_0);
     328           0 :   tmp2 = _mm_add_epi32(tmp2, rounding);
     329           0 :   tmp3 = _mm_add_epi32(tmp3, rounding);
     330           0 :   tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
     331           0 :   tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
     332           0 :   stp1_6 = _mm_packs_epi32(tmp2, tmp3);
     333             : 
     334           0 :   tmp2 = _mm_madd_epi16(tmp0, stk2_1);
     335           0 :   tmp3 = _mm_madd_epi16(tmp1, stk2_1);
     336           0 :   tmp2 = _mm_add_epi32(tmp2, rounding);
     337           0 :   tmp3 = _mm_add_epi32(tmp3, rounding);
     338           0 :   tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
     339           0 :   tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
     340           0 :   stp1_5 = _mm_packs_epi32(tmp2, tmp3);
     341             : 
     342             :   /* Stage4  */
     343           0 :   in0 = _mm_add_epi16(stp1_0, stp2_7);
     344           0 :   in1 = _mm_add_epi16(stp1_1, stp1_6);
     345           0 :   in2 = _mm_add_epi16(stp1_2, stp1_5);
     346           0 :   in3 = _mm_add_epi16(stp1_3, stp2_4);
     347           0 :   in4 = _mm_sub_epi16(stp1_3, stp2_4);
     348           0 :   in5 = _mm_sub_epi16(stp1_2, stp1_5);
     349           0 :   in6 = _mm_sub_epi16(stp1_1, stp1_6);
     350           0 :   in7 = _mm_sub_epi16(stp1_0, stp2_7);
     351             : 
     352             :   // Final rounding and shift
     353           0 :   in0 = _mm_adds_epi16(in0, final_rounding);
     354           0 :   in1 = _mm_adds_epi16(in1, final_rounding);
     355           0 :   in2 = _mm_adds_epi16(in2, final_rounding);
     356           0 :   in3 = _mm_adds_epi16(in3, final_rounding);
     357           0 :   in4 = _mm_adds_epi16(in4, final_rounding);
     358           0 :   in5 = _mm_adds_epi16(in5, final_rounding);
     359           0 :   in6 = _mm_adds_epi16(in6, final_rounding);
     360           0 :   in7 = _mm_adds_epi16(in7, final_rounding);
     361             : 
     362           0 :   in0 = _mm_srai_epi16(in0, 5);
     363           0 :   in1 = _mm_srai_epi16(in1, 5);
     364           0 :   in2 = _mm_srai_epi16(in2, 5);
     365           0 :   in3 = _mm_srai_epi16(in3, 5);
     366           0 :   in4 = _mm_srai_epi16(in4, 5);
     367           0 :   in5 = _mm_srai_epi16(in5, 5);
     368           0 :   in6 = _mm_srai_epi16(in6, 5);
     369           0 :   in7 = _mm_srai_epi16(in7, 5);
     370             : 
     371           0 :   RECON_AND_STORE(dest + 0 * stride, in0);
     372           0 :   RECON_AND_STORE(dest + 1 * stride, in1);
     373           0 :   RECON_AND_STORE(dest + 2 * stride, in2);
     374           0 :   RECON_AND_STORE(dest + 3 * stride, in3);
     375           0 :   RECON_AND_STORE(dest + 4 * stride, in4);
     376           0 :   RECON_AND_STORE(dest + 5 * stride, in5);
     377           0 :   RECON_AND_STORE(dest + 6 * stride, in6);
     378           0 :   RECON_AND_STORE(dest + 7 * stride, in7);
     379           0 : }
     380             : 
     381             : // Only do addition and subtraction butterfly, size = 16, 32
     382           0 : static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
     383             :                                      int size) {
     384           0 :   int i = 0;
     385           0 :   const int num = size >> 1;
     386           0 :   const int bound = size - 1;
     387           0 :   while (i < num) {
     388           0 :     out[i] = _mm_add_epi16(in[i], in[bound - i]);
     389           0 :     out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
     390           0 :     i++;
     391             :   }
     392           0 : }
     393             : 
     394             : #define BUTTERFLY_PAIR(x0, x1, co0, co1)         \
     395             :   do {                                           \
     396             :     tmp0 = _mm_madd_epi16(x0, co0);              \
     397             :     tmp1 = _mm_madd_epi16(x1, co0);              \
     398             :     tmp2 = _mm_madd_epi16(x0, co1);              \
     399             :     tmp3 = _mm_madd_epi16(x1, co1);              \
     400             :     tmp0 = _mm_add_epi32(tmp0, rounding);        \
     401             :     tmp1 = _mm_add_epi32(tmp1, rounding);        \
     402             :     tmp2 = _mm_add_epi32(tmp2, rounding);        \
     403             :     tmp3 = _mm_add_epi32(tmp3, rounding);        \
     404             :     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
     405             :     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
     406             :     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
     407             :     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
     408             :   } while (0)
     409             : 
     410           0 : static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
     411             :                              const __m128i *c0, const __m128i *c1, __m128i *y0,
     412             :                              __m128i *y1) {
     413             :   __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
     414           0 :   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
     415             : 
     416           0 :   u0 = _mm_unpacklo_epi16(*x0, *x1);
     417           0 :   u1 = _mm_unpackhi_epi16(*x0, *x1);
     418           0 :   BUTTERFLY_PAIR(u0, u1, *c0, *c1);
     419           0 :   *y0 = _mm_packs_epi32(tmp0, tmp1);
     420           0 :   *y1 = _mm_packs_epi32(tmp2, tmp3);
     421           0 : }
     422             : 
     423           0 : static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
     424             :                                   const __m128i *c1) {
     425             :   __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
     426           0 :   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
     427             : 
     428           0 :   u0 = _mm_unpacklo_epi16(*x0, *x1);
     429           0 :   u1 = _mm_unpackhi_epi16(*x0, *x1);
     430           0 :   BUTTERFLY_PAIR(u0, u1, *c0, *c1);
     431           0 :   *x0 = _mm_packs_epi32(tmp0, tmp1);
     432           0 :   *x1 = _mm_packs_epi32(tmp2, tmp3);
     433           0 : }
     434             : 
     435           0 : static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
     436           0 :   const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
     437           0 :   const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
     438           0 :   const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
     439           0 :   const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
     440             : 
     441           0 :   const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
     442           0 :   const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
     443             : 
     444           0 :   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
     445           0 :   const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
     446           0 :   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     447           0 :   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
     448           0 :   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
     449           0 :   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
     450             : 
     451           0 :   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
     452             :   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
     453             :   __m128i x0, x1, x4, x5, x6, x7;
     454             :   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
     455             : 
     456             :   // phase 1
     457             : 
     458             :   // 0, 15
     459           0 :   u2 = _mm_mulhrs_epi16(in[2], stk2_1);  // stp2_15
     460           0 :   u3 = _mm_mulhrs_epi16(in[6], stk2_7);  // stp2_12
     461           0 :   v15 = _mm_add_epi16(u2, u3);
     462             :   // in[0], in[4]
     463           0 :   x0 = _mm_mulhrs_epi16(in[0], stk4_0);  // stp1[0]
     464           0 :   x7 = _mm_mulhrs_epi16(in[4], stk3_1);  // stp1[7]
     465           0 :   v0 = _mm_add_epi16(x0, x7);            // stp2_0
     466           0 :   stp1[0] = _mm_add_epi16(v0, v15);
     467           0 :   stp1[15] = _mm_sub_epi16(v0, v15);
     468             : 
     469             :   // in[2], in[6]
     470           0 :   u0 = _mm_mulhrs_epi16(in[2], stk2_0);             // stp2_8
     471           0 :   u1 = _mm_mulhrs_epi16(in[6], stk2_6);             // stp2_11
     472           0 :   butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5);  // stp2_9, stp2_14
     473           0 :   butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7);  // stp2_10, stp2_13
     474             : 
     475           0 :   v8 = _mm_add_epi16(u0, u1);
     476           0 :   v9 = _mm_add_epi16(u4, u6);
     477           0 :   v10 = _mm_sub_epi16(u4, u6);
     478           0 :   v11 = _mm_sub_epi16(u0, u1);
     479           0 :   v12 = _mm_sub_epi16(u2, u3);
     480           0 :   v13 = _mm_sub_epi16(u5, u7);
     481           0 :   v14 = _mm_add_epi16(u5, u7);
     482             : 
     483           0 :   butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
     484           0 :   butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
     485             : 
     486             :   // 1, 14
     487           0 :   x1 = _mm_mulhrs_epi16(in[0], stk4_0);  // stp1[1], stk4_1 = stk4_0
     488             :   // stp1[2] = stp1[0], stp1[3] = stp1[1]
     489           0 :   x4 = _mm_mulhrs_epi16(in[4], stk3_0);  // stp1[4]
     490           0 :   butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6);
     491           0 :   v1 = _mm_add_epi16(x1, x6);  // stp2_1
     492           0 :   v2 = _mm_add_epi16(x0, x5);  // stp2_2
     493           0 :   stp1[1] = _mm_add_epi16(v1, v14);
     494           0 :   stp1[14] = _mm_sub_epi16(v1, v14);
     495             : 
     496           0 :   stp1[2] = _mm_add_epi16(v2, v13);
     497           0 :   stp1[13] = _mm_sub_epi16(v2, v13);
     498             : 
     499           0 :   v3 = _mm_add_epi16(x1, x4);  // stp2_3
     500           0 :   v4 = _mm_sub_epi16(x1, x4);  // stp2_4
     501             : 
     502           0 :   v5 = _mm_sub_epi16(x0, x5);  // stp2_5
     503             : 
     504           0 :   v6 = _mm_sub_epi16(x1, x6);  // stp2_6
     505           0 :   v7 = _mm_sub_epi16(x0, x7);  // stp2_7
     506           0 :   stp1[3] = _mm_add_epi16(v3, v12);
     507           0 :   stp1[12] = _mm_sub_epi16(v3, v12);
     508             : 
     509           0 :   stp1[6] = _mm_add_epi16(v6, v9);
     510           0 :   stp1[9] = _mm_sub_epi16(v6, v9);
     511             : 
     512           0 :   stp1[7] = _mm_add_epi16(v7, v8);
     513           0 :   stp1[8] = _mm_sub_epi16(v7, v8);
     514             : 
     515           0 :   stp1[4] = _mm_add_epi16(v4, v11);
     516           0 :   stp1[11] = _mm_sub_epi16(v4, v11);
     517             : 
     518           0 :   stp1[5] = _mm_add_epi16(v5, v10);
     519           0 :   stp1[10] = _mm_sub_epi16(v5, v10);
     520           0 : }
     521             : 
     522           0 : static void idct32_34_second_half(const __m128i *in, __m128i *stp1) {
     523           0 :   const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
     524           0 :   const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
     525           0 :   const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
     526           0 :   const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
     527           0 :   const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
     528           0 :   const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
     529           0 :   const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
     530           0 :   const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
     531           0 :   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
     532           0 :   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
     533           0 :   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
     534           0 :   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
     535           0 :   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
     536           0 :   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
     537             : 
     538           0 :   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
     539           0 :   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
     540           0 :   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
     541           0 :   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
     542             : 
     543           0 :   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
     544             :   __m128i v16, v17, v18, v19, v20, v21, v22, v23;
     545             :   __m128i v24, v25, v26, v27, v28, v29, v30, v31;
     546             :   __m128i u16, u17, u18, u19, u20, u21, u22, u23;
     547             :   __m128i u24, u25, u26, u27, u28, u29, u30, u31;
     548             : 
     549           0 :   v16 = _mm_mulhrs_epi16(in[1], stk1_0);
     550           0 :   v31 = _mm_mulhrs_epi16(in[1], stk1_1);
     551             : 
     552           0 :   v19 = _mm_mulhrs_epi16(in[7], stk1_6);
     553           0 :   v28 = _mm_mulhrs_epi16(in[7], stk1_7);
     554             : 
     555           0 :   v20 = _mm_mulhrs_epi16(in[5], stk1_8);
     556           0 :   v27 = _mm_mulhrs_epi16(in[5], stk1_9);
     557             : 
     558           0 :   v23 = _mm_mulhrs_epi16(in[3], stk1_14);
     559           0 :   v24 = _mm_mulhrs_epi16(in[3], stk1_15);
     560             : 
     561           0 :   butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30);
     562           0 :   butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29);
     563           0 :   butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26);
     564           0 :   butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25);
     565             : 
     566           0 :   u16 = _mm_add_epi16(v16, v19);
     567           0 :   u17 = _mm_add_epi16(v17, v18);
     568           0 :   u18 = _mm_sub_epi16(v17, v18);
     569           0 :   u19 = _mm_sub_epi16(v16, v19);
     570           0 :   u20 = _mm_sub_epi16(v23, v20);
     571           0 :   u21 = _mm_sub_epi16(v22, v21);
     572           0 :   u22 = _mm_add_epi16(v22, v21);
     573           0 :   u23 = _mm_add_epi16(v23, v20);
     574           0 :   u24 = _mm_add_epi16(v24, v27);
     575           0 :   u27 = _mm_sub_epi16(v24, v27);
     576           0 :   u25 = _mm_add_epi16(v25, v26);
     577           0 :   u26 = _mm_sub_epi16(v25, v26);
     578           0 :   u28 = _mm_sub_epi16(v31, v28);
     579           0 :   u31 = _mm_add_epi16(v28, v31);
     580           0 :   u29 = _mm_sub_epi16(v30, v29);
     581           0 :   u30 = _mm_add_epi16(v29, v30);
     582             : 
     583           0 :   butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
     584           0 :   butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
     585           0 :   butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
     586           0 :   butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
     587             : 
     588           0 :   stp1[16] = _mm_add_epi16(u16, u23);
     589           0 :   stp1[23] = _mm_sub_epi16(u16, u23);
     590             : 
     591           0 :   stp1[17] = _mm_add_epi16(u17, u22);
     592           0 :   stp1[22] = _mm_sub_epi16(u17, u22);
     593             : 
     594           0 :   stp1[18] = _mm_add_epi16(u18, u21);
     595           0 :   stp1[21] = _mm_sub_epi16(u18, u21);
     596             : 
     597           0 :   stp1[19] = _mm_add_epi16(u19, u20);
     598           0 :   stp1[20] = _mm_sub_epi16(u19, u20);
     599             : 
     600           0 :   stp1[24] = _mm_sub_epi16(u31, u24);
     601           0 :   stp1[31] = _mm_add_epi16(u24, u31);
     602             : 
     603           0 :   stp1[25] = _mm_sub_epi16(u30, u25);
     604           0 :   stp1[30] = _mm_add_epi16(u25, u30);
     605             : 
     606           0 :   stp1[26] = _mm_sub_epi16(u29, u26);
     607           0 :   stp1[29] = _mm_add_epi16(u26, u29);
     608             : 
     609           0 :   stp1[27] = _mm_sub_epi16(u28, u27);
     610           0 :   stp1[28] = _mm_add_epi16(u27, u28);
     611             : 
     612           0 :   butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0);
     613           0 :   butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0);
     614           0 :   butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0);
     615           0 :   butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0);
     616           0 : }
     617             : 
     618             : // Only upper-left 8x8 has non-zero coeff
     619           0 : void aom_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
     620             :                                 int stride) {
     621           0 :   const __m128i zero = _mm_setzero_si128();
     622           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
     623             :   __m128i in[32], col[32];
     624             :   __m128i stp1[32];
     625             :   int i;
     626             : 
     627             :   // Load input data. Only need to load the top left 8x8 block.
     628           0 :   in[0] = load_input_data(input);
     629           0 :   in[1] = load_input_data(input + 32);
     630           0 :   in[2] = load_input_data(input + 64);
     631           0 :   in[3] = load_input_data(input + 96);
     632           0 :   in[4] = load_input_data(input + 128);
     633           0 :   in[5] = load_input_data(input + 160);
     634           0 :   in[6] = load_input_data(input + 192);
     635           0 :   in[7] = load_input_data(input + 224);
     636             : 
     637           0 :   array_transpose_8x8(in, in);
     638           0 :   idct32_34_first_half(in, stp1);
     639           0 :   idct32_34_second_half(in, stp1);
     640             : 
     641             :   // 1_D: Store 32 intermediate results for each 8x32 block.
     642           0 :   add_sub_butterfly(stp1, col, 32);
     643           0 :   for (i = 0; i < 4; i++) {
     644             :     int j;
     645             :     // Transpose 32x8 block to 8x32 block
     646           0 :     array_transpose_8x8(col + i * 8, in);
     647           0 :     idct32_34_first_half(in, stp1);
     648           0 :     idct32_34_second_half(in, stp1);
     649             : 
     650             :     // 2_D: Calculate the results and store them to destination.
     651           0 :     add_sub_butterfly(stp1, in, 32);
     652           0 :     for (j = 0; j < 32; ++j) {
     653             :       // Final rounding and shift
     654           0 :       in[j] = _mm_adds_epi16(in[j], final_rounding);
     655           0 :       in[j] = _mm_srai_epi16(in[j], 6);
     656           0 :       RECON_AND_STORE(dest + j * stride, in[j]);
     657             :     }
     658             : 
     659           0 :     dest += 8;
     660             :   }
     661           0 : }
     662             : 
     663             : // in0[16] represents the left 8x16 block
     664             : // in1[16] represents the right 8x16 block
     665           0 : static void load_buffer_16x16(const tran_low_t *input, __m128i *in0,
     666             :                               __m128i *in1) {
     667             :   int i;
     668           0 :   for (i = 0; i < 16; i++) {
     669           0 :     in0[i] = load_input_data(input);
     670           0 :     in1[i] = load_input_data(input + 8);
     671           0 :     input += 32;
     672             :   }
     673           0 : }
     674             : 
     675           0 : static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0,
     676             :                                     __m128i *out1) {
     677           0 :   array_transpose_8x8(in0, out0);
     678           0 :   array_transpose_8x8(&in0[8], out1);
     679           0 :   array_transpose_8x8(in1, &out0[8]);
     680           0 :   array_transpose_8x8(&in1[8], &out1[8]);
     681           0 : }
     682             : 
     683             : // Group the coefficient calculation into smaller functions
     684             : // to prevent stack spillover:
     685             : // quarter_1: 0-7
     686             : // quarter_2: 8-15
     687             : // quarter_3_4: 16-23, 24-31
     688           0 : static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/,
     689             :                                       __m128i *out /*out[8]*/) {
     690             :   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
     691             :   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
     692             : 
     693             :   {
     694           0 :     const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
     695           0 :     const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
     696           0 :     const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
     697           0 :     u0 = _mm_mulhrs_epi16(in[0], stk4_0);
     698           0 :     u2 = _mm_mulhrs_epi16(in[8], stk4_2);
     699           0 :     u3 = _mm_mulhrs_epi16(in[8], stk4_3);
     700           0 :     u1 = u0;
     701             :   }
     702             : 
     703           0 :   v0 = _mm_add_epi16(u0, u3);
     704           0 :   v1 = _mm_add_epi16(u1, u2);
     705           0 :   v2 = _mm_sub_epi16(u1, u2);
     706           0 :   v3 = _mm_sub_epi16(u0, u3);
     707             : 
     708             :   {
     709           0 :     const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
     710           0 :     const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
     711           0 :     const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
     712           0 :     const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
     713           0 :     u4 = _mm_mulhrs_epi16(in[4], stk3_0);
     714           0 :     u7 = _mm_mulhrs_epi16(in[4], stk3_1);
     715           0 :     u5 = _mm_mulhrs_epi16(in[12], stk3_2);
     716           0 :     u6 = _mm_mulhrs_epi16(in[12], stk3_3);
     717             :   }
     718             : 
     719           0 :   v4 = _mm_add_epi16(u4, u5);
     720           0 :   v5 = _mm_sub_epi16(u4, u5);
     721           0 :   v6 = _mm_sub_epi16(u7, u6);
     722           0 :   v7 = _mm_add_epi16(u7, u6);
     723             : 
     724             :   {
     725           0 :     const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
     726           0 :     const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
     727           0 :     butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
     728             :   }
     729             : 
     730           0 :   out[0] = _mm_add_epi16(v0, v7);
     731           0 :   out[1] = _mm_add_epi16(v1, v6);
     732           0 :   out[2] = _mm_add_epi16(v2, v5);
     733           0 :   out[3] = _mm_add_epi16(v3, v4);
     734           0 :   out[4] = _mm_sub_epi16(v3, v4);
     735           0 :   out[5] = _mm_sub_epi16(v2, v5);
     736           0 :   out[6] = _mm_sub_epi16(v1, v6);
     737           0 :   out[7] = _mm_sub_epi16(v0, v7);
     738           0 : }
     739             : 
     740           0 : static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/,
     741             :                                       __m128i *out /*out[8]*/) {
     742             :   __m128i u8, u9, u10, u11, u12, u13, u14, u15;
     743             :   __m128i v8, v9, v10, v11, v12, v13, v14, v15;
     744             : 
     745             :   {
     746           0 :     const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
     747           0 :     const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
     748           0 :     const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
     749           0 :     const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
     750           0 :     const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
     751           0 :     const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
     752           0 :     const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
     753           0 :     const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
     754           0 :     u8 = _mm_mulhrs_epi16(in[2], stk2_0);
     755           0 :     u15 = _mm_mulhrs_epi16(in[2], stk2_1);
     756           0 :     u9 = _mm_mulhrs_epi16(in[14], stk2_2);
     757           0 :     u14 = _mm_mulhrs_epi16(in[14], stk2_3);
     758           0 :     u10 = _mm_mulhrs_epi16(in[10], stk2_4);
     759           0 :     u13 = _mm_mulhrs_epi16(in[10], stk2_5);
     760           0 :     u11 = _mm_mulhrs_epi16(in[6], stk2_6);
     761           0 :     u12 = _mm_mulhrs_epi16(in[6], stk2_7);
     762             :   }
     763             : 
     764           0 :   v8 = _mm_add_epi16(u8, u9);
     765           0 :   v9 = _mm_sub_epi16(u8, u9);
     766           0 :   v10 = _mm_sub_epi16(u11, u10);
     767           0 :   v11 = _mm_add_epi16(u11, u10);
     768           0 :   v12 = _mm_add_epi16(u12, u13);
     769           0 :   v13 = _mm_sub_epi16(u12, u13);
     770           0 :   v14 = _mm_sub_epi16(u15, u14);
     771           0 :   v15 = _mm_add_epi16(u15, u14);
     772             : 
     773             :   {
     774           0 :     const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
     775           0 :     const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
     776           0 :     const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
     777           0 :     butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
     778           0 :     butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
     779             :   }
     780             : 
     781           0 :   out[0] = _mm_add_epi16(v8, v11);
     782           0 :   out[1] = _mm_add_epi16(v9, v10);
     783           0 :   out[2] = _mm_sub_epi16(v9, v10);
     784           0 :   out[3] = _mm_sub_epi16(v8, v11);
     785           0 :   out[4] = _mm_sub_epi16(v15, v12);
     786           0 :   out[5] = _mm_sub_epi16(v14, v13);
     787           0 :   out[6] = _mm_add_epi16(v14, v13);
     788           0 :   out[7] = _mm_add_epi16(v15, v12);
     789             : 
     790             :   {
     791           0 :     const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
     792           0 :     const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
     793           0 :     butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
     794           0 :     butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
     795             :   }
     796           0 : }
     797             : 
     798             : // 8x32 block even indexed 8 inputs of in[16],
     799             : // output first half 16 to out[32]
     800           0 : static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/,
     801             :                                     __m128i *out /*out[32]*/) {
     802             :   __m128i temp[16];
     803           0 :   idct32_8x32_135_quarter_1(in, temp);
     804           0 :   idct32_8x32_135_quarter_2(in, &temp[8]);
     805           0 :   add_sub_butterfly(temp, out, 16);
     806           0 : }
     807             : 
     808             : // 8x32 block odd indexed 8 inputs of in[16],
     809             : // output second half 16 to out[32]
     810           0 : static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/,
     811             :                                     __m128i *out /*out[32]*/) {
     812             :   __m128i v16, v17, v18, v19, v20, v21, v22, v23;
     813             :   __m128i v24, v25, v26, v27, v28, v29, v30, v31;
     814             :   __m128i u16, u17, u18, u19, u20, u21, u22, u23;
     815             :   __m128i u24, u25, u26, u27, u28, u29, u30, u31;
     816             : 
     817             :   {
     818           0 :     const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
     819           0 :     const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
     820           0 :     const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
     821           0 :     const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
     822             : 
     823           0 :     const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
     824           0 :     const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
     825           0 :     const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
     826           0 :     const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
     827           0 :     const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
     828           0 :     const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
     829           0 :     const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
     830           0 :     const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
     831             : 
     832           0 :     const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
     833           0 :     const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
     834           0 :     const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
     835           0 :     const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
     836           0 :     u16 = _mm_mulhrs_epi16(in[1], stk1_0);
     837           0 :     u31 = _mm_mulhrs_epi16(in[1], stk1_1);
     838           0 :     u17 = _mm_mulhrs_epi16(in[15], stk1_2);
     839           0 :     u30 = _mm_mulhrs_epi16(in[15], stk1_3);
     840             : 
     841           0 :     u18 = _mm_mulhrs_epi16(in[9], stk1_4);
     842           0 :     u29 = _mm_mulhrs_epi16(in[9], stk1_5);
     843           0 :     u19 = _mm_mulhrs_epi16(in[7], stk1_6);
     844           0 :     u28 = _mm_mulhrs_epi16(in[7], stk1_7);
     845             : 
     846           0 :     u20 = _mm_mulhrs_epi16(in[5], stk1_8);
     847           0 :     u27 = _mm_mulhrs_epi16(in[5], stk1_9);
     848           0 :     u21 = _mm_mulhrs_epi16(in[11], stk1_10);
     849           0 :     u26 = _mm_mulhrs_epi16(in[11], stk1_11);
     850             : 
     851           0 :     u22 = _mm_mulhrs_epi16(in[13], stk1_12);
     852           0 :     u25 = _mm_mulhrs_epi16(in[13], stk1_13);
     853           0 :     u23 = _mm_mulhrs_epi16(in[3], stk1_14);
     854           0 :     u24 = _mm_mulhrs_epi16(in[3], stk1_15);
     855             :   }
     856             : 
     857           0 :   v16 = _mm_add_epi16(u16, u17);
     858           0 :   v17 = _mm_sub_epi16(u16, u17);
     859           0 :   v18 = _mm_sub_epi16(u19, u18);
     860           0 :   v19 = _mm_add_epi16(u19, u18);
     861             : 
     862           0 :   v20 = _mm_add_epi16(u20, u21);
     863           0 :   v21 = _mm_sub_epi16(u20, u21);
     864           0 :   v22 = _mm_sub_epi16(u23, u22);
     865           0 :   v23 = _mm_add_epi16(u23, u22);
     866             : 
     867           0 :   v24 = _mm_add_epi16(u24, u25);
     868           0 :   v25 = _mm_sub_epi16(u24, u25);
     869           0 :   v26 = _mm_sub_epi16(u27, u26);
     870           0 :   v27 = _mm_add_epi16(u27, u26);
     871             : 
     872           0 :   v28 = _mm_add_epi16(u28, u29);
     873           0 :   v29 = _mm_sub_epi16(u28, u29);
     874           0 :   v30 = _mm_sub_epi16(u31, u30);
     875           0 :   v31 = _mm_add_epi16(u31, u30);
     876             : 
     877             :   {
     878           0 :     const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
     879           0 :     const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
     880           0 :     const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
     881           0 :     const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
     882           0 :     const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
     883           0 :     const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
     884             : 
     885           0 :     butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
     886           0 :     butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
     887           0 :     butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
     888           0 :     butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
     889             :   }
     890             : 
     891           0 :   u16 = _mm_add_epi16(v16, v19);
     892           0 :   u17 = _mm_add_epi16(v17, v18);
     893           0 :   u18 = _mm_sub_epi16(v17, v18);
     894           0 :   u19 = _mm_sub_epi16(v16, v19);
     895           0 :   u20 = _mm_sub_epi16(v23, v20);
     896           0 :   u21 = _mm_sub_epi16(v22, v21);
     897           0 :   u22 = _mm_add_epi16(v22, v21);
     898           0 :   u23 = _mm_add_epi16(v23, v20);
     899             : 
     900           0 :   u24 = _mm_add_epi16(v24, v27);
     901           0 :   u25 = _mm_add_epi16(v25, v26);
     902           0 :   u26 = _mm_sub_epi16(v25, v26);
     903           0 :   u27 = _mm_sub_epi16(v24, v27);
     904           0 :   u28 = _mm_sub_epi16(v31, v28);
     905           0 :   u29 = _mm_sub_epi16(v30, v29);
     906           0 :   u30 = _mm_add_epi16(v29, v30);
     907           0 :   u31 = _mm_add_epi16(v28, v31);
     908             : 
     909             :   {
     910           0 :     const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
     911           0 :     const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
     912           0 :     const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
     913           0 :     butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
     914           0 :     butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
     915           0 :     butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
     916           0 :     butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
     917             :   }
     918             : 
     919           0 :   out[0] = _mm_add_epi16(u16, u23);
     920           0 :   out[1] = _mm_add_epi16(u17, u22);
     921           0 :   out[2] = _mm_add_epi16(u18, u21);
     922           0 :   out[3] = _mm_add_epi16(u19, u20);
     923           0 :   v20 = _mm_sub_epi16(u19, u20);
     924           0 :   v21 = _mm_sub_epi16(u18, u21);
     925           0 :   v22 = _mm_sub_epi16(u17, u22);
     926           0 :   v23 = _mm_sub_epi16(u16, u23);
     927             : 
     928           0 :   v24 = _mm_sub_epi16(u31, u24);
     929           0 :   v25 = _mm_sub_epi16(u30, u25);
     930           0 :   v26 = _mm_sub_epi16(u29, u26);
     931           0 :   v27 = _mm_sub_epi16(u28, u27);
     932           0 :   out[12] = _mm_add_epi16(u27, u28);
     933           0 :   out[13] = _mm_add_epi16(u26, u29);
     934           0 :   out[14] = _mm_add_epi16(u25, u30);
     935           0 :   out[15] = _mm_add_epi16(u24, u31);
     936             : 
     937             :   {
     938           0 :     const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
     939           0 :     const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
     940           0 :     butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]);
     941           0 :     butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]);
     942           0 :     butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]);
     943           0 :     butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]);
     944             :   }
     945           0 : }
     946             : 
     947             : // 8x16 block, input __m128i in[16], output __m128i in[32]
     948           0 : static void idct32_8x32_135(__m128i *in /*in[32]*/) {
     949             :   __m128i out[32];
     950           0 :   idct32_8x32_quarter_1_2(in, out);
     951           0 :   idct32_8x32_quarter_3_4(in, &out[16]);
     952           0 :   add_sub_butterfly(out, in, 32);
     953           0 : }
     954             : 
     955           0 : static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
     956           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
     957           0 :   const __m128i zero = _mm_setzero_si128();
     958           0 :   int j = 0;
     959           0 :   while (j < 32) {
     960           0 :     in[j] = _mm_adds_epi16(in[j], final_rounding);
     961           0 :     in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
     962             : 
     963           0 :     in[j] = _mm_srai_epi16(in[j], 6);
     964           0 :     in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
     965             : 
     966           0 :     RECON_AND_STORE(dst, in[j]);
     967           0 :     dst += stride;
     968           0 :     RECON_AND_STORE(dst, in[j + 1]);
     969           0 :     dst += stride;
     970           0 :     j += 2;
     971             :   }
     972           0 : }
     973             : 
     974           0 : static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest,
     975             :                                    int stride) {
     976           0 :   store_buffer_8x32(in0, dest, stride);
     977           0 :   store_buffer_8x32(in1, dest + 8, stride);
     978           0 : }
     979             : 
     980           0 : static INLINE void idct32_135(__m128i *col0, __m128i *col1) {
     981           0 :   idct32_8x32_135(col0);
     982           0 :   idct32_8x32_135(col1);
     983           0 : }
     984             : 
     985             : typedef enum { left_16, right_16 } ColsIndicator;
     986             : 
     987           0 : static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store,
     988             :                                      ColsIndicator cols) {
     989           0 :   switch (cols) {
     990             :     case left_16: {
     991             :       int i;
     992           0 :       array_transpose_16x16(in0, in1);
     993           0 :       for (i = 0; i < 16; ++i) {
     994           0 :         store[i] = in0[16 + i];
     995           0 :         store[16 + i] = in1[16 + i];
     996             :       }
     997           0 :       break;
     998             :     }
     999             :     case right_16: {
    1000           0 :       array_transpose_16x16_2(store, &store[16], in0, in1);
    1001           0 :       break;
    1002             :     }
    1003           0 :     default: { assert(0); }
    1004             :   }
    1005           0 : }
    1006             : 
    1007             : // Only upper-left 16x16 has non-zero coeff
    1008           0 : void aom_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
    1009             :                                  int stride) {
    1010             :   // Each array represents an 8x32 block
    1011             :   __m128i col0[32], col1[32];
    1012             :   // This array represents a 16x16 block
    1013             :   __m128i temp[32];
    1014             : 
    1015             :   // Load input data. Only need to load the top left 16x16 block.
    1016           0 :   load_buffer_16x16(input, col0, col1);
    1017             : 
    1018             :   // columns
    1019           0 :   array_transpose_16x16(col0, col1);
    1020           0 :   idct32_135(col0, col1);
    1021             : 
    1022             :   // rows
    1023           0 :   transpose_and_copy_16x16(col0, col1, temp, left_16);
    1024           0 :   idct32_135(col0, col1);
    1025           0 :   recon_and_store(col0, col1, dest, stride);
    1026             : 
    1027           0 :   transpose_and_copy_16x16(col0, col1, temp, right_16);
    1028           0 :   idct32_135(col0, col1);
    1029           0 :   recon_and_store(col0, col1, dest + 16, stride);
    1030           0 : }
    1031             : 
    1032             : // For each 8x32 block __m128i in[32],
    1033             : // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
    1034             : // output pixels: 8-15 in __m128i in[32]
    1035           0 : static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
    1036             :                                        __m128i *out /*out[16]*/) {
    1037             :   __m128i u8, u9, u10, u11, u12, u13, u14, u15;  // stp2_
    1038             :   __m128i v8, v9, v10, v11, v12, v13, v14, v15;  // stp1_
    1039             : 
    1040             :   {
    1041           0 :     const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
    1042           0 :     const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
    1043           0 :     const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
    1044           0 :     const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
    1045           0 :     butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
    1046           0 :     butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
    1047             :   }
    1048             : 
    1049           0 :   v8 = _mm_add_epi16(u8, u9);
    1050           0 :   v9 = _mm_sub_epi16(u8, u9);
    1051           0 :   v14 = _mm_sub_epi16(u15, u14);
    1052           0 :   v15 = _mm_add_epi16(u15, u14);
    1053             : 
    1054             :   {
    1055           0 :     const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
    1056           0 :     const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
    1057           0 :     const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
    1058           0 :     const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
    1059           0 :     butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
    1060           0 :     butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
    1061             :   }
    1062             : 
    1063           0 :   v10 = _mm_sub_epi16(u11, u10);
    1064           0 :   v11 = _mm_add_epi16(u11, u10);
    1065           0 :   v12 = _mm_add_epi16(u12, u13);
    1066           0 :   v13 = _mm_sub_epi16(u12, u13);
    1067             : 
    1068             :   {
    1069           0 :     const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    1070           0 :     const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
    1071           0 :     const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
    1072           0 :     butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
    1073           0 :     butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
    1074             :   }
    1075             : 
    1076           0 :   out[0] = _mm_add_epi16(v8, v11);
    1077           0 :   out[1] = _mm_add_epi16(v9, v10);
    1078           0 :   out[6] = _mm_add_epi16(v14, v13);
    1079           0 :   out[7] = _mm_add_epi16(v15, v12);
    1080             : 
    1081           0 :   out[2] = _mm_sub_epi16(v9, v10);
    1082           0 :   out[3] = _mm_sub_epi16(v8, v11);
    1083           0 :   out[4] = _mm_sub_epi16(v15, v12);
    1084           0 :   out[5] = _mm_sub_epi16(v14, v13);
    1085             : 
    1086             :   {
    1087           0 :     const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
    1088           0 :     const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    1089           0 :     butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
    1090           0 :     butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
    1091             :   }
    1092           0 : }
    1093             : 
    1094             : // For each 8x32 block __m128i in[32],
    1095             : // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
    1096             : // output pixels: 0-7 in __m128i in[32]
    1097           0 : static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/,
    1098             :                                        __m128i *out /*out[8]*/) {
    1099             :   __m128i u0, u1, u2, u3, u4, u5, u6, u7;  // stp1_
    1100             :   __m128i v0, v1, v2, v3, v4, v5, v6, v7;  // stp2_
    1101             : 
    1102             :   {
    1103           0 :     const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    1104           0 :     const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
    1105           0 :     const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
    1106           0 :     const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
    1107           0 :     butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
    1108           0 :     butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
    1109             :   }
    1110             : 
    1111           0 :   v4 = _mm_add_epi16(u4, u5);
    1112           0 :   v5 = _mm_sub_epi16(u4, u5);
    1113           0 :   v6 = _mm_sub_epi16(u7, u6);
    1114           0 :   v7 = _mm_add_epi16(u7, u6);
    1115             : 
    1116             :   {
    1117           0 :     const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
    1118           0 :     const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    1119           0 :     const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    1120           0 :     const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
    1121           0 :     butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
    1122             : 
    1123           0 :     butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
    1124           0 :     butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
    1125             :   }
    1126             : 
    1127           0 :   v0 = _mm_add_epi16(u0, u3);
    1128           0 :   v1 = _mm_add_epi16(u1, u2);
    1129           0 :   v2 = _mm_sub_epi16(u1, u2);
    1130           0 :   v3 = _mm_sub_epi16(u0, u3);
    1131             : 
    1132           0 :   out[0] = _mm_add_epi16(v0, v7);
    1133           0 :   out[1] = _mm_add_epi16(v1, v6);
    1134           0 :   out[2] = _mm_add_epi16(v2, v5);
    1135           0 :   out[3] = _mm_add_epi16(v3, v4);
    1136           0 :   out[4] = _mm_sub_epi16(v3, v4);
    1137           0 :   out[5] = _mm_sub_epi16(v2, v5);
    1138           0 :   out[6] = _mm_sub_epi16(v1, v6);
    1139           0 :   out[7] = _mm_sub_epi16(v0, v7);
    1140           0 : }
    1141             : 
    1142             : // For each 8x32 block __m128i in[32],
    1143             : // Input with odd index,
    1144             : // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
    1145             : // output pixels: 16-23, 24-31 in __m128i in[32]
    1146             : // We avoid hide an offset, 16, inside this function. So we output 0-15 into
    1147             : // array out[16]
    1148           0 : static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
    1149             :                                          __m128i *out /*out[16]*/) {
    1150             :   __m128i v16, v17, v18, v19, v20, v21, v22, v23;
    1151             :   __m128i v24, v25, v26, v27, v28, v29, v30, v31;
    1152             :   __m128i u16, u17, u18, u19, u20, u21, u22, u23;
    1153             :   __m128i u24, u25, u26, u27, u28, u29, u30, u31;
    1154             : 
    1155             :   {
    1156           0 :     const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
    1157           0 :     const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
    1158           0 :     const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
    1159           0 :     const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
    1160           0 :     const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
    1161           0 :     const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
    1162           0 :     const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
    1163           0 :     const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
    1164           0 :     const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
    1165           0 :     const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
    1166           0 :     const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
    1167           0 :     const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
    1168           0 :     const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
    1169           0 :     const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
    1170           0 :     const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
    1171           0 :     const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
    1172           0 :     butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
    1173           0 :     butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
    1174           0 :     butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
    1175           0 :     butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
    1176             : 
    1177           0 :     butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
    1178           0 :     butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
    1179             : 
    1180           0 :     butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
    1181           0 :     butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
    1182             :   }
    1183             : 
    1184           0 :   v16 = _mm_add_epi16(u16, u17);
    1185           0 :   v17 = _mm_sub_epi16(u16, u17);
    1186           0 :   v18 = _mm_sub_epi16(u19, u18);
    1187           0 :   v19 = _mm_add_epi16(u19, u18);
    1188             : 
    1189           0 :   v20 = _mm_add_epi16(u20, u21);
    1190           0 :   v21 = _mm_sub_epi16(u20, u21);
    1191           0 :   v22 = _mm_sub_epi16(u23, u22);
    1192           0 :   v23 = _mm_add_epi16(u23, u22);
    1193             : 
    1194           0 :   v24 = _mm_add_epi16(u24, u25);
    1195           0 :   v25 = _mm_sub_epi16(u24, u25);
    1196           0 :   v26 = _mm_sub_epi16(u27, u26);
    1197           0 :   v27 = _mm_add_epi16(u27, u26);
    1198             : 
    1199           0 :   v28 = _mm_add_epi16(u28, u29);
    1200           0 :   v29 = _mm_sub_epi16(u28, u29);
    1201           0 :   v30 = _mm_sub_epi16(u31, u30);
    1202           0 :   v31 = _mm_add_epi16(u31, u30);
    1203             : 
    1204             :   {
    1205           0 :     const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
    1206           0 :     const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
    1207           0 :     const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
    1208           0 :     const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
    1209           0 :     const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
    1210           0 :     const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
    1211           0 :     butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
    1212           0 :     butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
    1213           0 :     butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
    1214           0 :     butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
    1215             :   }
    1216             : 
    1217           0 :   u16 = _mm_add_epi16(v16, v19);
    1218           0 :   u17 = _mm_add_epi16(v17, v18);
    1219           0 :   u18 = _mm_sub_epi16(v17, v18);
    1220           0 :   u19 = _mm_sub_epi16(v16, v19);
    1221           0 :   u20 = _mm_sub_epi16(v23, v20);
    1222           0 :   u21 = _mm_sub_epi16(v22, v21);
    1223           0 :   u22 = _mm_add_epi16(v22, v21);
    1224           0 :   u23 = _mm_add_epi16(v23, v20);
    1225             : 
    1226           0 :   u24 = _mm_add_epi16(v24, v27);
    1227           0 :   u25 = _mm_add_epi16(v25, v26);
    1228           0 :   u26 = _mm_sub_epi16(v25, v26);
    1229           0 :   u27 = _mm_sub_epi16(v24, v27);
    1230             : 
    1231           0 :   u28 = _mm_sub_epi16(v31, v28);
    1232           0 :   u29 = _mm_sub_epi16(v30, v29);
    1233           0 :   u30 = _mm_add_epi16(v29, v30);
    1234           0 :   u31 = _mm_add_epi16(v28, v31);
    1235             : 
    1236             :   {
    1237           0 :     const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
    1238           0 :     const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
    1239           0 :     const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
    1240           0 :     butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
    1241           0 :     butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
    1242           0 :     butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
    1243           0 :     butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
    1244             :   }
    1245             : 
    1246           0 :   out[0] = _mm_add_epi16(u16, u23);
    1247           0 :   out[1] = _mm_add_epi16(u17, u22);
    1248           0 :   out[2] = _mm_add_epi16(u18, u21);
    1249           0 :   out[3] = _mm_add_epi16(u19, u20);
    1250           0 :   out[4] = _mm_sub_epi16(u19, u20);
    1251           0 :   out[5] = _mm_sub_epi16(u18, u21);
    1252           0 :   out[6] = _mm_sub_epi16(u17, u22);
    1253           0 :   out[7] = _mm_sub_epi16(u16, u23);
    1254             : 
    1255           0 :   out[8] = _mm_sub_epi16(u31, u24);
    1256           0 :   out[9] = _mm_sub_epi16(u30, u25);
    1257           0 :   out[10] = _mm_sub_epi16(u29, u26);
    1258           0 :   out[11] = _mm_sub_epi16(u28, u27);
    1259           0 :   out[12] = _mm_add_epi16(u27, u28);
    1260           0 :   out[13] = _mm_add_epi16(u26, u29);
    1261           0 :   out[14] = _mm_add_epi16(u25, u30);
    1262           0 :   out[15] = _mm_add_epi16(u24, u31);
    1263             : 
    1264             :   {
    1265           0 :     const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
    1266           0 :     const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
    1267           0 :     butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
    1268           0 :     butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
    1269           0 :     butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
    1270           0 :     butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
    1271             :   }
    1272           0 : }
    1273             : 
    1274           0 : static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/,
    1275             :                                          __m128i *out /*out[32]*/) {
    1276             :   __m128i temp[16];
    1277           0 :   idct32_full_8x32_quarter_1(in, temp);
    1278           0 :   idct32_full_8x32_quarter_2(in, &temp[8]);
    1279           0 :   add_sub_butterfly(temp, out, 16);
    1280           0 : }
    1281             : 
    1282           0 : static void idct32_full_8x32(const __m128i *in /*in[32]*/,
    1283             :                              __m128i *out /*out[32]*/) {
    1284             :   __m128i temp[32];
    1285           0 :   idct32_full_8x32_quarter_1_2(in, temp);
    1286           0 :   idct32_full_8x32_quarter_3_4(in, &temp[16]);
    1287           0 :   add_sub_butterfly(temp, out, 32);
    1288           0 : }
    1289             : 
    1290           0 : static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
    1291             :   int i;
    1292           0 :   for (i = 0; i < 8; ++i) {
    1293           0 :     in[i] = load_input_data(input);
    1294           0 :     in[i + 8] = load_input_data(input + 8);
    1295           0 :     in[i + 16] = load_input_data(input + 16);
    1296           0 :     in[i + 24] = load_input_data(input + 24);
    1297           0 :     input += 32;
    1298             :   }
    1299           0 : }
    1300             : 
    1301           0 : void aom_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest,
    1302             :                                   int stride) {
    1303             :   __m128i col[128], in[32];
    1304             :   int i, j;
    1305             : 
    1306             :   // rows
    1307           0 :   for (i = 0; i < 4; ++i) {
    1308           0 :     load_buffer_8x32(input, in);
    1309           0 :     input += 32 << 3;
    1310             : 
    1311             :     // Transpose 32x8 block to 8x32 block
    1312           0 :     array_transpose_8x8(in, in);
    1313           0 :     array_transpose_8x8(in + 8, in + 8);
    1314           0 :     array_transpose_8x8(in + 16, in + 16);
    1315           0 :     array_transpose_8x8(in + 24, in + 24);
    1316             : 
    1317           0 :     idct32_full_8x32(in, col + (i << 5));
    1318             :   }
    1319             : 
    1320             :   // columns
    1321           0 :   for (i = 0; i < 4; ++i) {
    1322           0 :     j = i << 3;
    1323             :     // Transpose 32x8 block to 8x32 block
    1324           0 :     array_transpose_8x8(col + j, in);
    1325           0 :     array_transpose_8x8(col + j + 32, in + 8);
    1326           0 :     array_transpose_8x8(col + j + 64, in + 16);
    1327           0 :     array_transpose_8x8(col + j + 96, in + 24);
    1328             : 
    1329           0 :     idct32_full_8x32(in, in);
    1330           0 :     store_buffer_8x32(in, dest, stride);
    1331           0 :     dest += 8;
    1332             :   }
    1333           0 : }

Generated by: LCOV version 1.13