LCOV - code coverage report
Current view: top level - third_party/aom/av1/common/x86 - idct_intrin_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 746 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 24 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include "./av1_rtcd.h"
      13             : #include "aom_dsp/x86/inv_txfm_sse2.h"
      14             : #include "aom_dsp/x86/synonyms.h"
      15             : #include "aom_dsp/x86/txfm_common_sse2.h"
      16             : #include "aom_ports/mem.h"
      17             : #include "av1/common/enums.h"
      18             : 
      19             : #if CONFIG_EXT_TX
      20           0 : static INLINE void fliplr_4x4(__m128i *in /*in[2]*/) {
      21           0 :   in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
      22           0 :   in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
      23           0 :   in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
      24           0 :   in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
      25           0 : }
      26             : 
      27           0 : static INLINE void fliplr_8x8(__m128i *in /*in[8]*/) {
      28           0 :   in[0] = mm_reverse_epi16(in[0]);
      29           0 :   in[1] = mm_reverse_epi16(in[1]);
      30           0 :   in[2] = mm_reverse_epi16(in[2]);
      31           0 :   in[3] = mm_reverse_epi16(in[3]);
      32             : 
      33           0 :   in[4] = mm_reverse_epi16(in[4]);
      34           0 :   in[5] = mm_reverse_epi16(in[5]);
      35           0 :   in[6] = mm_reverse_epi16(in[6]);
      36           0 :   in[7] = mm_reverse_epi16(in[7]);
      37           0 : }
      38             : 
      39           0 : static INLINE void fliplr_16x8(__m128i *in /*in[16]*/) {
      40           0 :   fliplr_8x8(&in[0]);
      41           0 :   fliplr_8x8(&in[8]);
      42           0 : }
      43             : 
      44             : #define FLIPLR_16x16(in0, in1) \
      45             :   do {                         \
      46             :     __m128i *tmp;              \
      47             :     fliplr_16x8(in0);          \
      48             :     fliplr_16x8(in1);          \
      49             :     tmp = (in0);               \
      50             :     (in0) = (in1);             \
      51             :     (in1) = tmp;               \
      52             :   } while (0)
      53             : 
      54             : #define FLIPUD_PTR(dest, stride, size)       \
      55             :   do {                                       \
      56             :     (dest) = (dest) + ((size)-1) * (stride); \
      57             :     (stride) = -(stride);                    \
      58             :   } while (0)
      59             : #endif
      60             : 
      61           0 : void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
      62             :                             int tx_type) {
      63             :   __m128i in[2];
      64           0 :   const __m128i zero = _mm_setzero_si128();
      65           0 :   const __m128i eight = _mm_set1_epi16(8);
      66             : 
      67           0 :   in[0] = load_input_data(input);
      68           0 :   in[1] = load_input_data(input + 8);
      69             : 
      70           0 :   switch (tx_type) {
      71             :     case DCT_DCT:
      72           0 :       aom_idct4_sse2(in);
      73           0 :       aom_idct4_sse2(in);
      74           0 :       break;
      75             :     case ADST_DCT:
      76           0 :       aom_idct4_sse2(in);
      77           0 :       aom_iadst4_sse2(in);
      78           0 :       break;
      79             :     case DCT_ADST:
      80           0 :       aom_iadst4_sse2(in);
      81           0 :       aom_idct4_sse2(in);
      82           0 :       break;
      83             :     case ADST_ADST:
      84           0 :       aom_iadst4_sse2(in);
      85           0 :       aom_iadst4_sse2(in);
      86           0 :       break;
      87             : #if CONFIG_EXT_TX
      88             :     case FLIPADST_DCT:
      89           0 :       aom_idct4_sse2(in);
      90           0 :       aom_iadst4_sse2(in);
      91           0 :       FLIPUD_PTR(dest, stride, 4);
      92           0 :       break;
      93             :     case DCT_FLIPADST:
      94           0 :       aom_iadst4_sse2(in);
      95           0 :       aom_idct4_sse2(in);
      96           0 :       fliplr_4x4(in);
      97           0 :       break;
      98             :     case FLIPADST_FLIPADST:
      99           0 :       aom_iadst4_sse2(in);
     100           0 :       aom_iadst4_sse2(in);
     101           0 :       FLIPUD_PTR(dest, stride, 4);
     102           0 :       fliplr_4x4(in);
     103           0 :       break;
     104             :     case ADST_FLIPADST:
     105           0 :       aom_iadst4_sse2(in);
     106           0 :       aom_iadst4_sse2(in);
     107           0 :       fliplr_4x4(in);
     108           0 :       break;
     109             :     case FLIPADST_ADST:
     110           0 :       aom_iadst4_sse2(in);
     111           0 :       aom_iadst4_sse2(in);
     112           0 :       FLIPUD_PTR(dest, stride, 4);
     113           0 :       break;
     114             : #endif  // CONFIG_EXT_TX
     115           0 :     default: assert(0); break;
     116             :   }
     117             : 
     118             :   // Final round and shift
     119           0 :   in[0] = _mm_add_epi16(in[0], eight);
     120           0 :   in[1] = _mm_add_epi16(in[1], eight);
     121             : 
     122           0 :   in[0] = _mm_srai_epi16(in[0], 4);
     123           0 :   in[1] = _mm_srai_epi16(in[1], 4);
     124             : 
     125             :   // Reconstruction and Store
     126             :   {
     127           0 :     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
     128           0 :     __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
     129           0 :     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
     130           0 :     __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
     131           0 :     d0 = _mm_unpacklo_epi32(d0, d1);
     132           0 :     d2 = _mm_unpacklo_epi32(d2, d3);
     133           0 :     d0 = _mm_unpacklo_epi8(d0, zero);
     134           0 :     d2 = _mm_unpacklo_epi8(d2, zero);
     135           0 :     d0 = _mm_add_epi16(d0, in[0]);
     136           0 :     d2 = _mm_add_epi16(d2, in[1]);
     137           0 :     d0 = _mm_packus_epi16(d0, d2);
     138             :     // store result[0]
     139           0 :     *(int *)dest = _mm_cvtsi128_si32(d0);
     140             :     // store result[1]
     141           0 :     d0 = _mm_srli_si128(d0, 4);
     142           0 :     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
     143             :     // store result[2]
     144           0 :     d0 = _mm_srli_si128(d0, 4);
     145           0 :     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
     146             :     // store result[3]
     147           0 :     d0 = _mm_srli_si128(d0, 4);
     148           0 :     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
     149             :   }
     150           0 : }
     151             : 
     152           0 : void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
     153             :                             int tx_type) {
     154             :   __m128i in[8];
     155           0 :   const __m128i zero = _mm_setzero_si128();
     156           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
     157             : 
     158             :   // load input data
     159           0 :   in[0] = load_input_data(input);
     160           0 :   in[1] = load_input_data(input + 8 * 1);
     161           0 :   in[2] = load_input_data(input + 8 * 2);
     162           0 :   in[3] = load_input_data(input + 8 * 3);
     163           0 :   in[4] = load_input_data(input + 8 * 4);
     164           0 :   in[5] = load_input_data(input + 8 * 5);
     165           0 :   in[6] = load_input_data(input + 8 * 6);
     166           0 :   in[7] = load_input_data(input + 8 * 7);
     167             : 
     168           0 :   switch (tx_type) {
     169             :     case DCT_DCT:
     170           0 :       aom_idct8_sse2(in);
     171           0 :       aom_idct8_sse2(in);
     172           0 :       break;
     173             :     case ADST_DCT:
     174           0 :       aom_idct8_sse2(in);
     175           0 :       aom_iadst8_sse2(in);
     176           0 :       break;
     177             :     case DCT_ADST:
     178           0 :       aom_iadst8_sse2(in);
     179           0 :       aom_idct8_sse2(in);
     180           0 :       break;
     181             :     case ADST_ADST:
     182           0 :       aom_iadst8_sse2(in);
     183           0 :       aom_iadst8_sse2(in);
     184           0 :       break;
     185             : #if CONFIG_EXT_TX
     186             :     case FLIPADST_DCT:
     187           0 :       aom_idct8_sse2(in);
     188           0 :       aom_iadst8_sse2(in);
     189           0 :       FLIPUD_PTR(dest, stride, 8);
     190           0 :       break;
     191             :     case DCT_FLIPADST:
     192           0 :       aom_iadst8_sse2(in);
     193           0 :       aom_idct8_sse2(in);
     194           0 :       fliplr_8x8(in);
     195           0 :       break;
     196             :     case FLIPADST_FLIPADST:
     197           0 :       aom_iadst8_sse2(in);
     198           0 :       aom_iadst8_sse2(in);
     199           0 :       FLIPUD_PTR(dest, stride, 8);
     200           0 :       fliplr_8x8(in);
     201           0 :       break;
     202             :     case ADST_FLIPADST:
     203           0 :       aom_iadst8_sse2(in);
     204           0 :       aom_iadst8_sse2(in);
     205           0 :       fliplr_8x8(in);
     206           0 :       break;
     207             :     case FLIPADST_ADST:
     208           0 :       aom_iadst8_sse2(in);
     209           0 :       aom_iadst8_sse2(in);
     210           0 :       FLIPUD_PTR(dest, stride, 8);
     211           0 :       break;
     212             : #endif  // CONFIG_EXT_TX
     213           0 :     default: assert(0); break;
     214             :   }
     215             : 
     216             :   // Final rounding and shift
     217           0 :   in[0] = _mm_adds_epi16(in[0], final_rounding);
     218           0 :   in[1] = _mm_adds_epi16(in[1], final_rounding);
     219           0 :   in[2] = _mm_adds_epi16(in[2], final_rounding);
     220           0 :   in[3] = _mm_adds_epi16(in[3], final_rounding);
     221           0 :   in[4] = _mm_adds_epi16(in[4], final_rounding);
     222           0 :   in[5] = _mm_adds_epi16(in[5], final_rounding);
     223           0 :   in[6] = _mm_adds_epi16(in[6], final_rounding);
     224           0 :   in[7] = _mm_adds_epi16(in[7], final_rounding);
     225             : 
     226           0 :   in[0] = _mm_srai_epi16(in[0], 5);
     227           0 :   in[1] = _mm_srai_epi16(in[1], 5);
     228           0 :   in[2] = _mm_srai_epi16(in[2], 5);
     229           0 :   in[3] = _mm_srai_epi16(in[3], 5);
     230           0 :   in[4] = _mm_srai_epi16(in[4], 5);
     231           0 :   in[5] = _mm_srai_epi16(in[5], 5);
     232           0 :   in[6] = _mm_srai_epi16(in[6], 5);
     233           0 :   in[7] = _mm_srai_epi16(in[7], 5);
     234             : 
     235           0 :   RECON_AND_STORE(dest + 0 * stride, in[0]);
     236           0 :   RECON_AND_STORE(dest + 1 * stride, in[1]);
     237           0 :   RECON_AND_STORE(dest + 2 * stride, in[2]);
     238           0 :   RECON_AND_STORE(dest + 3 * stride, in[3]);
     239           0 :   RECON_AND_STORE(dest + 4 * stride, in[4]);
     240           0 :   RECON_AND_STORE(dest + 5 * stride, in[5]);
     241           0 :   RECON_AND_STORE(dest + 6 * stride, in[6]);
     242           0 :   RECON_AND_STORE(dest + 7 * stride, in[7]);
     243           0 : }
     244             : 
     245             : #if CONFIG_EXT_TX
     246           0 : static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
     247           0 :   array_transpose_16x16(in0, in1);
     248           0 :   idtx16_8col(in0);
     249           0 :   idtx16_8col(in1);
     250           0 : }
     251             : #endif  // CONFIG_EXT_TX
     252             : 
     253           0 : void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
     254             :                                int stride, int tx_type) {
     255             :   __m128i in[32];
     256           0 :   __m128i *in0 = &in[0];
     257           0 :   __m128i *in1 = &in[16];
     258             : 
     259           0 :   load_buffer_8x16(input, in0);
     260           0 :   input += 8;
     261           0 :   load_buffer_8x16(input, in1);
     262             : 
     263           0 :   switch (tx_type) {
     264             :     case DCT_DCT:
     265           0 :       aom_idct16_sse2(in0, in1);
     266           0 :       aom_idct16_sse2(in0, in1);
     267           0 :       break;
     268             :     case ADST_DCT:
     269           0 :       aom_idct16_sse2(in0, in1);
     270           0 :       aom_iadst16_sse2(in0, in1);
     271           0 :       break;
     272             :     case DCT_ADST:
     273           0 :       aom_iadst16_sse2(in0, in1);
     274           0 :       aom_idct16_sse2(in0, in1);
     275           0 :       break;
     276             :     case ADST_ADST:
     277           0 :       aom_iadst16_sse2(in0, in1);
     278           0 :       aom_iadst16_sse2(in0, in1);
     279           0 :       break;
     280             : #if CONFIG_EXT_TX
     281             :     case FLIPADST_DCT:
     282           0 :       aom_idct16_sse2(in0, in1);
     283           0 :       aom_iadst16_sse2(in0, in1);
     284           0 :       FLIPUD_PTR(dest, stride, 16);
     285           0 :       break;
     286             :     case DCT_FLIPADST:
     287           0 :       aom_iadst16_sse2(in0, in1);
     288           0 :       aom_idct16_sse2(in0, in1);
     289           0 :       FLIPLR_16x16(in0, in1);
     290           0 :       break;
     291             :     case FLIPADST_FLIPADST:
     292           0 :       aom_iadst16_sse2(in0, in1);
     293           0 :       aom_iadst16_sse2(in0, in1);
     294           0 :       FLIPUD_PTR(dest, stride, 16);
     295           0 :       FLIPLR_16x16(in0, in1);
     296           0 :       break;
     297             :     case ADST_FLIPADST:
     298           0 :       aom_iadst16_sse2(in0, in1);
     299           0 :       aom_iadst16_sse2(in0, in1);
     300           0 :       FLIPLR_16x16(in0, in1);
     301           0 :       break;
     302             :     case FLIPADST_ADST:
     303           0 :       aom_iadst16_sse2(in0, in1);
     304           0 :       aom_iadst16_sse2(in0, in1);
     305           0 :       FLIPUD_PTR(dest, stride, 16);
     306           0 :       break;
     307             :     case IDTX:
     308           0 :       iidtx16_sse2(in0, in1);
     309           0 :       iidtx16_sse2(in0, in1);
     310           0 :       break;
     311             :     case V_DCT:
     312           0 :       iidtx16_sse2(in0, in1);
     313           0 :       aom_idct16_sse2(in0, in1);
     314           0 :       break;
     315             :     case H_DCT:
     316           0 :       aom_idct16_sse2(in0, in1);
     317           0 :       iidtx16_sse2(in0, in1);
     318           0 :       break;
     319             :     case V_ADST:
     320           0 :       iidtx16_sse2(in0, in1);
     321           0 :       aom_iadst16_sse2(in0, in1);
     322           0 :       break;
     323             :     case H_ADST:
     324           0 :       aom_iadst16_sse2(in0, in1);
     325           0 :       iidtx16_sse2(in0, in1);
     326           0 :       break;
     327             :     case V_FLIPADST:
     328           0 :       iidtx16_sse2(in0, in1);
     329           0 :       aom_iadst16_sse2(in0, in1);
     330           0 :       FLIPUD_PTR(dest, stride, 16);
     331           0 :       break;
     332             :     case H_FLIPADST:
     333           0 :       aom_iadst16_sse2(in0, in1);
     334           0 :       iidtx16_sse2(in0, in1);
     335           0 :       FLIPLR_16x16(in0, in1);
     336           0 :       break;
     337             : #endif  // CONFIG_EXT_TX
     338           0 :     default: assert(0); break;
     339             :   }
     340             : 
     341           0 :   write_buffer_8x16(dest, in0, stride);
     342           0 :   dest += 8;
     343           0 :   write_buffer_8x16(dest, in1, stride);
     344           0 : }
     345             : 
     346             : #if CONFIG_EXT_TX
     347           0 : static void iidtx8_sse2(__m128i *in) {
     348           0 :   in[0] = _mm_slli_epi16(in[0], 1);
     349           0 :   in[1] = _mm_slli_epi16(in[1], 1);
     350           0 :   in[2] = _mm_slli_epi16(in[2], 1);
     351           0 :   in[3] = _mm_slli_epi16(in[3], 1);
     352           0 :   in[4] = _mm_slli_epi16(in[4], 1);
     353           0 :   in[5] = _mm_slli_epi16(in[5], 1);
     354           0 :   in[6] = _mm_slli_epi16(in[6], 1);
     355           0 :   in[7] = _mm_slli_epi16(in[7], 1);
     356           0 : }
     357             : 
     358           0 : static INLINE void iidtx4_sse2(__m128i *in) {
     359           0 :   const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
     360             : 
     361           0 :   const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
     362           0 :   const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
     363           0 :   const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
     364           0 :   const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
     365             : 
     366           0 :   const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
     367           0 :   const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
     368           0 :   const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
     369           0 :   const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
     370             : 
     371           0 :   in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
     372             :                           xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
     373           0 :   in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
     374             :                           xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
     375           0 : }
     376             : 
     377             : // load 8x8 array
     378           0 : static INLINE void flip_buffer_lr_8x8(__m128i *in) {
     379           0 :   in[0] = mm_reverse_epi16(in[0]);
     380           0 :   in[1] = mm_reverse_epi16(in[1]);
     381           0 :   in[2] = mm_reverse_epi16(in[2]);
     382           0 :   in[3] = mm_reverse_epi16(in[3]);
     383           0 :   in[4] = mm_reverse_epi16(in[4]);
     384           0 :   in[5] = mm_reverse_epi16(in[5]);
     385           0 :   in[6] = mm_reverse_epi16(in[6]);
     386           0 :   in[7] = mm_reverse_epi16(in[7]);
     387           0 : }
     388             : #endif  // CONFIG_EXT_TX
     389             : 
     390           0 : void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
     391             :                               int stride, int tx_type) {
     392             :   __m128i in[16];
     393             : 
     394           0 :   in[0] = load_input_data(input + 0 * 8);
     395           0 :   in[1] = load_input_data(input + 1 * 8);
     396           0 :   in[2] = load_input_data(input + 2 * 8);
     397           0 :   in[3] = load_input_data(input + 3 * 8);
     398           0 :   in[4] = load_input_data(input + 4 * 8);
     399           0 :   in[5] = load_input_data(input + 5 * 8);
     400           0 :   in[6] = load_input_data(input + 6 * 8);
     401           0 :   in[7] = load_input_data(input + 7 * 8);
     402             : 
     403           0 :   in[8] = load_input_data(input + 8 * 8);
     404           0 :   in[9] = load_input_data(input + 9 * 8);
     405           0 :   in[10] = load_input_data(input + 10 * 8);
     406           0 :   in[11] = load_input_data(input + 11 * 8);
     407           0 :   in[12] = load_input_data(input + 12 * 8);
     408           0 :   in[13] = load_input_data(input + 13 * 8);
     409           0 :   in[14] = load_input_data(input + 14 * 8);
     410           0 :   in[15] = load_input_data(input + 15 * 8);
     411             : 
     412             :   // Row transform
     413           0 :   switch (tx_type) {
     414             :     case DCT_DCT:
     415             :     case ADST_DCT:
     416             : #if CONFIG_EXT_TX
     417             :     case FLIPADST_DCT:
     418             :     case H_DCT:
     419             : #endif
     420           0 :       aom_idct8_sse2(in);
     421           0 :       array_transpose_8x8(in, in);
     422           0 :       aom_idct8_sse2(in + 8);
     423           0 :       array_transpose_8x8(in + 8, in + 8);
     424           0 :       break;
     425             :     case DCT_ADST:
     426             :     case ADST_ADST:
     427             : #if CONFIG_EXT_TX
     428             :     case DCT_FLIPADST:
     429             :     case FLIPADST_FLIPADST:
     430             :     case ADST_FLIPADST:
     431             :     case FLIPADST_ADST:
     432             :     case H_ADST:
     433             :     case H_FLIPADST:
     434             : #endif
     435           0 :       aom_iadst8_sse2(in);
     436           0 :       array_transpose_8x8(in, in);
     437           0 :       aom_iadst8_sse2(in + 8);
     438           0 :       array_transpose_8x8(in + 8, in + 8);
     439           0 :       break;
     440             : #if CONFIG_EXT_TX
     441             :     case V_FLIPADST:
     442             :     case V_ADST:
     443             :     case V_DCT:
     444             :     case IDTX:
     445           0 :       iidtx8_sse2(in);
     446           0 :       iidtx8_sse2(in + 8);
     447           0 :       break;
     448             : #endif
     449           0 :     default: assert(0); break;
     450             :   }
     451           0 :   scale_sqrt2_8x8(in);
     452           0 :   scale_sqrt2_8x8(in + 8);
     453             : 
     454             :   // Column transform
     455           0 :   switch (tx_type) {
     456             :     case DCT_DCT:
     457             :     case DCT_ADST:
     458             : #if CONFIG_EXT_TX
     459             :     case DCT_FLIPADST:
     460             :     case V_DCT:
     461             : #endif
     462           0 :       idct16_8col(in);
     463           0 :       break;
     464             :     case ADST_DCT:
     465             :     case ADST_ADST:
     466             : #if CONFIG_EXT_TX
     467             :     case FLIPADST_ADST:
     468             :     case ADST_FLIPADST:
     469             :     case FLIPADST_FLIPADST:
     470             :     case FLIPADST_DCT:
     471             :     case V_ADST:
     472             :     case V_FLIPADST:
     473             : #endif
     474           0 :       iadst16_8col(in);
     475           0 :       break;
     476             : #if CONFIG_EXT_TX
     477             :     case H_DCT:
     478             :     case H_ADST:
     479             :     case H_FLIPADST:
     480           0 :     case IDTX: idtx16_8col(in); break;
     481             : #endif
     482           0 :     default: assert(0); break;
     483             :   }
     484             : 
     485           0 :   switch (tx_type) {
     486             :     case DCT_DCT:
     487             :     case ADST_DCT:
     488             : #if CONFIG_EXT_TX
     489             :     case H_DCT:
     490             : #endif
     491             :     case DCT_ADST:
     492             :     case ADST_ADST:
     493             : #if CONFIG_EXT_TX
     494             :     case H_ADST:
     495             :     case V_ADST:
     496             :     case V_DCT:
     497             :     case IDTX:
     498             : #endif
     499           0 :       write_buffer_8x16(dest, in, stride);
     500           0 :       break;
     501             : #if CONFIG_EXT_TX
     502             :     case FLIPADST_DCT:
     503             :     case FLIPADST_ADST:
     504           0 :     case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break;
     505             :     case DCT_FLIPADST:
     506             :     case ADST_FLIPADST:
     507             :     case H_FLIPADST:
     508           0 :       flip_buffer_lr_8x8(in);
     509           0 :       flip_buffer_lr_8x8(in + 8);
     510           0 :       write_buffer_8x16(dest, in, stride);
     511           0 :       break;
     512             :     case FLIPADST_FLIPADST:
     513           0 :       flip_buffer_lr_8x8(in);
     514           0 :       flip_buffer_lr_8x8(in + 8);
     515           0 :       write_buffer_8x16(dest + stride * 15, in, -stride);
     516           0 :       break;
     517             : #endif
     518           0 :     default: assert(0); break;
     519             :   }
     520           0 : }
     521             : 
     522           0 : static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in,
     523             :                                            int stride) {
     524           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
     525           0 :   const __m128i zero = _mm_setzero_si128();
     526             :   // Final rounding and shift
     527           0 :   in[0] = _mm_adds_epi16(in[0], final_rounding);
     528           0 :   in[1] = _mm_adds_epi16(in[1], final_rounding);
     529           0 :   in[2] = _mm_adds_epi16(in[2], final_rounding);
     530           0 :   in[3] = _mm_adds_epi16(in[3], final_rounding);
     531           0 :   in[4] = _mm_adds_epi16(in[4], final_rounding);
     532           0 :   in[5] = _mm_adds_epi16(in[5], final_rounding);
     533           0 :   in[6] = _mm_adds_epi16(in[6], final_rounding);
     534           0 :   in[7] = _mm_adds_epi16(in[7], final_rounding);
     535             : 
     536           0 :   in[0] = _mm_srai_epi16(in[0], 6);
     537           0 :   in[1] = _mm_srai_epi16(in[1], 6);
     538           0 :   in[2] = _mm_srai_epi16(in[2], 6);
     539           0 :   in[3] = _mm_srai_epi16(in[3], 6);
     540           0 :   in[4] = _mm_srai_epi16(in[4], 6);
     541           0 :   in[5] = _mm_srai_epi16(in[5], 6);
     542           0 :   in[6] = _mm_srai_epi16(in[6], 6);
     543           0 :   in[7] = _mm_srai_epi16(in[7], 6);
     544             : 
     545           0 :   RECON_AND_STORE(dest + 0 * stride, in[0]);
     546           0 :   RECON_AND_STORE(dest + 1 * stride, in[1]);
     547           0 :   RECON_AND_STORE(dest + 2 * stride, in[2]);
     548           0 :   RECON_AND_STORE(dest + 3 * stride, in[3]);
     549           0 :   RECON_AND_STORE(dest + 4 * stride, in[4]);
     550           0 :   RECON_AND_STORE(dest + 5 * stride, in[5]);
     551           0 :   RECON_AND_STORE(dest + 6 * stride, in[6]);
     552           0 :   RECON_AND_STORE(dest + 7 * stride, in[7]);
     553           0 : }
     554             : 
     555           0 : void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest,
     556             :                               int stride, int tx_type) {
     557             :   __m128i in[16];
     558             : 
     559             :   // Transpose 16x8 input into in[]
     560           0 :   in[0] = load_input_data(input + 0 * 16);
     561           0 :   in[1] = load_input_data(input + 1 * 16);
     562           0 :   in[2] = load_input_data(input + 2 * 16);
     563           0 :   in[3] = load_input_data(input + 3 * 16);
     564           0 :   in[4] = load_input_data(input + 4 * 16);
     565           0 :   in[5] = load_input_data(input + 5 * 16);
     566           0 :   in[6] = load_input_data(input + 6 * 16);
     567           0 :   in[7] = load_input_data(input + 7 * 16);
     568           0 :   array_transpose_8x8(in, in);
     569             : 
     570           0 :   in[8] = load_input_data(input + 8 + 0 * 16);
     571           0 :   in[9] = load_input_data(input + 8 + 1 * 16);
     572           0 :   in[10] = load_input_data(input + 8 + 2 * 16);
     573           0 :   in[11] = load_input_data(input + 8 + 3 * 16);
     574           0 :   in[12] = load_input_data(input + 8 + 4 * 16);
     575           0 :   in[13] = load_input_data(input + 8 + 5 * 16);
     576           0 :   in[14] = load_input_data(input + 8 + 6 * 16);
     577           0 :   in[15] = load_input_data(input + 8 + 7 * 16);
     578           0 :   array_transpose_8x8(in + 8, in + 8);
     579             : 
     580             :   // Row transform
     581           0 :   switch (tx_type) {
     582             :     case DCT_DCT:
     583             :     case ADST_DCT:
     584             : #if CONFIG_EXT_TX
     585             :     case FLIPADST_DCT:
     586             :     case H_DCT:
     587             : #endif
     588           0 :       idct16_8col(in);
     589           0 :       break;
     590             :     case DCT_ADST:
     591             :     case ADST_ADST:
     592             : #if CONFIG_EXT_TX
     593             :     case DCT_FLIPADST:
     594             :     case FLIPADST_FLIPADST:
     595             :     case ADST_FLIPADST:
     596             :     case FLIPADST_ADST:
     597             :     case H_ADST:
     598             :     case H_FLIPADST:
     599             : #endif
     600           0 :       iadst16_8col(in);
     601           0 :       break;
     602             : #if CONFIG_EXT_TX
     603             :     case V_FLIPADST:
     604             :     case V_ADST:
     605             :     case V_DCT:
     606           0 :     case IDTX: idtx16_8col(in); break;
     607             : #endif
     608           0 :     default: assert(0); break;
     609             :   }
     610             : 
     611             :   // Scale
     612           0 :   scale_sqrt2_8x8(in);
     613           0 :   scale_sqrt2_8x8(in + 8);
     614             : 
     615             :   // Column transform
     616           0 :   switch (tx_type) {
     617             :     case DCT_DCT:
     618             :     case DCT_ADST:
     619             : #if CONFIG_EXT_TX
     620             :     case DCT_FLIPADST:
     621             :     case V_DCT:
     622             : #endif
     623           0 :       aom_idct8_sse2(in);
     624           0 :       aom_idct8_sse2(in + 8);
     625           0 :       break;
     626             :     case ADST_DCT:
     627             :     case ADST_ADST:
     628             : #if CONFIG_EXT_TX
     629             :     case FLIPADST_ADST:
     630             :     case ADST_FLIPADST:
     631             :     case FLIPADST_FLIPADST:
     632             :     case FLIPADST_DCT:
     633             :     case V_ADST:
     634             :     case V_FLIPADST:
     635             : #endif
     636           0 :       aom_iadst8_sse2(in);
     637           0 :       aom_iadst8_sse2(in + 8);
     638           0 :       break;
     639             : #if CONFIG_EXT_TX
     640             :     case H_DCT:
     641             :     case H_ADST:
     642             :     case H_FLIPADST:
     643             :     case IDTX:
     644           0 :       array_transpose_8x8(in, in);
     645           0 :       array_transpose_8x8(in + 8, in + 8);
     646           0 :       iidtx8_sse2(in);
     647           0 :       iidtx8_sse2(in + 8);
     648           0 :       break;
     649             : #endif
     650           0 :     default: assert(0); break;
     651             :   }
     652             : 
     653           0 :   switch (tx_type) {
     654             :     case DCT_DCT:
     655             :     case ADST_DCT:
     656             :     case DCT_ADST:
     657             :     case ADST_ADST:
     658             : #if CONFIG_EXT_TX
     659             :     case H_DCT:
     660             :     case H_ADST:
     661             :     case V_ADST:
     662             :     case V_DCT:
     663             :     case IDTX:
     664             : #endif
     665           0 :       write_buffer_8x8_round6(dest, in, stride);
     666           0 :       write_buffer_8x8_round6(dest + 8, in + 8, stride);
     667           0 :       break;
     668             : #if CONFIG_EXT_TX
     669             :     case FLIPADST_DCT:
     670             :     case FLIPADST_ADST:
     671             :     case V_FLIPADST:
     672           0 :       write_buffer_8x8_round6(dest + stride * 7, in, -stride);
     673           0 :       write_buffer_8x8_round6(dest + stride * 7 + 8, in + 8, -stride);
     674           0 :       break;
     675             :     case DCT_FLIPADST:
     676             :     case ADST_FLIPADST:
     677             :     case H_FLIPADST:
     678           0 :       flip_buffer_lr_8x8(in);
     679           0 :       flip_buffer_lr_8x8(in + 8);
     680           0 :       write_buffer_8x8_round6(dest, in + 8, stride);
     681           0 :       write_buffer_8x8_round6(dest + 8, in, stride);
     682           0 :       break;
     683             :     case FLIPADST_FLIPADST:
     684           0 :       flip_buffer_lr_8x8(in);
     685           0 :       flip_buffer_lr_8x8(in + 8);
     686           0 :       write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride);
     687           0 :       write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride);
     688           0 :       break;
     689             : #endif
     690           0 :     default: assert(0); break;
     691             :   }
     692           0 : }
     693             : 
     694           0 : static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in,
     695             :                                            int stride) {
     696           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
     697           0 :   const __m128i zero = _mm_setzero_si128();
     698             :   // Final rounding and shift
     699           0 :   in[0] = _mm_adds_epi16(in[0], final_rounding);
     700           0 :   in[1] = _mm_adds_epi16(in[1], final_rounding);
     701           0 :   in[2] = _mm_adds_epi16(in[2], final_rounding);
     702           0 :   in[3] = _mm_adds_epi16(in[3], final_rounding);
     703             : 
     704           0 :   in[0] = _mm_srai_epi16(in[0], 5);
     705           0 :   in[1] = _mm_srai_epi16(in[1], 5);
     706           0 :   in[2] = _mm_srai_epi16(in[2], 5);
     707           0 :   in[3] = _mm_srai_epi16(in[3], 5);
     708             : 
     709           0 :   RECON_AND_STORE(dest + 0 * stride, in[0]);
     710           0 :   RECON_AND_STORE(dest + 1 * stride, in[1]);
     711           0 :   RECON_AND_STORE(dest + 2 * stride, in[2]);
     712           0 :   RECON_AND_STORE(dest + 3 * stride, in[3]);
     713           0 : }
     714             : 
     715           0 : void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
     716             :                             int tx_type) {
     717             :   __m128i in[8];
     718             : 
     719           0 :   in[0] = load_input_data(input + 0 * 8);
     720           0 :   in[1] = load_input_data(input + 1 * 8);
     721           0 :   in[2] = load_input_data(input + 2 * 8);
     722           0 :   in[3] = load_input_data(input + 3 * 8);
     723             : 
     724             :   // Row transform
     725           0 :   switch (tx_type) {
     726             :     case DCT_DCT:
     727             :     case ADST_DCT:
     728             : #if CONFIG_EXT_TX
     729             :     case FLIPADST_DCT:
     730             :     case H_DCT:
     731             : #endif
     732           0 :       aom_idct8_sse2(in);
     733           0 :       break;
     734             :     case DCT_ADST:
     735           0 :     case ADST_ADST: aom_iadst8_sse2(in); break;
     736             : #if CONFIG_EXT_TX
     737             :     case DCT_FLIPADST:
     738             :     case FLIPADST_FLIPADST:
     739             :     case ADST_FLIPADST:
     740             :     case FLIPADST_ADST:
     741             :     case H_ADST:
     742           0 :     case H_FLIPADST: aom_iadst8_sse2(in); break;
     743             :     case V_FLIPADST:
     744             :     case V_ADST:
     745             :     case V_DCT:
     746           0 :     case IDTX: iidtx8_sse2(in); array_transpose_8x8(in, in);
     747             : #endif
     748           0 :       break;
     749           0 :     default: assert(0); break;
     750             :   }
     751             : 
     752           0 :   scale_sqrt2_8x8(in);
     753             : 
     754             :   // Repack data. We pack into the bottom half of 'in'
     755             :   // so that the next repacking stage can pack into the
     756             :   // top half without overwriting anything
     757           0 :   in[7] = _mm_unpacklo_epi64(in[6], in[7]);
     758           0 :   in[6] = _mm_unpacklo_epi64(in[4], in[5]);
     759           0 :   in[5] = _mm_unpacklo_epi64(in[2], in[3]);
     760           0 :   in[4] = _mm_unpacklo_epi64(in[0], in[1]);
     761             : 
     762             :   // Column transform
     763           0 :   switch (tx_type) {
     764             :     case DCT_DCT:
     765             :     case DCT_ADST:
     766             : #if CONFIG_EXT_TX
     767             :     case DCT_FLIPADST:
     768             :     case V_DCT:
     769             : #endif
     770           0 :       aom_idct4_sse2(in + 4);
     771           0 :       aom_idct4_sse2(in + 6);
     772           0 :       break;
     773             :     case ADST_DCT:
     774             :     case ADST_ADST:
     775             : #if CONFIG_EXT_TX
     776             :     case FLIPADST_ADST:
     777             :     case ADST_FLIPADST:
     778             :     case FLIPADST_FLIPADST:
     779             :     case FLIPADST_DCT:
     780             :     case V_ADST:
     781             :     case V_FLIPADST:
     782             : #endif
     783           0 :       aom_iadst4_sse2(in + 4);
     784           0 :       aom_iadst4_sse2(in + 6);
     785           0 :       break;
     786             : #if CONFIG_EXT_TX
     787             :     case H_DCT:
     788             :     case H_ADST:
     789             :     case H_FLIPADST:
     790             :     case IDTX:
     791           0 :       iidtx4_sse2(in + 4);
     792           0 :       array_transpose_4x4(in + 4);
     793           0 :       iidtx4_sse2(in + 6);
     794           0 :       array_transpose_4x4(in + 6);
     795           0 :       break;
     796             : #endif
     797           0 :     default: assert(0); break;
     798             :   }
     799             : 
     800             :   // Repack data
     801           0 :   in[0] = _mm_unpacklo_epi64(in[4], in[6]);
     802           0 :   in[1] = _mm_unpackhi_epi64(in[4], in[6]);
     803           0 :   in[2] = _mm_unpacklo_epi64(in[5], in[7]);
     804           0 :   in[3] = _mm_unpackhi_epi64(in[5], in[7]);
     805             : 
     806           0 :   switch (tx_type) {
     807             :     case DCT_DCT:
     808             :     case ADST_DCT:
     809             :     case DCT_ADST:
     810             :     case ADST_ADST:
     811             : #if CONFIG_EXT_TX
     812             :     case H_DCT:
     813             :     case H_ADST:
     814             :     case V_ADST:
     815             :     case V_DCT:
     816           0 :     case IDTX: break;
     817             :     case FLIPADST_DCT:
     818             :     case FLIPADST_ADST:
     819           0 :     case V_FLIPADST: FLIPUD_PTR(dest, stride, 4); break;
     820             :     case DCT_FLIPADST:
     821             :     case ADST_FLIPADST:
     822             :     case H_FLIPADST:
     823           0 :       in[0] = mm_reverse_epi16(in[0]);
     824           0 :       in[1] = mm_reverse_epi16(in[1]);
     825           0 :       in[2] = mm_reverse_epi16(in[2]);
     826           0 :       in[3] = mm_reverse_epi16(in[3]);
     827           0 :       break;
     828             :     case FLIPADST_FLIPADST:
     829           0 :       in[0] = mm_reverse_epi16(in[0]);
     830           0 :       in[1] = mm_reverse_epi16(in[1]);
     831           0 :       in[2] = mm_reverse_epi16(in[2]);
     832           0 :       in[3] = mm_reverse_epi16(in[3]);
     833           0 :       FLIPUD_PTR(dest, stride, 4);
     834             : #endif
     835           0 :       break;
     836           0 :     default: assert(0); break;
     837             :   }
     838           0 :   write_buffer_8x4_round5(dest, in, stride);
     839           0 : }
     840             : 
     841           0 : static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in,
     842             :                                            int stride) {
     843           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
     844           0 :   const __m128i zero = _mm_setzero_si128();
     845             :   // Final rounding and shift
     846           0 :   in[0] = _mm_adds_epi16(in[0], final_rounding);
     847           0 :   in[1] = _mm_adds_epi16(in[1], final_rounding);
     848           0 :   in[2] = _mm_adds_epi16(in[2], final_rounding);
     849           0 :   in[3] = _mm_adds_epi16(in[3], final_rounding);
     850             : 
     851           0 :   in[0] = _mm_srai_epi16(in[0], 5);
     852           0 :   in[1] = _mm_srai_epi16(in[1], 5);
     853           0 :   in[2] = _mm_srai_epi16(in[2], 5);
     854           0 :   in[3] = _mm_srai_epi16(in[3], 5);
     855             : 
     856             :   // Reconstruction and Store
     857             :   {
     858           0 :     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
     859           0 :     __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
     860           0 :     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
     861           0 :     __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
     862           0 :     __m128i d4 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 4));
     863           0 :     __m128i d5 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 5));
     864           0 :     __m128i d6 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 6));
     865           0 :     __m128i d7 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 7));
     866             : 
     867           0 :     d0 = _mm_unpacklo_epi32(d0, d1);
     868           0 :     d2 = _mm_unpacklo_epi32(d2, d3);
     869           0 :     d4 = _mm_unpacklo_epi32(d4, d5);
     870           0 :     d6 = _mm_unpacklo_epi32(d6, d7);
     871           0 :     d0 = _mm_unpacklo_epi8(d0, zero);
     872           0 :     d2 = _mm_unpacklo_epi8(d2, zero);
     873           0 :     d4 = _mm_unpacklo_epi8(d4, zero);
     874           0 :     d6 = _mm_unpacklo_epi8(d6, zero);
     875           0 :     d0 = _mm_add_epi16(d0, in[0]);
     876           0 :     d2 = _mm_add_epi16(d2, in[1]);
     877           0 :     d4 = _mm_add_epi16(d4, in[2]);
     878           0 :     d6 = _mm_add_epi16(d6, in[3]);
     879             : 
     880           0 :     d0 = _mm_packus_epi16(d0, d2);
     881           0 :     *(int *)dest = _mm_cvtsi128_si32(d0);
     882           0 :     d0 = _mm_srli_si128(d0, 4);
     883           0 :     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
     884           0 :     d0 = _mm_srli_si128(d0, 4);
     885           0 :     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
     886           0 :     d0 = _mm_srli_si128(d0, 4);
     887           0 :     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
     888           0 :     d0 = _mm_packus_epi16(d4, d6);
     889           0 :     *(int *)(dest + stride * 4) = _mm_cvtsi128_si32(d0);
     890           0 :     d0 = _mm_srli_si128(d0, 4);
     891           0 :     *(int *)(dest + stride * 5) = _mm_cvtsi128_si32(d0);
     892           0 :     d0 = _mm_srli_si128(d0, 4);
     893           0 :     *(int *)(dest + stride * 6) = _mm_cvtsi128_si32(d0);
     894           0 :     d0 = _mm_srli_si128(d0, 4);
     895           0 :     *(int *)(dest + stride * 7) = _mm_cvtsi128_si32(d0);
     896             :   }
     897           0 : }
     898             : 
     899           0 : void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
     900             :                             int tx_type) {
     901             :   __m128i in[8];
     902             : 
     903             :   // Load rows, packed two per element of 'in'.
     904             :   // We pack into the bottom half of 'in' so that the
     905             :   // later repacking stage can pack into the
     906             :   // top half without overwriting anything
     907           0 :   in[4] = load_input_data(input + 0 * 8);
     908           0 :   in[5] = load_input_data(input + 1 * 8);
     909           0 :   in[6] = load_input_data(input + 2 * 8);
     910           0 :   in[7] = load_input_data(input + 3 * 8);
     911             : 
     912             :   // Row transform
     913           0 :   switch (tx_type) {
     914             :     case DCT_DCT:
     915             :     case ADST_DCT:
     916             : #if CONFIG_EXT_TX
     917             :     case FLIPADST_DCT:
     918             :     case H_DCT:
     919             : #endif
     920           0 :       aom_idct4_sse2(in + 4);
     921           0 :       aom_idct4_sse2(in + 6);
     922           0 :       break;
     923             :     case DCT_ADST:
     924             :     case ADST_ADST:
     925             : #if CONFIG_EXT_TX
     926             :     case DCT_FLIPADST:
     927             :     case FLIPADST_FLIPADST:
     928             :     case ADST_FLIPADST:
     929             :     case FLIPADST_ADST:
     930             :     case H_ADST:
     931             :     case H_FLIPADST:
     932             : #endif
     933           0 :       aom_iadst4_sse2(in + 4);
     934           0 :       aom_iadst4_sse2(in + 6);
     935           0 :       break;
     936             : #if CONFIG_EXT_TX
     937             :     case V_FLIPADST:
     938             :     case V_ADST:
     939             :     case V_DCT:
     940             :     case IDTX:
     941           0 :       iidtx4_sse2(in + 4);
     942           0 :       array_transpose_4x4(in + 4);
     943           0 :       iidtx4_sse2(in + 6);
     944           0 :       array_transpose_4x4(in + 6);
     945           0 :       break;
     946             : #endif
     947           0 :     default: assert(0); break;
     948             :   }
     949             : 
     950           0 :   scale_sqrt2_8x4(in + 4);
     951             : 
     952             :   // Repack data
     953           0 :   in[0] = _mm_unpacklo_epi64(in[4], in[6]);
     954           0 :   in[1] = _mm_unpackhi_epi64(in[4], in[6]);
     955           0 :   in[2] = _mm_unpacklo_epi64(in[5], in[7]);
     956           0 :   in[3] = _mm_unpackhi_epi64(in[5], in[7]);
     957             : 
     958             :   // Column transform
     959           0 :   switch (tx_type) {
     960             :     case DCT_DCT:
     961             :     case DCT_ADST:
     962             : #if CONFIG_EXT_TX
     963             :     case DCT_FLIPADST:
     964             :     case V_DCT:
     965             : #endif
     966           0 :       aom_idct8_sse2(in);
     967           0 :       break;
     968             :     case ADST_DCT:
     969             :     case ADST_ADST:
     970             : #if CONFIG_EXT_TX
     971             :     case FLIPADST_ADST:
     972             :     case ADST_FLIPADST:
     973             :     case FLIPADST_FLIPADST:
     974             :     case FLIPADST_DCT:
     975             :     case V_ADST:
     976             :     case V_FLIPADST:
     977             : #endif
     978           0 :       aom_iadst8_sse2(in);
     979           0 :       break;
     980             : #if CONFIG_EXT_TX
     981             :     case H_DCT:
     982             :     case H_ADST:
     983             :     case H_FLIPADST:
     984             :     case IDTX:
     985           0 :       iidtx8_sse2(in);
     986           0 :       array_transpose_8x8(in, in);
     987           0 :       break;
     988             : #endif
     989           0 :     default: assert(0); break;
     990             :   }
     991             : 
     992           0 :   switch (tx_type) {
     993             :     case DCT_DCT:
     994             :     case ADST_DCT:
     995             :     case DCT_ADST:
     996             :     case ADST_ADST:
     997             : #if CONFIG_EXT_TX
     998             :     case H_DCT:
     999             :     case H_ADST:
    1000             :     case V_ADST:
    1001             :     case V_DCT:
    1002             :     case IDTX:
    1003             : #endif
    1004           0 :       break;
    1005             : #if CONFIG_EXT_TX
    1006             :     case FLIPADST_DCT:
    1007             :     case FLIPADST_ADST:
    1008           0 :     case V_FLIPADST: FLIPUD_PTR(dest, stride, 8); break;
    1009             :     case DCT_FLIPADST:
    1010             :     case ADST_FLIPADST:
    1011             :     case H_FLIPADST:
    1012           0 :       in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
    1013           0 :       in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
    1014           0 :       in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
    1015           0 :       in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
    1016           0 :       in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
    1017           0 :       in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
    1018           0 :       in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
    1019           0 :       in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
    1020           0 :       break;
    1021             :     case FLIPADST_FLIPADST:
    1022           0 :       in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
    1023           0 :       in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
    1024           0 :       in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
    1025           0 :       in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
    1026           0 :       in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
    1027           0 :       in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
    1028           0 :       in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
    1029           0 :       in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
    1030           0 :       FLIPUD_PTR(dest, stride, 8);
    1031           0 :       break;
    1032             : #endif
    1033           0 :     default: assert(0); break;
    1034             :   }
    1035           0 :   in[0] = _mm_unpacklo_epi64(in[0], in[1]);
    1036           0 :   in[1] = _mm_unpacklo_epi64(in[2], in[3]);
    1037           0 :   in[2] = _mm_unpacklo_epi64(in[4], in[5]);
    1038           0 :   in[3] = _mm_unpacklo_epi64(in[6], in[7]);
    1039           0 :   write_buffer_4x8_round5(dest, in, stride);
    1040           0 : }
    1041             : 
    1042             : // Note: The 16-column 32-element transforms take input in the form of four
    1043             : // 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants
    1044             : // of the overall 16x32 input buffer.
    1045           0 : static INLINE void idct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
    1046             :                                 __m128i *br) {
    1047           0 :   array_transpose_16x16(tl, tr);
    1048           0 :   array_transpose_16x16(bl, br);
    1049           0 :   idct32_8col(tl, bl);
    1050           0 :   idct32_8col(tr, br);
    1051           0 : }
    1052             : 
    1053           0 : static INLINE void ihalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
    1054             :                                       __m128i *br) {
    1055             :   __m128i tmpl[16], tmpr[16];
    1056             :   int i;
    1057             : 
    1058             :   // Copy the top half of the input to temporary storage
    1059           0 :   for (i = 0; i < 16; ++i) {
    1060           0 :     tmpl[i] = tl[i];
    1061           0 :     tmpr[i] = tr[i];
    1062             :   }
    1063             : 
    1064             :   // Generate the top half of the output
    1065           0 :   for (i = 0; i < 16; ++i) {
    1066           0 :     tl[i] = _mm_slli_epi16(bl[i], 2);
    1067           0 :     tr[i] = _mm_slli_epi16(br[i], 2);
    1068             :   }
    1069           0 :   array_transpose_16x16(tl, tr);
    1070             : 
    1071             :   // Copy the temporary storage back to the bottom half of the input
    1072           0 :   for (i = 0; i < 16; ++i) {
    1073           0 :     bl[i] = tmpl[i];
    1074           0 :     br[i] = tmpr[i];
    1075             :   }
    1076             : 
    1077             :   // Generate the bottom half of the output
    1078           0 :   scale_sqrt2_8x16(bl);
    1079           0 :   scale_sqrt2_8x16(br);
    1080           0 :   aom_idct16_sse2(bl, br);  // Includes a transposition
    1081           0 : }
    1082             : 
    1083             : #if CONFIG_EXT_TX
    1084           0 : static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
    1085             :                                  __m128i *br) {
    1086             :   int i;
    1087           0 :   array_transpose_16x16(tl, tr);
    1088           0 :   array_transpose_16x16(bl, br);
    1089           0 :   for (i = 0; i < 16; ++i) {
    1090           0 :     tl[i] = _mm_slli_epi16(tl[i], 2);
    1091           0 :     tr[i] = _mm_slli_epi16(tr[i], 2);
    1092           0 :     bl[i] = _mm_slli_epi16(bl[i], 2);
    1093           0 :     br[i] = _mm_slli_epi16(br[i], 2);
    1094             :   }
    1095           0 : }
    1096             : #endif  // CONFIG_EXT_TX
    1097             : 
    1098           0 : static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
    1099             :                                              __m128i *intr, __m128i *inbl,
    1100             :                                              __m128i *inbr, int stride) {
    1101           0 :   const __m128i zero = _mm_setzero_si128();
    1102           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
    1103             :   int i;
    1104             : 
    1105           0 :   for (i = 0; i < 16; ++i) {
    1106           0 :     intl[i] = _mm_adds_epi16(intl[i], final_rounding);
    1107           0 :     intr[i] = _mm_adds_epi16(intr[i], final_rounding);
    1108           0 :     inbl[i] = _mm_adds_epi16(inbl[i], final_rounding);
    1109           0 :     inbr[i] = _mm_adds_epi16(inbr[i], final_rounding);
    1110           0 :     intl[i] = _mm_srai_epi16(intl[i], 6);
    1111           0 :     intr[i] = _mm_srai_epi16(intr[i], 6);
    1112           0 :     inbl[i] = _mm_srai_epi16(inbl[i], 6);
    1113           0 :     inbr[i] = _mm_srai_epi16(inbr[i], 6);
    1114           0 :     RECON_AND_STORE(dest + i * stride + 0, intl[i]);
    1115           0 :     RECON_AND_STORE(dest + i * stride + 8, intr[i]);
    1116           0 :     RECON_AND_STORE(dest + (i + 16) * stride + 0, inbl[i]);
    1117           0 :     RECON_AND_STORE(dest + (i + 16) * stride + 8, inbr[i]);
    1118             :   }
    1119           0 : }
    1120             : 
    1121           0 : void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
    1122             :                                int stride, int tx_type) {
    1123             :   __m128i intl[16], intr[16], inbl[16], inbr[16];
    1124             : 
    1125             :   int i;
    1126           0 :   for (i = 0; i < 16; ++i) {
    1127           0 :     intl[i] = load_input_data(input + i * 16 + 0);
    1128           0 :     intr[i] = load_input_data(input + i * 16 + 8);
    1129           0 :     inbl[i] = load_input_data(input + (i + 16) * 16 + 0);
    1130           0 :     inbr[i] = load_input_data(input + (i + 16) * 16 + 8);
    1131             :   }
    1132             : 
    1133             :   // Row transform
    1134           0 :   switch (tx_type) {
    1135             :     case DCT_DCT:
    1136             :     case ADST_DCT:
    1137             : #if CONFIG_EXT_TX
    1138             :     case FLIPADST_DCT:
    1139             :     case H_DCT:
    1140             : #endif
    1141           0 :       aom_idct16_sse2(intl, intr);
    1142           0 :       aom_idct16_sse2(inbl, inbr);
    1143           0 :       break;
    1144             :     case DCT_ADST:
    1145             :     case ADST_ADST:
    1146             : #if CONFIG_EXT_TX
    1147             :     case DCT_FLIPADST:
    1148             :     case FLIPADST_FLIPADST:
    1149             :     case ADST_FLIPADST:
    1150             :     case FLIPADST_ADST:
    1151             :     case H_ADST:
    1152             :     case H_FLIPADST:
    1153             : #endif
    1154           0 :       aom_iadst16_sse2(intl, intr);
    1155           0 :       aom_iadst16_sse2(inbl, inbr);
    1156           0 :       break;
    1157             : #if CONFIG_EXT_TX
    1158             :     case V_FLIPADST:
    1159             :     case V_ADST:
    1160             :     case V_DCT:
    1161             :     case IDTX:
    1162           0 :       iidtx16_sse2(intl, intr);
    1163           0 :       iidtx16_sse2(inbl, inbr);
    1164           0 :       break;
    1165             : #endif
    1166           0 :     default: assert(0); break;
    1167             :   }
    1168             : 
    1169           0 :   scale_sqrt2_8x16(intl);
    1170           0 :   scale_sqrt2_8x16(intr);
    1171           0 :   scale_sqrt2_8x16(inbl);
    1172           0 :   scale_sqrt2_8x16(inbr);
    1173             : 
    1174             :   // Column transform
    1175           0 :   switch (tx_type) {
    1176             :     case DCT_DCT:
    1177             :     case DCT_ADST:
    1178             : #if CONFIG_EXT_TX
    1179             :     case DCT_FLIPADST:
    1180             :     case V_DCT:
    1181             : #endif
    1182           0 :       idct32_16col(intl, intr, inbl, inbr);
    1183           0 :       break;
    1184             :     case ADST_DCT:
    1185             :     case ADST_ADST:
    1186             : #if CONFIG_EXT_TX
    1187             :     case FLIPADST_ADST:
    1188             :     case ADST_FLIPADST:
    1189             :     case FLIPADST_FLIPADST:
    1190             :     case FLIPADST_DCT:
    1191             :     case V_ADST:
    1192             :     case V_FLIPADST:
    1193             : #endif
    1194           0 :       ihalfright32_16col(intl, intr, inbl, inbr);
    1195           0 :       break;
    1196             : #if CONFIG_EXT_TX
    1197             :     case H_DCT:
    1198             :     case H_ADST:
    1199             :     case H_FLIPADST:
    1200           0 :     case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
    1201             : #endif
    1202           0 :     default: assert(0); break;
    1203             :   }
    1204             : 
    1205           0 :   switch (tx_type) {
    1206             :     case DCT_DCT:
    1207             :     case ADST_DCT:
    1208             :     case DCT_ADST:
    1209             :     case ADST_ADST:
    1210             : #if CONFIG_EXT_TX
    1211             :     case H_DCT:
    1212             :     case H_ADST:
    1213             :     case V_ADST:
    1214             :     case V_DCT:
    1215             :     case IDTX:
    1216             : #endif
    1217           0 :       break;
    1218             : #if CONFIG_EXT_TX
    1219             :     case FLIPADST_DCT:
    1220             :     case FLIPADST_ADST:
    1221           0 :     case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
    1222             :     case DCT_FLIPADST:
    1223             :     case ADST_FLIPADST:
    1224             :     case H_FLIPADST:
    1225           0 :       for (i = 0; i < 16; ++i) {
    1226           0 :         __m128i tmp = intl[i];
    1227           0 :         intl[i] = mm_reverse_epi16(intr[i]);
    1228           0 :         intr[i] = mm_reverse_epi16(tmp);
    1229           0 :         tmp = inbl[i];
    1230           0 :         inbl[i] = mm_reverse_epi16(inbr[i]);
    1231           0 :         inbr[i] = mm_reverse_epi16(tmp);
    1232             :       }
    1233           0 :       break;
    1234             :     case FLIPADST_FLIPADST:
    1235           0 :       for (i = 0; i < 16; ++i) {
    1236           0 :         __m128i tmp = intl[i];
    1237           0 :         intl[i] = mm_reverse_epi16(intr[i]);
    1238           0 :         intr[i] = mm_reverse_epi16(tmp);
    1239           0 :         tmp = inbl[i];
    1240           0 :         inbl[i] = mm_reverse_epi16(inbr[i]);
    1241           0 :         inbr[i] = mm_reverse_epi16(tmp);
    1242             :       }
    1243           0 :       FLIPUD_PTR(dest, stride, 32);
    1244           0 :       break;
    1245             : #endif
    1246           0 :     default: assert(0); break;
    1247             :   }
    1248           0 :   write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
    1249           0 : }
    1250             : 
    1251           0 : static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
    1252             :                                              __m128i *in1, __m128i *in2,
    1253             :                                              __m128i *in3, int stride) {
    1254           0 :   const __m128i zero = _mm_setzero_si128();
    1255           0 :   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
    1256             :   int i;
    1257             : 
    1258           0 :   for (i = 0; i < 16; ++i) {
    1259           0 :     in0[i] = _mm_adds_epi16(in0[i], final_rounding);
    1260           0 :     in1[i] = _mm_adds_epi16(in1[i], final_rounding);
    1261           0 :     in2[i] = _mm_adds_epi16(in2[i], final_rounding);
    1262           0 :     in3[i] = _mm_adds_epi16(in3[i], final_rounding);
    1263           0 :     in0[i] = _mm_srai_epi16(in0[i], 6);
    1264           0 :     in1[i] = _mm_srai_epi16(in1[i], 6);
    1265           0 :     in2[i] = _mm_srai_epi16(in2[i], 6);
    1266           0 :     in3[i] = _mm_srai_epi16(in3[i], 6);
    1267           0 :     RECON_AND_STORE(dest + i * stride + 0, in0[i]);
    1268           0 :     RECON_AND_STORE(dest + i * stride + 8, in1[i]);
    1269           0 :     RECON_AND_STORE(dest + i * stride + 16, in2[i]);
    1270           0 :     RECON_AND_STORE(dest + i * stride + 24, in3[i]);
    1271             :   }
    1272           0 : }
    1273             : 
    1274           0 : void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
    1275             :                                int stride, int tx_type) {
    1276             :   __m128i in0[16], in1[16], in2[16], in3[16];
    1277             :   int i;
    1278             : 
    1279           0 :   for (i = 0; i < 16; ++i) {
    1280           0 :     in0[i] = load_input_data(input + i * 32 + 0);
    1281           0 :     in1[i] = load_input_data(input + i * 32 + 8);
    1282           0 :     in2[i] = load_input_data(input + i * 32 + 16);
    1283           0 :     in3[i] = load_input_data(input + i * 32 + 24);
    1284             :   }
    1285             : 
    1286             :   // Row transform
    1287           0 :   switch (tx_type) {
    1288             :     case DCT_DCT:
    1289             :     case ADST_DCT:
    1290             : #if CONFIG_EXT_TX
    1291             :     case FLIPADST_DCT:
    1292             :     case H_DCT:
    1293             : #endif
    1294           0 :       idct32_16col(in0, in1, in2, in3);
    1295           0 :       break;
    1296             :     case DCT_ADST:
    1297             :     case ADST_ADST:
    1298             : #if CONFIG_EXT_TX
    1299             :     case DCT_FLIPADST:
    1300             :     case FLIPADST_FLIPADST:
    1301             :     case ADST_FLIPADST:
    1302             :     case FLIPADST_ADST:
    1303             :     case H_ADST:
    1304             :     case H_FLIPADST:
    1305             : #endif
    1306           0 :       ihalfright32_16col(in0, in1, in2, in3);
    1307           0 :       break;
    1308             : #if CONFIG_EXT_TX
    1309             :     case V_FLIPADST:
    1310             :     case V_ADST:
    1311             :     case V_DCT:
    1312           0 :     case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
    1313             : #endif
    1314           0 :     default: assert(0); break;
    1315             :   }
    1316             : 
    1317           0 :   scale_sqrt2_8x16(in0);
    1318           0 :   scale_sqrt2_8x16(in1);
    1319           0 :   scale_sqrt2_8x16(in2);
    1320           0 :   scale_sqrt2_8x16(in3);
    1321             : 
    1322             :   // Column transform
    1323           0 :   switch (tx_type) {
    1324             :     case DCT_DCT:
    1325             :     case DCT_ADST:
    1326             : #if CONFIG_EXT_TX
    1327             :     case DCT_FLIPADST:
    1328             :     case V_DCT:
    1329             : #endif
    1330           0 :       aom_idct16_sse2(in0, in1);
    1331           0 :       aom_idct16_sse2(in2, in3);
    1332           0 :       break;
    1333             :     case ADST_DCT:
    1334             :     case ADST_ADST:
    1335             : #if CONFIG_EXT_TX
    1336             :     case FLIPADST_ADST:
    1337             :     case ADST_FLIPADST:
    1338             :     case FLIPADST_FLIPADST:
    1339             :     case FLIPADST_DCT:
    1340             :     case V_ADST:
    1341             :     case V_FLIPADST:
    1342             : #endif
    1343           0 :       aom_iadst16_sse2(in0, in1);
    1344           0 :       aom_iadst16_sse2(in2, in3);
    1345           0 :       break;
    1346             : #if CONFIG_EXT_TX
    1347             :     case H_DCT:
    1348             :     case H_ADST:
    1349             :     case H_FLIPADST:
    1350             :     case IDTX:
    1351           0 :       iidtx16_sse2(in0, in1);
    1352           0 :       iidtx16_sse2(in2, in3);
    1353           0 :       break;
    1354             : #endif
    1355           0 :     default: assert(0); break;
    1356             :   }
    1357             : 
    1358           0 :   switch (tx_type) {
    1359             :     case DCT_DCT:
    1360             :     case ADST_DCT:
    1361             :     case DCT_ADST:
    1362             :     case ADST_ADST:
    1363             : #if CONFIG_EXT_TX
    1364             :     case H_DCT:
    1365             :     case H_ADST:
    1366             :     case V_ADST:
    1367             :     case V_DCT:
    1368             :     case IDTX:
    1369             : #endif
    1370           0 :       break;
    1371             : #if CONFIG_EXT_TX
    1372             :     case FLIPADST_DCT:
    1373             :     case FLIPADST_ADST:
    1374           0 :     case V_FLIPADST: FLIPUD_PTR(dest, stride, 16); break;
    1375             :     case DCT_FLIPADST:
    1376             :     case ADST_FLIPADST:
    1377             :     case H_FLIPADST:
    1378           0 :       for (i = 0; i < 16; ++i) {
    1379           0 :         __m128i tmp1 = in0[i];
    1380           0 :         __m128i tmp2 = in1[i];
    1381           0 :         in0[i] = mm_reverse_epi16(in3[i]);
    1382           0 :         in1[i] = mm_reverse_epi16(in2[i]);
    1383           0 :         in2[i] = mm_reverse_epi16(tmp2);
    1384           0 :         in3[i] = mm_reverse_epi16(tmp1);
    1385             :       }
    1386           0 :       break;
    1387             :     case FLIPADST_FLIPADST:
    1388           0 :       for (i = 0; i < 16; ++i) {
    1389           0 :         __m128i tmp1 = in0[i];
    1390           0 :         __m128i tmp2 = in1[i];
    1391           0 :         in0[i] = mm_reverse_epi16(in3[i]);
    1392           0 :         in1[i] = mm_reverse_epi16(in2[i]);
    1393           0 :         in2[i] = mm_reverse_epi16(tmp2);
    1394           0 :         in3[i] = mm_reverse_epi16(tmp1);
    1395             :       }
    1396           0 :       FLIPUD_PTR(dest, stride, 16);
    1397           0 :       break;
    1398             : #endif
    1399           0 :     default: assert(0); break;
    1400             :   }
    1401           0 :   write_buffer_32x16_round6(dest, in0, in1, in2, in3, stride);
    1402           0 : }

Generated by: LCOV version 1.13