LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - txfm_common_sse2.h (source / functions) Hit Total Coverage
Test: output.info Lines: 0 239 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 5 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #ifndef AOM_DSP_X86_TXFM_COMMON_SSE2_H_
      13             : #define AOM_DSP_X86_TXFM_COMMON_SSE2_H_
      14             : 
      15             : #include <emmintrin.h>
      16             : #include "aom/aom_integer.h"
      17             : #include "aom_dsp/x86/synonyms.h"
      18             : 
      19             : #define pair_set_epi16(a, b)                                            \
      20             :   _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
      21             :                 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
      22             : 
      23             : #define dual_set_epi16(a, b)                                            \
      24             :   _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
      25             :                 (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
      26             : 
      27             : #define octa_set_epi16(a, b, c, d, e, f, g, h)                           \
      28             :   _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
      29             :                  (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
      30             : 
      31             : // Reverse the 8 16 bit words in __m128i
      32           0 : static INLINE __m128i mm_reverse_epi16(const __m128i x) {
      33           0 :   const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
      34           0 :   const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
      35           0 :   return _mm_shuffle_epi32(b, 0x4e);
      36             : }
      37             : 
      38             : #if CONFIG_EXT_TX
      39             : // Identity transform (both forward and inverse).
      40           0 : static INLINE void idtx16_8col(__m128i *in) {
      41           0 :   const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
      42           0 :   const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
      43           0 :   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
      44             : 
      45             :   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
      46             :   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
      47             :   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
      48             :   __m128i y0, y1, y2, y3, y4, y5, y6, y7;
      49             : 
      50           0 :   in[0] = _mm_slli_epi16(in[0], 1);
      51           0 :   in[1] = _mm_slli_epi16(in[1], 1);
      52           0 :   in[2] = _mm_slli_epi16(in[2], 1);
      53           0 :   in[3] = _mm_slli_epi16(in[3], 1);
      54           0 :   in[4] = _mm_slli_epi16(in[4], 1);
      55           0 :   in[5] = _mm_slli_epi16(in[5], 1);
      56           0 :   in[6] = _mm_slli_epi16(in[6], 1);
      57           0 :   in[7] = _mm_slli_epi16(in[7], 1);
      58           0 :   in[8] = _mm_slli_epi16(in[8], 1);
      59           0 :   in[9] = _mm_slli_epi16(in[9], 1);
      60           0 :   in[10] = _mm_slli_epi16(in[10], 1);
      61           0 :   in[11] = _mm_slli_epi16(in[11], 1);
      62           0 :   in[12] = _mm_slli_epi16(in[12], 1);
      63           0 :   in[13] = _mm_slli_epi16(in[13], 1);
      64           0 :   in[14] = _mm_slli_epi16(in[14], 1);
      65           0 :   in[15] = _mm_slli_epi16(in[15], 1);
      66             : 
      67           0 :   v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
      68           0 :   v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
      69           0 :   v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
      70           0 :   v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
      71           0 :   v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
      72           0 :   v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
      73           0 :   v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
      74           0 :   v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
      75             : 
      76           0 :   u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
      77           0 :   u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
      78           0 :   u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
      79           0 :   u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
      80           0 :   u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
      81           0 :   u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
      82           0 :   u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
      83           0 :   u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
      84             : 
      85           0 :   x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
      86           0 :   x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
      87           0 :   x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
      88           0 :   x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
      89           0 :   x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
      90           0 :   x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
      91           0 :   x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
      92           0 :   x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
      93             : 
      94           0 :   y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
      95           0 :   y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
      96           0 :   y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
      97           0 :   y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
      98           0 :   y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
      99           0 :   y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
     100           0 :   y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
     101           0 :   y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
     102             : 
     103           0 :   v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
     104           0 :   v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
     105           0 :   v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
     106           0 :   v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
     107           0 :   v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
     108           0 :   v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
     109           0 :   v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
     110           0 :   v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
     111             : 
     112           0 :   x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
     113           0 :   x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
     114           0 :   x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
     115           0 :   x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
     116           0 :   x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
     117           0 :   x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
     118           0 :   x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
     119           0 :   x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
     120             : 
     121           0 :   u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
     122           0 :   u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
     123           0 :   u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
     124           0 :   u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
     125           0 :   u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
     126           0 :   u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
     127           0 :   u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
     128           0 :   u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
     129             : 
     130           0 :   y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
     131           0 :   y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
     132           0 :   y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
     133           0 :   y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
     134           0 :   y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
     135           0 :   y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
     136           0 :   y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
     137           0 :   y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
     138             : 
     139           0 :   v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
     140           0 :   v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
     141           0 :   v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
     142           0 :   v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
     143           0 :   v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
     144           0 :   v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
     145           0 :   v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
     146           0 :   v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
     147             : 
     148           0 :   x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
     149           0 :   x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
     150           0 :   x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
     151           0 :   x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
     152           0 :   x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
     153           0 :   x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
     154           0 :   x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
     155           0 :   x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
     156             : 
     157           0 :   u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
     158           0 :   u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
     159           0 :   u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
     160           0 :   u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
     161           0 :   u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
     162           0 :   u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
     163           0 :   u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
     164           0 :   u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
     165             : 
     166           0 :   y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
     167           0 :   y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
     168           0 :   y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
     169           0 :   y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
     170           0 :   y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
     171           0 :   y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
     172           0 :   y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
     173           0 :   y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
     174             : 
     175           0 :   v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
     176           0 :   v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
     177           0 :   v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
     178           0 :   v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
     179           0 :   v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
     180           0 :   v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
     181           0 :   v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
     182           0 :   v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
     183             : 
     184           0 :   x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
     185           0 :   x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
     186           0 :   x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
     187           0 :   x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
     188           0 :   x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
     189           0 :   x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
     190           0 :   x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
     191           0 :   x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
     192             : 
     193           0 :   u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
     194           0 :   u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
     195           0 :   u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
     196           0 :   u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
     197           0 :   u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
     198           0 :   u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
     199           0 :   u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
     200           0 :   u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
     201             : 
     202           0 :   y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
     203           0 :   y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
     204           0 :   y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
     205           0 :   y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
     206           0 :   y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
     207           0 :   y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
     208           0 :   y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
     209           0 :   y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
     210             : 
     211           0 :   in[0] = _mm_packs_epi32(v0, x0);
     212           0 :   in[1] = _mm_packs_epi32(v1, x1);
     213           0 :   in[2] = _mm_packs_epi32(v2, x2);
     214           0 :   in[3] = _mm_packs_epi32(v3, x3);
     215           0 :   in[4] = _mm_packs_epi32(v4, x4);
     216           0 :   in[5] = _mm_packs_epi32(v5, x5);
     217           0 :   in[6] = _mm_packs_epi32(v6, x6);
     218           0 :   in[7] = _mm_packs_epi32(v7, x7);
     219             : 
     220           0 :   in[8] = _mm_packs_epi32(u0, y0);
     221           0 :   in[9] = _mm_packs_epi32(u1, y1);
     222           0 :   in[10] = _mm_packs_epi32(u2, y2);
     223           0 :   in[11] = _mm_packs_epi32(u3, y3);
     224           0 :   in[12] = _mm_packs_epi32(u4, y4);
     225           0 :   in[13] = _mm_packs_epi32(u5, y5);
     226           0 :   in[14] = _mm_packs_epi32(u6, y6);
     227           0 :   in[15] = _mm_packs_epi32(u7, y7);
     228           0 : }
     229             : #endif  // CONFIG_EXT_TX
     230             : 
     231           0 : static INLINE void scale_sqrt2_8x4(__m128i *in) {
     232             :   // Implements ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS), for 32
     233             :   // consecutive elements.
     234           0 :   const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
     235             : 
     236           0 :   const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
     237           0 :   const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
     238           0 :   const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
     239           0 :   const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
     240           0 :   const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
     241           0 :   const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
     242           0 :   const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
     243           0 :   const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
     244             : 
     245           0 :   const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
     246           0 :   const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
     247           0 :   const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
     248           0 :   const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
     249           0 :   const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
     250           0 :   const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
     251           0 :   const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
     252           0 :   const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
     253             : 
     254           0 :   in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
     255             :                           xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
     256           0 :   in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
     257             :                           xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
     258           0 :   in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
     259             :                           xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
     260           0 :   in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
     261             :                           xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
     262           0 : }
     263             : 
     264           0 : static INLINE void scale_sqrt2_8x8(__m128i *in) {
     265             :   // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
     266             :   // for each element.
     267           0 :   const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
     268             : 
     269           0 :   const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
     270           0 :   const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
     271           0 :   const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
     272           0 :   const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
     273           0 :   const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
     274           0 :   const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
     275           0 :   const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
     276           0 :   const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
     277           0 :   const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w);
     278           0 :   const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w);
     279           0 :   const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w);
     280           0 :   const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w);
     281           0 :   const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w);
     282           0 :   const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w);
     283           0 :   const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w);
     284           0 :   const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w);
     285             : 
     286           0 :   const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
     287           0 :   const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
     288           0 :   const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
     289           0 :   const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
     290           0 :   const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
     291           0 :   const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
     292           0 :   const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
     293           0 :   const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
     294           0 :   const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w);
     295           0 :   const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w);
     296           0 :   const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w);
     297           0 :   const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w);
     298           0 :   const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w);
     299           0 :   const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w);
     300           0 :   const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
     301           0 :   const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);
     302             : 
     303           0 :   in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
     304             :                           xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
     305           0 :   in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
     306             :                           xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
     307           0 :   in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
     308             :                           xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
     309           0 :   in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
     310             :                           xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
     311           0 :   in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS),
     312             :                           xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS));
     313           0 :   in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS),
     314             :                           xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS));
     315           0 :   in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS),
     316             :                           xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS));
     317           0 :   in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS),
     318             :                           xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS));
     319           0 : }
     320             : 
     321           0 : static INLINE void scale_sqrt2_8x16(__m128i *in) {
     322           0 :   scale_sqrt2_8x8(in);
     323           0 :   scale_sqrt2_8x8(in + 8);
     324           0 : }
     325             : 
     326             : #endif  // AOM_DSP_X86_TXFM_COMMON_SSE2_H_

Generated by: LCOV version 1.13