LCOV - code coverage report
Current view: top level - third_party/aom/av1/common/x86 - av1_convolve_ssse3.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 596 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 33 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <assert.h>
      13             : #include <tmmintrin.h>
      14             : 
      15             : #include "./aom_config.h"
      16             : #include "./av1_rtcd.h"
      17             : #include "av1/common/filter.h"
      18             : 
      19             : #define WIDTH_BOUND (16)
      20             : #define HEIGHT_BOUND (16)
      21             : 
      22             : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
      23             : DECLARE_ALIGNED(16, static int8_t,
      24             :                 sub_pel_filters_12sharp_signal_dir[15][2][16]);
      25             : 
      26             : DECLARE_ALIGNED(16, static int8_t,
      27             :                 sub_pel_filters_12sharp_ver_signal_dir[15][6][16]);
      28             : #endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
      29             : 
      30             : #if USE_TEMPORALFILTER_12TAP
      31             : DECLARE_ALIGNED(16, static int8_t,
      32             :                 sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]);
      33             : 
      34             : DECLARE_ALIGNED(16, static int8_t,
      35             :                 sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]);
      36             : #endif
      37             : 
      38             : typedef int8_t (*SubpelFilterCoeffs)[16];
      39             : 
      40             : static INLINE SubpelFilterCoeffs
      41           0 : get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
      42             : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
      43             :   if (p.interp_filter == MULTITAP_SHARP) {
      44             :     return &sub_pel_filters_12sharp_signal_dir[index][0];
      45             :   }
      46             : #endif
      47             : #if USE_TEMPORALFILTER_12TAP
      48           0 :   if (p.interp_filter == TEMPORALFILTER_12TAP) {
      49           0 :     return &sub_pel_filters_temporalfilter_12_signal_dir[index][0];
      50             :   }
      51             : #endif
      52             :   (void)p;
      53             :   (void)index;
      54           0 :   return NULL;
      55             : }
      56             : 
      57             : static INLINE SubpelFilterCoeffs
      58           0 : get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
      59             : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
      60             :   if (p.interp_filter == MULTITAP_SHARP) {
      61             :     return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
      62             :   }
      63             : #endif
      64             : #if USE_TEMPORALFILTER_12TAP
      65           0 :   if (p.interp_filter == TEMPORALFILTER_12TAP) {
      66           0 :     return &sub_pel_filters_temporalfilter_12_ver_signal_dir[index][0];
      67             :   }
      68             : #endif
      69             :   (void)p;
      70             :   (void)index;
      71           0 :   return NULL;
      72             : }
      73             : 
      74           0 : static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
      75             :   __m128i t0, t1;
      76             : 
      77           0 :   t0 = _mm_unpacklo_epi16(in[0], in[1]);
      78           0 :   t1 = _mm_unpacklo_epi16(in[2], in[3]);
      79             : 
      80           0 :   out[0] = _mm_unpacklo_epi32(t0, t1);
      81           0 :   out[1] = _mm_srli_si128(out[0], 8);
      82           0 :   out[2] = _mm_unpackhi_epi32(t0, t1);
      83           0 :   out[3] = _mm_srli_si128(out[2], 8);
      84             : 
      85           0 :   t0 = _mm_unpackhi_epi16(in[0], in[1]);
      86           0 :   t1 = _mm_unpackhi_epi16(in[2], in[3]);
      87             : 
      88           0 :   out[4] = _mm_unpacklo_epi32(t0, t1);
      89           0 :   out[5] = _mm_srli_si128(out[4], 8);
      90             :   // Note: We ignore out[6] and out[7] because
      91             :   // they're zero vectors.
      92           0 : }
      93             : 
      94             : typedef void (*store_pixel_t)(const __m128i *x, uint8_t *dst);
      95             : 
      96           0 : static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
      97           0 :   const __m128i zero = _mm_setzero_si128();
      98           0 :   const __m128i one = _mm_set1_epi16(1);
      99           0 :   __m128i y = _mm_loadl_epi64((__m128i const *)src);
     100           0 :   y = _mm_unpacklo_epi8(y, zero);
     101           0 :   y = _mm_add_epi16(*x, y);
     102           0 :   y = _mm_add_epi16(y, one);
     103           0 :   y = _mm_srai_epi16(y, 1);
     104           0 :   y = _mm_packus_epi16(y, y);
     105           0 :   return y;
     106             : }
     107             : 
     108           0 : static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *dst) {
     109             :   uint32_t temp;
     110           0 :   __m128i u = _mm_packus_epi16(*x, *x);
     111           0 :   temp = _mm_cvtsi128_si32(u);
     112           0 :   *(uint16_t *)dst = (uint16_t)temp;
     113           0 : }
     114             : 
     115           0 : static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *dst) {
     116             :   uint32_t temp;
     117           0 :   __m128i y = accumulate_store(x, dst);
     118           0 :   temp = _mm_cvtsi128_si32(y);
     119           0 :   *(uint16_t *)dst = (uint16_t)temp;
     120           0 : }
     121             : 
     122             : static store_pixel_t store2pixelTab[2] = { store_2_pixel_only,
     123             :                                            accumulate_store_2_pixel };
     124             : 
     125           0 : static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *dst) {
     126           0 :   __m128i u = _mm_packus_epi16(*x, *x);
     127           0 :   *(int *)dst = _mm_cvtsi128_si32(u);
     128           0 : }
     129             : 
     130           0 : static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *dst) {
     131           0 :   __m128i y = accumulate_store(x, dst);
     132           0 :   *(int *)dst = _mm_cvtsi128_si32(y);
     133           0 : }
     134             : 
     135             : static store_pixel_t store4pixelTab[2] = { store_4_pixel_only,
     136             :                                            accumulate_store_4_pixel };
     137             : 
     138           0 : static void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
     139             :                            store_pixel_t store_func, uint8_t *dst) {
     140             :   __m128i sumPairRow[4];
     141             :   __m128i sumPairCol[8];
     142             :   __m128i pixel;
     143           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     144           0 :   const __m128i zero = _mm_setzero_si128();
     145             : 
     146           0 :   assert(tapsNum == 10 || tapsNum == 12);
     147           0 :   if (10 == tapsNum) {
     148           0 :     src -= 1;
     149             :   }
     150             : 
     151           0 :   pixel = _mm_loadu_si128((__m128i const *)src);
     152           0 :   sumPairRow[0] = _mm_maddubs_epi16(pixel, f[0]);
     153           0 :   sumPairRow[2] = _mm_maddubs_epi16(pixel, f[1]);
     154           0 :   sumPairRow[2] = _mm_srli_si128(sumPairRow[2], 2);
     155             : 
     156           0 :   pixel = _mm_loadu_si128((__m128i const *)(src + 1));
     157           0 :   sumPairRow[1] = _mm_maddubs_epi16(pixel, f[0]);
     158           0 :   sumPairRow[3] = _mm_maddubs_epi16(pixel, f[1]);
     159           0 :   sumPairRow[3] = _mm_srli_si128(sumPairRow[3], 2);
     160             : 
     161           0 :   transpose_4x8(sumPairRow, sumPairCol);
     162             : 
     163           0 :   sumPairRow[0] = _mm_adds_epi16(sumPairCol[0], sumPairCol[1]);
     164           0 :   sumPairRow[1] = _mm_adds_epi16(sumPairCol[4], sumPairCol[5]);
     165             : 
     166           0 :   sumPairRow[2] = _mm_min_epi16(sumPairCol[2], sumPairCol[3]);
     167           0 :   sumPairRow[3] = _mm_max_epi16(sumPairCol[2], sumPairCol[3]);
     168             : 
     169           0 :   sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[1]);
     170           0 :   sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[2]);
     171           0 :   sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
     172             : 
     173           0 :   sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
     174           0 :   sumPairRow[1] = _mm_packus_epi16(sumPairRow[1], sumPairRow[1]);
     175           0 :   sumPairRow[1] = _mm_unpacklo_epi8(sumPairRow[1], zero);
     176             : 
     177           0 :   store_func(&sumPairRow[1], dst);
     178           0 : }
     179             : 
     180           0 : static void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
     181             :                            store_pixel_t store, uint8_t *buf) {
     182           0 :   horiz_w4_ssse3(src, f, tapsNum, store, buf);
     183           0 :   src += 4;
     184           0 :   buf += 4;
     185           0 :   horiz_w4_ssse3(src, f, tapsNum, store, buf);
     186           0 : }
     187             : 
     188           0 : static void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
     189             :                             store_pixel_t store, uint8_t *buf) {
     190           0 :   horiz_w8_ssse3(src, f, tapsNum, store, buf);
     191           0 :   src += 8;
     192           0 :   buf += 8;
     193           0 :   horiz_w8_ssse3(src, f, tapsNum, store, buf);
     194           0 : }
     195             : 
     196           0 : static void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
     197             :                             store_pixel_t store, uint8_t *buf) {
     198           0 :   horiz_w16_ssse3(src, f, tapsNum, store, buf);
     199           0 :   src += 16;
     200           0 :   buf += 16;
     201           0 :   horiz_w16_ssse3(src, f, tapsNum, store, buf);
     202           0 : }
     203             : 
     204           0 : static void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
     205             :                             store_pixel_t store, uint8_t *buf) {
     206           0 :   horiz_w32_ssse3(src, f, tapsNum, store, buf);
     207           0 :   src += 32;
     208           0 :   buf += 32;
     209           0 :   horiz_w32_ssse3(src, f, tapsNum, store, buf);
     210           0 : }
     211             : 
     212           0 : static void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
     213             :                              store_pixel_t store, uint8_t *buf) {
     214           0 :   horiz_w64_ssse3(src, f, tapsNum, store, buf);
     215           0 :   src += 64;
     216           0 :   buf += 64;
     217           0 :   horiz_w64_ssse3(src, f, tapsNum, store, buf);
     218           0 : }
     219             : 
     220             : static void (*horizTab[6])(const uint8_t *, const __m128i *, int, store_pixel_t,
     221             :                            uint8_t *) = {
     222             :   horiz_w4_ssse3,  horiz_w8_ssse3,  horiz_w16_ssse3,
     223             :   horiz_w32_ssse3, horiz_w64_ssse3, horiz_w128_ssse3,
     224             : };
     225             : 
     226           0 : static void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum,
     227             :                                int width, store_pixel_t store, uint8_t *dst) {
     228           0 :   switch (width) {
     229             :     // Note:
     230             :     // For width=2 and 4, store function must be different
     231             :     case 2:
     232           0 :     case 4: horizTab[0](src, f, tapsNum, store, dst); break;
     233           0 :     case 8: horizTab[1](src, f, tapsNum, store, dst); break;
     234           0 :     case 16: horizTab[2](src, f, tapsNum, store, dst); break;
     235           0 :     case 32: horizTab[3](src, f, tapsNum, store, dst); break;
     236           0 :     case 64: horizTab[4](src, f, tapsNum, store, dst); break;
     237           0 :     case 128: horizTab[5](src, f, tapsNum, store, dst); break;
     238           0 :     default: assert(0);
     239             :   }
     240           0 : }
     241             : 
     242             : // Vertical 8-pixel parallel
     243             : typedef void (*transpose_to_dst_t)(const uint16_t *src, int src_stride,
     244             :                                    uint8_t *dst, int dst_stride);
     245             : 
     246           0 : static INLINE void transpose8x8_direct_to_dst(const uint16_t *src,
     247             :                                               int src_stride, uint8_t *dst,
     248             :                                               int dst_stride) {
     249           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     250             :   __m128i v0, v1, v2, v3;
     251             : 
     252           0 :   __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
     253           0 :   __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
     254           0 :   __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
     255           0 :   __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
     256           0 :   __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
     257           0 :   __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
     258           0 :   __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
     259           0 :   __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
     260             : 
     261           0 :   u0 = _mm_mulhrs_epi16(u0, k_256);
     262           0 :   u1 = _mm_mulhrs_epi16(u1, k_256);
     263           0 :   u2 = _mm_mulhrs_epi16(u2, k_256);
     264           0 :   u3 = _mm_mulhrs_epi16(u3, k_256);
     265           0 :   u4 = _mm_mulhrs_epi16(u4, k_256);
     266           0 :   u5 = _mm_mulhrs_epi16(u5, k_256);
     267           0 :   u6 = _mm_mulhrs_epi16(u6, k_256);
     268           0 :   u7 = _mm_mulhrs_epi16(u7, k_256);
     269             : 
     270           0 :   v0 = _mm_packus_epi16(u0, u1);
     271           0 :   v1 = _mm_packus_epi16(u2, u3);
     272           0 :   v2 = _mm_packus_epi16(u4, u5);
     273           0 :   v3 = _mm_packus_epi16(u6, u7);
     274             : 
     275           0 :   u0 = _mm_unpacklo_epi8(v0, v1);
     276           0 :   u1 = _mm_unpackhi_epi8(v0, v1);
     277           0 :   u2 = _mm_unpacklo_epi8(v2, v3);
     278           0 :   u3 = _mm_unpackhi_epi8(v2, v3);
     279             : 
     280           0 :   u4 = _mm_unpacklo_epi8(u0, u1);
     281           0 :   u5 = _mm_unpacklo_epi8(u2, u3);
     282           0 :   u6 = _mm_unpackhi_epi8(u0, u1);
     283           0 :   u7 = _mm_unpackhi_epi8(u2, u3);
     284             : 
     285           0 :   u0 = _mm_unpacklo_epi32(u4, u5);
     286           0 :   u1 = _mm_unpackhi_epi32(u4, u5);
     287           0 :   u2 = _mm_unpacklo_epi32(u6, u7);
     288           0 :   u3 = _mm_unpackhi_epi32(u6, u7);
     289             : 
     290           0 :   u4 = _mm_srli_si128(u0, 8);
     291           0 :   u5 = _mm_srli_si128(u1, 8);
     292           0 :   u6 = _mm_srli_si128(u2, 8);
     293           0 :   u7 = _mm_srli_si128(u3, 8);
     294             : 
     295             :   _mm_storel_epi64((__m128i *)dst, u0);
     296           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
     297           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
     298           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
     299           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
     300           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
     301           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
     302           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
     303           0 : }
     304             : 
     305           0 : static INLINE void transpose8x8_accumu_to_dst(const uint16_t *src,
     306             :                                               int src_stride, uint8_t *dst,
     307             :                                               int dst_stride) {
     308           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     309           0 :   const __m128i zero = _mm_setzero_si128();
     310           0 :   const __m128i one = _mm_set1_epi16(1);
     311             :   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
     312             : 
     313           0 :   __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
     314           0 :   __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
     315           0 :   __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
     316           0 :   __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
     317           0 :   __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
     318           0 :   __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
     319           0 :   __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
     320           0 :   __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
     321             : 
     322           0 :   u0 = _mm_mulhrs_epi16(u0, k_256);
     323           0 :   u1 = _mm_mulhrs_epi16(u1, k_256);
     324           0 :   u2 = _mm_mulhrs_epi16(u2, k_256);
     325           0 :   u3 = _mm_mulhrs_epi16(u3, k_256);
     326           0 :   u4 = _mm_mulhrs_epi16(u4, k_256);
     327           0 :   u5 = _mm_mulhrs_epi16(u5, k_256);
     328           0 :   u6 = _mm_mulhrs_epi16(u6, k_256);
     329           0 :   u7 = _mm_mulhrs_epi16(u7, k_256);
     330             : 
     331           0 :   v0 = _mm_packus_epi16(u0, u1);
     332           0 :   v1 = _mm_packus_epi16(u2, u3);
     333           0 :   v2 = _mm_packus_epi16(u4, u5);
     334           0 :   v3 = _mm_packus_epi16(u6, u7);
     335             : 
     336           0 :   u0 = _mm_unpacklo_epi8(v0, v1);
     337           0 :   u1 = _mm_unpackhi_epi8(v0, v1);
     338           0 :   u2 = _mm_unpacklo_epi8(v2, v3);
     339           0 :   u3 = _mm_unpackhi_epi8(v2, v3);
     340             : 
     341           0 :   u4 = _mm_unpacklo_epi8(u0, u1);
     342           0 :   u5 = _mm_unpacklo_epi8(u2, u3);
     343           0 :   u6 = _mm_unpackhi_epi8(u0, u1);
     344           0 :   u7 = _mm_unpackhi_epi8(u2, u3);
     345             : 
     346           0 :   u0 = _mm_unpacklo_epi32(u4, u5);
     347           0 :   u1 = _mm_unpackhi_epi32(u4, u5);
     348           0 :   u2 = _mm_unpacklo_epi32(u6, u7);
     349           0 :   u3 = _mm_unpackhi_epi32(u6, u7);
     350             : 
     351           0 :   u4 = _mm_srli_si128(u0, 8);
     352           0 :   u5 = _mm_srli_si128(u1, 8);
     353           0 :   u6 = _mm_srli_si128(u2, 8);
     354           0 :   u7 = _mm_srli_si128(u3, 8);
     355             : 
     356           0 :   v0 = _mm_loadl_epi64((__m128i const *)(dst + 0 * dst_stride));
     357           0 :   v1 = _mm_loadl_epi64((__m128i const *)(dst + 1 * dst_stride));
     358           0 :   v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
     359           0 :   v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
     360           0 :   v4 = _mm_loadl_epi64((__m128i const *)(dst + 4 * dst_stride));
     361           0 :   v5 = _mm_loadl_epi64((__m128i const *)(dst + 5 * dst_stride));
     362           0 :   v6 = _mm_loadl_epi64((__m128i const *)(dst + 6 * dst_stride));
     363           0 :   v7 = _mm_loadl_epi64((__m128i const *)(dst + 7 * dst_stride));
     364             : 
     365           0 :   u0 = _mm_unpacklo_epi8(u0, zero);
     366           0 :   u1 = _mm_unpacklo_epi8(u1, zero);
     367           0 :   u2 = _mm_unpacklo_epi8(u2, zero);
     368           0 :   u3 = _mm_unpacklo_epi8(u3, zero);
     369           0 :   u4 = _mm_unpacklo_epi8(u4, zero);
     370           0 :   u5 = _mm_unpacklo_epi8(u5, zero);
     371           0 :   u6 = _mm_unpacklo_epi8(u6, zero);
     372           0 :   u7 = _mm_unpacklo_epi8(u7, zero);
     373             : 
     374           0 :   v0 = _mm_unpacklo_epi8(v0, zero);
     375           0 :   v1 = _mm_unpacklo_epi8(v1, zero);
     376           0 :   v2 = _mm_unpacklo_epi8(v2, zero);
     377           0 :   v3 = _mm_unpacklo_epi8(v3, zero);
     378           0 :   v4 = _mm_unpacklo_epi8(v4, zero);
     379           0 :   v5 = _mm_unpacklo_epi8(v5, zero);
     380           0 :   v6 = _mm_unpacklo_epi8(v6, zero);
     381           0 :   v7 = _mm_unpacklo_epi8(v7, zero);
     382             : 
     383           0 :   v0 = _mm_adds_epi16(u0, v0);
     384           0 :   v1 = _mm_adds_epi16(u4, v1);
     385           0 :   v2 = _mm_adds_epi16(u1, v2);
     386           0 :   v3 = _mm_adds_epi16(u5, v3);
     387           0 :   v4 = _mm_adds_epi16(u2, v4);
     388           0 :   v5 = _mm_adds_epi16(u6, v5);
     389           0 :   v6 = _mm_adds_epi16(u3, v6);
     390           0 :   v7 = _mm_adds_epi16(u7, v7);
     391             : 
     392           0 :   v0 = _mm_adds_epi16(v0, one);
     393           0 :   v1 = _mm_adds_epi16(v1, one);
     394           0 :   v2 = _mm_adds_epi16(v2, one);
     395           0 :   v3 = _mm_adds_epi16(v3, one);
     396           0 :   v4 = _mm_adds_epi16(v4, one);
     397           0 :   v5 = _mm_adds_epi16(v5, one);
     398           0 :   v6 = _mm_adds_epi16(v6, one);
     399           0 :   v7 = _mm_adds_epi16(v7, one);
     400             : 
     401           0 :   v0 = _mm_srai_epi16(v0, 1);
     402           0 :   v1 = _mm_srai_epi16(v1, 1);
     403           0 :   v2 = _mm_srai_epi16(v2, 1);
     404           0 :   v3 = _mm_srai_epi16(v3, 1);
     405           0 :   v4 = _mm_srai_epi16(v4, 1);
     406           0 :   v5 = _mm_srai_epi16(v5, 1);
     407           0 :   v6 = _mm_srai_epi16(v6, 1);
     408           0 :   v7 = _mm_srai_epi16(v7, 1);
     409             : 
     410           0 :   u0 = _mm_packus_epi16(v0, v1);
     411           0 :   u1 = _mm_packus_epi16(v2, v3);
     412           0 :   u2 = _mm_packus_epi16(v4, v5);
     413           0 :   u3 = _mm_packus_epi16(v6, v7);
     414             : 
     415           0 :   u4 = _mm_srli_si128(u0, 8);
     416           0 :   u5 = _mm_srli_si128(u1, 8);
     417           0 :   u6 = _mm_srli_si128(u2, 8);
     418           0 :   u7 = _mm_srli_si128(u3, 8);
     419             : 
     420             :   _mm_storel_epi64((__m128i *)dst, u0);
     421           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
     422           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
     423           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
     424           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
     425           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
     426           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
     427           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
     428           0 : }
     429             : 
     430             : static transpose_to_dst_t trans8x8Tab[2] = { transpose8x8_direct_to_dst,
     431             :                                              transpose8x8_accumu_to_dst };
     432             : 
     433           0 : static INLINE void transpose_8x16(const __m128i *in, __m128i *out) {
     434             :   __m128i t0, t1, t2, t3, u0, u1;
     435             : 
     436           0 :   t0 = _mm_unpacklo_epi16(in[0], in[1]);
     437           0 :   t1 = _mm_unpacklo_epi16(in[2], in[3]);
     438           0 :   t2 = _mm_unpacklo_epi16(in[4], in[5]);
     439           0 :   t3 = _mm_unpacklo_epi16(in[6], in[7]);
     440             : 
     441           0 :   u0 = _mm_unpacklo_epi32(t0, t1);
     442           0 :   u1 = _mm_unpacklo_epi32(t2, t3);
     443             : 
     444           0 :   out[0] = _mm_unpacklo_epi64(u0, u1);
     445           0 :   out[1] = _mm_unpackhi_epi64(u0, u1);
     446             : 
     447           0 :   u0 = _mm_unpackhi_epi32(t0, t1);
     448           0 :   u1 = _mm_unpackhi_epi32(t2, t3);
     449             : 
     450           0 :   out[2] = _mm_unpacklo_epi64(u0, u1);
     451           0 :   out[3] = _mm_unpackhi_epi64(u0, u1);
     452             : 
     453           0 :   t0 = _mm_unpackhi_epi16(in[0], in[1]);
     454           0 :   t1 = _mm_unpackhi_epi16(in[2], in[3]);
     455           0 :   t2 = _mm_unpackhi_epi16(in[4], in[5]);
     456           0 :   t3 = _mm_unpackhi_epi16(in[6], in[7]);
     457             : 
     458           0 :   u0 = _mm_unpacklo_epi32(t0, t1);
     459           0 :   u1 = _mm_unpacklo_epi32(t2, t3);
     460             : 
     461           0 :   out[4] = _mm_unpacklo_epi64(u0, u1);
     462           0 :   out[5] = _mm_unpackhi_epi64(u0, u1);
     463             : 
     464             :   // Ignore out[6] and out[7]
     465             :   // they're zero vectors.
     466           0 : }
     467             : 
     468           0 : static void filter_horiz_v8p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     469             :                                    __m128i *f, int tapsNum, uint16_t *buf) {
     470             :   __m128i s[8], t[6];
     471             :   __m128i min_x2x3, max_x2x3;
     472             :   __m128i temp;
     473             : 
     474           0 :   assert(tapsNum == 10 || tapsNum == 12);
     475           0 :   if (tapsNum == 10) {
     476           0 :     src_ptr -= 1;
     477             :   }
     478           0 :   s[0] = _mm_loadu_si128((const __m128i *)src_ptr);
     479           0 :   s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
     480           0 :   s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
     481           0 :   s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
     482           0 :   s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
     483           0 :   s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
     484           0 :   s[6] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
     485           0 :   s[7] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
     486             : 
     487             :   // TRANSPOSE...
     488             :   // Vecotor represents column pixel pairs instead of a row
     489           0 :   transpose_8x16(s, t);
     490             : 
     491             :   // multiply 2 adjacent elements with the filter and add the result
     492           0 :   s[0] = _mm_maddubs_epi16(t[0], f[0]);
     493           0 :   s[1] = _mm_maddubs_epi16(t[1], f[1]);
     494           0 :   s[2] = _mm_maddubs_epi16(t[2], f[2]);
     495           0 :   s[3] = _mm_maddubs_epi16(t[3], f[3]);
     496           0 :   s[4] = _mm_maddubs_epi16(t[4], f[4]);
     497           0 :   s[5] = _mm_maddubs_epi16(t[5], f[5]);
     498             : 
     499             :   // add and saturate the results together
     500           0 :   min_x2x3 = _mm_min_epi16(s[2], s[3]);
     501           0 :   max_x2x3 = _mm_max_epi16(s[2], s[3]);
     502           0 :   temp = _mm_adds_epi16(s[0], s[1]);
     503           0 :   temp = _mm_adds_epi16(temp, s[5]);
     504           0 :   temp = _mm_adds_epi16(temp, s[4]);
     505             : 
     506           0 :   temp = _mm_adds_epi16(temp, min_x2x3);
     507           0 :   temp = _mm_adds_epi16(temp, max_x2x3);
     508             : 
     509             :   _mm_storeu_si128((__m128i *)buf, temp);
     510           0 : }
     511             : 
     512             : // Vertical 4-pixel parallel
     513           0 : static INLINE void transpose4x4_direct_to_dst(const uint16_t *src,
     514             :                                               int src_stride, uint8_t *dst,
     515             :                                               int dst_stride) {
     516           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     517             :   __m128i v0, v1, v2, v3;
     518             : 
     519             :   // TODO(luoyi): two loads, 8 elements per load (two bytes per element)
     520           0 :   __m128i u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
     521           0 :   __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
     522           0 :   __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
     523           0 :   __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
     524             : 
     525           0 :   v0 = _mm_unpacklo_epi16(u0, u1);
     526           0 :   v1 = _mm_unpacklo_epi16(u2, u3);
     527             : 
     528           0 :   v2 = _mm_unpacklo_epi32(v0, v1);
     529           0 :   v3 = _mm_unpackhi_epi32(v0, v1);
     530             : 
     531           0 :   u0 = _mm_mulhrs_epi16(v2, k_256);
     532           0 :   u1 = _mm_mulhrs_epi16(v3, k_256);
     533             : 
     534           0 :   u0 = _mm_packus_epi16(u0, u1);
     535           0 :   u1 = _mm_srli_si128(u0, 4);
     536           0 :   u2 = _mm_srli_si128(u0, 8);
     537           0 :   u3 = _mm_srli_si128(u0, 12);
     538             : 
     539           0 :   *(int *)(dst) = _mm_cvtsi128_si32(u0);
     540           0 :   *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
     541           0 :   *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
     542           0 :   *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
     543           0 : }
     544             : 
     545           0 : static INLINE void transpose4x4_accumu_to_dst(const uint16_t *src,
     546             :                                               int src_stride, uint8_t *dst,
     547             :                                               int dst_stride) {
     548           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     549           0 :   const __m128i zero = _mm_setzero_si128();
     550           0 :   const __m128i one = _mm_set1_epi16(1);
     551             : 
     552             :   __m128i v0, v1, v2, v3;
     553             : 
     554           0 :   __m128i u0 = _mm_loadl_epi64((__m128i const *)(src));
     555           0 :   __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + src_stride));
     556           0 :   __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
     557           0 :   __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
     558             : 
     559           0 :   v0 = _mm_unpacklo_epi16(u0, u1);
     560           0 :   v1 = _mm_unpacklo_epi16(u2, u3);
     561             : 
     562           0 :   v2 = _mm_unpacklo_epi32(v0, v1);
     563           0 :   v3 = _mm_unpackhi_epi32(v0, v1);
     564             : 
     565           0 :   u0 = _mm_mulhrs_epi16(v2, k_256);
     566           0 :   u1 = _mm_mulhrs_epi16(v3, k_256);
     567             : 
     568           0 :   u2 = _mm_packus_epi16(u0, u1);
     569           0 :   u0 = _mm_unpacklo_epi8(u2, zero);
     570           0 :   u1 = _mm_unpackhi_epi8(u2, zero);
     571             : 
     572             :   // load pixel values
     573           0 :   v0 = _mm_loadl_epi64((__m128i const *)(dst));
     574           0 :   v1 = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
     575           0 :   v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
     576           0 :   v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
     577             : 
     578           0 :   v0 = _mm_unpacklo_epi8(v0, zero);
     579           0 :   v1 = _mm_unpacklo_epi8(v1, zero);
     580           0 :   v2 = _mm_unpacklo_epi8(v2, zero);
     581           0 :   v3 = _mm_unpacklo_epi8(v3, zero);
     582             : 
     583           0 :   v0 = _mm_unpacklo_epi64(v0, v1);
     584           0 :   v1 = _mm_unpacklo_epi64(v2, v3);
     585             : 
     586           0 :   u0 = _mm_adds_epi16(u0, v0);
     587           0 :   u1 = _mm_adds_epi16(u1, v1);
     588             : 
     589           0 :   u0 = _mm_adds_epi16(u0, one);
     590           0 :   u1 = _mm_adds_epi16(u1, one);
     591             : 
     592           0 :   u0 = _mm_srai_epi16(u0, 1);
     593           0 :   u1 = _mm_srai_epi16(u1, 1);
     594             : 
     595             :   // saturation and pack to pixels
     596           0 :   u0 = _mm_packus_epi16(u0, u1);
     597           0 :   u1 = _mm_srli_si128(u0, 4);
     598           0 :   u2 = _mm_srli_si128(u0, 8);
     599           0 :   u3 = _mm_srli_si128(u0, 12);
     600             : 
     601           0 :   *(int *)(dst) = _mm_cvtsi128_si32(u0);
     602           0 :   *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
     603           0 :   *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
     604           0 :   *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
     605           0 : }
     606             : 
     607             : static transpose_to_dst_t trans4x4Tab[2] = { transpose4x4_direct_to_dst,
     608             :                                              transpose4x4_accumu_to_dst };
     609             : 
     610           0 : static void filter_horiz_v4p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     611             :                                    __m128i *f, int tapsNum, uint16_t *buf) {
     612             :   __m128i A, B, C, D;
     613             :   __m128i tr0_0, tr0_1, s1s0, s3s2, s5s4, s7s6, s9s8, sbsa;
     614             :   __m128i x0, x1, x2, x3, x4, x5;
     615             :   __m128i min_x2x3, max_x2x3, temp;
     616             : 
     617           0 :   assert(tapsNum == 10 || tapsNum == 12);
     618           0 :   if (tapsNum == 10) {
     619           0 :     src_ptr -= 1;
     620             :   }
     621           0 :   A = _mm_loadu_si128((const __m128i *)src_ptr);
     622           0 :   B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
     623           0 :   C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
     624           0 :   D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
     625             : 
     626             :   // TRANSPOSE...
     627             :   // Vecotor represents column pixel pairs instead of a row
     628             :   // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
     629           0 :   tr0_0 = _mm_unpacklo_epi16(A, B);
     630             :   // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
     631           0 :   tr0_1 = _mm_unpacklo_epi16(C, D);
     632             :   // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
     633           0 :   s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
     634             :   // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
     635           0 :   s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
     636             :   // 02 03 12 13 22 23 32 33
     637           0 :   s3s2 = _mm_srli_si128(s1s0, 8);
     638             :   // 06 07 16 17 26 27 36 37
     639           0 :   s7s6 = _mm_srli_si128(s5s4, 8);
     640             : 
     641           0 :   tr0_0 = _mm_unpackhi_epi16(A, B);
     642           0 :   tr0_1 = _mm_unpackhi_epi16(C, D);
     643           0 :   s9s8 = _mm_unpacklo_epi32(tr0_0, tr0_1);
     644           0 :   sbsa = _mm_srli_si128(s9s8, 8);
     645             : 
     646             :   // multiply 2 adjacent elements with the filter and add the result
     647           0 :   x0 = _mm_maddubs_epi16(s1s0, f[0]);
     648           0 :   x1 = _mm_maddubs_epi16(s3s2, f[1]);
     649           0 :   x2 = _mm_maddubs_epi16(s5s4, f[2]);
     650           0 :   x3 = _mm_maddubs_epi16(s7s6, f[3]);
     651           0 :   x4 = _mm_maddubs_epi16(s9s8, f[4]);
     652           0 :   x5 = _mm_maddubs_epi16(sbsa, f[5]);
     653             :   // add and saturate the results together
     654           0 :   min_x2x3 = _mm_min_epi16(x2, x3);
     655           0 :   max_x2x3 = _mm_max_epi16(x2, x3);
     656           0 :   temp = _mm_adds_epi16(x0, x1);
     657           0 :   temp = _mm_adds_epi16(temp, x5);
     658           0 :   temp = _mm_adds_epi16(temp, x4);
     659             : 
     660           0 :   temp = _mm_adds_epi16(temp, min_x2x3);
     661           0 :   temp = _mm_adds_epi16(temp, max_x2x3);
     662             :   _mm_storel_epi64((__m128i *)buf, temp);
     663           0 : }
     664             : 
     665             : // Note:
     666             : //  This function assumes:
     667             : // (1) 10/12-taps filters
     668             : // (2) x_step_q4 = 16 then filter is fixed at the call
     669             : 
     670           0 : void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
     671             :                               int dst_stride, int w, int h,
     672             :                               const InterpFilterParams filter_params,
     673             :                               const int subpel_x_q4, int x_step_q4,
     674             :                               ConvolveParams *conv_params) {
     675             :   DECLARE_ALIGNED(16, uint16_t, temp[8 * 8]);
     676             :   __m128i verf[6];
     677             :   __m128i horf[2];
     678             :   SubpelFilterCoeffs hCoeffs, vCoeffs;
     679             :   const uint8_t *src_ptr;
     680           0 :   store_pixel_t store2p = store2pixelTab[conv_params->ref];
     681           0 :   store_pixel_t store4p = store4pixelTab[conv_params->ref];
     682           0 :   transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->ref];
     683           0 :   transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->ref];
     684             : 
     685           0 :   const int tapsNum = filter_params.taps;
     686             :   int block_height, block_residu;
     687             :   int i, col, count;
     688             :   (void)x_step_q4;
     689             : 
     690           0 :   if (0 == subpel_x_q4 || 16 != x_step_q4) {
     691           0 :     av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
     692             :                          subpel_x_q4, x_step_q4, conv_params);
     693           0 :     return;
     694             :   }
     695             : 
     696           0 :   hCoeffs = get_subpel_filter_signal_dir(filter_params, subpel_x_q4 - 1);
     697           0 :   vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
     698             : 
     699           0 :   if (!hCoeffs || !vCoeffs) {
     700           0 :     av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
     701             :                          subpel_x_q4, x_step_q4, conv_params);
     702           0 :     return;
     703             :   }
     704             : 
     705           0 :   verf[0] = *((const __m128i *)(vCoeffs));
     706           0 :   verf[1] = *((const __m128i *)(vCoeffs + 1));
     707           0 :   verf[2] = *((const __m128i *)(vCoeffs + 2));
     708           0 :   verf[3] = *((const __m128i *)(vCoeffs + 3));
     709           0 :   verf[4] = *((const __m128i *)(vCoeffs + 4));
     710           0 :   verf[5] = *((const __m128i *)(vCoeffs + 5));
     711             : 
     712           0 :   horf[0] = *((const __m128i *)(hCoeffs));
     713           0 :   horf[1] = *((const __m128i *)(hCoeffs + 1));
     714             : 
     715           0 :   count = 0;
     716             : 
     717             :   // here tapsNum is filter size
     718           0 :   src -= (tapsNum >> 1) - 1;
     719           0 :   src_ptr = src;
     720           0 :   if (w > WIDTH_BOUND && h > HEIGHT_BOUND) {
     721             :     // 8-pixels parallel
     722           0 :     block_height = h >> 3;
     723           0 :     block_residu = h & 7;
     724             : 
     725             :     do {
     726           0 :       for (col = 0; col < w; col += 8) {
     727           0 :         for (i = 0; i < 8; ++i) {
     728           0 :           filter_horiz_v8p_ssse3(src_ptr, src_stride, verf, tapsNum,
     729             :                                  temp + (i * 8));
     730           0 :           src_ptr += 1;
     731             :         }
     732           0 :         transpose_8x8(temp, 8, dst + col, dst_stride);
     733             :       }
     734           0 :       count++;
     735           0 :       src_ptr = src + count * src_stride * 8;
     736           0 :       dst += dst_stride * 8;
     737           0 :     } while (count < block_height);
     738             : 
     739           0 :     for (i = 0; i < block_residu; ++i) {
     740           0 :       filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
     741           0 :       src_ptr += src_stride;
     742           0 :       dst += dst_stride;
     743             :     }
     744             :   } else {
     745           0 :     if (w > 2) {
     746             :       // 4-pixels parallel
     747           0 :       block_height = h >> 2;
     748           0 :       block_residu = h & 3;
     749             : 
     750             :       do {
     751           0 :         for (col = 0; col < w; col += 4) {
     752           0 :           for (i = 0; i < 4; ++i) {
     753           0 :             filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
     754             :                                    temp + (i * 4));
     755           0 :             src_ptr += 1;
     756             :           }
     757           0 :           transpose_4x4(temp, 4, dst + col, dst_stride);
     758             :         }
     759           0 :         count++;
     760           0 :         src_ptr = src + count * src_stride * 4;
     761           0 :         dst += dst_stride * 4;
     762           0 :       } while (count < block_height);
     763             : 
     764           0 :       for (i = 0; i < block_residu; ++i) {
     765           0 :         filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
     766           0 :         src_ptr += src_stride;
     767           0 :         dst += dst_stride;
     768             :       }
     769             :     } else {
     770           0 :       for (i = 0; i < h; i++) {
     771           0 :         filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store2p, dst);
     772           0 :         src_ptr += src_stride;
     773           0 :         dst += dst_stride;
     774             :       }
     775             :     }
     776             :   }
     777             : }
     778             : 
     779             : // Vertical convolution filtering
     780           0 : static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *dst) {
     781           0 :   __m128i u = _mm_packus_epi16(*x, *x);
     782             :   _mm_storel_epi64((__m128i *)dst, u);
     783           0 : }
     784             : 
     785           0 : static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *dst) {
     786           0 :   __m128i y = accumulate_store(x, dst);
     787             :   _mm_storel_epi64((__m128i *)dst, y);
     788           0 : }
     789             : 
     790             : static store_pixel_t store8pixelTab[2] = { store_8_pixel_only,
     791             :                                            accumulate_store_8_pixel };
     792             : 
     793           0 : static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
     794             :                                  int tapsNum, __m128i *f) {
     795             :   __m128i s[12];
     796           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     797           0 :   const __m128i zero = _mm_setzero_si128();
     798             :   __m128i min_x2x3, max_x2x3, sum;
     799           0 :   int i = 0;
     800           0 :   int r = 0;
     801             : 
     802           0 :   if (10 == tapsNum) {
     803           0 :     i += 1;
     804           0 :     s[0] = zero;
     805             :   }
     806           0 :   while (i < 12) {
     807           0 :     s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
     808           0 :     i += 1;
     809           0 :     r += 1;
     810             :   }
     811             : 
     812           0 :   s[0] = _mm_unpacklo_epi8(s[0], s[1]);
     813           0 :   s[2] = _mm_unpacklo_epi8(s[2], s[3]);
     814           0 :   s[4] = _mm_unpacklo_epi8(s[4], s[5]);
     815           0 :   s[6] = _mm_unpacklo_epi8(s[6], s[7]);
     816           0 :   s[8] = _mm_unpacklo_epi8(s[8], s[9]);
     817           0 :   s[10] = _mm_unpacklo_epi8(s[10], s[11]);
     818             : 
     819           0 :   s[0] = _mm_maddubs_epi16(s[0], f[0]);
     820           0 :   s[2] = _mm_maddubs_epi16(s[2], f[1]);
     821           0 :   s[4] = _mm_maddubs_epi16(s[4], f[2]);
     822           0 :   s[6] = _mm_maddubs_epi16(s[6], f[3]);
     823           0 :   s[8] = _mm_maddubs_epi16(s[8], f[4]);
     824           0 :   s[10] = _mm_maddubs_epi16(s[10], f[5]);
     825             : 
     826           0 :   min_x2x3 = _mm_min_epi16(s[4], s[6]);
     827           0 :   max_x2x3 = _mm_max_epi16(s[4], s[6]);
     828           0 :   sum = _mm_adds_epi16(s[0], s[2]);
     829           0 :   sum = _mm_adds_epi16(sum, s[10]);
     830           0 :   sum = _mm_adds_epi16(sum, s[8]);
     831             : 
     832           0 :   sum = _mm_adds_epi16(sum, min_x2x3);
     833           0 :   sum = _mm_adds_epi16(sum, max_x2x3);
     834             : 
     835           0 :   sum = _mm_mulhrs_epi16(sum, k_256);
     836           0 :   sum = _mm_packus_epi16(sum, sum);
     837           0 :   sum = _mm_unpacklo_epi8(sum, zero);
     838           0 :   return sum;
     839             : }
     840             : 
     841           0 : static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
     842             :                                              __m128i *f, int tapsNum,
     843             :                                              store_pixel_t store_func,
     844             :                                              uint8_t *dst) {
     845           0 :   __m128i sum = filter_vert_ssse3(src, src_stride, tapsNum, f);
     846           0 :   store_func(&sum, dst);
     847           0 : }
     848             : 
     849           0 : static void filter_vert_compute_small(const uint8_t *src, int src_stride,
     850             :                                       __m128i *f, int tapsNum,
     851             :                                       store_pixel_t store_func, int h,
     852             :                                       uint8_t *dst, int dst_stride) {
     853           0 :   int rowIndex = 0;
     854             :   do {
     855           0 :     filter_vert_horiz_parallel_ssse3(src, src_stride, f, tapsNum, store_func,
     856             :                                      dst);
     857           0 :     rowIndex++;
     858           0 :     src += src_stride;
     859           0 :     dst += dst_stride;
     860           0 :   } while (rowIndex < h);
     861           0 : }
     862             : 
     863           0 : static void filter_vert_compute_large(const uint8_t *src, int src_stride,
     864             :                                       __m128i *f, int tapsNum,
     865             :                                       store_pixel_t store_func, int w, int h,
     866             :                                       uint8_t *dst, int dst_stride) {
     867             :   int col;
     868           0 :   int rowIndex = 0;
     869           0 :   const uint8_t *src_ptr = src;
     870           0 :   uint8_t *dst_ptr = dst;
     871             : 
     872             :   do {
     873           0 :     for (col = 0; col < w; col += 8) {
     874           0 :       filter_vert_horiz_parallel_ssse3(src_ptr, src_stride, f, tapsNum,
     875             :                                        store_func, dst_ptr);
     876           0 :       src_ptr += 8;
     877           0 :       dst_ptr += 8;
     878             :     }
     879           0 :     rowIndex++;
     880           0 :     src_ptr = src + rowIndex * src_stride;
     881           0 :     dst_ptr = dst + rowIndex * dst_stride;
     882           0 :   } while (rowIndex < h);
     883           0 : }
     884             : 
     885           0 : void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
     886             :                              int dst_stride, int w, int h,
     887             :                              const InterpFilterParams filter_params,
     888             :                              const int subpel_y_q4, int y_step_q4,
     889             :                              ConvolveParams *conv_params) {
     890             :   __m128i verf[6];
     891             :   SubpelFilterCoeffs vCoeffs;
     892             :   const uint8_t *src_ptr;
     893           0 :   uint8_t *dst_ptr = dst;
     894           0 :   store_pixel_t store2p = store2pixelTab[conv_params->ref];
     895           0 :   store_pixel_t store4p = store4pixelTab[conv_params->ref];
     896           0 :   store_pixel_t store8p = store8pixelTab[conv_params->ref];
     897           0 :   const int tapsNum = filter_params.taps;
     898             : 
     899           0 :   if (0 == subpel_y_q4 || 16 != y_step_q4) {
     900           0 :     av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
     901             :                         subpel_y_q4, y_step_q4, conv_params);
     902           0 :     return;
     903             :   }
     904             : 
     905           0 :   vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
     906             : 
     907           0 :   if (!vCoeffs) {
     908           0 :     av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
     909             :                         subpel_y_q4, y_step_q4, conv_params);
     910           0 :     return;
     911             :   }
     912             : 
     913           0 :   verf[0] = *((const __m128i *)(vCoeffs));
     914           0 :   verf[1] = *((const __m128i *)(vCoeffs + 1));
     915           0 :   verf[2] = *((const __m128i *)(vCoeffs + 2));
     916           0 :   verf[3] = *((const __m128i *)(vCoeffs + 3));
     917           0 :   verf[4] = *((const __m128i *)(vCoeffs + 4));
     918           0 :   verf[5] = *((const __m128i *)(vCoeffs + 5));
     919             : 
     920           0 :   src -= src_stride * ((tapsNum >> 1) - 1);
     921           0 :   src_ptr = src;
     922             : 
     923           0 :   if (w > 4) {
     924           0 :     filter_vert_compute_large(src_ptr, src_stride, verf, tapsNum, store8p, w, h,
     925             :                               dst_ptr, dst_stride);
     926           0 :   } else if (4 == w) {
     927           0 :     filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store4p, h,
     928             :                               dst_ptr, dst_stride);
     929           0 :   } else if (2 == w) {
     930           0 :     filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store2p, h,
     931             :                               dst_ptr, dst_stride);
     932             :   } else {
     933           0 :     assert(0);
     934             :   }
     935             : }
     936             : 
     937           0 : static void init_simd_horiz_filter(const int16_t *filter_ptr, int taps,
     938             :                                    int8_t (*simd_horiz_filter)[2][16]) {
     939             :   int shift;
     940           0 :   int offset = (12 - taps) / 2;
     941             :   const int16_t *filter_row;
     942           0 :   for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
     943             :     int i;
     944           0 :     filter_row = filter_ptr + shift * taps;
     945           0 :     for (i = 0; i < offset; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
     946             : 
     947           0 :     for (i = 0; i < offset + 2; ++i) simd_horiz_filter[shift - 1][1][i] = 0;
     948             : 
     949           0 :     for (i = 0; i < taps; ++i) {
     950           0 :       simd_horiz_filter[shift - 1][0][i + offset] = (int8_t)filter_row[i];
     951           0 :       simd_horiz_filter[shift - 1][1][i + offset + 2] = (int8_t)filter_row[i];
     952             :     }
     953             : 
     954           0 :     for (i = offset + taps; i < 16; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
     955             : 
     956           0 :     for (i = offset + 2 + taps; i < 16; ++i)
     957           0 :       simd_horiz_filter[shift - 1][1][i] = 0;
     958             :   }
     959           0 : }
     960             : 
     961           0 : static void init_simd_vert_filter(const int16_t *filter_ptr, int taps,
     962             :                                   int8_t (*simd_vert_filter)[6][16]) {
     963             :   int shift;
     964           0 :   int offset = (12 - taps) / 2;
     965             :   const int16_t *filter_row;
     966           0 :   for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
     967             :     int i;
     968           0 :     filter_row = filter_ptr + shift * taps;
     969           0 :     for (i = 0; i < 6; ++i) {
     970             :       int j;
     971           0 :       for (j = 0; j < 16; ++j) {
     972           0 :         int c = i * 2 + (j % 2) - offset;
     973           0 :         if (c >= 0 && c < taps)
     974           0 :           simd_vert_filter[shift - 1][i][j] = (int8_t)filter_row[c];
     975             :         else
     976           0 :           simd_vert_filter[shift - 1][i][j] = 0;
     977             :       }
     978             :     }
     979             :   }
     980           0 : }
     981             : 
     982             : typedef struct SimdFilter {
     983             :   InterpFilter interp_filter;
     984             :   int8_t (*simd_horiz_filter)[2][16];
     985             :   int8_t (*simd_vert_filter)[6][16];
     986             : } SimdFilter;
     987             : 
     988             : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
     989             : #define MULTITAP_FILTER_NUM 1
     990             : SimdFilter simd_filters[MULTITAP_FILTER_NUM] = {
     991             :   { MULTITAP_SHARP, &sub_pel_filters_12sharp_signal_dir[0],
     992             :     &sub_pel_filters_12sharp_ver_signal_dir[0] },
     993             : };
     994             : #endif
     995             : 
     996             : #if USE_TEMPORALFILTER_12TAP
     997             : SimdFilter temporal_simd_filter = {
     998             :   TEMPORALFILTER_12TAP, &sub_pel_filters_temporalfilter_12_signal_dir[0],
     999             :   &sub_pel_filters_temporalfilter_12_ver_signal_dir[0]
    1000             : };
    1001             : #endif
    1002             : 
    1003           0 : void av1_lowbd_convolve_init_ssse3(void) {
    1004             : #if USE_TEMPORALFILTER_12TAP
    1005             :   {
    1006           0 :     InterpFilterParams filter_params =
    1007           0 :         av1_get_interp_filter_params(temporal_simd_filter.interp_filter);
    1008           0 :     int taps = filter_params.taps;
    1009           0 :     const int16_t *filter_ptr = filter_params.filter_ptr;
    1010           0 :     init_simd_horiz_filter(filter_ptr, taps,
    1011             :                            temporal_simd_filter.simd_horiz_filter);
    1012           0 :     init_simd_vert_filter(filter_ptr, taps,
    1013             :                           temporal_simd_filter.simd_vert_filter);
    1014             :   }
    1015             : #endif
    1016             : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
    1017             :   {
    1018             :     int i;
    1019             :     for (i = 0; i < MULTITAP_FILTER_NUM; ++i) {
    1020             :       InterpFilter interp_filter = simd_filters[i].interp_filter;
    1021             :       InterpFilterParams filter_params =
    1022             :           av1_get_interp_filter_params(interp_filter);
    1023             :       int taps = filter_params.taps;
    1024             :       const int16_t *filter_ptr = filter_params.filter_ptr;
    1025             :       init_simd_horiz_filter(filter_ptr, taps,
    1026             :                              simd_filters[i].simd_horiz_filter);
    1027             :       init_simd_vert_filter(filter_ptr, taps, simd_filters[i].simd_vert_filter);
    1028             :     }
    1029             :   }
    1030             : #endif
    1031           0 :   return;
    1032             : }

Generated by: LCOV version 1.13