LCOV - code coverage report
Current view: top level - third_party/aom/av1/common/x86 - av1_highbd_convolve_sse4.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 306 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 20 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <assert.h>
      13             : #include <smmintrin.h>
      14             : 
      15             : #include "./av1_rtcd.h"
      16             : #include "av1/common/filter.h"
      17             : 
      18             : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
      19             : DECLARE_ALIGNED(16, static int16_t, subpel_filters_sharp[15][6][8]);
      20             : #endif
      21             : 
      22             : #if USE_TEMPORALFILTER_12TAP
      23             : DECLARE_ALIGNED(16, static int16_t, subpel_temporalfilter[15][6][8]);
      24             : #endif
      25             : 
      26             : typedef int16_t (*HbdSubpelFilterCoeffs)[8];
      27             : 
      28             : typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
      29             :                               int src_stride, uint16_t *dst, int dst_stride,
      30             :                               int bd);
      31             : 
      32             : static INLINE HbdSubpelFilterCoeffs
      33           0 : hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
      34             : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
      35             :   if (p.interp_filter == MULTITAP_SHARP) {
      36             :     return &subpel_filters_sharp[index][0];
      37             :   }
      38             : #endif
      39             : #if USE_TEMPORALFILTER_12TAP
      40           0 :   if (p.interp_filter == TEMPORALFILTER_12TAP) {
      41           0 :     return &subpel_temporalfilter[index][0];
      42             :   }
      43             : #endif
      44             :   (void)p;
      45             :   (void)index;
      46           0 :   return NULL;
      47             : }
      48             : 
      49           0 : static void init_simd_filter(const int16_t *filter_ptr, int taps,
      50             :                              int16_t (*simd_filter)[6][8]) {
      51             :   int shift;
      52           0 :   int offset = (12 - taps) / 2;
      53           0 :   for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
      54           0 :     const int16_t *filter_row = filter_ptr + shift * taps;
      55             :     int i, j;
      56           0 :     for (i = 0; i < 12; ++i) {
      57           0 :       for (j = 0; j < 4; ++j) {
      58           0 :         int r = i / 2;
      59           0 :         int c = j * 2 + (i % 2);
      60           0 :         if (i - offset >= 0 && i - offset < taps)
      61           0 :           simd_filter[shift - 1][r][c] = filter_row[i - offset];
      62             :         else
      63           0 :           simd_filter[shift - 1][r][c] = 0;
      64             :       }
      65             :     }
      66             :   }
      67           0 : }
      68             : 
      69           0 : void av1_highbd_convolve_init_sse4_1(void) {
      70             : #if USE_TEMPORALFILTER_12TAP
      71             :   {
      72           0 :     InterpFilterParams filter_params =
      73             :         av1_get_interp_filter_params(TEMPORALFILTER_12TAP);
      74           0 :     int taps = filter_params.taps;
      75           0 :     const int16_t *filter_ptr = filter_params.filter_ptr;
      76           0 :     init_simd_filter(filter_ptr, taps, subpel_temporalfilter);
      77             :   }
      78             : #endif
      79             : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
      80             :   {
      81             :     InterpFilterParams filter_params =
      82             :         av1_get_interp_filter_params(MULTITAP_SHARP);
      83             :     int taps = filter_params.taps;
      84             :     const int16_t *filter_ptr = filter_params.filter_ptr;
      85             :     init_simd_filter(filter_ptr, taps, subpel_filters_sharp);
      86             :   }
      87             : #endif
      88           0 : }
      89             : 
      90             : // pixelsNum 0: write all 4 pixels
      91             : //           1/2/3: residual pixels 1/2/3
      92           0 : static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
      93             :                        int dst_stride) {
      94           0 :   if (2 == width) {
      95           0 :     if (0 == pixelsNum) {
      96           0 :       *(int *)dst = _mm_cvtsi128_si32(u[0]);
      97           0 :       *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
      98           0 :       *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
      99           0 :       *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
     100           0 :     } else if (1 == pixelsNum) {
     101           0 :       *(int *)dst = _mm_cvtsi128_si32(u[0]);
     102           0 :     } else if (2 == pixelsNum) {
     103           0 :       *(int *)dst = _mm_cvtsi128_si32(u[0]);
     104           0 :       *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
     105           0 :     } else if (3 == pixelsNum) {
     106           0 :       *(int *)dst = _mm_cvtsi128_si32(u[0]);
     107           0 :       *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
     108           0 :       *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
     109             :     }
     110             :   } else {
     111           0 :     if (0 == pixelsNum) {
     112           0 :       _mm_storel_epi64((__m128i *)dst, u[0]);
     113           0 :       _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
     114           0 :       _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
     115           0 :       _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
     116           0 :     } else if (1 == pixelsNum) {
     117           0 :       _mm_storel_epi64((__m128i *)dst, u[0]);
     118           0 :     } else if (2 == pixelsNum) {
     119           0 :       _mm_storel_epi64((__m128i *)dst, u[0]);
     120           0 :       _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
     121           0 :     } else if (3 == pixelsNum) {
     122           0 :       _mm_storel_epi64((__m128i *)dst, u[0]);
     123           0 :       _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
     124           0 :       _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
     125             :     }
     126             :   }
     127           0 : }
     128             : 
     129             : // 16-bit pixels clip with bd (10/12)
     130           0 : static void highbd_clip(__m128i *p, int numVecs, int bd) {
     131           0 :   const __m128i zero = _mm_setzero_si128();
     132           0 :   const __m128i one = _mm_set1_epi16(1);
     133           0 :   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
     134             :   __m128i clamped, mask;
     135             :   int i;
     136             : 
     137           0 :   for (i = 0; i < numVecs; i++) {
     138           0 :     mask = _mm_cmpgt_epi16(p[i], max);
     139           0 :     clamped = _mm_andnot_si128(mask, p[i]);
     140           0 :     mask = _mm_and_si128(mask, max);
     141           0 :     clamped = _mm_or_si128(mask, clamped);
     142           0 :     mask = _mm_cmpgt_epi16(clamped, zero);
     143           0 :     p[i] = _mm_and_si128(clamped, mask);
     144             :   }
     145           0 : }
     146             : 
     147           0 : static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
     148             :   __m128i v0, v1;
     149           0 :   __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
     150             : 
     151           0 :   u[0] = _mm_loadu_si128((__m128i const *)src);
     152           0 :   u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
     153           0 :   u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
     154           0 :   u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
     155             : 
     156           0 :   u[0] = _mm_add_epi32(u[0], rnd);
     157           0 :   u[1] = _mm_add_epi32(u[1], rnd);
     158           0 :   u[2] = _mm_add_epi32(u[2], rnd);
     159           0 :   u[3] = _mm_add_epi32(u[3], rnd);
     160             : 
     161           0 :   u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
     162           0 :   u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
     163           0 :   u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
     164           0 :   u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
     165             : 
     166           0 :   u[0] = _mm_packus_epi32(u[0], u[1]);
     167           0 :   u[1] = _mm_packus_epi32(u[2], u[3]);
     168             : 
     169           0 :   highbd_clip(u, 2, bd);
     170             : 
     171           0 :   v0 = _mm_unpacklo_epi16(u[0], u[1]);
     172           0 :   v1 = _mm_unpackhi_epi16(u[0], u[1]);
     173             : 
     174           0 :   u[0] = _mm_unpacklo_epi16(v0, v1);
     175           0 :   u[2] = _mm_unpackhi_epi16(v0, v1);
     176             : 
     177           0 :   u[1] = _mm_srli_si128(u[0], 8);
     178           0 :   u[3] = _mm_srli_si128(u[2], 8);
     179           0 : }
     180             : 
     181             : // pixelsNum = 0     : all 4 rows of pixels will be saved.
     182             : // pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
     183           0 : void trans_save_4x4(int width, int pixelsNum, uint32_t *src, int src_stride,
     184             :                     uint16_t *dst, int dst_stride, int bd) {
     185             :   __m128i u[4];
     186           0 :   transClipPixel(src, src_stride, u, bd);
     187           0 :   writePixel(u, width, pixelsNum, dst, dst_stride);
     188           0 : }
     189             : 
     190           0 : void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
     191             :                           int src_stride, uint16_t *dst, int dst_stride,
     192             :                           int bd) {
     193             :   __m128i u[4], v[4];
     194           0 :   const __m128i ones = _mm_set1_epi16(1);
     195             : 
     196           0 :   transClipPixel(src, src_stride, u, bd);
     197             : 
     198           0 :   v[0] = _mm_loadl_epi64((__m128i const *)dst);
     199           0 :   v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
     200           0 :   v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
     201           0 :   v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
     202             : 
     203           0 :   u[0] = _mm_add_epi16(u[0], v[0]);
     204           0 :   u[1] = _mm_add_epi16(u[1], v[1]);
     205           0 :   u[2] = _mm_add_epi16(u[2], v[2]);
     206           0 :   u[3] = _mm_add_epi16(u[3], v[3]);
     207             : 
     208           0 :   u[0] = _mm_add_epi16(u[0], ones);
     209           0 :   u[1] = _mm_add_epi16(u[1], ones);
     210           0 :   u[2] = _mm_add_epi16(u[2], ones);
     211           0 :   u[3] = _mm_add_epi16(u[3], ones);
     212             : 
     213           0 :   u[0] = _mm_srai_epi16(u[0], 1);
     214           0 :   u[1] = _mm_srai_epi16(u[1], 1);
     215           0 :   u[2] = _mm_srai_epi16(u[2], 1);
     216           0 :   u[3] = _mm_srai_epi16(u[3], 1);
     217             : 
     218           0 :   writePixel(u, width, pixelsNum, dst, dst_stride);
     219           0 : }
     220             : 
     221             : static TransposeSave transSaveTab[2] = { trans_save_4x4, trans_accum_save_4x4 };
     222             : 
     223           0 : static INLINE void transpose_pair(__m128i *in, __m128i *out) {
     224             :   __m128i x0, x1;
     225             : 
     226           0 :   x0 = _mm_unpacklo_epi32(in[0], in[1]);
     227           0 :   x1 = _mm_unpacklo_epi32(in[2], in[3]);
     228             : 
     229           0 :   out[0] = _mm_unpacklo_epi64(x0, x1);
     230           0 :   out[1] = _mm_unpackhi_epi64(x0, x1);
     231             : 
     232           0 :   x0 = _mm_unpackhi_epi32(in[0], in[1]);
     233           0 :   x1 = _mm_unpackhi_epi32(in[2], in[3]);
     234             : 
     235           0 :   out[2] = _mm_unpacklo_epi64(x0, x1);
     236           0 :   out[3] = _mm_unpackhi_epi64(x0, x1);
     237             : 
     238           0 :   x0 = _mm_unpacklo_epi32(in[4], in[5]);
     239           0 :   x1 = _mm_unpacklo_epi32(in[6], in[7]);
     240             : 
     241           0 :   out[4] = _mm_unpacklo_epi64(x0, x1);
     242           0 :   out[5] = _mm_unpackhi_epi64(x0, x1);
     243           0 : }
     244             : 
     245           0 : static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f,
     246             :                                 int tapsNum, uint32_t *buf) {
     247             :   __m128i u[8], v[6];
     248             : 
     249           0 :   assert(tapsNum == 10 || tapsNum == 12);
     250           0 :   if (tapsNum == 10) {
     251           0 :     src -= 1;
     252             :   }
     253             : 
     254           0 :   u[0] = _mm_loadu_si128((__m128i const *)src);
     255           0 :   u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
     256           0 :   u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
     257           0 :   u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
     258             : 
     259           0 :   u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
     260           0 :   u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
     261           0 :   u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
     262           0 :   u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));
     263             : 
     264           0 :   transpose_pair(u, v);
     265             : 
     266           0 :   u[0] = _mm_madd_epi16(v[0], f[0]);
     267           0 :   u[1] = _mm_madd_epi16(v[1], f[1]);
     268           0 :   u[2] = _mm_madd_epi16(v[2], f[2]);
     269           0 :   u[3] = _mm_madd_epi16(v[3], f[3]);
     270           0 :   u[4] = _mm_madd_epi16(v[4], f[4]);
     271           0 :   u[5] = _mm_madd_epi16(v[5], f[5]);
     272             : 
     273           0 :   u[6] = _mm_min_epi32(u[2], u[3]);
     274           0 :   u[7] = _mm_max_epi32(u[2], u[3]);
     275             : 
     276           0 :   u[0] = _mm_add_epi32(u[0], u[1]);
     277           0 :   u[0] = _mm_add_epi32(u[0], u[5]);
     278           0 :   u[0] = _mm_add_epi32(u[0], u[4]);
     279           0 :   u[0] = _mm_add_epi32(u[0], u[6]);
     280           0 :   u[0] = _mm_add_epi32(u[0], u[7]);
     281             : 
     282           0 :   _mm_storeu_si128((__m128i *)buf, u[0]);
     283           0 : }
     284             : 
     285           0 : void av1_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride,
     286             :                                       uint16_t *dst, int dst_stride, int w,
     287             :                                       int h,
     288             :                                       const InterpFilterParams filter_params,
     289             :                                       const int subpel_x_q4, int x_step_q4,
     290             :                                       int avg, int bd) {
     291             :   DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]);
     292             :   __m128i verf[6];
     293             :   HbdSubpelFilterCoeffs vCoeffs;
     294             :   const uint16_t *srcPtr;
     295           0 :   const int tapsNum = filter_params.taps;
     296             :   int i, col, count, blkResidu, blkHeight;
     297           0 :   TransposeSave transSave = transSaveTab[avg];
     298             :   (void)x_step_q4;
     299             : 
     300           0 :   if (0 == subpel_x_q4 || 16 != x_step_q4) {
     301           0 :     av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
     302             :                                 filter_params, subpel_x_q4, x_step_q4, avg, bd);
     303           0 :     return;
     304             :   }
     305             : 
     306           0 :   vCoeffs =
     307           0 :       hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
     308           0 :   if (!vCoeffs) {
     309           0 :     av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
     310             :                                 filter_params, subpel_x_q4, x_step_q4, avg, bd);
     311           0 :     return;
     312             :   }
     313             : 
     314           0 :   verf[0] = *((const __m128i *)(vCoeffs));
     315           0 :   verf[1] = *((const __m128i *)(vCoeffs + 1));
     316           0 :   verf[2] = *((const __m128i *)(vCoeffs + 2));
     317           0 :   verf[3] = *((const __m128i *)(vCoeffs + 3));
     318           0 :   verf[4] = *((const __m128i *)(vCoeffs + 4));
     319           0 :   verf[5] = *((const __m128i *)(vCoeffs + 5));
     320             : 
     321           0 :   src -= (tapsNum >> 1) - 1;
     322           0 :   srcPtr = src;
     323             : 
     324           0 :   count = 0;
     325           0 :   blkHeight = h >> 2;
     326           0 :   blkResidu = h & 3;
     327             : 
     328           0 :   while (blkHeight != 0) {
     329           0 :     for (col = 0; col < w; col += 4) {
     330           0 :       for (i = 0; i < 4; ++i) {
     331           0 :         highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
     332           0 :         srcPtr += 1;
     333             :       }
     334           0 :       transSave(w, 0, temp, 4, dst + col, dst_stride, bd);
     335             :     }
     336           0 :     count++;
     337           0 :     srcPtr = src + count * src_stride * 4;
     338           0 :     dst += dst_stride * 4;
     339           0 :     blkHeight--;
     340             :   }
     341             : 
     342           0 :   if (blkResidu == 0) return;
     343             : 
     344           0 :   for (col = 0; col < w; col += 4) {
     345           0 :     for (i = 0; i < 4; ++i) {
     346           0 :       highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
     347           0 :       srcPtr += 1;
     348             :     }
     349           0 :     transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd);
     350             :   }
     351             : }
     352             : 
     353             : // Vertical convolutional filter
     354             : 
     355             : typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
     356             : 
     357           0 : static void highbdRndingPacks(__m128i *u) {
     358           0 :   __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
     359           0 :   u[0] = _mm_add_epi32(u[0], rnd);
     360           0 :   u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
     361           0 :   u[0] = _mm_packus_epi32(u[0], u[0]);
     362           0 : }
     363             : 
     364           0 : static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
     365           0 :   highbdRndingPacks(u);
     366           0 :   highbd_clip(u, 1, bd);
     367           0 :   *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
     368           0 : }
     369             : 
     370           0 : static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
     371           0 :   __m128i v = _mm_loadl_epi64((__m128i const *)dst);
     372           0 :   const __m128i ones = _mm_set1_epi16(1);
     373             : 
     374           0 :   highbdRndingPacks(u);
     375           0 :   highbd_clip(u, 1, bd);
     376             : 
     377           0 :   v = _mm_add_epi16(v, u[0]);
     378           0 :   v = _mm_add_epi16(v, ones);
     379           0 :   v = _mm_srai_epi16(v, 1);
     380           0 :   *(uint32_t *)dst = _mm_cvtsi128_si32(v);
     381           0 : }
     382             : 
     383             : WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum };
     384             : 
     385           0 : static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
     386           0 :   highbdRndingPacks(u);
     387           0 :   highbd_clip(u, 1, bd);
     388           0 :   _mm_storel_epi64((__m128i *)dst, u[0]);
     389           0 : }
     390             : 
     391           0 : static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
     392           0 :   __m128i v = _mm_loadl_epi64((__m128i const *)dst);
     393           0 :   const __m128i ones = _mm_set1_epi16(1);
     394             : 
     395           0 :   highbdRndingPacks(u);
     396           0 :   highbd_clip(u, 1, bd);
     397             : 
     398           0 :   v = _mm_add_epi16(v, u[0]);
     399           0 :   v = _mm_add_epi16(v, ones);
     400           0 :   v = _mm_srai_epi16(v, 1);
     401             :   _mm_storel_epi64((__m128i *)dst, v);
     402           0 : }
     403             : 
     404             : WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum };
     405             : 
     406           0 : static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
     407             :                                        const __m128i *f, int taps,
     408             :                                        uint16_t *dst, WritePixels saveFunc,
     409             :                                        int bd) {
     410             :   __m128i s[12];
     411           0 :   __m128i zero = _mm_setzero_si128();
     412           0 :   int i = 0;
     413           0 :   int r = 0;
     414             : 
     415             :   // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
     416           0 :   assert(taps == 10 || taps == 12);
     417           0 :   if (10 == taps) {
     418           0 :     i += 1;
     419           0 :     s[0] = zero;
     420             :   }
     421           0 :   while (i < 12) {
     422           0 :     s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
     423           0 :     i += 1;
     424           0 :     r += 1;
     425             :   }
     426             : 
     427           0 :   s[0] = _mm_unpacklo_epi16(s[0], s[1]);
     428           0 :   s[2] = _mm_unpacklo_epi16(s[2], s[3]);
     429           0 :   s[4] = _mm_unpacklo_epi16(s[4], s[5]);
     430           0 :   s[6] = _mm_unpacklo_epi16(s[6], s[7]);
     431           0 :   s[8] = _mm_unpacklo_epi16(s[8], s[9]);
     432           0 :   s[10] = _mm_unpacklo_epi16(s[10], s[11]);
     433             : 
     434           0 :   s[0] = _mm_madd_epi16(s[0], f[0]);
     435           0 :   s[2] = _mm_madd_epi16(s[2], f[1]);
     436           0 :   s[4] = _mm_madd_epi16(s[4], f[2]);
     437           0 :   s[6] = _mm_madd_epi16(s[6], f[3]);
     438           0 :   s[8] = _mm_madd_epi16(s[8], f[4]);
     439           0 :   s[10] = _mm_madd_epi16(s[10], f[5]);
     440             : 
     441           0 :   s[1] = _mm_min_epi32(s[4], s[6]);
     442           0 :   s[3] = _mm_max_epi32(s[4], s[6]);
     443             : 
     444           0 :   s[0] = _mm_add_epi32(s[0], s[2]);
     445           0 :   s[0] = _mm_add_epi32(s[0], s[10]);
     446           0 :   s[0] = _mm_add_epi32(s[0], s[8]);
     447           0 :   s[0] = _mm_add_epi32(s[0], s[1]);
     448           0 :   s[0] = _mm_add_epi32(s[0], s[3]);
     449             : 
     450           0 :   saveFunc(s, bd, dst);
     451           0 : }
     452             : 
     453           0 : static void highbd_filter_vert_compute_large(const uint16_t *src,
     454             :                                              int src_stride, const __m128i *f,
     455             :                                              int taps, int w, int h,
     456             :                                              uint16_t *dst, int dst_stride,
     457             :                                              int avg, int bd) {
     458             :   int col;
     459           0 :   int rowIndex = 0;
     460           0 :   const uint16_t *src_ptr = src;
     461           0 :   uint16_t *dst_ptr = dst;
     462           0 :   const int step = 4;
     463           0 :   WritePixels write4pixels = write4pixelsTab[avg];
     464             : 
     465             :   do {
     466           0 :     for (col = 0; col < w; col += step) {
     467           0 :       filter_vert_horiz_parallel(src_ptr, src_stride, f, taps, dst_ptr,
     468             :                                  write4pixels, bd);
     469           0 :       src_ptr += step;
     470           0 :       dst_ptr += step;
     471             :     }
     472           0 :     rowIndex++;
     473           0 :     src_ptr = src + rowIndex * src_stride;
     474           0 :     dst_ptr = dst + rowIndex * dst_stride;
     475           0 :   } while (rowIndex < h);
     476           0 : }
     477             : 
     478           0 : static void highbd_filter_vert_compute_small(const uint16_t *src,
     479             :                                              int src_stride, const __m128i *f,
     480             :                                              int taps, int w, int h,
     481             :                                              uint16_t *dst, int dst_stride,
     482             :                                              int avg, int bd) {
     483           0 :   int rowIndex = 0;
     484           0 :   WritePixels write2pixels = write2pixelsTab[avg];
     485             :   (void)w;
     486             : 
     487             :   do {
     488           0 :     filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels, bd);
     489           0 :     rowIndex++;
     490           0 :     src += src_stride;
     491           0 :     dst += dst_stride;
     492           0 :   } while (rowIndex < h);
     493           0 : }
     494             : 
     495           0 : void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride,
     496             :                                      uint16_t *dst, int dst_stride, int w,
     497             :                                      int h,
     498             :                                      const InterpFilterParams filter_params,
     499             :                                      const int subpel_y_q4, int y_step_q4,
     500             :                                      int avg, int bd) {
     501             :   __m128i verf[6];
     502             :   HbdSubpelFilterCoeffs vCoeffs;
     503           0 :   const int tapsNum = filter_params.taps;
     504             : 
     505           0 :   if (0 == subpel_y_q4 || 16 != y_step_q4) {
     506           0 :     av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
     507             :                                filter_params, subpel_y_q4, y_step_q4, avg, bd);
     508           0 :     return;
     509             :   }
     510             : 
     511           0 :   vCoeffs =
     512           0 :       hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
     513           0 :   if (!vCoeffs) {
     514           0 :     av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
     515             :                                filter_params, subpel_y_q4, y_step_q4, avg, bd);
     516           0 :     return;
     517             :   }
     518             : 
     519           0 :   verf[0] = *((const __m128i *)(vCoeffs));
     520           0 :   verf[1] = *((const __m128i *)(vCoeffs + 1));
     521           0 :   verf[2] = *((const __m128i *)(vCoeffs + 2));
     522           0 :   verf[3] = *((const __m128i *)(vCoeffs + 3));
     523           0 :   verf[4] = *((const __m128i *)(vCoeffs + 4));
     524           0 :   verf[5] = *((const __m128i *)(vCoeffs + 5));
     525             : 
     526           0 :   src -= src_stride * ((tapsNum >> 1) - 1);
     527             : 
     528           0 :   if (w > 2) {
     529           0 :     highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h, dst,
     530             :                                      dst_stride, avg, bd);
     531             :   } else {
     532           0 :     highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h, dst,
     533             :                                      dst_stride, avg, bd);
     534             :   }
     535             : }

Generated by: LCOV version 1.13