LCOV - code coverage report
Current view: top level - media/libvpx/libvpx/vpx_dsp/x86 - variance_impl_avx2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 364 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 4 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include <immintrin.h>  // AVX2
      12             : 
      13             : #include "./vpx_dsp_rtcd.h"
      14             : #include "vpx_ports/mem.h"
      15             : 
      16             : /* clang-format off */
      17             : DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
      18             :   16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
      19             :   16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
      20             :   14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
      21             :   14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
      22             :   12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
      23             :   12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
      24             :   10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
      25             :   10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
      26             :   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
      27             :   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
      28             :   6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
      29             :   6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
      30             :   4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
      31             :   4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
      32             :   2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
      33             :   2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
      34             : };
      35             : /* clang-format on */
      36             : 
      37           0 : void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
      38             :                           const unsigned char *ref_ptr, int recon_stride,
      39             :                           unsigned int *SSE, int *Sum) {
      40             :   __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
      41             :   __m256i ref_expand_high, madd_low, madd_high;
      42             :   unsigned int i, src_2strides, ref_2strides;
      43           0 :   __m256i zero_reg = _mm256_set1_epi16(0);
      44           0 :   __m256i sum_ref_src = _mm256_set1_epi16(0);
      45           0 :   __m256i madd_ref_src = _mm256_set1_epi16(0);
      46             : 
      47             :   // processing two strides in a 256 bit register reducing the number
      48             :   // of loop stride by half (comparing to the sse2 code)
      49           0 :   src_2strides = source_stride << 1;
      50           0 :   ref_2strides = recon_stride << 1;
      51           0 :   for (i = 0; i < 8; i++) {
      52           0 :     src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
      53           0 :     src = _mm256_inserti128_si256(
      54             :         src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
      55             : 
      56           0 :     ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
      57           0 :     ref = _mm256_inserti128_si256(
      58             :         ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
      59             : 
      60             :     // expanding to 16 bit each lane
      61           0 :     src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
      62           0 :     src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
      63             : 
      64           0 :     ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
      65           0 :     ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
      66             : 
      67             :     // src-ref
      68           0 :     src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
      69           0 :     src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
      70             : 
      71             :     // madd low (src - ref)
      72           0 :     madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
      73             : 
      74             :     // add high to low
      75           0 :     src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
      76             : 
      77             :     // madd high (src - ref)
      78           0 :     madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
      79             : 
      80           0 :     sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
      81             : 
      82             :     // add high to low
      83           0 :     madd_ref_src =
      84           0 :         _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
      85             : 
      86           0 :     src_ptr += src_2strides;
      87           0 :     ref_ptr += ref_2strides;
      88             :   }
      89             : 
      90             :   {
      91             :     __m128i sum_res, madd_res;
      92             :     __m128i expand_sum_low, expand_sum_high, expand_sum;
      93             :     __m128i expand_madd_low, expand_madd_high, expand_madd;
      94             :     __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
      95             : 
      96             :     // extract the low lane and add it to the high lane
      97           0 :     sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
      98           0 :                             _mm256_extractf128_si256(sum_ref_src, 1));
      99             : 
     100           0 :     madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
     101           0 :                              _mm256_extractf128_si256(madd_ref_src, 1));
     102             : 
     103             :     // padding each 2 bytes with another 2 zeroed bytes
     104           0 :     expand_sum_low =
     105           0 :         _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
     106           0 :     expand_sum_high =
     107           0 :         _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
     108             : 
     109             :     // shifting the sign 16 bits right
     110           0 :     expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
     111           0 :     expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
     112             : 
     113           0 :     expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
     114             : 
     115             :     // expand each 32 bits of the madd result to 64 bits
     116           0 :     expand_madd_low =
     117           0 :         _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
     118           0 :     expand_madd_high =
     119           0 :         _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
     120             : 
     121           0 :     expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
     122             : 
     123           0 :     ex_expand_sum_low =
     124           0 :         _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
     125           0 :     ex_expand_sum_high =
     126           0 :         _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
     127             : 
     128           0 :     ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
     129             : 
     130             :     // shift 8 bytes eight
     131           0 :     madd_res = _mm_srli_si128(expand_madd, 8);
     132           0 :     sum_res = _mm_srli_si128(ex_expand_sum, 8);
     133             : 
     134           0 :     madd_res = _mm_add_epi32(madd_res, expand_madd);
     135           0 :     sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
     136             : 
     137           0 :     *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
     138             : 
     139           0 :     *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
     140             :   }
     141           0 : }
     142             : 
     143           0 : void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
     144             :                           const unsigned char *ref_ptr, int recon_stride,
     145             :                           unsigned int *SSE, int *Sum) {
     146             :   __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
     147             :   __m256i ref_expand_high, madd_low, madd_high;
     148             :   unsigned int i;
     149           0 :   __m256i zero_reg = _mm256_set1_epi16(0);
     150           0 :   __m256i sum_ref_src = _mm256_set1_epi16(0);
     151           0 :   __m256i madd_ref_src = _mm256_set1_epi16(0);
     152             : 
     153             :   // processing 32 elements in parallel
     154           0 :   for (i = 0; i < 16; i++) {
     155           0 :     src = _mm256_loadu_si256((__m256i const *)(src_ptr));
     156             : 
     157           0 :     ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
     158             : 
     159             :     // expanding to 16 bit each lane
     160           0 :     src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
     161           0 :     src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
     162             : 
     163           0 :     ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
     164           0 :     ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
     165             : 
     166             :     // src-ref
     167           0 :     src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
     168           0 :     src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
     169             : 
     170             :     // madd low (src - ref)
     171           0 :     madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
     172             : 
     173             :     // add high to low
     174           0 :     src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
     175             : 
     176             :     // madd high (src - ref)
     177           0 :     madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
     178             : 
     179           0 :     sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
     180             : 
     181             :     // add high to low
     182           0 :     madd_ref_src =
     183           0 :         _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
     184             : 
     185           0 :     src_ptr += source_stride;
     186           0 :     ref_ptr += recon_stride;
     187             :   }
     188             : 
     189             :   {
     190             :     __m256i expand_sum_low, expand_sum_high, expand_sum;
     191             :     __m256i expand_madd_low, expand_madd_high, expand_madd;
     192             :     __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
     193             : 
     194             :     // padding each 2 bytes with another 2 zeroed bytes
     195           0 :     expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
     196           0 :     expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
     197             : 
     198             :     // shifting the sign 16 bits right
     199           0 :     expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
     200           0 :     expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
     201             : 
     202           0 :     expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
     203             : 
     204             :     // expand each 32 bits of the madd result to 64 bits
     205           0 :     expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
     206           0 :     expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
     207             : 
     208           0 :     expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
     209             : 
     210           0 :     ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
     211           0 :     ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
     212             : 
     213           0 :     ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
     214             : 
     215             :     // shift 8 bytes eight
     216           0 :     madd_ref_src = _mm256_srli_si256(expand_madd, 8);
     217           0 :     sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
     218             : 
     219           0 :     madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
     220           0 :     sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
     221             : 
     222             :     // extract the low lane and the high lane and add the results
     223           0 :     *((int *)SSE) =
     224           0 :         _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
     225           0 :         _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
     226             : 
     227           0 :     *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
     228           0 :                     _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
     229             :   }
     230           0 : }
     231             : 
     232             : #define FILTER_SRC(filter)                               \
     233             :   /* filter the source */                                \
     234             :   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
     235             :   exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
     236             :                                                          \
     237             :   /* add 8 to source */                                  \
     238             :   exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
     239             :   exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
     240             :                                                          \
     241             :   /* divide source by 16 */                              \
     242             :   exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
     243             :   exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
     244             : 
     245             : #define MERGE_WITH_SRC(src_reg, reg)               \
     246             :   exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
     247             :   exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
     248             : 
     249             : #define LOAD_SRC_DST                                    \
     250             :   /* load source and destination */                     \
     251             :   src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
     252             :   dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
     253             : 
     254             : #define AVG_NEXT_SRC(src_reg, size_stride)                                 \
     255             :   src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
     256             :   /* average between current and next stride source */                     \
     257             :   src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
     258             : 
     259             : #define MERGE_NEXT_SRC(src_reg, size_stride)                               \
     260             :   src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
     261             :   MERGE_WITH_SRC(src_reg, src_next_reg)
     262             : 
     263             : #define CALC_SUM_SSE_INSIDE_LOOP                          \
     264             :   /* expand each byte to 2 bytes */                       \
     265             :   exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
     266             :   exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
     267             :   /* source - dest */                                     \
     268             :   exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
     269             :   exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
     270             :   /* caculate sum */                                      \
     271             :   sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
     272             :   exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
     273             :   sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
     274             :   exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
     275             :   /* calculate sse */                                     \
     276             :   sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
     277             :   sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
     278             : 
     279             : // final calculation to sum and sse
     280             : #define CALC_SUM_AND_SSE                                                   \
     281             :   res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
     282             :   sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
     283             :   sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
     284             :   sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
     285             :   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
     286             :   sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
     287             :                                                                            \
     288             :   sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
     289             :   sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
     290             :                                                                            \
     291             :   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
     292             :   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
     293             :   *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
     294             :                   _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
     295             :   sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
     296             :   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
     297             :   sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
     298             :         _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
     299             : 
     300           0 : unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
     301             :                                              int x_offset, int y_offset,
     302             :                                              const uint8_t *dst, int dst_stride,
     303             :                                              int height, unsigned int *sse) {
     304             :   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
     305             :   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
     306             :   __m256i zero_reg;
     307             :   int i, sum;
     308           0 :   sum_reg = _mm256_set1_epi16(0);
     309           0 :   sse_reg = _mm256_set1_epi16(0);
     310           0 :   zero_reg = _mm256_set1_epi16(0);
     311             : 
     312             :   // x_offset = 0 and y_offset = 0
     313           0 :   if (x_offset == 0) {
     314           0 :     if (y_offset == 0) {
     315           0 :       for (i = 0; i < height; i++) {
     316           0 :         LOAD_SRC_DST
     317             :         // expend each byte to 2 bytes
     318           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     319           0 :         CALC_SUM_SSE_INSIDE_LOOP
     320           0 :         src += src_stride;
     321           0 :         dst += dst_stride;
     322             :       }
     323             :       // x_offset = 0 and y_offset = 8
     324           0 :     } else if (y_offset == 8) {
     325             :       __m256i src_next_reg;
     326           0 :       for (i = 0; i < height; i++) {
     327           0 :         LOAD_SRC_DST
     328           0 :         AVG_NEXT_SRC(src_reg, src_stride)
     329             :         // expend each byte to 2 bytes
     330           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     331           0 :         CALC_SUM_SSE_INSIDE_LOOP
     332           0 :         src += src_stride;
     333           0 :         dst += dst_stride;
     334             :       }
     335             :       // x_offset = 0 and y_offset = bilin interpolation
     336             :     } else {
     337             :       __m256i filter, pw8, src_next_reg;
     338             : 
     339           0 :       y_offset <<= 5;
     340           0 :       filter = _mm256_load_si256(
     341             :           (__m256i const *)(bilinear_filters_avx2 + y_offset));
     342           0 :       pw8 = _mm256_set1_epi16(8);
     343           0 :       for (i = 0; i < height; i++) {
     344           0 :         LOAD_SRC_DST
     345           0 :         MERGE_NEXT_SRC(src_reg, src_stride)
     346           0 :         FILTER_SRC(filter)
     347           0 :         CALC_SUM_SSE_INSIDE_LOOP
     348           0 :         src += src_stride;
     349           0 :         dst += dst_stride;
     350             :       }
     351             :     }
     352             :     // x_offset = 8  and y_offset = 0
     353           0 :   } else if (x_offset == 8) {
     354           0 :     if (y_offset == 0) {
     355             :       __m256i src_next_reg;
     356           0 :       for (i = 0; i < height; i++) {
     357           0 :         LOAD_SRC_DST
     358           0 :         AVG_NEXT_SRC(src_reg, 1)
     359             :         // expand each byte to 2 bytes
     360           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     361           0 :         CALC_SUM_SSE_INSIDE_LOOP
     362           0 :         src += src_stride;
     363           0 :         dst += dst_stride;
     364             :       }
     365             :       // x_offset = 8  and y_offset = 8
     366           0 :     } else if (y_offset == 8) {
     367             :       __m256i src_next_reg, src_avg;
     368             :       // load source and another source starting from the next
     369             :       // following byte
     370           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     371           0 :       AVG_NEXT_SRC(src_reg, 1)
     372           0 :       for (i = 0; i < height; i++) {
     373           0 :         src_avg = src_reg;
     374           0 :         src += src_stride;
     375           0 :         LOAD_SRC_DST
     376           0 :         AVG_NEXT_SRC(src_reg, 1)
     377             :         // average between previous average to current average
     378           0 :         src_avg = _mm256_avg_epu8(src_avg, src_reg);
     379             :         // expand each byte to 2 bytes
     380           0 :         MERGE_WITH_SRC(src_avg, zero_reg)
     381             :         // save current source average
     382           0 :         CALC_SUM_SSE_INSIDE_LOOP
     383           0 :         dst += dst_stride;
     384             :       }
     385             :       // x_offset = 8  and y_offset = bilin interpolation
     386             :     } else {
     387             :       __m256i filter, pw8, src_next_reg, src_avg;
     388           0 :       y_offset <<= 5;
     389           0 :       filter = _mm256_load_si256(
     390             :           (__m256i const *)(bilinear_filters_avx2 + y_offset));
     391           0 :       pw8 = _mm256_set1_epi16(8);
     392             :       // load source and another source starting from the next
     393             :       // following byte
     394           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     395           0 :       AVG_NEXT_SRC(src_reg, 1)
     396           0 :       for (i = 0; i < height; i++) {
     397             :         // save current source average
     398           0 :         src_avg = src_reg;
     399           0 :         src += src_stride;
     400           0 :         LOAD_SRC_DST
     401           0 :         AVG_NEXT_SRC(src_reg, 1)
     402           0 :         MERGE_WITH_SRC(src_avg, src_reg)
     403           0 :         FILTER_SRC(filter)
     404           0 :         CALC_SUM_SSE_INSIDE_LOOP
     405           0 :         dst += dst_stride;
     406             :       }
     407             :     }
     408             :     // x_offset = bilin interpolation and y_offset = 0
     409             :   } else {
     410           0 :     if (y_offset == 0) {
     411             :       __m256i filter, pw8, src_next_reg;
     412           0 :       x_offset <<= 5;
     413           0 :       filter = _mm256_load_si256(
     414             :           (__m256i const *)(bilinear_filters_avx2 + x_offset));
     415           0 :       pw8 = _mm256_set1_epi16(8);
     416           0 :       for (i = 0; i < height; i++) {
     417           0 :         LOAD_SRC_DST
     418           0 :         MERGE_NEXT_SRC(src_reg, 1)
     419           0 :         FILTER_SRC(filter)
     420           0 :         CALC_SUM_SSE_INSIDE_LOOP
     421           0 :         src += src_stride;
     422           0 :         dst += dst_stride;
     423             :       }
     424             :       // x_offset = bilin interpolation and y_offset = 8
     425           0 :     } else if (y_offset == 8) {
     426             :       __m256i filter, pw8, src_next_reg, src_pack;
     427           0 :       x_offset <<= 5;
     428           0 :       filter = _mm256_load_si256(
     429             :           (__m256i const *)(bilinear_filters_avx2 + x_offset));
     430           0 :       pw8 = _mm256_set1_epi16(8);
     431           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     432           0 :       MERGE_NEXT_SRC(src_reg, 1)
     433           0 :       FILTER_SRC(filter)
     434             :       // convert each 16 bit to 8 bit to each low and high lane source
     435           0 :       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     436           0 :       for (i = 0; i < height; i++) {
     437           0 :         src += src_stride;
     438           0 :         LOAD_SRC_DST
     439           0 :         MERGE_NEXT_SRC(src_reg, 1)
     440           0 :         FILTER_SRC(filter)
     441           0 :         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     442             :         // average between previous pack to the current
     443           0 :         src_pack = _mm256_avg_epu8(src_pack, src_reg);
     444           0 :         MERGE_WITH_SRC(src_pack, zero_reg)
     445           0 :         CALC_SUM_SSE_INSIDE_LOOP
     446           0 :         src_pack = src_reg;
     447           0 :         dst += dst_stride;
     448             :       }
     449             :       // x_offset = bilin interpolation and y_offset = bilin interpolation
     450             :     } else {
     451             :       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
     452           0 :       x_offset <<= 5;
     453           0 :       xfilter = _mm256_load_si256(
     454             :           (__m256i const *)(bilinear_filters_avx2 + x_offset));
     455           0 :       y_offset <<= 5;
     456           0 :       yfilter = _mm256_load_si256(
     457             :           (__m256i const *)(bilinear_filters_avx2 + y_offset));
     458           0 :       pw8 = _mm256_set1_epi16(8);
     459             :       // load source and another source starting from the next
     460             :       // following byte
     461           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     462           0 :       MERGE_NEXT_SRC(src_reg, 1)
     463             : 
     464           0 :       FILTER_SRC(xfilter)
     465             :       // convert each 16 bit to 8 bit to each low and high lane source
     466           0 :       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     467           0 :       for (i = 0; i < height; i++) {
     468           0 :         src += src_stride;
     469           0 :         LOAD_SRC_DST
     470           0 :         MERGE_NEXT_SRC(src_reg, 1)
     471           0 :         FILTER_SRC(xfilter)
     472           0 :         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     473             :         // merge previous pack to current pack source
     474           0 :         MERGE_WITH_SRC(src_pack, src_reg)
     475             :         // filter the source
     476           0 :         FILTER_SRC(yfilter)
     477           0 :         src_pack = src_reg;
     478           0 :         CALC_SUM_SSE_INSIDE_LOOP
     479           0 :         dst += dst_stride;
     480             :       }
     481             :     }
     482             :   }
     483           0 :   CALC_SUM_AND_SSE
     484           0 :   return sum;
     485             : }
     486             : 
     487           0 : unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
     488             :     const uint8_t *src, int src_stride, int x_offset, int y_offset,
     489             :     const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
     490             :     int height, unsigned int *sse) {
     491             :   __m256i sec_reg;
     492             :   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
     493             :   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
     494             :   __m256i zero_reg;
     495             :   int i, sum;
     496           0 :   sum_reg = _mm256_set1_epi16(0);
     497           0 :   sse_reg = _mm256_set1_epi16(0);
     498           0 :   zero_reg = _mm256_set1_epi16(0);
     499             : 
     500             :   // x_offset = 0 and y_offset = 0
     501           0 :   if (x_offset == 0) {
     502           0 :     if (y_offset == 0) {
     503           0 :       for (i = 0; i < height; i++) {
     504           0 :         LOAD_SRC_DST
     505           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     506           0 :         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
     507           0 :         sec += sec_stride;
     508             :         // expend each byte to 2 bytes
     509           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     510           0 :         CALC_SUM_SSE_INSIDE_LOOP
     511           0 :         src += src_stride;
     512           0 :         dst += dst_stride;
     513             :       }
     514           0 :     } else if (y_offset == 8) {
     515             :       __m256i src_next_reg;
     516           0 :       for (i = 0; i < height; i++) {
     517           0 :         LOAD_SRC_DST
     518           0 :         AVG_NEXT_SRC(src_reg, src_stride)
     519           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     520           0 :         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
     521           0 :         sec += sec_stride;
     522             :         // expend each byte to 2 bytes
     523           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     524           0 :         CALC_SUM_SSE_INSIDE_LOOP
     525           0 :         src += src_stride;
     526           0 :         dst += dst_stride;
     527             :       }
     528             :       // x_offset = 0 and y_offset = bilin interpolation
     529             :     } else {
     530             :       __m256i filter, pw8, src_next_reg;
     531             : 
     532           0 :       y_offset <<= 5;
     533           0 :       filter = _mm256_load_si256(
     534             :           (__m256i const *)(bilinear_filters_avx2 + y_offset));
     535           0 :       pw8 = _mm256_set1_epi16(8);
     536           0 :       for (i = 0; i < height; i++) {
     537           0 :         LOAD_SRC_DST
     538           0 :         MERGE_NEXT_SRC(src_reg, src_stride)
     539           0 :         FILTER_SRC(filter)
     540           0 :         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     541           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     542           0 :         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
     543           0 :         sec += sec_stride;
     544           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     545           0 :         CALC_SUM_SSE_INSIDE_LOOP
     546           0 :         src += src_stride;
     547           0 :         dst += dst_stride;
     548             :       }
     549             :     }
     550             :     // x_offset = 8  and y_offset = 0
     551           0 :   } else if (x_offset == 8) {
     552           0 :     if (y_offset == 0) {
     553             :       __m256i src_next_reg;
     554           0 :       for (i = 0; i < height; i++) {
     555           0 :         LOAD_SRC_DST
     556           0 :         AVG_NEXT_SRC(src_reg, 1)
     557           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     558           0 :         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
     559           0 :         sec += sec_stride;
     560             :         // expand each byte to 2 bytes
     561           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     562           0 :         CALC_SUM_SSE_INSIDE_LOOP
     563           0 :         src += src_stride;
     564           0 :         dst += dst_stride;
     565             :       }
     566             :       // x_offset = 8  and y_offset = 8
     567           0 :     } else if (y_offset == 8) {
     568             :       __m256i src_next_reg, src_avg;
     569             :       // load source and another source starting from the next
     570             :       // following byte
     571           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     572           0 :       AVG_NEXT_SRC(src_reg, 1)
     573           0 :       for (i = 0; i < height; i++) {
     574             :         // save current source average
     575           0 :         src_avg = src_reg;
     576           0 :         src += src_stride;
     577           0 :         LOAD_SRC_DST
     578           0 :         AVG_NEXT_SRC(src_reg, 1)
     579             :         // average between previous average to current average
     580           0 :         src_avg = _mm256_avg_epu8(src_avg, src_reg);
     581           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     582           0 :         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
     583           0 :         sec += sec_stride;
     584             :         // expand each byte to 2 bytes
     585           0 :         MERGE_WITH_SRC(src_avg, zero_reg)
     586           0 :         CALC_SUM_SSE_INSIDE_LOOP
     587           0 :         dst += dst_stride;
     588             :       }
     589             :       // x_offset = 8  and y_offset = bilin interpolation
     590             :     } else {
     591             :       __m256i filter, pw8, src_next_reg, src_avg;
     592           0 :       y_offset <<= 5;
     593           0 :       filter = _mm256_load_si256(
     594             :           (__m256i const *)(bilinear_filters_avx2 + y_offset));
     595           0 :       pw8 = _mm256_set1_epi16(8);
     596             :       // load source and another source starting from the next
     597             :       // following byte
     598           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     599           0 :       AVG_NEXT_SRC(src_reg, 1)
     600           0 :       for (i = 0; i < height; i++) {
     601             :         // save current source average
     602           0 :         src_avg = src_reg;
     603           0 :         src += src_stride;
     604           0 :         LOAD_SRC_DST
     605           0 :         AVG_NEXT_SRC(src_reg, 1)
     606           0 :         MERGE_WITH_SRC(src_avg, src_reg)
     607           0 :         FILTER_SRC(filter)
     608           0 :         src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     609           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     610           0 :         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
     611             :         // expand each byte to 2 bytes
     612           0 :         MERGE_WITH_SRC(src_avg, zero_reg)
     613           0 :         sec += sec_stride;
     614           0 :         CALC_SUM_SSE_INSIDE_LOOP
     615           0 :         dst += dst_stride;
     616             :       }
     617             :     }
     618             :     // x_offset = bilin interpolation and y_offset = 0
     619             :   } else {
     620           0 :     if (y_offset == 0) {
     621             :       __m256i filter, pw8, src_next_reg;
     622           0 :       x_offset <<= 5;
     623           0 :       filter = _mm256_load_si256(
     624             :           (__m256i const *)(bilinear_filters_avx2 + x_offset));
     625           0 :       pw8 = _mm256_set1_epi16(8);
     626           0 :       for (i = 0; i < height; i++) {
     627           0 :         LOAD_SRC_DST
     628           0 :         MERGE_NEXT_SRC(src_reg, 1)
     629           0 :         FILTER_SRC(filter)
     630           0 :         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     631           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     632           0 :         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
     633           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     634           0 :         sec += sec_stride;
     635           0 :         CALC_SUM_SSE_INSIDE_LOOP
     636           0 :         src += src_stride;
     637           0 :         dst += dst_stride;
     638             :       }
     639             :       // x_offset = bilin interpolation and y_offset = 8
     640           0 :     } else if (y_offset == 8) {
     641             :       __m256i filter, pw8, src_next_reg, src_pack;
     642           0 :       x_offset <<= 5;
     643           0 :       filter = _mm256_load_si256(
     644             :           (__m256i const *)(bilinear_filters_avx2 + x_offset));
     645           0 :       pw8 = _mm256_set1_epi16(8);
     646           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     647           0 :       MERGE_NEXT_SRC(src_reg, 1)
     648           0 :       FILTER_SRC(filter)
     649             :       // convert each 16 bit to 8 bit to each low and high lane source
     650           0 :       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     651           0 :       for (i = 0; i < height; i++) {
     652           0 :         src += src_stride;
     653           0 :         LOAD_SRC_DST
     654           0 :         MERGE_NEXT_SRC(src_reg, 1)
     655           0 :         FILTER_SRC(filter)
     656           0 :         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     657             :         // average between previous pack to the current
     658           0 :         src_pack = _mm256_avg_epu8(src_pack, src_reg);
     659           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     660           0 :         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
     661           0 :         sec += sec_stride;
     662           0 :         MERGE_WITH_SRC(src_pack, zero_reg)
     663           0 :         src_pack = src_reg;
     664           0 :         CALC_SUM_SSE_INSIDE_LOOP
     665           0 :         dst += dst_stride;
     666             :       }
     667             :       // x_offset = bilin interpolation and y_offset = bilin interpolation
     668             :     } else {
     669             :       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
     670           0 :       x_offset <<= 5;
     671           0 :       xfilter = _mm256_load_si256(
     672             :           (__m256i const *)(bilinear_filters_avx2 + x_offset));
     673           0 :       y_offset <<= 5;
     674           0 :       yfilter = _mm256_load_si256(
     675             :           (__m256i const *)(bilinear_filters_avx2 + y_offset));
     676           0 :       pw8 = _mm256_set1_epi16(8);
     677             :       // load source and another source starting from the next
     678             :       // following byte
     679           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     680           0 :       MERGE_NEXT_SRC(src_reg, 1)
     681             : 
     682           0 :       FILTER_SRC(xfilter)
     683             :       // convert each 16 bit to 8 bit to each low and high lane source
     684           0 :       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     685           0 :       for (i = 0; i < height; i++) {
     686           0 :         src += src_stride;
     687           0 :         LOAD_SRC_DST
     688           0 :         MERGE_NEXT_SRC(src_reg, 1)
     689           0 :         FILTER_SRC(xfilter)
     690           0 :         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     691             :         // merge previous pack to current pack source
     692           0 :         MERGE_WITH_SRC(src_pack, src_reg)
     693             :         // filter the source
     694           0 :         FILTER_SRC(yfilter)
     695           0 :         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     696           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     697           0 :         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
     698           0 :         MERGE_WITH_SRC(src_pack, zero_reg)
     699           0 :         src_pack = src_reg;
     700           0 :         sec += sec_stride;
     701           0 :         CALC_SUM_SSE_INSIDE_LOOP
     702           0 :         dst += dst_stride;
     703             :       }
     704             :     }
     705             :   }
     706           0 :   CALC_SUM_AND_SSE
     707           0 :   return sum;
     708             : }

Generated by: LCOV version 1.13