LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - variance_impl_avx2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 364 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 4 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <immintrin.h>  // AVX2
      13             : 
      14             : #include "./aom_dsp_rtcd.h"
      15             : #include "aom_ports/mem.h"
      16             : 
      17             : /* clang-format off */
      18             : DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
      19             :   16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
      20             :   16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
      21             :   14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
      22             :   14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
      23             :   12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
      24             :   12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
      25             :   10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
      26             :   10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
      27             :    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
      28             :    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
      29             :    6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
      30             :    6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
      31             :    4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
      32             :    4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
      33             :    2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
      34             :    2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
      35             : };
      36             : /* clang-format on */
      37             : 
      38           0 : void aom_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
      39             :                           const unsigned char *ref_ptr, int recon_stride,
      40             :                           unsigned int *SSE, int *Sum) {
      41             :   __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
      42             :   __m256i ref_expand_high, madd_low, madd_high;
      43             :   unsigned int i, src_2strides, ref_2strides;
      44           0 :   __m256i zero_reg = _mm256_set1_epi16(0);
      45           0 :   __m256i sum_ref_src = _mm256_set1_epi16(0);
      46           0 :   __m256i madd_ref_src = _mm256_set1_epi16(0);
      47             : 
      48             :   // processing two strides in a 256 bit register reducing the number
      49             :   // of loop stride by half (comparing to the sse2 code)
      50           0 :   src_2strides = source_stride << 1;
      51           0 :   ref_2strides = recon_stride << 1;
      52           0 :   for (i = 0; i < 8; i++) {
      53           0 :     src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
      54           0 :     src = _mm256_inserti128_si256(
      55             :         src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
      56             : 
      57           0 :     ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
      58           0 :     ref = _mm256_inserti128_si256(
      59             :         ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
      60             : 
      61             :     // expanding to 16 bit each lane
      62           0 :     src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
      63           0 :     src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
      64             : 
      65           0 :     ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
      66           0 :     ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
      67             : 
      68             :     // src-ref
      69           0 :     src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
      70           0 :     src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
      71             : 
      72             :     // madd low (src - ref)
      73           0 :     madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
      74             : 
      75             :     // add high to low
      76           0 :     src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
      77             : 
      78             :     // madd high (src - ref)
      79           0 :     madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
      80             : 
      81           0 :     sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
      82             : 
      83             :     // add high to low
      84           0 :     madd_ref_src =
      85           0 :         _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
      86             : 
      87           0 :     src_ptr += src_2strides;
      88           0 :     ref_ptr += ref_2strides;
      89             :   }
      90             : 
      91             :   {
      92             :     __m128i sum_res, madd_res;
      93             :     __m128i expand_sum_low, expand_sum_high, expand_sum;
      94             :     __m128i expand_madd_low, expand_madd_high, expand_madd;
      95             :     __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
      96             : 
      97             :     // extract the low lane and add it to the high lane
      98           0 :     sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
      99           0 :                             _mm256_extractf128_si256(sum_ref_src, 1));
     100             : 
     101           0 :     madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
     102           0 :                              _mm256_extractf128_si256(madd_ref_src, 1));
     103             : 
     104             :     // padding each 2 bytes with another 2 zeroed bytes
     105           0 :     expand_sum_low =
     106           0 :         _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
     107           0 :     expand_sum_high =
     108           0 :         _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
     109             : 
     110             :     // shifting the sign 16 bits right
     111           0 :     expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
     112           0 :     expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
     113             : 
     114           0 :     expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
     115             : 
     116             :     // expand each 32 bits of the madd result to 64 bits
     117           0 :     expand_madd_low =
     118           0 :         _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
     119           0 :     expand_madd_high =
     120           0 :         _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
     121             : 
     122           0 :     expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
     123             : 
     124           0 :     ex_expand_sum_low =
     125           0 :         _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
     126           0 :     ex_expand_sum_high =
     127           0 :         _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
     128             : 
     129           0 :     ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
     130             : 
     131             :     // shift 8 bytes eight
     132           0 :     madd_res = _mm_srli_si128(expand_madd, 8);
     133           0 :     sum_res = _mm_srli_si128(ex_expand_sum, 8);
     134             : 
     135           0 :     madd_res = _mm_add_epi32(madd_res, expand_madd);
     136           0 :     sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
     137             : 
     138           0 :     *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
     139             : 
     140           0 :     *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
     141             :   }
     142             :   _mm256_zeroupper();
     143           0 : }
     144             : 
     145           0 : void aom_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
     146             :                           const unsigned char *ref_ptr, int recon_stride,
     147             :                           unsigned int *SSE, int *Sum) {
     148             :   __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
     149             :   __m256i ref_expand_high, madd_low, madd_high;
     150             :   unsigned int i;
     151           0 :   __m256i zero_reg = _mm256_set1_epi16(0);
     152           0 :   __m256i sum_ref_src = _mm256_set1_epi16(0);
     153           0 :   __m256i madd_ref_src = _mm256_set1_epi16(0);
     154             : 
     155             :   // processing 32 elements in parallel
     156           0 :   for (i = 0; i < 16; i++) {
     157           0 :     src = _mm256_loadu_si256((__m256i const *)(src_ptr));
     158             : 
     159           0 :     ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
     160             : 
     161             :     // expanding to 16 bit each lane
     162           0 :     src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
     163           0 :     src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
     164             : 
     165           0 :     ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
     166           0 :     ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
     167             : 
     168             :     // src-ref
     169           0 :     src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
     170           0 :     src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
     171             : 
     172             :     // madd low (src - ref)
     173           0 :     madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
     174             : 
     175             :     // add high to low
     176           0 :     src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
     177             : 
     178             :     // madd high (src - ref)
     179           0 :     madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
     180             : 
     181           0 :     sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
     182             : 
     183             :     // add high to low
     184           0 :     madd_ref_src =
     185           0 :         _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
     186             : 
     187           0 :     src_ptr += source_stride;
     188           0 :     ref_ptr += recon_stride;
     189             :   }
     190             : 
     191             :   {
     192             :     __m256i expand_sum_low, expand_sum_high, expand_sum;
     193             :     __m256i expand_madd_low, expand_madd_high, expand_madd;
     194             :     __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
     195             : 
     196             :     // padding each 2 bytes with another 2 zeroed bytes
     197           0 :     expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
     198           0 :     expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
     199             : 
     200             :     // shifting the sign 16 bits right
     201           0 :     expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
     202           0 :     expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
     203             : 
     204           0 :     expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
     205             : 
     206             :     // expand each 32 bits of the madd result to 64 bits
     207           0 :     expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
     208           0 :     expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
     209             : 
     210           0 :     expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
     211             : 
     212           0 :     ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
     213           0 :     ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
     214             : 
     215           0 :     ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
     216             : 
     217             :     // shift 8 bytes eight
     218           0 :     madd_ref_src = _mm256_srli_si256(expand_madd, 8);
     219           0 :     sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
     220             : 
     221           0 :     madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
     222           0 :     sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
     223             : 
     224             :     // extract the low lane and the high lane and add the results
     225           0 :     *((int *)SSE) =
     226           0 :         _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
     227           0 :         _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
     228             : 
     229           0 :     *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
     230           0 :                     _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
     231             :   }
     232             :   _mm256_zeroupper();
     233           0 : }
     234             : 
     235             : #define FILTER_SRC(filter)                               \
     236             :   /* filter the source */                                \
     237             :   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
     238             :   exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
     239             :                                                          \
     240             :   /* add 8 to source */                                  \
     241             :   exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
     242             :   exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
     243             :                                                          \
     244             :   /* divide source by 16 */                              \
     245             :   exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
     246             :   exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
     247             : 
     248             : #define MERGE_WITH_SRC(src_reg, reg)               \
     249             :   exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
     250             :   exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
     251             : 
     252             : #define LOAD_SRC_DST                                    \
     253             :   /* load source and destination */                     \
     254             :   src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
     255             :   dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
     256             : 
     257             : #define AVG_NEXT_SRC(src_reg, size_stride)                                 \
     258             :   src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
     259             :   /* average between current and next stride source */                     \
     260             :   src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
     261             : 
     262             : #define MERGE_NEXT_SRC(src_reg, size_stride)                               \
     263             :   src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
     264             :   MERGE_WITH_SRC(src_reg, src_next_reg)
     265             : 
     266             : #define CALC_SUM_SSE_INSIDE_LOOP                          \
     267             :   /* expand each byte to 2 bytes */                       \
     268             :   exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
     269             :   exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
     270             :   /* source - dest */                                     \
     271             :   exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
     272             :   exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
     273             :   /* caculate sum */                                      \
     274             :   sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
     275             :   exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
     276             :   sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
     277             :   exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
     278             :   /* calculate sse */                                     \
     279             :   sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
     280             :   sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
     281             : 
     282             : // final calculation to sum and sse
     283             : #define CALC_SUM_AND_SSE                                                   \
     284             :   res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
     285             :   sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
     286             :   sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
     287             :   sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
     288             :   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
     289             :   sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
     290             :                                                                            \
     291             :   sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
     292             :   sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
     293             :                                                                            \
     294             :   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
     295             :   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
     296             :   *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
     297             :                   _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
     298             :   sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
     299             :   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
     300             :   sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
     301             :         _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
     302             : 
     303           0 : unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
     304             :                                              int x_offset, int y_offset,
     305             :                                              const uint8_t *dst, int dst_stride,
     306             :                                              int height, unsigned int *sse) {
     307             :   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
     308             :   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
     309             :   __m256i zero_reg;
     310             :   int i, sum;
     311           0 :   sum_reg = _mm256_set1_epi16(0);
     312           0 :   sse_reg = _mm256_set1_epi16(0);
     313           0 :   zero_reg = _mm256_set1_epi16(0);
     314             : 
     315             :   // x_offset = 0 and y_offset = 0
     316           0 :   if (x_offset == 0) {
     317           0 :     if (y_offset == 0) {
     318           0 :       for (i = 0; i < height; i++) {
     319           0 :         LOAD_SRC_DST
     320             :         // expend each byte to 2 bytes
     321           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     322           0 :         CALC_SUM_SSE_INSIDE_LOOP
     323           0 :         src += src_stride;
     324           0 :         dst += dst_stride;
     325             :       }
     326             :       // x_offset = 0 and y_offset = 8
     327           0 :     } else if (y_offset == 8) {
     328             :       __m256i src_next_reg;
     329           0 :       for (i = 0; i < height; i++) {
     330           0 :         LOAD_SRC_DST
     331           0 :         AVG_NEXT_SRC(src_reg, src_stride)
     332             :         // expend each byte to 2 bytes
     333           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     334           0 :         CALC_SUM_SSE_INSIDE_LOOP
     335           0 :         src += src_stride;
     336           0 :         dst += dst_stride;
     337             :       }
     338             :       // x_offset = 0 and y_offset = bilin interpolation
     339             :     } else {
     340             :       __m256i filter, pw8, src_next_reg;
     341             : 
     342           0 :       y_offset <<= 5;
     343           0 :       filter = _mm256_load_si256(
     344             :           (__m256i const *)(bilinear_filters_avx2 + y_offset));
     345           0 :       pw8 = _mm256_set1_epi16(8);
     346           0 :       for (i = 0; i < height; i++) {
     347           0 :         LOAD_SRC_DST
     348           0 :         MERGE_NEXT_SRC(src_reg, src_stride)
     349           0 :         FILTER_SRC(filter)
     350           0 :         CALC_SUM_SSE_INSIDE_LOOP
     351           0 :         src += src_stride;
     352           0 :         dst += dst_stride;
     353             :       }
     354             :     }
     355             :     // x_offset = 8  and y_offset = 0
     356           0 :   } else if (x_offset == 8) {
     357           0 :     if (y_offset == 0) {
     358             :       __m256i src_next_reg;
     359           0 :       for (i = 0; i < height; i++) {
     360           0 :         LOAD_SRC_DST
     361           0 :         AVG_NEXT_SRC(src_reg, 1)
     362             :         // expand each byte to 2 bytes
     363           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     364           0 :         CALC_SUM_SSE_INSIDE_LOOP
     365           0 :         src += src_stride;
     366           0 :         dst += dst_stride;
     367             :       }
     368             :       // x_offset = 8  and y_offset = 8
     369           0 :     } else if (y_offset == 8) {
     370             :       __m256i src_next_reg, src_avg;
     371             :       // load source and another source starting from the next
     372             :       // following byte
     373           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     374           0 :       AVG_NEXT_SRC(src_reg, 1)
     375           0 :       for (i = 0; i < height; i++) {
     376           0 :         src_avg = src_reg;
     377           0 :         src += src_stride;
     378           0 :         LOAD_SRC_DST
     379           0 :         AVG_NEXT_SRC(src_reg, 1)
     380             :         // average between previous average to current average
     381           0 :         src_avg = _mm256_avg_epu8(src_avg, src_reg);
     382             :         // expand each byte to 2 bytes
     383           0 :         MERGE_WITH_SRC(src_avg, zero_reg)
     384             :         // save current source average
     385           0 :         CALC_SUM_SSE_INSIDE_LOOP
     386           0 :         dst += dst_stride;
     387             :       }
     388             :       // x_offset = 8  and y_offset = bilin interpolation
     389             :     } else {
     390             :       __m256i filter, pw8, src_next_reg, src_avg;
     391           0 :       y_offset <<= 5;
     392           0 :       filter = _mm256_load_si256(
     393             :           (__m256i const *)(bilinear_filters_avx2 + y_offset));
     394           0 :       pw8 = _mm256_set1_epi16(8);
     395             :       // load source and another source starting from the next
     396             :       // following byte
     397           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     398           0 :       AVG_NEXT_SRC(src_reg, 1)
     399           0 :       for (i = 0; i < height; i++) {
     400             :         // save current source average
     401           0 :         src_avg = src_reg;
     402           0 :         src += src_stride;
     403           0 :         LOAD_SRC_DST
     404           0 :         AVG_NEXT_SRC(src_reg, 1)
     405           0 :         MERGE_WITH_SRC(src_avg, src_reg)
     406           0 :         FILTER_SRC(filter)
     407           0 :         CALC_SUM_SSE_INSIDE_LOOP
     408           0 :         dst += dst_stride;
     409             :       }
     410             :     }
     411             :     // x_offset = bilin interpolation and y_offset = 0
     412             :   } else {
     413           0 :     if (y_offset == 0) {
     414             :       __m256i filter, pw8, src_next_reg;
     415           0 :       x_offset <<= 5;
     416           0 :       filter = _mm256_load_si256(
     417             :           (__m256i const *)(bilinear_filters_avx2 + x_offset));
     418           0 :       pw8 = _mm256_set1_epi16(8);
     419           0 :       for (i = 0; i < height; i++) {
     420           0 :         LOAD_SRC_DST
     421           0 :         MERGE_NEXT_SRC(src_reg, 1)
     422           0 :         FILTER_SRC(filter)
     423           0 :         CALC_SUM_SSE_INSIDE_LOOP
     424           0 :         src += src_stride;
     425           0 :         dst += dst_stride;
     426             :       }
     427             :       // x_offset = bilin interpolation and y_offset = 8
     428           0 :     } else if (y_offset == 8) {
     429             :       __m256i filter, pw8, src_next_reg, src_pack;
     430           0 :       x_offset <<= 5;
     431           0 :       filter = _mm256_load_si256(
     432             :           (__m256i const *)(bilinear_filters_avx2 + x_offset));
     433           0 :       pw8 = _mm256_set1_epi16(8);
     434           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     435           0 :       MERGE_NEXT_SRC(src_reg, 1)
     436           0 :       FILTER_SRC(filter)
     437             :       // convert each 16 bit to 8 bit to each low and high lane source
     438           0 :       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     439           0 :       for (i = 0; i < height; i++) {
     440           0 :         src += src_stride;
     441           0 :         LOAD_SRC_DST
     442           0 :         MERGE_NEXT_SRC(src_reg, 1)
     443           0 :         FILTER_SRC(filter)
     444           0 :         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     445             :         // average between previous pack to the current
     446           0 :         src_pack = _mm256_avg_epu8(src_pack, src_reg);
     447           0 :         MERGE_WITH_SRC(src_pack, zero_reg)
     448           0 :         CALC_SUM_SSE_INSIDE_LOOP
     449           0 :         src_pack = src_reg;
     450           0 :         dst += dst_stride;
     451             :       }
     452             :       // x_offset = bilin interpolation and y_offset = bilin interpolation
     453             :     } else {
     454             :       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
     455           0 :       x_offset <<= 5;
     456           0 :       xfilter = _mm256_load_si256(
     457             :           (__m256i const *)(bilinear_filters_avx2 + x_offset));
     458           0 :       y_offset <<= 5;
     459           0 :       yfilter = _mm256_load_si256(
     460             :           (__m256i const *)(bilinear_filters_avx2 + y_offset));
     461           0 :       pw8 = _mm256_set1_epi16(8);
     462             :       // load source and another source starting from the next
     463             :       // following byte
     464           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     465           0 :       MERGE_NEXT_SRC(src_reg, 1)
     466             : 
     467           0 :       FILTER_SRC(xfilter)
     468             :       // convert each 16 bit to 8 bit to each low and high lane source
     469           0 :       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     470           0 :       for (i = 0; i < height; i++) {
     471           0 :         src += src_stride;
     472           0 :         LOAD_SRC_DST
     473           0 :         MERGE_NEXT_SRC(src_reg, 1)
     474           0 :         FILTER_SRC(xfilter)
     475           0 :         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     476             :         // merge previous pack to current pack source
     477           0 :         MERGE_WITH_SRC(src_pack, src_reg)
     478             :         // filter the source
     479           0 :         FILTER_SRC(yfilter)
     480           0 :         src_pack = src_reg;
     481           0 :         CALC_SUM_SSE_INSIDE_LOOP
     482           0 :         dst += dst_stride;
     483             :       }
     484             :     }
     485             :   }
     486           0 :   CALC_SUM_AND_SSE
     487             :   _mm256_zeroupper();
     488           0 :   return sum;
     489             : }
     490             : 
     491           0 : unsigned int aom_sub_pixel_avg_variance32xh_avx2(
     492             :     const uint8_t *src, int src_stride, int x_offset, int y_offset,
     493             :     const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
     494             :     int height, unsigned int *sse) {
     495             :   __m256i sec_reg;
     496             :   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
     497             :   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
     498             :   __m256i zero_reg;
     499             :   int i, sum;
     500           0 :   sum_reg = _mm256_set1_epi16(0);
     501           0 :   sse_reg = _mm256_set1_epi16(0);
     502           0 :   zero_reg = _mm256_set1_epi16(0);
     503             : 
     504             :   // x_offset = 0 and y_offset = 0
     505           0 :   if (x_offset == 0) {
     506           0 :     if (y_offset == 0) {
     507           0 :       for (i = 0; i < height; i++) {
     508           0 :         LOAD_SRC_DST
     509           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     510           0 :         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
     511           0 :         sec += sec_stride;
     512             :         // expend each byte to 2 bytes
     513           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     514           0 :         CALC_SUM_SSE_INSIDE_LOOP
     515           0 :         src += src_stride;
     516           0 :         dst += dst_stride;
     517             :       }
     518           0 :     } else if (y_offset == 8) {
     519             :       __m256i src_next_reg;
     520           0 :       for (i = 0; i < height; i++) {
     521           0 :         LOAD_SRC_DST
     522           0 :         AVG_NEXT_SRC(src_reg, src_stride)
     523           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     524           0 :         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
     525           0 :         sec += sec_stride;
     526             :         // expend each byte to 2 bytes
     527           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     528           0 :         CALC_SUM_SSE_INSIDE_LOOP
     529           0 :         src += src_stride;
     530           0 :         dst += dst_stride;
     531             :       }
     532             :       // x_offset = 0 and y_offset = bilin interpolation
     533             :     } else {
     534             :       __m256i filter, pw8, src_next_reg;
     535             : 
     536           0 :       y_offset <<= 5;
     537           0 :       filter = _mm256_load_si256(
     538             :           (__m256i const *)(bilinear_filters_avx2 + y_offset));
     539           0 :       pw8 = _mm256_set1_epi16(8);
     540           0 :       for (i = 0; i < height; i++) {
     541           0 :         LOAD_SRC_DST
     542           0 :         MERGE_NEXT_SRC(src_reg, src_stride)
     543           0 :         FILTER_SRC(filter)
     544           0 :         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     545           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     546           0 :         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
     547           0 :         sec += sec_stride;
     548           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     549           0 :         CALC_SUM_SSE_INSIDE_LOOP
     550           0 :         src += src_stride;
     551           0 :         dst += dst_stride;
     552             :       }
     553             :     }
     554             :     // x_offset = 8  and y_offset = 0
     555           0 :   } else if (x_offset == 8) {
     556           0 :     if (y_offset == 0) {
     557             :       __m256i src_next_reg;
     558           0 :       for (i = 0; i < height; i++) {
     559           0 :         LOAD_SRC_DST
     560           0 :         AVG_NEXT_SRC(src_reg, 1)
     561           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     562           0 :         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
     563           0 :         sec += sec_stride;
     564             :         // expand each byte to 2 bytes
     565           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     566           0 :         CALC_SUM_SSE_INSIDE_LOOP
     567           0 :         src += src_stride;
     568           0 :         dst += dst_stride;
     569             :       }
     570             :       // x_offset = 8  and y_offset = 8
     571           0 :     } else if (y_offset == 8) {
     572             :       __m256i src_next_reg, src_avg;
     573             :       // load source and another source starting from the next
     574             :       // following byte
     575           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     576           0 :       AVG_NEXT_SRC(src_reg, 1)
     577           0 :       for (i = 0; i < height; i++) {
     578             :         // save current source average
     579           0 :         src_avg = src_reg;
     580           0 :         src += src_stride;
     581           0 :         LOAD_SRC_DST
     582           0 :         AVG_NEXT_SRC(src_reg, 1)
     583             :         // average between previous average to current average
     584           0 :         src_avg = _mm256_avg_epu8(src_avg, src_reg);
     585           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     586           0 :         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
     587           0 :         sec += sec_stride;
     588             :         // expand each byte to 2 bytes
     589           0 :         MERGE_WITH_SRC(src_avg, zero_reg)
     590           0 :         CALC_SUM_SSE_INSIDE_LOOP
     591           0 :         dst += dst_stride;
     592             :       }
     593             :       // x_offset = 8  and y_offset = bilin interpolation
     594             :     } else {
     595             :       __m256i filter, pw8, src_next_reg, src_avg;
     596           0 :       y_offset <<= 5;
     597           0 :       filter = _mm256_load_si256(
     598             :           (__m256i const *)(bilinear_filters_avx2 + y_offset));
     599           0 :       pw8 = _mm256_set1_epi16(8);
     600             :       // load source and another source starting from the next
     601             :       // following byte
     602           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     603           0 :       AVG_NEXT_SRC(src_reg, 1)
     604           0 :       for (i = 0; i < height; i++) {
     605             :         // save current source average
     606           0 :         src_avg = src_reg;
     607           0 :         src += src_stride;
     608           0 :         LOAD_SRC_DST
     609           0 :         AVG_NEXT_SRC(src_reg, 1)
     610           0 :         MERGE_WITH_SRC(src_avg, src_reg)
     611           0 :         FILTER_SRC(filter)
     612           0 :         src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     613           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     614           0 :         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
     615             :         // expand each byte to 2 bytes
     616           0 :         MERGE_WITH_SRC(src_avg, zero_reg)
     617           0 :         sec += sec_stride;
     618           0 :         CALC_SUM_SSE_INSIDE_LOOP
     619           0 :         dst += dst_stride;
     620             :       }
     621             :     }
     622             :     // x_offset = bilin interpolation and y_offset = 0
     623             :   } else {
     624           0 :     if (y_offset == 0) {
     625             :       __m256i filter, pw8, src_next_reg;
     626           0 :       x_offset <<= 5;
     627           0 :       filter = _mm256_load_si256(
     628             :           (__m256i const *)(bilinear_filters_avx2 + x_offset));
     629           0 :       pw8 = _mm256_set1_epi16(8);
     630           0 :       for (i = 0; i < height; i++) {
     631           0 :         LOAD_SRC_DST
     632           0 :         MERGE_NEXT_SRC(src_reg, 1)
     633           0 :         FILTER_SRC(filter)
     634           0 :         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     635           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     636           0 :         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
     637           0 :         MERGE_WITH_SRC(src_reg, zero_reg)
     638           0 :         sec += sec_stride;
     639           0 :         CALC_SUM_SSE_INSIDE_LOOP
     640           0 :         src += src_stride;
     641           0 :         dst += dst_stride;
     642             :       }
     643             :       // x_offset = bilin interpolation and y_offset = 8
     644           0 :     } else if (y_offset == 8) {
     645             :       __m256i filter, pw8, src_next_reg, src_pack;
     646           0 :       x_offset <<= 5;
     647           0 :       filter = _mm256_load_si256(
     648             :           (__m256i const *)(bilinear_filters_avx2 + x_offset));
     649           0 :       pw8 = _mm256_set1_epi16(8);
     650           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     651           0 :       MERGE_NEXT_SRC(src_reg, 1)
     652           0 :       FILTER_SRC(filter)
     653             :       // convert each 16 bit to 8 bit to each low and high lane source
     654           0 :       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     655           0 :       for (i = 0; i < height; i++) {
     656           0 :         src += src_stride;
     657           0 :         LOAD_SRC_DST
     658           0 :         MERGE_NEXT_SRC(src_reg, 1)
     659           0 :         FILTER_SRC(filter)
     660           0 :         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     661             :         // average between previous pack to the current
     662           0 :         src_pack = _mm256_avg_epu8(src_pack, src_reg);
     663           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     664           0 :         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
     665           0 :         sec += sec_stride;
     666           0 :         MERGE_WITH_SRC(src_pack, zero_reg)
     667           0 :         src_pack = src_reg;
     668           0 :         CALC_SUM_SSE_INSIDE_LOOP
     669           0 :         dst += dst_stride;
     670             :       }
     671             :       // x_offset = bilin interpolation and y_offset = bilin interpolation
     672             :     } else {
     673             :       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
     674           0 :       x_offset <<= 5;
     675           0 :       xfilter = _mm256_load_si256(
     676             :           (__m256i const *)(bilinear_filters_avx2 + x_offset));
     677           0 :       y_offset <<= 5;
     678           0 :       yfilter = _mm256_load_si256(
     679             :           (__m256i const *)(bilinear_filters_avx2 + y_offset));
     680           0 :       pw8 = _mm256_set1_epi16(8);
     681             :       // load source and another source starting from the next
     682             :       // following byte
     683           0 :       src_reg = _mm256_loadu_si256((__m256i const *)(src));
     684           0 :       MERGE_NEXT_SRC(src_reg, 1)
     685             : 
     686           0 :       FILTER_SRC(xfilter)
     687             :       // convert each 16 bit to 8 bit to each low and high lane source
     688           0 :       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     689           0 :       for (i = 0; i < height; i++) {
     690           0 :         src += src_stride;
     691           0 :         LOAD_SRC_DST
     692           0 :         MERGE_NEXT_SRC(src_reg, 1)
     693           0 :         FILTER_SRC(xfilter)
     694           0 :         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     695             :         // merge previous pack to current pack source
     696           0 :         MERGE_WITH_SRC(src_pack, src_reg)
     697             :         // filter the source
     698           0 :         FILTER_SRC(yfilter)
     699           0 :         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
     700           0 :         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
     701           0 :         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
     702           0 :         MERGE_WITH_SRC(src_pack, zero_reg)
     703           0 :         src_pack = src_reg;
     704           0 :         sec += sec_stride;
     705           0 :         CALC_SUM_SSE_INSIDE_LOOP
     706           0 :         dst += dst_stride;
     707             :       }
     708             :     }
     709             :   }
     710           0 :   CALC_SUM_AND_SSE
     711             :   _mm256_zeroupper();
     712           0 :   return sum;
     713             : }

Generated by: LCOV version 1.13