LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - aom_subpixel_8t_intrin_avx2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 213 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 5 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <immintrin.h>
      13             : 
      14             : #include "./aom_dsp_rtcd.h"
      15             : #include "aom_dsp/x86/convolve.h"
      16             : #include "aom_ports/mem.h"
      17             : 
      18             : // filters for 16_h8 and 16_v8
      19             : DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
      20             :   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
      21             :   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
      22             : };
      23             : 
      24             : DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
      25             :   2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
      26             :   2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
      27             : };
      28             : 
      29             : DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
      30             :   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
      31             :   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
      32             : };
      33             : 
      34             : DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
      35             :   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
      36             :   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
      37             : };
      38             : 
      39             : #if defined(__clang__)
      40             : #if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
      41             :     (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
      42             :     (defined(__APPLE__) && defined(__apple_build_version__) && \
      43             :      ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
      44             :       (__clang_major__ == 5 && __clang_minor__ == 0)))
      45             : #define MM256_BROADCASTSI128_SI256(x) \
      46             :   _mm_broadcastsi128_si256((__m128i const *)&(x))
      47             : #else  // clang > 3.3, and not 5.0 on macosx.
      48             : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
      49             : #endif  // clang <= 3.3
      50             : #elif defined(__GNUC__)
      51             : #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
      52             : #define MM256_BROADCASTSI128_SI256(x) \
      53             :   _mm_broadcastsi128_si256((__m128i const *)&(x))
      54             : #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
      55             : #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
      56             : #else  // gcc > 4.7
      57             : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
      58             : #endif  // gcc <= 4.6
      59             : #else   // !(gcc || clang)
      60             : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
      61             : #endif  // __clang__
      62             : 
      63           0 : static void aom_filter_block1d16_h8_avx2(
      64             :     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
      65             :     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
      66             :   __m128i filtersReg;
      67             :   __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
      68             :   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
      69             :   __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
      70             :   __m256i srcReg32b1, srcReg32b2, filtersReg32;
      71             :   unsigned int i;
      72             :   ptrdiff_t src_stride, dst_stride;
      73             : 
      74             :   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
      75           0 :   addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
      76           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
      77             :   // converting the 16 bit (short) to 8 bit (byte) and have the same data
      78             :   // in both lanes of 128 bit register.
      79           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
      80             :   // have the same data in both lanes of a 256 bit register
      81           0 :   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
      82             : 
      83             :   // duplicate only the first 16 bits (first and second byte)
      84             :   // across 256 bit register
      85           0 :   firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
      86             :   // duplicate only the second 16 bits (third and forth byte)
      87             :   // across 256 bit register
      88           0 :   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
      89             :   // duplicate only the third 16 bits (fifth and sixth byte)
      90             :   // across 256 bit register
      91           0 :   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
      92             :   // duplicate only the forth 16 bits (seventh and eighth byte)
      93             :   // across 256 bit register
      94           0 :   forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
      95             : 
      96           0 :   filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
      97           0 :   filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
      98           0 :   filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
      99           0 :   filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
     100             : 
     101             :   // multiple the size of the source and destination stride by two
     102           0 :   src_stride = src_pixels_per_line << 1;
     103           0 :   dst_stride = output_pitch << 1;
     104           0 :   for (i = output_height; i > 1; i -= 2) {
     105             :     // load the 2 strides of source
     106           0 :     srcReg32b1 =
     107           0 :         _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
     108           0 :     srcReg32b1 = _mm256_inserti128_si256(
     109             :         srcReg32b1,
     110             :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
     111             :         1);
     112             : 
     113             :     // filter the source buffer
     114           0 :     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
     115           0 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
     116             : 
     117             :     // multiply 2 adjacent elements with the filter and add the result
     118           0 :     srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
     119           0 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
     120             : 
     121             :     // add and saturate the results together
     122           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
     123             : 
     124             :     // filter the source buffer
     125           0 :     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
     126           0 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
     127             : 
     128             :     // multiply 2 adjacent elements with the filter and add the result
     129           0 :     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     130           0 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
     131             : 
     132             :     // add and saturate the results together
     133           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(
     134             :         srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
     135             : 
     136             :     // reading 2 strides of the next 16 bytes
     137             :     // (part of it was being read by earlier read)
     138           0 :     srcReg32b2 =
     139           0 :         _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
     140           0 :     srcReg32b2 = _mm256_inserti128_si256(
     141             :         srcReg32b2,
     142             :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
     143             :         1);
     144             : 
     145             :     // add and saturate the results together
     146           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(
     147             :         srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
     148             : 
     149             :     // filter the source buffer
     150           0 :     srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
     151           0 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
     152             : 
     153             :     // multiply 2 adjacent elements with the filter and add the result
     154           0 :     srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
     155           0 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
     156             : 
     157             :     // add and saturate the results together
     158           0 :     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
     159             : 
     160             :     // filter the source buffer
     161           0 :     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
     162           0 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
     163             : 
     164             :     // multiply 2 adjacent elements with the filter and add the result
     165           0 :     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     166           0 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
     167             : 
     168             :     // add and saturate the results together
     169           0 :     srcRegFilt32b2_1 = _mm256_adds_epi16(
     170             :         srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
     171           0 :     srcRegFilt32b2_1 = _mm256_adds_epi16(
     172             :         srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
     173             : 
     174           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
     175             : 
     176           0 :     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
     177             : 
     178             :     // shift by 7 bit each 16 bit
     179           0 :     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
     180           0 :     srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
     181             : 
     182             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     183             :     // convolve result and the second lane contain the second convolve
     184             :     // result
     185           0 :     srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
     186             : 
     187           0 :     src_ptr += src_stride;
     188             : 
     189             :     // save 16 bytes
     190           0 :     _mm_store_si128((__m128i *)output_ptr,
     191             :                     _mm256_castsi256_si128(srcRegFilt32b1_1));
     192             : 
     193             :     // save the next 16 bits
     194           0 :     _mm_store_si128((__m128i *)(output_ptr + output_pitch),
     195           0 :                     _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
     196           0 :     output_ptr += dst_stride;
     197             :   }
     198             : 
     199             :   // if the number of strides is odd.
     200             :   // process only 16 bytes
     201           0 :   if (i > 0) {
     202             :     __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
     203             :     __m128i srcRegFilt2, srcRegFilt3;
     204             : 
     205           0 :     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
     206             : 
     207             :     // filter the source buffer
     208           0 :     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
     209           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
     210             : 
     211             :     // multiply 2 adjacent elements with the filter and add the result
     212           0 :     srcRegFilt1_1 =
     213           0 :         _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
     214           0 :     srcRegFilt2 =
     215           0 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
     216             : 
     217             :     // add and saturate the results together
     218           0 :     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
     219             : 
     220             :     // filter the source buffer
     221           0 :     srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
     222           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
     223             : 
     224             :     // multiply 2 adjacent elements with the filter and add the result
     225           0 :     srcRegFilt3 =
     226           0 :         _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
     227           0 :     srcRegFilt2 =
     228           0 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
     229             : 
     230             :     // add and saturate the results together
     231           0 :     srcRegFilt1_1 =
     232           0 :         _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
     233             : 
     234             :     // reading the next 16 bytes
     235             :     // (part of it was being read by earlier read)
     236           0 :     srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
     237             : 
     238             :     // add and saturate the results together
     239           0 :     srcRegFilt1_1 =
     240           0 :         _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
     241             : 
     242             :     // filter the source buffer
     243           0 :     srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
     244           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
     245             : 
     246             :     // multiply 2 adjacent elements with the filter and add the result
     247           0 :     srcRegFilt2_1 =
     248           0 :         _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
     249           0 :     srcRegFilt2 =
     250           0 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
     251             : 
     252             :     // add and saturate the results together
     253           0 :     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
     254             : 
     255             :     // filter the source buffer
     256           0 :     srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
     257           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
     258             : 
     259             :     // multiply 2 adjacent elements with the filter and add the result
     260           0 :     srcRegFilt3 =
     261           0 :         _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
     262           0 :     srcRegFilt2 =
     263           0 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
     264             : 
     265             :     // add and saturate the results together
     266           0 :     srcRegFilt2_1 =
     267           0 :         _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
     268           0 :     srcRegFilt2_1 =
     269           0 :         _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
     270             : 
     271           0 :     srcRegFilt1_1 =
     272           0 :         _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
     273             : 
     274           0 :     srcRegFilt2_1 =
     275           0 :         _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
     276             : 
     277             :     // shift by 7 bit each 16 bit
     278           0 :     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
     279           0 :     srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
     280             : 
     281             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     282             :     // convolve result and the second lane contain the second convolve
     283             :     // result
     284           0 :     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
     285             : 
     286             :     // save 16 bytes
     287             :     _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
     288             :   }
     289           0 : }
     290             : 
     291           0 : static void aom_filter_block1d16_v8_avx2(
     292             :     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     293             :     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
     294             :   __m128i filtersReg;
     295             :   __m256i addFilterReg64;
     296             :   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
     297             :   __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
     298             :   __m256i srcReg32b11, srcReg32b12, filtersReg32;
     299             :   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
     300             :   unsigned int i;
     301             :   ptrdiff_t src_stride, dst_stride;
     302             : 
     303             :   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
     304           0 :   addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
     305           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     306             :   // converting the 16 bit (short) to  8 bit (byte) and have the
     307             :   // same data in both lanes of 128 bit register.
     308           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     309             :   // have the same data in both lanes of a 256 bit register
     310           0 :   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
     311             : 
     312             :   // duplicate only the first 16 bits (first and second byte)
     313             :   // across 256 bit register
     314           0 :   firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
     315             :   // duplicate only the second 16 bits (third and forth byte)
     316             :   // across 256 bit register
     317           0 :   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
     318             :   // duplicate only the third 16 bits (fifth and sixth byte)
     319             :   // across 256 bit register
     320           0 :   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
     321             :   // duplicate only the forth 16 bits (seventh and eighth byte)
     322             :   // across 256 bit register
     323           0 :   forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
     324             : 
     325             :   // multiple the size of the source and destination stride by two
     326           0 :   src_stride = src_pitch << 1;
     327           0 :   dst_stride = out_pitch << 1;
     328             : 
     329             :   // load 16 bytes 7 times in stride of src_pitch
     330           0 :   srcReg32b1 =
     331           0 :       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
     332           0 :   srcReg32b2 = _mm256_castsi128_si256(
     333           0 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
     334           0 :   srcReg32b3 = _mm256_castsi128_si256(
     335           0 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
     336           0 :   srcReg32b4 = _mm256_castsi128_si256(
     337           0 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
     338           0 :   srcReg32b5 = _mm256_castsi128_si256(
     339           0 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
     340           0 :   srcReg32b6 = _mm256_castsi128_si256(
     341           0 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
     342           0 :   srcReg32b7 = _mm256_castsi128_si256(
     343           0 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
     344             : 
     345             :   // have each consecutive loads on the same 256 register
     346           0 :   srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
     347             :                                        _mm256_castsi256_si128(srcReg32b2), 1);
     348           0 :   srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
     349             :                                        _mm256_castsi256_si128(srcReg32b3), 1);
     350           0 :   srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
     351             :                                        _mm256_castsi256_si128(srcReg32b4), 1);
     352           0 :   srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
     353             :                                        _mm256_castsi256_si128(srcReg32b5), 1);
     354           0 :   srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
     355             :                                        _mm256_castsi256_si128(srcReg32b6), 1);
     356           0 :   srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
     357             :                                        _mm256_castsi256_si128(srcReg32b7), 1);
     358             : 
     359             :   // merge every two consecutive registers except the last one
     360           0 :   srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
     361           0 :   srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
     362             : 
     363             :   // save
     364           0 :   srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
     365             : 
     366             :   // save
     367           0 :   srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
     368             : 
     369             :   // save
     370           0 :   srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
     371             : 
     372             :   // save
     373           0 :   srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
     374             : 
     375           0 :   for (i = output_height; i > 1; i -= 2) {
     376             :     // load the last 2 loads of 16 bytes and have every two
     377             :     // consecutive loads in the same 256 bit register
     378           0 :     srcReg32b8 = _mm256_castsi128_si256(
     379           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
     380           0 :     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
     381             :                                          _mm256_castsi256_si128(srcReg32b8), 1);
     382           0 :     srcReg32b9 = _mm256_castsi128_si256(
     383           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
     384           0 :     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
     385             :                                          _mm256_castsi256_si128(srcReg32b9), 1);
     386             : 
     387             :     // merge every two consecutive registers
     388             :     // save
     389           0 :     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
     390           0 :     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
     391             : 
     392             :     // multiply 2 adjacent elements with the filter and add the result
     393           0 :     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
     394           0 :     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
     395             : 
     396             :     // add and saturate the results together
     397           0 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
     398             : 
     399             :     // multiply 2 adjacent elements with the filter and add the result
     400           0 :     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
     401           0 :     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
     402             : 
     403             :     // add and saturate the results together
     404           0 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
     405             :                                     _mm256_min_epi16(srcReg32b8, srcReg32b12));
     406           0 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
     407             :                                     _mm256_max_epi16(srcReg32b8, srcReg32b12));
     408             : 
     409             :     // multiply 2 adjacent elements with the filter and add the result
     410           0 :     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
     411           0 :     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
     412             : 
     413           0 :     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
     414             : 
     415             :     // multiply 2 adjacent elements with the filter and add the result
     416           0 :     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
     417           0 :     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
     418             : 
     419             :     // add and saturate the results together
     420           0 :     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
     421             :                                    _mm256_min_epi16(srcReg32b8, srcReg32b12));
     422           0 :     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
     423             :                                    _mm256_max_epi16(srcReg32b8, srcReg32b12));
     424             : 
     425           0 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
     426           0 :     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
     427             : 
     428             :     // shift by 7 bit each 16 bit
     429           0 :     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
     430           0 :     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
     431             : 
     432             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     433             :     // convolve result and the second lane contain the second convolve
     434             :     // result
     435           0 :     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
     436             : 
     437           0 :     src_ptr += src_stride;
     438             : 
     439             :     // save 16 bytes
     440           0 :     _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
     441             : 
     442             :     // save the next 16 bits
     443           0 :     _mm_store_si128((__m128i *)(output_ptr + out_pitch),
     444           0 :                     _mm256_extractf128_si256(srcReg32b1, 1));
     445             : 
     446           0 :     output_ptr += dst_stride;
     447             : 
     448             :     // save part of the registers for next strides
     449           0 :     srcReg32b10 = srcReg32b11;
     450           0 :     srcReg32b1 = srcReg32b3;
     451           0 :     srcReg32b11 = srcReg32b2;
     452           0 :     srcReg32b3 = srcReg32b5;
     453           0 :     srcReg32b2 = srcReg32b4;
     454           0 :     srcReg32b5 = srcReg32b7;
     455           0 :     srcReg32b7 = srcReg32b9;
     456             :   }
     457           0 :   if (i > 0) {
     458             :     __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
     459             :     __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
     460             :     // load the last 16 bytes
     461           0 :     srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
     462             : 
     463             :     // merge the last 2 results together
     464           0 :     srcRegFilt4 =
     465           0 :         _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
     466           0 :     srcRegFilt7 =
     467           0 :         _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
     468             : 
     469             :     // multiply 2 adjacent elements with the filter and add the result
     470           0 :     srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
     471             :                                     _mm256_castsi256_si128(firstFilters));
     472           0 :     srcRegFilt4 =
     473           0 :         _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
     474           0 :     srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
     475             :                                     _mm256_castsi256_si128(firstFilters));
     476           0 :     srcRegFilt7 =
     477           0 :         _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
     478             : 
     479             :     // add and saturate the results together
     480           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
     481           0 :     srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
     482             : 
     483             :     // multiply 2 adjacent elements with the filter and add the result
     484           0 :     srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
     485             :                                     _mm256_castsi256_si128(secondFilters));
     486           0 :     srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
     487             :                                     _mm256_castsi256_si128(secondFilters));
     488             : 
     489             :     // multiply 2 adjacent elements with the filter and add the result
     490           0 :     srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
     491             :                                     _mm256_castsi256_si128(thirdFilters));
     492           0 :     srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
     493             :                                     _mm256_castsi256_si128(thirdFilters));
     494             : 
     495             :     // add and saturate the results together
     496           0 :     srcRegFilt1 =
     497           0 :         _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
     498           0 :     srcRegFilt3 =
     499           0 :         _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
     500             : 
     501             :     // add and saturate the results together
     502           0 :     srcRegFilt1 =
     503           0 :         _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
     504           0 :     srcRegFilt3 =
     505           0 :         _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
     506             : 
     507           0 :     srcRegFilt1 =
     508           0 :         _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
     509           0 :     srcRegFilt3 =
     510           0 :         _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
     511             : 
     512             :     // shift by 7 bit each 16 bit
     513           0 :     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
     514           0 :     srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
     515             : 
     516             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     517             :     // convolve result and the second lane contain the second convolve
     518             :     // result
     519           0 :     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
     520             : 
     521             :     // save 16 bytes
     522             :     _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
     523             :   }
     524           0 : }
     525             : 
     526             : #if HAVE_AVX2 && HAVE_SSSE3
     527             : filter8_1dfunction aom_filter_block1d4_v8_ssse3;
     528             : #if ARCH_X86_64
     529             : filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
     530             : filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
     531             : filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
     532             : #define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_intrin_ssse3
     533             : #define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_intrin_ssse3
     534             : #define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_intrin_ssse3
     535             : #else  // ARCH_X86
     536             : filter8_1dfunction aom_filter_block1d8_v8_ssse3;
     537             : filter8_1dfunction aom_filter_block1d8_h8_ssse3;
     538             : filter8_1dfunction aom_filter_block1d4_h8_ssse3;
     539             : #define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_ssse3
     540             : #define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_ssse3
     541             : #define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_ssse3
     542             : #endif  // ARCH_X86_64
     543             : filter8_1dfunction aom_filter_block1d16_v2_ssse3;
     544             : filter8_1dfunction aom_filter_block1d16_h2_ssse3;
     545             : filter8_1dfunction aom_filter_block1d8_v2_ssse3;
     546             : filter8_1dfunction aom_filter_block1d8_h2_ssse3;
     547             : filter8_1dfunction aom_filter_block1d4_v2_ssse3;
     548             : filter8_1dfunction aom_filter_block1d4_h2_ssse3;
     549             : #define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
     550             : #define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
     551             : #define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
     552             : #define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
     553             : #define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
     554             : #define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
     555             : #define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
     556             : // void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
     557             : //                                uint8_t *dst, ptrdiff_t dst_stride,
     558             : //                                const int16_t *filter_x, int x_step_q4,
     559             : //                                const int16_t *filter_y, int y_step_q4,
     560             : //                                int w, int h);
     561             : // void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
     562             : //                               uint8_t *dst, ptrdiff_t dst_stride,
     563             : //                               const int16_t *filter_x, int x_step_q4,
     564             : //                               const int16_t *filter_y, int y_step_q4,
     565             : //                               int w, int h);
     566           0 : FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
     567           0 : FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
     568             : 
     569             : // void aom_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
     570             : //                          uint8_t *dst, ptrdiff_t dst_stride,
     571             : //                          const int16_t *filter_x, int x_step_q4,
     572             : //                          const int16_t *filter_y, int y_step_q4,
     573             : //                          int w, int h);
     574           0 : FUN_CONV_2D(, avx2);
     575             : #endif  // HAVE_AX2 && HAVE_SSSE3

Generated by: LCOV version 1.13