LCOV - code coverage report
Current view: top level - media/libvpx/libvpx/vpx_dsp/x86 - vpx_subpixel_8t_intrin_avx2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 213 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 5 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include <immintrin.h>
      12             : 
      13             : #include "./vpx_dsp_rtcd.h"
      14             : #include "vpx_dsp/x86/convolve.h"
      15             : #include "vpx_ports/mem.h"
      16             : 
      17             : // filters for 16_h8 and 16_v8
      18             : DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
      19             :   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
      20             :   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
      21             : };
      22             : 
      23             : DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
      24             :   2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
      25             :   2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
      26             : };
      27             : 
      28             : DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
      29             :   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
      30             :   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
      31             : };
      32             : 
      33             : DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
      34             :   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
      35             :   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
      36             : };
      37             : 
      38             : #if defined(__clang__)
      39             : #if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
      40             :     (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
      41             :     (defined(__APPLE__) && defined(__apple_build_version__) && \
      42             :      ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
      43             :       (__clang_major__ == 5 && __clang_minor__ == 0)))
      44             : #define MM256_BROADCASTSI128_SI256(x) \
      45             :   _mm_broadcastsi128_si256((__m128i const *)&(x))
      46             : #else  // clang > 3.3, and not 5.0 on macosx.
      47             : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
      48             : #endif  // clang <= 3.3
      49             : #elif defined(__GNUC__)
      50             : #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
      51             : #define MM256_BROADCASTSI128_SI256(x) \
      52             :   _mm_broadcastsi128_si256((__m128i const *)&(x))
      53             : #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
      54             : #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
      55             : #else  // gcc > 4.7
      56             : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
      57             : #endif  // gcc <= 4.6
      58             : #else   // !(gcc || clang)
      59             : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
      60             : #endif  // __clang__
      61             : 
      62           0 : static void vpx_filter_block1d16_h8_avx2(
      63             :     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
      64             :     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
      65             :   __m128i filtersReg;
      66             :   __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
      67             :   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
      68             :   __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
      69             :   __m256i srcReg32b1, srcReg32b2, filtersReg32;
      70             :   unsigned int i;
      71             :   ptrdiff_t src_stride, dst_stride;
      72             : 
      73             :   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
      74           0 :   addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
      75           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
      76             :   // converting the 16 bit (short) to 8 bit (byte) and have the same data
      77             :   // in both lanes of 128 bit register.
      78           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
      79             :   // have the same data in both lanes of a 256 bit register
      80           0 :   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
      81             : 
      82             :   // duplicate only the first 16 bits (first and second byte)
      83             :   // across 256 bit register
      84           0 :   firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
      85             :   // duplicate only the second 16 bits (third and forth byte)
      86             :   // across 256 bit register
      87           0 :   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
      88             :   // duplicate only the third 16 bits (fifth and sixth byte)
      89             :   // across 256 bit register
      90           0 :   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
      91             :   // duplicate only the forth 16 bits (seventh and eighth byte)
      92             :   // across 256 bit register
      93           0 :   forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
      94             : 
      95           0 :   filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
      96           0 :   filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
      97           0 :   filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
      98           0 :   filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
      99             : 
     100             :   // multiple the size of the source and destination stride by two
     101           0 :   src_stride = src_pixels_per_line << 1;
     102           0 :   dst_stride = output_pitch << 1;
     103           0 :   for (i = output_height; i > 1; i -= 2) {
     104             :     // load the 2 strides of source
     105           0 :     srcReg32b1 =
     106           0 :         _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
     107           0 :     srcReg32b1 = _mm256_inserti128_si256(
     108             :         srcReg32b1,
     109             :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
     110             :         1);
     111             : 
     112             :     // filter the source buffer
     113           0 :     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
     114           0 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
     115             : 
     116             :     // multiply 2 adjacent elements with the filter and add the result
     117           0 :     srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
     118           0 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
     119             : 
     120             :     // add and saturate the results together
     121           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
     122             : 
     123             :     // filter the source buffer
     124           0 :     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
     125           0 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
     126             : 
     127             :     // multiply 2 adjacent elements with the filter and add the result
     128           0 :     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     129           0 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
     130             : 
     131             :     // add and saturate the results together
     132           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(
     133             :         srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
     134             : 
     135             :     // reading 2 strides of the next 16 bytes
     136             :     // (part of it was being read by earlier read)
     137           0 :     srcReg32b2 =
     138           0 :         _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
     139           0 :     srcReg32b2 = _mm256_inserti128_si256(
     140             :         srcReg32b2,
     141             :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
     142             :         1);
     143             : 
     144             :     // add and saturate the results together
     145           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(
     146             :         srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
     147             : 
     148             :     // filter the source buffer
     149           0 :     srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
     150           0 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
     151             : 
     152             :     // multiply 2 adjacent elements with the filter and add the result
     153           0 :     srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
     154           0 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
     155             : 
     156             :     // add and saturate the results together
     157           0 :     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
     158             : 
     159             :     // filter the source buffer
     160           0 :     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
     161           0 :     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
     162             : 
     163             :     // multiply 2 adjacent elements with the filter and add the result
     164           0 :     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     165           0 :     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
     166             : 
     167             :     // add and saturate the results together
     168           0 :     srcRegFilt32b2_1 = _mm256_adds_epi16(
     169             :         srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
     170           0 :     srcRegFilt32b2_1 = _mm256_adds_epi16(
     171             :         srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
     172             : 
     173           0 :     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
     174             : 
     175           0 :     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
     176             : 
     177             :     // shift by 7 bit each 16 bit
     178           0 :     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
     179           0 :     srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
     180             : 
     181             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     182             :     // convolve result and the second lane contain the second convolve
     183             :     // result
     184           0 :     srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
     185             : 
     186           0 :     src_ptr += src_stride;
     187             : 
     188             :     // save 16 bytes
     189           0 :     _mm_store_si128((__m128i *)output_ptr,
     190             :                     _mm256_castsi256_si128(srcRegFilt32b1_1));
     191             : 
     192             :     // save the next 16 bits
     193           0 :     _mm_store_si128((__m128i *)(output_ptr + output_pitch),
     194           0 :                     _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
     195           0 :     output_ptr += dst_stride;
     196             :   }
     197             : 
     198             :   // if the number of strides is odd.
     199             :   // process only 16 bytes
     200           0 :   if (i > 0) {
     201             :     __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
     202             :     __m128i srcRegFilt2, srcRegFilt3;
     203             : 
     204           0 :     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
     205             : 
     206             :     // filter the source buffer
     207           0 :     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
     208           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
     209             : 
     210             :     // multiply 2 adjacent elements with the filter and add the result
     211           0 :     srcRegFilt1_1 =
     212           0 :         _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
     213           0 :     srcRegFilt2 =
     214           0 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
     215             : 
     216             :     // add and saturate the results together
     217           0 :     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
     218             : 
     219             :     // filter the source buffer
     220           0 :     srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
     221           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
     222             : 
     223             :     // multiply 2 adjacent elements with the filter and add the result
     224           0 :     srcRegFilt3 =
     225           0 :         _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
     226           0 :     srcRegFilt2 =
     227           0 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
     228             : 
     229             :     // add and saturate the results together
     230           0 :     srcRegFilt1_1 =
     231           0 :         _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
     232             : 
     233             :     // reading the next 16 bytes
     234             :     // (part of it was being read by earlier read)
     235           0 :     srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
     236             : 
     237             :     // add and saturate the results together
     238           0 :     srcRegFilt1_1 =
     239           0 :         _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
     240             : 
     241             :     // filter the source buffer
     242           0 :     srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
     243           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
     244             : 
     245             :     // multiply 2 adjacent elements with the filter and add the result
     246           0 :     srcRegFilt2_1 =
     247           0 :         _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
     248           0 :     srcRegFilt2 =
     249           0 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
     250             : 
     251             :     // add and saturate the results together
     252           0 :     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
     253             : 
     254             :     // filter the source buffer
     255           0 :     srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
     256           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
     257             : 
     258             :     // multiply 2 adjacent elements with the filter and add the result
     259           0 :     srcRegFilt3 =
     260           0 :         _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
     261           0 :     srcRegFilt2 =
     262           0 :         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
     263             : 
     264             :     // add and saturate the results together
     265           0 :     srcRegFilt2_1 =
     266           0 :         _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
     267           0 :     srcRegFilt2_1 =
     268           0 :         _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
     269             : 
     270           0 :     srcRegFilt1_1 =
     271           0 :         _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
     272             : 
     273           0 :     srcRegFilt2_1 =
     274           0 :         _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
     275             : 
     276             :     // shift by 7 bit each 16 bit
     277           0 :     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
     278           0 :     srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
     279             : 
     280             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     281             :     // convolve result and the second lane contain the second convolve
     282             :     // result
     283           0 :     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
     284             : 
     285             :     // save 16 bytes
     286             :     _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
     287             :   }
     288           0 : }
     289             : 
     290           0 : static void vpx_filter_block1d16_v8_avx2(
     291             :     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     292             :     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
     293             :   __m128i filtersReg;
     294             :   __m256i addFilterReg64;
     295             :   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
     296             :   __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
     297             :   __m256i srcReg32b11, srcReg32b12, filtersReg32;
     298             :   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
     299             :   unsigned int i;
     300             :   ptrdiff_t src_stride, dst_stride;
     301             : 
     302             :   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
     303           0 :   addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
     304           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     305             :   // converting the 16 bit (short) to  8 bit (byte) and have the
     306             :   // same data in both lanes of 128 bit register.
     307           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     308             :   // have the same data in both lanes of a 256 bit register
     309           0 :   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
     310             : 
     311             :   // duplicate only the first 16 bits (first and second byte)
     312             :   // across 256 bit register
     313           0 :   firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
     314             :   // duplicate only the second 16 bits (third and forth byte)
     315             :   // across 256 bit register
     316           0 :   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
     317             :   // duplicate only the third 16 bits (fifth and sixth byte)
     318             :   // across 256 bit register
     319           0 :   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
     320             :   // duplicate only the forth 16 bits (seventh and eighth byte)
     321             :   // across 256 bit register
     322           0 :   forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
     323             : 
     324             :   // multiple the size of the source and destination stride by two
     325           0 :   src_stride = src_pitch << 1;
     326           0 :   dst_stride = out_pitch << 1;
     327             : 
     328             :   // load 16 bytes 7 times in stride of src_pitch
     329           0 :   srcReg32b1 =
     330           0 :       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
     331           0 :   srcReg32b2 = _mm256_castsi128_si256(
     332           0 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
     333           0 :   srcReg32b3 = _mm256_castsi128_si256(
     334           0 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
     335           0 :   srcReg32b4 = _mm256_castsi128_si256(
     336           0 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
     337           0 :   srcReg32b5 = _mm256_castsi128_si256(
     338           0 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
     339           0 :   srcReg32b6 = _mm256_castsi128_si256(
     340           0 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
     341           0 :   srcReg32b7 = _mm256_castsi128_si256(
     342           0 :       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
     343             : 
     344             :   // have each consecutive loads on the same 256 register
     345           0 :   srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
     346             :                                        _mm256_castsi256_si128(srcReg32b2), 1);
     347           0 :   srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
     348             :                                        _mm256_castsi256_si128(srcReg32b3), 1);
     349           0 :   srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
     350             :                                        _mm256_castsi256_si128(srcReg32b4), 1);
     351           0 :   srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
     352             :                                        _mm256_castsi256_si128(srcReg32b5), 1);
     353           0 :   srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
     354             :                                        _mm256_castsi256_si128(srcReg32b6), 1);
     355           0 :   srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
     356             :                                        _mm256_castsi256_si128(srcReg32b7), 1);
     357             : 
     358             :   // merge every two consecutive registers except the last one
     359           0 :   srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
     360           0 :   srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
     361             : 
     362             :   // save
     363           0 :   srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
     364             : 
     365             :   // save
     366           0 :   srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
     367             : 
     368             :   // save
     369           0 :   srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
     370             : 
     371             :   // save
     372           0 :   srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
     373             : 
     374           0 :   for (i = output_height; i > 1; i -= 2) {
     375             :     // load the last 2 loads of 16 bytes and have every two
     376             :     // consecutive loads in the same 256 bit register
     377           0 :     srcReg32b8 = _mm256_castsi128_si256(
     378           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
     379           0 :     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
     380             :                                          _mm256_castsi256_si128(srcReg32b8), 1);
     381           0 :     srcReg32b9 = _mm256_castsi128_si256(
     382           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
     383           0 :     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
     384             :                                          _mm256_castsi256_si128(srcReg32b9), 1);
     385             : 
     386             :     // merge every two consecutive registers
     387             :     // save
     388           0 :     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
     389           0 :     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
     390             : 
     391             :     // multiply 2 adjacent elements with the filter and add the result
     392           0 :     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
     393           0 :     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
     394             : 
     395             :     // add and saturate the results together
     396           0 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
     397             : 
     398             :     // multiply 2 adjacent elements with the filter and add the result
     399           0 :     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
     400           0 :     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
     401             : 
     402             :     // add and saturate the results together
     403           0 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
     404             :                                     _mm256_min_epi16(srcReg32b8, srcReg32b12));
     405           0 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
     406             :                                     _mm256_max_epi16(srcReg32b8, srcReg32b12));
     407             : 
     408             :     // multiply 2 adjacent elements with the filter and add the result
     409           0 :     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
     410           0 :     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
     411             : 
     412           0 :     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
     413             : 
     414             :     // multiply 2 adjacent elements with the filter and add the result
     415           0 :     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
     416           0 :     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
     417             : 
     418             :     // add and saturate the results together
     419           0 :     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
     420             :                                    _mm256_min_epi16(srcReg32b8, srcReg32b12));
     421           0 :     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
     422             :                                    _mm256_max_epi16(srcReg32b8, srcReg32b12));
     423             : 
     424           0 :     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
     425           0 :     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
     426             : 
     427             :     // shift by 7 bit each 16 bit
     428           0 :     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
     429           0 :     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
     430             : 
     431             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     432             :     // convolve result and the second lane contain the second convolve
     433             :     // result
     434           0 :     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
     435             : 
     436           0 :     src_ptr += src_stride;
     437             : 
     438             :     // save 16 bytes
     439           0 :     _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
     440             : 
     441             :     // save the next 16 bits
     442           0 :     _mm_store_si128((__m128i *)(output_ptr + out_pitch),
     443           0 :                     _mm256_extractf128_si256(srcReg32b1, 1));
     444             : 
     445           0 :     output_ptr += dst_stride;
     446             : 
     447             :     // save part of the registers for next strides
     448           0 :     srcReg32b10 = srcReg32b11;
     449           0 :     srcReg32b1 = srcReg32b3;
     450           0 :     srcReg32b11 = srcReg32b2;
     451           0 :     srcReg32b3 = srcReg32b5;
     452           0 :     srcReg32b2 = srcReg32b4;
     453           0 :     srcReg32b5 = srcReg32b7;
     454           0 :     srcReg32b7 = srcReg32b9;
     455             :   }
     456           0 :   if (i > 0) {
     457             :     __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
     458             :     __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
     459             :     // load the last 16 bytes
     460           0 :     srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
     461             : 
     462             :     // merge the last 2 results together
     463           0 :     srcRegFilt4 =
     464           0 :         _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
     465           0 :     srcRegFilt7 =
     466           0 :         _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
     467             : 
     468             :     // multiply 2 adjacent elements with the filter and add the result
     469           0 :     srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
     470             :                                     _mm256_castsi256_si128(firstFilters));
     471           0 :     srcRegFilt4 =
     472           0 :         _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
     473           0 :     srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
     474             :                                     _mm256_castsi256_si128(firstFilters));
     475           0 :     srcRegFilt7 =
     476           0 :         _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
     477             : 
     478             :     // add and saturate the results together
     479           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
     480           0 :     srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
     481             : 
     482             :     // multiply 2 adjacent elements with the filter and add the result
     483           0 :     srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
     484             :                                     _mm256_castsi256_si128(secondFilters));
     485           0 :     srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
     486             :                                     _mm256_castsi256_si128(secondFilters));
     487             : 
     488             :     // multiply 2 adjacent elements with the filter and add the result
     489           0 :     srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
     490             :                                     _mm256_castsi256_si128(thirdFilters));
     491           0 :     srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
     492             :                                     _mm256_castsi256_si128(thirdFilters));
     493             : 
     494             :     // add and saturate the results together
     495           0 :     srcRegFilt1 =
     496           0 :         _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
     497           0 :     srcRegFilt3 =
     498           0 :         _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
     499             : 
     500             :     // add and saturate the results together
     501           0 :     srcRegFilt1 =
     502           0 :         _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
     503           0 :     srcRegFilt3 =
     504           0 :         _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
     505             : 
     506           0 :     srcRegFilt1 =
     507           0 :         _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
     508           0 :     srcRegFilt3 =
     509           0 :         _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
     510             : 
     511             :     // shift by 7 bit each 16 bit
     512           0 :     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
     513           0 :     srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
     514             : 
     515             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     516             :     // convolve result and the second lane contain the second convolve
     517             :     // result
     518           0 :     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
     519             : 
     520             :     // save 16 bytes
     521             :     _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
     522             :   }
     523           0 : }
     524             : 
     525             : #if HAVE_AVX2 && HAVE_SSSE3
     526             : filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
     527             : #if ARCH_X86_64
     528             : filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
     529             : filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
     530             : filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
     531             : #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
     532             : #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
     533             : #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
     534             : #else  // ARCH_X86
     535             : filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
     536             : filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
     537             : filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
     538             : #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
     539             : #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
     540             : #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
     541             : #endif  // ARCH_X86_64
     542             : filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
     543             : filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
     544             : filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
     545             : filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
     546             : filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
     547             : filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
     548             : #define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
     549             : #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
     550             : #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
     551             : #define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
     552             : #define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
     553             : #define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
     554             : #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
     555             : // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
     556             : //                                uint8_t *dst, ptrdiff_t dst_stride,
     557             : //                                const int16_t *filter_x, int x_step_q4,
     558             : //                                const int16_t *filter_y, int y_step_q4,
     559             : //                                int w, int h);
     560             : // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
     561             : //                               uint8_t *dst, ptrdiff_t dst_stride,
     562             : //                               const int16_t *filter_x, int x_step_q4,
     563             : //                               const int16_t *filter_y, int y_step_q4,
     564             : //                               int w, int h);
     565           0 : FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
     566           0 : FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
     567             : 
     568             : // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
     569             : //                          uint8_t *dst, ptrdiff_t dst_stride,
     570             : //                          const int16_t *filter_x, int x_step_q4,
     571             : //                          const int16_t *filter_y, int y_step_q4,
     572             : //                          int w, int h);
     573           0 : FUN_CONV_2D(, avx2);
     574             : #endif  // HAVE_AX2 && HAVE_SSSE3

Generated by: LCOV version 1.13