LCOV - code coverage report
Current view: top level - media/libvpx/libvpx/vpx_dsp/x86 - vpx_subpixel_8t_intrin_ssse3.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 419 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 23 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include <tmmintrin.h>
      12             : 
      13             : #include "./vpx_dsp_rtcd.h"
      14             : #include "vpx_dsp/vpx_filter.h"
      15             : #include "vpx_dsp/x86/convolve.h"
      16             : #include "vpx_mem/vpx_mem.h"
      17             : #include "vpx_ports/mem.h"
      18             : #include "vpx_ports/emmintrin_compat.h"
      19             : 
      20             : // filters only for the 4_h8 convolution
      21             : DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
      22             :   0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
      23             : };
      24             : 
      25             : DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
      26             :   4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
      27             : };
      28             : 
      29             : // filters for 8_h8 and 16_h8
      30             : DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
      31             :   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
      32             : };
      33             : 
      34             : DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
      35             :   2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
      36             : };
      37             : 
      38             : DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
      39             :   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
      40             : };
      41             : 
      42             : DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
      43             :   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
      44             : };
      45             : 
      46             : // These are reused by the avx2 intrinsics.
      47             : filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
      48             : filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
      49             : filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
      50             : 
      51           0 : void vpx_filter_block1d4_h8_intrin_ssse3(
      52             :     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
      53             :     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
      54             :   __m128i firstFilters, secondFilters, shuffle1, shuffle2;
      55             :   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
      56             :   __m128i addFilterReg64, filtersReg, srcReg, minReg;
      57             :   unsigned int i;
      58             : 
      59             :   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
      60           0 :   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
      61           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
      62             :   // converting the 16 bit (short) to  8 bit (byte) and have the same data
      63             :   // in both lanes of 128 bit register.
      64           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
      65             : 
      66             :   // duplicate only the first 16 bits in the filter into the first lane
      67           0 :   firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
      68             :   // duplicate only the third 16 bit in the filter into the first lane
      69           0 :   secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
      70             :   // duplicate only the seconds 16 bits in the filter into the second lane
      71             :   // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
      72           0 :   firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
      73             :   // duplicate only the forth 16 bits in the filter into the second lane
      74             :   // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
      75           0 :   secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
      76             : 
      77             :   // loading the local filters
      78           0 :   shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
      79           0 :   shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
      80             : 
      81           0 :   for (i = 0; i < output_height; i++) {
      82           0 :     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
      83             : 
      84             :     // filter the source buffer
      85           0 :     srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
      86           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
      87             : 
      88             :     // multiply 2 adjacent elements with the filter and add the result
      89           0 :     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
      90           0 :     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
      91             : 
      92             :     // extract the higher half of the lane
      93           0 :     srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
      94           0 :     srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
      95             : 
      96           0 :     minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
      97             : 
      98             :     // add and saturate all the results together
      99           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
     100           0 :     srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
     101           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
     102           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
     103           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
     104             : 
     105             :     // shift by 7 bit each 16 bits
     106           0 :     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
     107             : 
     108             :     // shrink to 8 bit each 16 bits
     109           0 :     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
     110           0 :     src_ptr += src_pixels_per_line;
     111             : 
     112             :     // save only 4 bytes
     113           0 :     *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
     114             : 
     115           0 :     output_ptr += output_pitch;
     116             :   }
     117           0 : }
     118             : 
     119           0 : void vpx_filter_block1d8_h8_intrin_ssse3(
     120             :     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     121             :     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
     122             :   __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
     123             :   __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
     124             :   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
     125             :   __m128i addFilterReg64, filtersReg, minReg;
     126             :   unsigned int i;
     127             : 
     128             :   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
     129           0 :   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
     130           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     131             :   // converting the 16 bit (short) to  8 bit (byte) and have the same data
     132             :   // in both lanes of 128 bit register.
     133           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     134             : 
     135             :   // duplicate only the first 16 bits (first and second byte)
     136             :   // across 128 bit register
     137           0 :   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
     138             :   // duplicate only the second 16 bits (third and forth byte)
     139             :   // across 128 bit register
     140           0 :   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
     141             :   // duplicate only the third 16 bits (fifth and sixth byte)
     142             :   // across 128 bit register
     143           0 :   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
     144             :   // duplicate only the forth 16 bits (seventh and eighth byte)
     145             :   // across 128 bit register
     146           0 :   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
     147             : 
     148           0 :   filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
     149           0 :   filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
     150           0 :   filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
     151           0 :   filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
     152             : 
     153           0 :   for (i = 0; i < output_height; i++) {
     154           0 :     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
     155             : 
     156             :     // filter the source buffer
     157           0 :     srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
     158           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
     159             : 
     160             :     // multiply 2 adjacent elements with the filter and add the result
     161           0 :     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
     162           0 :     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
     163             : 
     164             :     // filter the source buffer
     165           0 :     srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
     166           0 :     srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
     167             : 
     168             :     // multiply 2 adjacent elements with the filter and add the result
     169           0 :     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
     170           0 :     srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
     171             : 
     172             :     // add and saturate all the results together
     173           0 :     minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
     174           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
     175             : 
     176           0 :     srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
     177           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
     178           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
     179           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
     180             : 
     181             :     // shift by 7 bit each 16 bits
     182           0 :     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
     183             : 
     184             :     // shrink to 8 bit each 16 bits
     185           0 :     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
     186             : 
     187           0 :     src_ptr += src_pixels_per_line;
     188             : 
     189             :     // save only 8 bytes
     190             :     _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
     191             : 
     192           0 :     output_ptr += output_pitch;
     193             :   }
     194           0 : }
     195             : 
     196           0 : void vpx_filter_block1d8_v8_intrin_ssse3(
     197             :     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     198             :     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
     199             :   __m128i addFilterReg64, filtersReg, minReg;
     200             :   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
     201             :   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
     202             :   __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
     203             :   __m128i srcReg8;
     204             :   unsigned int i;
     205             : 
     206             :   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
     207           0 :   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
     208           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     209             :   // converting the 16 bit (short) to  8 bit (byte) and have the same data
     210             :   // in both lanes of 128 bit register.
     211           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     212             : 
     213             :   // duplicate only the first 16 bits in the filter
     214           0 :   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
     215             :   // duplicate only the second 16 bits in the filter
     216           0 :   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
     217             :   // duplicate only the third 16 bits in the filter
     218           0 :   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
     219             :   // duplicate only the forth 16 bits in the filter
     220           0 :   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
     221             : 
     222             :   // load the first 7 rows of 8 bytes
     223           0 :   srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
     224           0 :   srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
     225           0 :   srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
     226           0 :   srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
     227           0 :   srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
     228           0 :   srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
     229           0 :   srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
     230             : 
     231           0 :   for (i = 0; i < output_height; i++) {
     232             :     // load the last 8 bytes
     233           0 :     srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
     234             : 
     235             :     // merge the result together
     236           0 :     srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
     237           0 :     srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
     238             : 
     239             :     // merge the result together
     240           0 :     srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
     241           0 :     srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
     242             : 
     243             :     // multiply 2 adjacent elements with the filter and add the result
     244           0 :     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
     245           0 :     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
     246           0 :     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
     247           0 :     srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
     248             : 
     249             :     // add and saturate the results together
     250           0 :     minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
     251           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
     252           0 :     srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
     253           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
     254           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
     255           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
     256             : 
     257             :     // shift by 7 bit each 16 bit
     258           0 :     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
     259             : 
     260             :     // shrink to 8 bit each 16 bits
     261           0 :     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
     262             : 
     263           0 :     src_ptr += src_pitch;
     264             : 
     265             :     // shift down a row
     266           0 :     srcReg1 = srcReg2;
     267           0 :     srcReg2 = srcReg3;
     268           0 :     srcReg3 = srcReg4;
     269           0 :     srcReg4 = srcReg5;
     270           0 :     srcReg5 = srcReg6;
     271           0 :     srcReg6 = srcReg7;
     272           0 :     srcReg7 = srcReg8;
     273             : 
     274             :     // save only 8 bytes convolve result
     275             :     _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
     276             : 
     277           0 :     output_ptr += out_pitch;
     278             :   }
     279           0 : }
     280             : 
     281             : filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
     282             : filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
     283             : filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
     284             : filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
     285             : filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
     286             : filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
     287             : filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
     288             : filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
     289             : filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
     290             : filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
     291             : filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
     292             : filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
     293             : 
     294             : filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
     295             : filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
     296             : filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
     297             : filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
     298             : filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
     299             : filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
     300             : filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
     301             : filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
     302             : filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
     303             : filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
     304             : filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
     305             : filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
     306             : 
     307             : // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
     308             : //                                uint8_t *dst, ptrdiff_t dst_stride,
     309             : //                                const int16_t *filter_x, int x_step_q4,
     310             : //                                const int16_t *filter_y, int y_step_q4,
     311             : //                                int w, int h);
     312             : // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
     313             : //                               uint8_t *dst, ptrdiff_t dst_stride,
     314             : //                               const int16_t *filter_x, int x_step_q4,
     315             : //                               const int16_t *filter_y, int y_step_q4,
     316             : //                               int w, int h);
     317             : // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
     318             : //                                    uint8_t *dst, ptrdiff_t dst_stride,
     319             : //                                    const int16_t *filter_x, int x_step_q4,
     320             : //                                    const int16_t *filter_y, int y_step_q4,
     321             : //                                    int w, int h);
     322             : // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
     323             : //                                   uint8_t *dst, ptrdiff_t dst_stride,
     324             : //                                   const int16_t *filter_x, int x_step_q4,
     325             : //                                   const int16_t *filter_y, int y_step_q4,
     326             : //                                   int w, int h);
     327           0 : FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
     328           0 : FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
     329           0 : FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
     330           0 : FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
     331             :             ssse3);
     332             : 
     333             : #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
     334             :                       out2, out3, out4, out5, out6, out7)                 \
     335             :   {                                                                       \
     336             :     const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
     337             :     const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
     338             :     const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
     339             :     const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
     340             :                                                                           \
     341             :     const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
     342             :     const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
     343             :     const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
     344             :     const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
     345             :                                                                           \
     346             :     const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
     347             :     const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
     348             :     const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
     349             :     const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
     350             :                                                                           \
     351             :     out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
     352             :     out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
     353             :     out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
     354             :     out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
     355             :     out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
     356             :     out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
     357             :     out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
     358             :     out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
     359             :   }
     360             : 
     361           0 : static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
     362             :                                   uint8_t *dst, const int16_t *x_filter) {
     363           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     364           0 :   const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
     365             :   // pack and duplicate the filter values
     366           0 :   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
     367           0 :   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
     368           0 :   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
     369           0 :   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
     370           0 :   const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
     371           0 :   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
     372           0 :   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
     373           0 :   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
     374           0 :   const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
     375           0 :   const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
     376           0 :   const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
     377           0 :   const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
     378             :   // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
     379           0 :   const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
     380             :   // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
     381           0 :   const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
     382             :   // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
     383           0 :   const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
     384             :   // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
     385           0 :   const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
     386             :   // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
     387           0 :   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
     388             :   // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
     389           0 :   const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
     390             :   // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
     391           0 :   const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
     392             :   // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
     393           0 :   const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
     394             :   // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
     395           0 :   const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
     396           0 :   const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
     397           0 :   const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
     398           0 :   const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
     399             :   // multiply 2 adjacent elements with the filter and add the result
     400           0 :   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
     401           0 :   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
     402           0 :   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
     403           0 :   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
     404             :   // add and saturate the results together
     405           0 :   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
     406           0 :   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
     407           0 :   __m128i temp = _mm_adds_epi16(x0, x3);
     408           0 :   temp = _mm_adds_epi16(temp, min_x2x1);
     409           0 :   temp = _mm_adds_epi16(temp, max_x2x1);
     410             :   // round and shift by 7 bit each 16 bit
     411           0 :   temp = _mm_mulhrs_epi16(temp, k_256);
     412             :   // shrink to 8 bit each 16 bits
     413           0 :   temp = _mm_packus_epi16(temp, temp);
     414             :   // save only 8 bytes convolve result
     415             :   _mm_storel_epi64((__m128i *)dst, temp);
     416           0 : }
     417             : 
     418           0 : static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
     419             :                                 uint8_t *dst, ptrdiff_t dst_stride) {
     420             :   __m128i A, B, C, D, E, F, G, H;
     421             : 
     422           0 :   A = _mm_loadl_epi64((const __m128i *)src);
     423           0 :   B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
     424           0 :   C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
     425           0 :   D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
     426           0 :   E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
     427           0 :   F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
     428           0 :   G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
     429           0 :   H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
     430             : 
     431           0 :   TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
     432             : 
     433             :   _mm_storel_epi64((__m128i *)dst, A);
     434           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
     435           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
     436           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
     437           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
     438           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
     439           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
     440           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
     441           0 : }
     442             : 
     443           0 : static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
     444             :                                     uint8_t *dst, ptrdiff_t dst_stride,
     445             :                                     const InterpKernel *x_filters, int x0_q4,
     446             :                                     int x_step_q4, int w, int h) {
     447             :   DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
     448             :   int x, y, z;
     449           0 :   src -= SUBPEL_TAPS / 2 - 1;
     450             : 
     451             :   // This function processes 8x8 areas.  The intermediate height is not always
     452             :   // a multiple of 8, so force it to be a multiple of 8 here.
     453           0 :   y = h + (8 - (h & 0x7));
     454             : 
     455             :   do {
     456           0 :     int x_q4 = x0_q4;
     457           0 :     for (x = 0; x < w; x += 8) {
     458             :       // process 8 src_x steps
     459           0 :       for (z = 0; z < 8; ++z) {
     460           0 :         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
     461           0 :         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
     462           0 :         if (x_q4 & SUBPEL_MASK) {
     463           0 :           filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
     464             :         } else {
     465             :           int i;
     466           0 :           for (i = 0; i < 8; ++i) {
     467           0 :             temp[z * 8 + i] = src_x[i * src_stride + 3];
     468             :           }
     469             :         }
     470           0 :         x_q4 += x_step_q4;
     471             :       }
     472             : 
     473             :       // transpose the 8x8 filters values back to dst
     474           0 :       transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
     475             :     }
     476             : 
     477           0 :     src += src_stride * 8;
     478           0 :     dst += dst_stride * 8;
     479           0 :   } while (y -= 8);
     480           0 : }
     481             : 
     482           0 : static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     483             :                                   uint8_t *dst, const int16_t *filter) {
     484           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     485           0 :   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
     486             :   // pack and duplicate the filter values
     487           0 :   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
     488           0 :   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
     489           0 :   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
     490           0 :   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
     491           0 :   const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
     492           0 :   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
     493           0 :   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
     494           0 :   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
     495             :   // TRANSPOSE...
     496             :   // 00 01 02 03 04 05 06 07
     497             :   // 10 11 12 13 14 15 16 17
     498             :   // 20 21 22 23 24 25 26 27
     499             :   // 30 31 32 33 34 35 36 37
     500             :   //
     501             :   // TO
     502             :   //
     503             :   // 00 10 20 30
     504             :   // 01 11 21 31
     505             :   // 02 12 22 32
     506             :   // 03 13 23 33
     507             :   // 04 14 24 34
     508             :   // 05 15 25 35
     509             :   // 06 16 26 36
     510             :   // 07 17 27 37
     511             :   //
     512             :   // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
     513           0 :   const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
     514             :   // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
     515           0 :   const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
     516             :   // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
     517           0 :   const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
     518             :   // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
     519           0 :   const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
     520             :   // 02 03 12 13 22 23 32 33
     521           0 :   const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
     522             :   // 06 07 16 17 26 27 36 37
     523           0 :   const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
     524             :   // multiply 2 adjacent elements with the filter and add the result
     525           0 :   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
     526           0 :   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
     527           0 :   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
     528           0 :   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
     529             :   // add and saturate the results together
     530           0 :   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
     531           0 :   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
     532           0 :   __m128i temp = _mm_adds_epi16(x0, x3);
     533           0 :   temp = _mm_adds_epi16(temp, min_x2x1);
     534           0 :   temp = _mm_adds_epi16(temp, max_x2x1);
     535             :   // round and shift by 7 bit each 16 bit
     536           0 :   temp = _mm_mulhrs_epi16(temp, k_256);
     537             :   // shrink to 8 bit each 16 bits
     538           0 :   temp = _mm_packus_epi16(temp, temp);
     539             :   // save only 4 bytes
     540           0 :   *(int *)dst = _mm_cvtsi128_si32(temp);
     541           0 : }
     542             : 
     543           0 : static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
     544             :                                 uint8_t *dst, ptrdiff_t dst_stride) {
     545           0 :   __m128i A = _mm_cvtsi32_si128(*(const int *)src);
     546           0 :   __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
     547           0 :   __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
     548           0 :   __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
     549             :   // 00 10 01 11 02 12 03 13
     550           0 :   const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
     551             :   // 20 30 21 31 22 32 23 33
     552           0 :   const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
     553             :   // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
     554           0 :   A = _mm_unpacklo_epi16(tr0_0, tr0_1);
     555           0 :   B = _mm_srli_si128(A, 4);
     556           0 :   C = _mm_srli_si128(A, 8);
     557           0 :   D = _mm_srli_si128(A, 12);
     558             : 
     559           0 :   *(int *)(dst) = _mm_cvtsi128_si32(A);
     560           0 :   *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
     561           0 :   *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
     562           0 :   *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
     563           0 : }
     564             : 
     565           0 : static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
     566             :                                     uint8_t *dst, ptrdiff_t dst_stride,
     567             :                                     const InterpKernel *x_filters, int x0_q4,
     568             :                                     int x_step_q4, int w, int h) {
     569             :   DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
     570             :   int x, y, z;
     571           0 :   src -= SUBPEL_TAPS / 2 - 1;
     572             : 
     573           0 :   for (y = 0; y < h; y += 4) {
     574           0 :     int x_q4 = x0_q4;
     575           0 :     for (x = 0; x < w; x += 4) {
     576             :       // process 4 src_x steps
     577           0 :       for (z = 0; z < 4; ++z) {
     578           0 :         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
     579           0 :         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
     580           0 :         if (x_q4 & SUBPEL_MASK) {
     581           0 :           filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
     582             :         } else {
     583             :           int i;
     584           0 :           for (i = 0; i < 4; ++i) {
     585           0 :             temp[z * 4 + i] = src_x[i * src_stride + 3];
     586             :           }
     587             :         }
     588           0 :         x_q4 += x_step_q4;
     589             :       }
     590             : 
     591             :       // transpose the 4x4 filters values back to dst
     592           0 :       transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
     593             :     }
     594             : 
     595           0 :     src += src_stride * 4;
     596           0 :     dst += dst_stride * 4;
     597             :   }
     598           0 : }
     599             : 
     600           0 : static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     601             :                                  uint8_t *dst, const int16_t *filter) {
     602           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     603           0 :   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
     604             :   // pack and duplicate the filter values
     605           0 :   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
     606           0 :   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
     607           0 :   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
     608           0 :   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
     609           0 :   const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
     610           0 :   const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
     611           0 :   const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
     612           0 :   const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
     613           0 :   const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
     614           0 :   const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
     615           0 :   const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
     616           0 :   const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
     617           0 :   const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
     618           0 :   const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
     619           0 :   const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
     620           0 :   const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
     621             :   // multiply 2 adjacent elements with the filter and add the result
     622           0 :   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
     623           0 :   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
     624           0 :   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
     625           0 :   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
     626             :   // add and saturate the results together
     627           0 :   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
     628           0 :   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
     629           0 :   __m128i temp = _mm_adds_epi16(x0, x3);
     630           0 :   temp = _mm_adds_epi16(temp, min_x2x1);
     631           0 :   temp = _mm_adds_epi16(temp, max_x2x1);
     632             :   // round and shift by 7 bit each 16 bit
     633           0 :   temp = _mm_mulhrs_epi16(temp, k_256);
     634             :   // shrink to 8 bit each 16 bits
     635           0 :   temp = _mm_packus_epi16(temp, temp);
     636             :   // save only 4 bytes
     637           0 :   *(int *)dst = _mm_cvtsi128_si32(temp);
     638           0 : }
     639             : 
     640           0 : static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
     641             :                                    uint8_t *dst, ptrdiff_t dst_stride,
     642             :                                    const InterpKernel *y_filters, int y0_q4,
     643             :                                    int y_step_q4, int w, int h) {
     644             :   int y;
     645           0 :   int y_q4 = y0_q4;
     646             : 
     647           0 :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
     648           0 :   for (y = 0; y < h; ++y) {
     649           0 :     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
     650           0 :     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
     651             : 
     652           0 :     if (y_q4 & SUBPEL_MASK) {
     653           0 :       filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
     654             :     } else {
     655           0 :       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
     656             :     }
     657             : 
     658           0 :     y_q4 += y_step_q4;
     659             :   }
     660           0 : }
     661             : 
     662           0 : static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     663             :                                  uint8_t *dst, const int16_t *filter) {
     664           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     665           0 :   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
     666             :   // pack and duplicate the filter values
     667           0 :   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
     668           0 :   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
     669           0 :   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
     670           0 :   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
     671           0 :   const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
     672           0 :   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
     673           0 :   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
     674           0 :   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
     675           0 :   const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
     676           0 :   const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
     677           0 :   const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
     678           0 :   const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
     679           0 :   const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
     680           0 :   const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
     681           0 :   const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
     682           0 :   const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
     683             :   // multiply 2 adjacent elements with the filter and add the result
     684           0 :   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
     685           0 :   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
     686           0 :   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
     687           0 :   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
     688             :   // add and saturate the results together
     689           0 :   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
     690           0 :   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
     691           0 :   __m128i temp = _mm_adds_epi16(x0, x3);
     692           0 :   temp = _mm_adds_epi16(temp, min_x2x1);
     693           0 :   temp = _mm_adds_epi16(temp, max_x2x1);
     694             :   // round and shift by 7 bit each 16 bit
     695           0 :   temp = _mm_mulhrs_epi16(temp, k_256);
     696             :   // shrink to 8 bit each 16 bits
     697           0 :   temp = _mm_packus_epi16(temp, temp);
     698             :   // save only 8 bytes convolve result
     699             :   _mm_storel_epi64((__m128i *)dst, temp);
     700           0 : }
     701             : 
     702           0 : static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
     703             :                                    uint8_t *dst, ptrdiff_t dst_stride,
     704             :                                    const InterpKernel *y_filters, int y0_q4,
     705             :                                    int y_step_q4, int w, int h) {
     706             :   int y;
     707           0 :   int y_q4 = y0_q4;
     708             : 
     709           0 :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
     710           0 :   for (y = 0; y < h; ++y) {
     711           0 :     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
     712           0 :     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
     713           0 :     if (y_q4 & SUBPEL_MASK) {
     714           0 :       filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
     715             :     } else {
     716           0 :       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
     717             :     }
     718           0 :     y_q4 += y_step_q4;
     719             :   }
     720           0 : }
     721             : 
     722           0 : static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     723             :                                   uint8_t *dst, const int16_t *filter, int w) {
     724           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     725           0 :   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
     726             :   // pack and duplicate the filter values
     727           0 :   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
     728           0 :   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
     729           0 :   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
     730           0 :   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
     731             :   int i;
     732             : 
     733           0 :   for (i = 0; i < w; i += 16) {
     734           0 :     const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
     735           0 :     const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
     736           0 :     const __m128i C =
     737           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
     738           0 :     const __m128i D =
     739           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
     740           0 :     const __m128i E =
     741           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
     742           0 :     const __m128i F =
     743           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
     744           0 :     const __m128i G =
     745           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
     746           0 :     const __m128i H =
     747           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
     748             :     // merge the result together
     749           0 :     const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
     750           0 :     const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
     751           0 :     const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
     752           0 :     const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
     753             :     // multiply 2 adjacent elements with the filter and add the result
     754           0 :     const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
     755           0 :     const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
     756           0 :     const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
     757           0 :     const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
     758             :     // add and saturate the results together
     759           0 :     const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
     760           0 :     const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
     761             :     // merge the result together
     762           0 :     const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
     763           0 :     const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
     764             :     // multiply 2 adjacent elements with the filter and add the result
     765           0 :     const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
     766           0 :     const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
     767             :     // merge the result together
     768           0 :     const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
     769           0 :     const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
     770             :     // multiply 2 adjacent elements with the filter and add the result
     771           0 :     const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
     772           0 :     const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
     773             :     // add and saturate the results together
     774           0 :     __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
     775           0 :     __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
     776             : 
     777             :     // add and saturate the results together
     778           0 :     temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
     779           0 :     temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
     780             :     // round and shift by 7 bit each 16 bit
     781           0 :     temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
     782           0 :     temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
     783             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     784             :     // convolve result and the second lane contain the second convolve
     785             :     // result
     786           0 :     temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
     787           0 :     src_ptr += 16;
     788             :     // save 16 bytes convolve result
     789           0 :     _mm_store_si128((__m128i *)&dst[i], temp_hi);
     790             :   }
     791           0 : }
     792             : 
     793           0 : static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
     794             :                                     uint8_t *dst, ptrdiff_t dst_stride,
     795             :                                     const InterpKernel *y_filters, int y0_q4,
     796             :                                     int y_step_q4, int w, int h) {
     797             :   int y;
     798           0 :   int y_q4 = y0_q4;
     799             : 
     800           0 :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
     801           0 :   for (y = 0; y < h; ++y) {
     802           0 :     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
     803           0 :     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
     804           0 :     if (y_q4 & SUBPEL_MASK) {
     805           0 :       filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
     806             :                             w);
     807             :     } else {
     808           0 :       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
     809             :     }
     810           0 :     y_q4 += y_step_q4;
     811             :   }
     812           0 : }
     813             : 
     814           0 : static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
     815             :                              uint8_t *dst, ptrdiff_t dst_stride,
     816             :                              const InterpKernel *const x_filters, int x0_q4,
     817             :                              int x_step_q4, const InterpKernel *const y_filters,
     818             :                              int y0_q4, int y_step_q4, int w, int h) {
     819             :   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
     820             :   // 2d filtering proceeds in 2 steps:
     821             :   //   (1) Interpolate horizontally into an intermediate buffer, temp.
     822             :   //   (2) Interpolate temp vertically to derive the sub-pixel result.
     823             :   // Deriving the maximum number of rows in the temp buffer (135):
     824             :   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
     825             :   // --Largest block size is 64x64 pixels.
     826             :   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
     827             :   //   original frame (in 1/16th pixel units).
     828             :   // --Must round-up because block may be located at sub-pixel position.
     829             :   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
     830             :   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
     831             :   // --Require an additional 8 rows for the horiz_w8 transpose tail.
     832             :   DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
     833           0 :   const int intermediate_height =
     834           0 :       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
     835             : 
     836           0 :   assert(w <= 64);
     837           0 :   assert(h <= 64);
     838           0 :   assert(y_step_q4 <= 32);
     839           0 :   assert(x_step_q4 <= 32);
     840             : 
     841           0 :   if (w >= 8) {
     842           0 :     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
     843             :                             src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
     844             :                             w, intermediate_height);
     845             :   } else {
     846           0 :     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
     847             :                             src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
     848             :                             w, intermediate_height);
     849             :   }
     850             : 
     851           0 :   if (w >= 16) {
     852           0 :     scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
     853             :                             dst_stride, y_filters, y0_q4, y_step_q4, w, h);
     854           0 :   } else if (w == 8) {
     855           0 :     scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
     856             :                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
     857             :   } else {
     858           0 :     scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
     859             :                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
     860             :   }
     861           0 : }
     862             : 
     863           0 : void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     864             :                          ptrdiff_t dst_stride, const int16_t *filter_x,
     865             :                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
     866             :                          int w, int h) {
     867           0 :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     868           0 :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     869             : 
     870           0 :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     871           0 :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     872             : 
     873           0 :   scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
     874             :                    x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
     875           0 : }
     876             : 
     877             : // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
     878             : //                          uint8_t *dst, ptrdiff_t dst_stride,
     879             : //                          const int16_t *filter_x, int x_step_q4,
     880             : //                          const int16_t *filter_y, int y_step_q4,
     881             : //                          int w, int h);
     882             : // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
     883             : //                              uint8_t *dst, ptrdiff_t dst_stride,
     884             : //                              const int16_t *filter_x, int x_step_q4,
     885             : //                              const int16_t *filter_y, int y_step_q4,
     886             : //                              int w, int h);
     887           0 : FUN_CONV_2D(, ssse3);
     888           0 : FUN_CONV_2D(avg_, ssse3);

Generated by: LCOV version 1.13