LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - aom_subpixel_8t_intrin_ssse3.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 423 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 25 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <tmmintrin.h>
      13             : 
      14             : #include "./aom_dsp_rtcd.h"
      15             : #include "aom_dsp/aom_filter.h"
      16             : #include "aom_dsp/x86/convolve.h"
      17             : #include "aom_mem/aom_mem.h"
      18             : #include "aom_ports/mem.h"
      19             : #include "aom_ports/emmintrin_compat.h"
      20             : 
      21             : // filters only for the 4_h8 convolution
      22             : DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
      23             :   0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
      24             : };
      25             : 
      26             : DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
      27             :   4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
      28             : };
      29             : 
      30             : // filters for 8_h8 and 16_h8
      31             : DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
      32             :   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
      33             : };
      34             : 
      35             : DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
      36             :   2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
      37             : };
      38             : 
      39             : DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
      40             :   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
      41             : };
      42             : 
      43             : DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
      44             :   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
      45             : };
      46             : 
      47             : // These are reused by the avx2 intrinsics.
      48             : filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
      49             : filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
      50             : filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
      51             : 
      52           0 : void aom_filter_block1d4_h8_intrin_ssse3(
      53             :     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
      54             :     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
      55             :   __m128i firstFilters, secondFilters, shuffle1, shuffle2;
      56             :   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
      57             :   __m128i addFilterReg64, filtersReg, srcReg, minReg;
      58             :   unsigned int i;
      59             : 
      60             :   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
      61           0 :   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
      62           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
      63             :   // converting the 16 bit (short) to  8 bit (byte) and have the same data
      64             :   // in both lanes of 128 bit register.
      65           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
      66             : 
      67             :   // duplicate only the first 16 bits in the filter into the first lane
      68           0 :   firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
      69             :   // duplicate only the third 16 bit in the filter into the first lane
      70           0 :   secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
      71             :   // duplicate only the seconds 16 bits in the filter into the second lane
      72             :   // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
      73           0 :   firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
      74             :   // duplicate only the forth 16 bits in the filter into the second lane
      75             :   // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
      76           0 :   secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
      77             : 
      78             :   // loading the local filters
      79           0 :   shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
      80           0 :   shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
      81             : 
      82           0 :   for (i = 0; i < output_height; i++) {
      83           0 :     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
      84             : 
      85             :     // filter the source buffer
      86           0 :     srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
      87           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
      88             : 
      89             :     // multiply 2 adjacent elements with the filter and add the result
      90           0 :     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
      91           0 :     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
      92             : 
      93             :     // extract the higher half of the lane
      94           0 :     srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
      95           0 :     srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
      96             : 
      97           0 :     minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
      98             : 
      99             :     // add and saturate all the results together
     100           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
     101           0 :     srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
     102           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
     103           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
     104           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
     105             : 
     106             :     // shift by 7 bit each 16 bits
     107           0 :     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
     108             : 
     109             :     // shrink to 8 bit each 16 bits
     110           0 :     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
     111           0 :     src_ptr += src_pixels_per_line;
     112             : 
     113             :     // save only 4 bytes
     114           0 :     *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
     115             : 
     116           0 :     output_ptr += output_pitch;
     117             :   }
     118           0 : }
     119             : 
     120           0 : void aom_filter_block1d8_h8_intrin_ssse3(
     121             :     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     122             :     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
     123             :   __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
     124             :   __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
     125             :   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
     126             :   __m128i addFilterReg64, filtersReg, minReg;
     127             :   unsigned int i;
     128             : 
     129             :   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
     130           0 :   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
     131           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     132             :   // converting the 16 bit (short) to  8 bit (byte) and have the same data
     133             :   // in both lanes of 128 bit register.
     134           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     135             : 
     136             :   // duplicate only the first 16 bits (first and second byte)
     137             :   // across 128 bit register
     138           0 :   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
     139             :   // duplicate only the second 16 bits (third and forth byte)
     140             :   // across 128 bit register
     141           0 :   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
     142             :   // duplicate only the third 16 bits (fifth and sixth byte)
     143             :   // across 128 bit register
     144           0 :   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
     145             :   // duplicate only the forth 16 bits (seventh and eighth byte)
     146             :   // across 128 bit register
     147           0 :   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
     148             : 
     149           0 :   filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
     150           0 :   filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
     151           0 :   filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
     152           0 :   filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
     153             : 
     154           0 :   for (i = 0; i < output_height; i++) {
     155           0 :     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
     156             : 
     157             :     // filter the source buffer
     158           0 :     srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
     159           0 :     srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
     160             : 
     161             :     // multiply 2 adjacent elements with the filter and add the result
     162           0 :     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
     163           0 :     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
     164             : 
     165             :     // filter the source buffer
     166           0 :     srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
     167           0 :     srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
     168             : 
     169             :     // multiply 2 adjacent elements with the filter and add the result
     170           0 :     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
     171           0 :     srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
     172             : 
     173             :     // add and saturate all the results together
     174           0 :     minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
     175           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
     176             : 
     177           0 :     srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
     178           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
     179           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
     180           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
     181             : 
     182             :     // shift by 7 bit each 16 bits
     183           0 :     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
     184             : 
     185             :     // shrink to 8 bit each 16 bits
     186           0 :     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
     187             : 
     188           0 :     src_ptr += src_pixels_per_line;
     189             : 
     190             :     // save only 8 bytes
     191             :     _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
     192             : 
     193           0 :     output_ptr += output_pitch;
     194             :   }
     195           0 : }
     196             : 
     197           0 : void aom_filter_block1d8_v8_intrin_ssse3(
     198             :     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     199             :     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
     200             :   __m128i addFilterReg64, filtersReg, minReg;
     201             :   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
     202             :   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
     203             :   __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
     204             :   __m128i srcReg8;
     205             :   unsigned int i;
     206             : 
     207             :   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
     208           0 :   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
     209           0 :   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     210             :   // converting the 16 bit (short) to  8 bit (byte) and have the same data
     211             :   // in both lanes of 128 bit register.
     212           0 :   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     213             : 
     214             :   // duplicate only the first 16 bits in the filter
     215           0 :   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
     216             :   // duplicate only the second 16 bits in the filter
     217           0 :   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
     218             :   // duplicate only the third 16 bits in the filter
     219           0 :   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
     220             :   // duplicate only the forth 16 bits in the filter
     221           0 :   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
     222             : 
     223             :   // load the first 7 rows of 8 bytes
     224           0 :   srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
     225           0 :   srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
     226           0 :   srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
     227           0 :   srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
     228           0 :   srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
     229           0 :   srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
     230           0 :   srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
     231             : 
     232           0 :   for (i = 0; i < output_height; i++) {
     233             :     // load the last 8 bytes
     234           0 :     srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
     235             : 
     236             :     // merge the result together
     237           0 :     srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
     238           0 :     srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
     239             : 
     240             :     // merge the result together
     241           0 :     srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
     242           0 :     srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
     243             : 
     244             :     // multiply 2 adjacent elements with the filter and add the result
     245           0 :     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
     246           0 :     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
     247           0 :     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
     248           0 :     srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
     249             : 
     250             :     // add and saturate the results together
     251           0 :     minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
     252           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
     253           0 :     srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
     254           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
     255           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
     256           0 :     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
     257             : 
     258             :     // shift by 7 bit each 16 bit
     259           0 :     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
     260             : 
     261             :     // shrink to 8 bit each 16 bits
     262           0 :     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
     263             : 
     264           0 :     src_ptr += src_pitch;
     265             : 
     266             :     // shift down a row
     267           0 :     srcReg1 = srcReg2;
     268           0 :     srcReg2 = srcReg3;
     269           0 :     srcReg3 = srcReg4;
     270           0 :     srcReg4 = srcReg5;
     271           0 :     srcReg5 = srcReg6;
     272           0 :     srcReg6 = srcReg7;
     273           0 :     srcReg7 = srcReg8;
     274             : 
     275             :     // save only 8 bytes convolve result
     276             :     _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
     277             : 
     278           0 :     output_ptr += out_pitch;
     279             :   }
     280           0 : }
     281             : 
     282             : filter8_1dfunction aom_filter_block1d16_v8_ssse3;
     283             : filter8_1dfunction aom_filter_block1d16_h8_ssse3;
     284             : filter8_1dfunction aom_filter_block1d8_v8_ssse3;
     285             : filter8_1dfunction aom_filter_block1d8_h8_ssse3;
     286             : filter8_1dfunction aom_filter_block1d4_v8_ssse3;
     287             : filter8_1dfunction aom_filter_block1d4_h8_ssse3;
     288             : filter8_1dfunction aom_filter_block1d16_v8_avg_ssse3;
     289             : filter8_1dfunction aom_filter_block1d16_h8_avg_ssse3;
     290             : filter8_1dfunction aom_filter_block1d8_v8_avg_ssse3;
     291             : filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3;
     292             : filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3;
     293             : filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3;
     294             : #if CONFIG_LOOP_RESTORATION
     295             : filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3;
     296             : filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3;
     297             : filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3;
     298             : filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3;
     299             : filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3;
     300             : filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3;
     301             : #endif
     302             : 
     303             : filter8_1dfunction aom_filter_block1d16_v2_ssse3;
     304             : filter8_1dfunction aom_filter_block1d16_h2_ssse3;
     305             : filter8_1dfunction aom_filter_block1d8_v2_ssse3;
     306             : filter8_1dfunction aom_filter_block1d8_h2_ssse3;
     307             : filter8_1dfunction aom_filter_block1d4_v2_ssse3;
     308             : filter8_1dfunction aom_filter_block1d4_h2_ssse3;
     309             : filter8_1dfunction aom_filter_block1d16_v2_avg_ssse3;
     310             : filter8_1dfunction aom_filter_block1d16_h2_avg_ssse3;
     311             : filter8_1dfunction aom_filter_block1d8_v2_avg_ssse3;
     312             : filter8_1dfunction aom_filter_block1d8_h2_avg_ssse3;
     313             : filter8_1dfunction aom_filter_block1d4_v2_avg_ssse3;
     314             : filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3;
     315             : 
     316             : // void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
     317             : //                                uint8_t *dst, ptrdiff_t dst_stride,
     318             : //                                const int16_t *filter_x, int x_step_q4,
     319             : //                                const int16_t *filter_y, int y_step_q4,
     320             : //                                int w, int h);
     321             : // void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
     322             : //                               uint8_t *dst, ptrdiff_t dst_stride,
     323             : //                               const int16_t *filter_x, int x_step_q4,
     324             : //                               const int16_t *filter_y, int y_step_q4,
     325             : //                               int w, int h);
     326             : // void aom_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
     327             : //                                    uint8_t *dst, ptrdiff_t dst_stride,
     328             : //                                    const int16_t *filter_x, int x_step_q4,
     329             : //                                    const int16_t *filter_y, int y_step_q4,
     330             : //                                    int w, int h);
     331             : // void aom_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
     332             : //                                   uint8_t *dst, ptrdiff_t dst_stride,
     333             : //                                   const int16_t *filter_x, int x_step_q4,
     334             : //                                   const int16_t *filter_y, int y_step_q4,
     335             : //                                   int w, int h);
     336           0 : FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
     337           0 : FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
     338           0 : FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
     339           0 : FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
     340             :             ssse3);
     341             : 
     342             : #if CONFIG_LOOP_RESTORATION
     343             : FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_,
     344             :                         ssse3);
     345             : FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v,
     346             :                         src - src_stride * 3, add_src_, ssse3);
     347             : #endif
     348             : 
     349             : #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
     350             :                       out2, out3, out4, out5, out6, out7)                 \
     351             :   {                                                                       \
     352             :     const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
     353             :     const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
     354             :     const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
     355             :     const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
     356             :                                                                           \
     357             :     const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
     358             :     const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
     359             :     const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
     360             :     const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
     361             :                                                                           \
     362             :     const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
     363             :     const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
     364             :     const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
     365             :     const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
     366             :                                                                           \
     367             :     out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
     368             :     out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
     369             :     out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
     370             :     out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
     371             :     out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
     372             :     out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
     373             :     out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
     374             :     out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
     375             :   }
     376             : 
     377           0 : static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
     378             :                                   uint8_t *dst, const int16_t *x_filter) {
     379           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     380           0 :   const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
     381             :   // pack and duplicate the filter values
     382           0 :   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
     383           0 :   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
     384           0 :   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
     385           0 :   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
     386           0 :   const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
     387           0 :   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
     388           0 :   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
     389           0 :   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
     390           0 :   const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
     391           0 :   const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
     392           0 :   const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
     393           0 :   const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
     394             :   // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
     395           0 :   const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
     396             :   // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
     397           0 :   const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
     398             :   // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
     399           0 :   const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
     400             :   // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
     401           0 :   const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
     402             :   // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
     403           0 :   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
     404             :   // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
     405           0 :   const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
     406             :   // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
     407           0 :   const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
     408             :   // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
     409           0 :   const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
     410             :   // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
     411           0 :   const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
     412           0 :   const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
     413           0 :   const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
     414           0 :   const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
     415             :   // multiply 2 adjacent elements with the filter and add the result
     416           0 :   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
     417           0 :   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
     418           0 :   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
     419           0 :   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
     420             :   // add and saturate the results together
     421           0 :   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
     422           0 :   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
     423           0 :   __m128i temp = _mm_adds_epi16(x0, x3);
     424           0 :   temp = _mm_adds_epi16(temp, min_x2x1);
     425           0 :   temp = _mm_adds_epi16(temp, max_x2x1);
     426             :   // round and shift by 7 bit each 16 bit
     427           0 :   temp = _mm_mulhrs_epi16(temp, k_256);
     428             :   // shrink to 8 bit each 16 bits
     429           0 :   temp = _mm_packus_epi16(temp, temp);
     430             :   // save only 8 bytes convolve result
     431             :   _mm_storel_epi64((__m128i *)dst, temp);
     432           0 : }
     433             : 
     434           0 : static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
     435             :                                 uint8_t *dst, ptrdiff_t dst_stride) {
     436             :   __m128i A, B, C, D, E, F, G, H;
     437             : 
     438           0 :   A = _mm_loadl_epi64((const __m128i *)src);
     439           0 :   B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
     440           0 :   C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
     441           0 :   D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
     442           0 :   E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
     443           0 :   F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
     444           0 :   G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
     445           0 :   H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
     446             : 
     447           0 :   TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
     448             : 
     449             :   _mm_storel_epi64((__m128i *)dst, A);
     450           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
     451           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
     452           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
     453           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
     454           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
     455           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
     456           0 :   _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
     457           0 : }
     458             : 
     459           0 : static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
     460             :                                     uint8_t *dst, ptrdiff_t dst_stride,
     461             :                                     const InterpKernel *x_filters, int x0_q4,
     462             :                                     int x_step_q4, int w, int h) {
     463             :   DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
     464             :   int x, y, z;
     465           0 :   src -= SUBPEL_TAPS / 2 - 1;
     466             : 
     467             :   // This function processes 8x8 areas.  The intermediate height is not always
     468             :   // a multiple of 8, so force it to be a multiple of 8 here.
     469           0 :   y = h + (8 - (h & 0x7));
     470             : 
     471             :   do {
     472           0 :     int x_q4 = x0_q4;
     473           0 :     for (x = 0; x < w; x += 8) {
     474             :       // process 8 src_x steps
     475           0 :       for (z = 0; z < 8; ++z) {
     476           0 :         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
     477           0 :         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
     478           0 :         if (x_q4 & SUBPEL_MASK) {
     479           0 :           filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
     480             :         } else {
     481             :           int i;
     482           0 :           for (i = 0; i < 8; ++i) {
     483           0 :             temp[z * 8 + i] = src_x[i * src_stride + 3];
     484             :           }
     485             :         }
     486           0 :         x_q4 += x_step_q4;
     487             :       }
     488             : 
     489             :       // transpose the 8x8 filters values back to dst
     490           0 :       transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
     491             :     }
     492             : 
     493           0 :     src += src_stride * 8;
     494           0 :     dst += dst_stride * 8;
     495           0 :   } while (y -= 8);
     496           0 : }
     497             : 
     498           0 : static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     499             :                                   uint8_t *dst, const int16_t *filter) {
     500           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     501           0 :   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
     502             :   // pack and duplicate the filter values
     503           0 :   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
     504           0 :   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
     505           0 :   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
     506           0 :   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
     507           0 :   const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
     508           0 :   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
     509           0 :   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
     510           0 :   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
     511             :   // TRANSPOSE...
     512             :   // 00 01 02 03 04 05 06 07
     513             :   // 10 11 12 13 14 15 16 17
     514             :   // 20 21 22 23 24 25 26 27
     515             :   // 30 31 32 33 34 35 36 37
     516             :   //
     517             :   // TO
     518             :   //
     519             :   // 00 10 20 30
     520             :   // 01 11 21 31
     521             :   // 02 12 22 32
     522             :   // 03 13 23 33
     523             :   // 04 14 24 34
     524             :   // 05 15 25 35
     525             :   // 06 16 26 36
     526             :   // 07 17 27 37
     527             :   //
     528             :   // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
     529           0 :   const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
     530             :   // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
     531           0 :   const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
     532             :   // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
     533           0 :   const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
     534             :   // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
     535           0 :   const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
     536             :   // 02 03 12 13 22 23 32 33
     537           0 :   const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
     538             :   // 06 07 16 17 26 27 36 37
     539           0 :   const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
     540             :   // multiply 2 adjacent elements with the filter and add the result
     541           0 :   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
     542           0 :   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
     543           0 :   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
     544           0 :   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
     545             :   // add and saturate the results together
     546           0 :   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
     547           0 :   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
     548           0 :   __m128i temp = _mm_adds_epi16(x0, x3);
     549           0 :   temp = _mm_adds_epi16(temp, min_x2x1);
     550           0 :   temp = _mm_adds_epi16(temp, max_x2x1);
     551             :   // round and shift by 7 bit each 16 bit
     552           0 :   temp = _mm_mulhrs_epi16(temp, k_256);
     553             :   // shrink to 8 bit each 16 bits
     554           0 :   temp = _mm_packus_epi16(temp, temp);
     555             :   // save only 4 bytes
     556           0 :   *(int *)dst = _mm_cvtsi128_si32(temp);
     557           0 : }
     558             : 
     559           0 : static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
     560             :                                 uint8_t *dst, ptrdiff_t dst_stride) {
     561           0 :   __m128i A = _mm_cvtsi32_si128(*(const int *)src);
     562           0 :   __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
     563           0 :   __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
     564           0 :   __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
     565             :   // 00 10 01 11 02 12 03 13
     566           0 :   const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
     567             :   // 20 30 21 31 22 32 23 33
     568           0 :   const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
     569             :   // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
     570           0 :   A = _mm_unpacklo_epi16(tr0_0, tr0_1);
     571           0 :   B = _mm_srli_si128(A, 4);
     572           0 :   C = _mm_srli_si128(A, 8);
     573           0 :   D = _mm_srli_si128(A, 12);
     574             : 
     575           0 :   *(int *)(dst) = _mm_cvtsi128_si32(A);
     576           0 :   *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
     577           0 :   *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
     578           0 :   *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
     579           0 : }
     580             : 
     581           0 : static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
     582             :                                     uint8_t *dst, ptrdiff_t dst_stride,
     583             :                                     const InterpKernel *x_filters, int x0_q4,
     584             :                                     int x_step_q4, int w, int h) {
     585             :   DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
     586             :   int x, y, z;
     587           0 :   src -= SUBPEL_TAPS / 2 - 1;
     588             : 
     589           0 :   for (y = 0; y < h; y += 4) {
     590           0 :     int x_q4 = x0_q4;
     591           0 :     for (x = 0; x < w; x += 4) {
     592             :       // process 4 src_x steps
     593           0 :       for (z = 0; z < 4; ++z) {
     594           0 :         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
     595           0 :         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
     596           0 :         if (x_q4 & SUBPEL_MASK) {
     597           0 :           filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
     598             :         } else {
     599             :           int i;
     600           0 :           for (i = 0; i < 4; ++i) {
     601           0 :             temp[z * 4 + i] = src_x[i * src_stride + 3];
     602             :           }
     603             :         }
     604           0 :         x_q4 += x_step_q4;
     605             :       }
     606             : 
     607             :       // transpose the 4x4 filters values back to dst
     608           0 :       transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
     609             :     }
     610             : 
     611           0 :     src += src_stride * 4;
     612           0 :     dst += dst_stride * 4;
     613             :   }
     614           0 : }
     615             : 
     616           0 : static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     617             :                                  uint8_t *dst, const int16_t *filter) {
     618           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     619           0 :   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
     620             :   // pack and duplicate the filter values
     621           0 :   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
     622           0 :   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
     623           0 :   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
     624           0 :   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
     625           0 :   const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
     626           0 :   const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
     627           0 :   const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
     628           0 :   const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
     629           0 :   const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
     630           0 :   const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
     631           0 :   const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
     632           0 :   const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
     633           0 :   const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
     634           0 :   const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
     635           0 :   const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
     636           0 :   const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
     637             :   // multiply 2 adjacent elements with the filter and add the result
     638           0 :   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
     639           0 :   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
     640           0 :   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
     641           0 :   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
     642             :   // add and saturate the results together
     643           0 :   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
     644           0 :   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
     645           0 :   __m128i temp = _mm_adds_epi16(x0, x3);
     646           0 :   temp = _mm_adds_epi16(temp, min_x2x1);
     647           0 :   temp = _mm_adds_epi16(temp, max_x2x1);
     648             :   // round and shift by 7 bit each 16 bit
     649           0 :   temp = _mm_mulhrs_epi16(temp, k_256);
     650             :   // shrink to 8 bit each 16 bits
     651           0 :   temp = _mm_packus_epi16(temp, temp);
     652             :   // save only 4 bytes
     653           0 :   *(int *)dst = _mm_cvtsi128_si32(temp);
     654           0 : }
     655             : 
     656           0 : static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
     657             :                                    uint8_t *dst, ptrdiff_t dst_stride,
     658             :                                    const InterpKernel *y_filters, int y0_q4,
     659             :                                    int y_step_q4, int w, int h) {
     660             :   int y;
     661           0 :   int y_q4 = y0_q4;
     662             : 
     663           0 :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
     664           0 :   for (y = 0; y < h; ++y) {
     665           0 :     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
     666           0 :     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
     667             : 
     668           0 :     if (y_q4 & SUBPEL_MASK) {
     669           0 :       filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
     670             :     } else {
     671           0 :       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
     672             :     }
     673             : 
     674           0 :     y_q4 += y_step_q4;
     675             :   }
     676           0 : }
     677             : 
     678           0 : static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     679             :                                  uint8_t *dst, const int16_t *filter) {
     680           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     681           0 :   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
     682             :   // pack and duplicate the filter values
     683           0 :   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
     684           0 :   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
     685           0 :   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
     686           0 :   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
     687           0 :   const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
     688           0 :   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
     689           0 :   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
     690           0 :   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
     691           0 :   const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
     692           0 :   const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
     693           0 :   const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
     694           0 :   const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
     695           0 :   const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
     696           0 :   const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
     697           0 :   const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
     698           0 :   const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
     699             :   // multiply 2 adjacent elements with the filter and add the result
     700           0 :   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
     701           0 :   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
     702           0 :   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
     703           0 :   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
     704             :   // add and saturate the results together
     705           0 :   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
     706           0 :   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
     707           0 :   __m128i temp = _mm_adds_epi16(x0, x3);
     708           0 :   temp = _mm_adds_epi16(temp, min_x2x1);
     709           0 :   temp = _mm_adds_epi16(temp, max_x2x1);
     710             :   // round and shift by 7 bit each 16 bit
     711           0 :   temp = _mm_mulhrs_epi16(temp, k_256);
     712             :   // shrink to 8 bit each 16 bits
     713           0 :   temp = _mm_packus_epi16(temp, temp);
     714             :   // save only 8 bytes convolve result
     715             :   _mm_storel_epi64((__m128i *)dst, temp);
     716           0 : }
     717             : 
     718           0 : static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
     719             :                                    uint8_t *dst, ptrdiff_t dst_stride,
     720             :                                    const InterpKernel *y_filters, int y0_q4,
     721             :                                    int y_step_q4, int w, int h) {
     722             :   int y;
     723           0 :   int y_q4 = y0_q4;
     724             : 
     725           0 :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
     726           0 :   for (y = 0; y < h; ++y) {
     727           0 :     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
     728           0 :     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
     729           0 :     if (y_q4 & SUBPEL_MASK) {
     730           0 :       filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
     731             :     } else {
     732           0 :       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
     733             :     }
     734           0 :     y_q4 += y_step_q4;
     735             :   }
     736           0 : }
     737             : 
     738           0 : static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     739             :                                   uint8_t *dst, const int16_t *filter, int w) {
     740           0 :   const __m128i k_256 = _mm_set1_epi16(1 << 8);
     741           0 :   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
     742             :   // pack and duplicate the filter values
     743           0 :   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
     744           0 :   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
     745           0 :   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
     746           0 :   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
     747             :   int i;
     748             : 
     749           0 :   for (i = 0; i < w; i += 16) {
     750           0 :     const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
     751           0 :     const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
     752           0 :     const __m128i C =
     753           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
     754           0 :     const __m128i D =
     755           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
     756           0 :     const __m128i E =
     757           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
     758           0 :     const __m128i F =
     759           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
     760           0 :     const __m128i G =
     761           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
     762           0 :     const __m128i H =
     763           0 :         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
     764             :     // merge the result together
     765           0 :     const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
     766           0 :     const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
     767           0 :     const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
     768           0 :     const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
     769             :     // multiply 2 adjacent elements with the filter and add the result
     770           0 :     const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
     771           0 :     const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
     772           0 :     const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
     773           0 :     const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
     774             :     // add and saturate the results together
     775           0 :     const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
     776           0 :     const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
     777             :     // merge the result together
     778           0 :     const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
     779           0 :     const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
     780             :     // multiply 2 adjacent elements with the filter and add the result
     781           0 :     const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
     782           0 :     const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
     783             :     // merge the result together
     784           0 :     const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
     785           0 :     const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
     786             :     // multiply 2 adjacent elements with the filter and add the result
     787           0 :     const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
     788           0 :     const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
     789             :     // add and saturate the results together
     790           0 :     __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
     791           0 :     __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
     792             : 
     793             :     // add and saturate the results together
     794           0 :     temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
     795           0 :     temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
     796             :     // round and shift by 7 bit each 16 bit
     797           0 :     temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
     798           0 :     temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
     799             :     // shrink to 8 bit each 16 bits, the first lane contain the first
     800             :     // convolve result and the second lane contain the second convolve
     801             :     // result
     802           0 :     temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
     803           0 :     src_ptr += 16;
     804             :     // save 16 bytes convolve result
     805           0 :     _mm_store_si128((__m128i *)&dst[i], temp_hi);
     806             :   }
     807           0 : }
     808             : 
     809           0 : static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
     810             :                                     uint8_t *dst, ptrdiff_t dst_stride,
     811             :                                     const InterpKernel *y_filters, int y0_q4,
     812             :                                     int y_step_q4, int w, int h) {
     813             :   int y;
     814           0 :   int y_q4 = y0_q4;
     815             : 
     816           0 :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
     817           0 :   for (y = 0; y < h; ++y) {
     818           0 :     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
     819           0 :     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
     820           0 :     if (y_q4 & SUBPEL_MASK) {
     821           0 :       filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
     822             :                             w);
     823             :     } else {
     824           0 :       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
     825             :     }
     826           0 :     y_q4 += y_step_q4;
     827             :   }
     828           0 : }
     829             : 
     830           0 : static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
     831             :                              uint8_t *dst, ptrdiff_t dst_stride,
     832             :                              const InterpKernel *const x_filters, int x0_q4,
     833             :                              int x_step_q4, const InterpKernel *const y_filters,
     834             :                              int y0_q4, int y_step_q4, int w, int h) {
     835             :   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
     836             :   // 2d filtering proceeds in 2 steps:
     837             :   //   (1) Interpolate horizontally into an intermediate buffer, temp.
     838             :   //   (2) Interpolate temp vertically to derive the sub-pixel result.
     839             :   // Deriving the maximum number of rows in the temp buffer (135):
     840             :   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
     841             :   // --Largest block size is 64x64 pixels.
     842             :   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
     843             :   //   original frame (in 1/16th pixel units).
     844             :   // --Must round-up because block may be located at sub-pixel position.
     845             :   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
     846             :   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
     847             :   // --Require an additional 8 rows for the horiz_w8 transpose tail.
     848             :   DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]);
     849           0 :   const int intermediate_height =
     850           0 :       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
     851             : 
     852           0 :   assert(w <= MAX_SB_SIZE);
     853           0 :   assert(h <= MAX_SB_SIZE);
     854           0 :   assert(y_step_q4 <= 32);
     855           0 :   assert(x_step_q4 <= 32);
     856             : 
     857           0 :   if (w >= 8) {
     858           0 :     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
     859             :                             src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
     860             :                             x_step_q4, w, intermediate_height);
     861             :   } else {
     862           0 :     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
     863             :                             src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
     864             :                             x_step_q4, w, intermediate_height);
     865             :   }
     866             : 
     867           0 :   if (w >= 16) {
     868           0 :     scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
     869             :                             MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
     870             :                             y_step_q4, w, h);
     871           0 :   } else if (w == 8) {
     872           0 :     scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
     873             :                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
     874             :                            y_step_q4, w, h);
     875             :   } else {
     876           0 :     scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
     877             :                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
     878             :                            y_step_q4, w, h);
     879             :   }
     880           0 : }
     881             : 
     882           0 : static const InterpKernel *get_filter_base(const int16_t *filter) {
     883             :   // NOTE: This assumes that the filter table is 256-byte aligned.
     884             :   // TODO(agrange) Modify to make independent of table alignment.
     885           0 :   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
     886             : }
     887             : 
     888           0 : static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
     889           0 :   return (int)((const InterpKernel *)(intptr_t)f - base);
     890             : }
     891             : 
     892           0 : void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     893             :                          ptrdiff_t dst_stride, const int16_t *filter_x,
     894             :                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
     895             :                          int w, int h) {
     896           0 :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     897           0 :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     898             : 
     899           0 :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     900           0 :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     901             : 
     902           0 :   scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
     903             :                    x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
     904           0 : }
     905             : 
     906             : // void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
     907             : //                          uint8_t *dst, ptrdiff_t dst_stride,
     908             : //                          const int16_t *filter_x, int x_step_q4,
     909             : //                          const int16_t *filter_y, int y_step_q4,
     910             : //                          int w, int h);
     911             : // void aom_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
     912             : //                              uint8_t *dst, ptrdiff_t dst_stride,
     913             : //                              const int16_t *filter_x, int x_step_q4,
     914             : //                              const int16_t *filter_y, int y_step_q4,
     915             : //                              int w, int h);
     916           0 : FUN_CONV_2D(, ssse3);
     917           0 : FUN_CONV_2D(avg_, ssse3);
     918             : #if CONFIG_LOOP_RESTORATION
     919             : FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3);
     920             : #endif

Generated by: LCOV version 1.13