LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - masked_variance_intrin_ssse3.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 477 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 67 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <stdlib.h>
      13             : #include <string.h>
      14             : #include <tmmintrin.h>
      15             : 
      16             : #include "./aom_config.h"
      17             : #include "./aom_dsp_rtcd.h"
      18             : #include "aom_dsp/blend.h"
      19             : #include "aom/aom_integer.h"
      20             : #include "aom_ports/mem.h"
      21             : #include "aom_dsp/aom_filter.h"
      22             : #include "aom_dsp/x86/synonyms.h"
      23             : 
      24             : // For width a multiple of 16
      25             : static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
      26             :                             int yoffset, uint8_t *dst, int w, int h);
      27             : 
      28             : static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
      29             :                                int yoffset, uint8_t *dst, int h);
      30             : 
      31             : static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
      32             :                                int yoffset, uint8_t *dst, int h);
      33             : 
      34             : // For width a multiple of 16
      35             : static void masked_variance(const uint8_t *src_ptr, int src_stride,
      36             :                             const uint8_t *a_ptr, int a_stride,
      37             :                             const uint8_t *b_ptr, int b_stride,
      38             :                             const uint8_t *m_ptr, int m_stride, int width,
      39             :                             int height, unsigned int *sse, int *sum_);
      40             : 
      41             : static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
      42             :                                const uint8_t *a_ptr, const uint8_t *b_ptr,
      43             :                                const uint8_t *m_ptr, int m_stride, int height,
      44             :                                unsigned int *sse, int *sum_);
      45             : 
      46             : static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
      47             :                                const uint8_t *a_ptr, const uint8_t *b_ptr,
      48             :                                const uint8_t *m_ptr, int m_stride, int height,
      49             :                                unsigned int *sse, int *sum_);
      50             : 
      51             : #define MASK_SUBPIX_VAR_SSSE3(W, H)                                   \
      52             :   unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3(        \
      53             :       const uint8_t *src, int src_stride, int xoffset, int yoffset,   \
      54             :       const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
      55             :       const uint8_t *msk, int msk_stride, int invert_mask,            \
      56             :       unsigned int *sse) {                                            \
      57             :     int sum;                                                          \
      58             :     uint8_t temp[(H + 1) * W];                                        \
      59             :                                                                       \
      60             :     bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);   \
      61             :                                                                       \
      62             :     if (!invert_mask)                                                 \
      63             :       masked_variance(ref, ref_stride, temp, W, second_pred, W, msk,  \
      64             :                       msk_stride, W, H, sse, &sum);                   \
      65             :     else                                                              \
      66             :       masked_variance(ref, ref_stride, second_pred, W, temp, W, msk,  \
      67             :                       msk_stride, W, H, sse, &sum);                   \
      68             :     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));         \
      69             :   }
      70             : 
      71             : #define MASK_SUBPIX_VAR8XH_SSSE3(H)                                           \
      72             :   unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3(                    \
      73             :       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
      74             :       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
      75             :       const uint8_t *msk, int msk_stride, int invert_mask,                    \
      76             :       unsigned int *sse) {                                                    \
      77             :     int sum;                                                                  \
      78             :     uint8_t temp[(H + 1) * 8];                                                \
      79             :                                                                               \
      80             :     bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H);           \
      81             :                                                                               \
      82             :     if (!invert_mask)                                                         \
      83             :       masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
      84             :                          H, sse, &sum);                                       \
      85             :     else                                                                      \
      86             :       masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
      87             :                          H, sse, &sum);                                       \
      88             :     return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H));                 \
      89             :   }
      90             : 
      91             : #define MASK_SUBPIX_VAR4XH_SSSE3(H)                                           \
      92             :   unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3(                    \
      93             :       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
      94             :       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
      95             :       const uint8_t *msk, int msk_stride, int invert_mask,                    \
      96             :       unsigned int *sse) {                                                    \
      97             :     int sum;                                                                  \
      98             :     uint8_t temp[(H + 1) * 4];                                                \
      99             :                                                                               \
     100             :     bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);           \
     101             :                                                                               \
     102             :     if (!invert_mask)                                                         \
     103             :       masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
     104             :                          H, sse, &sum);                                       \
     105             :     else                                                                      \
     106             :       masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
     107             :                          H, sse, &sum);                                       \
     108             :     return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));                 \
     109             :   }
     110             : 
     111             : #if CONFIG_EXT_PARTITION
     112             : MASK_SUBPIX_VAR_SSSE3(128, 128)
     113             : MASK_SUBPIX_VAR_SSSE3(128, 64)
     114             : MASK_SUBPIX_VAR_SSSE3(64, 128)
     115             : #endif
     116           0 : MASK_SUBPIX_VAR_SSSE3(64, 64)
     117           0 : MASK_SUBPIX_VAR_SSSE3(64, 32)
     118           0 : MASK_SUBPIX_VAR_SSSE3(32, 64)
     119           0 : MASK_SUBPIX_VAR_SSSE3(32, 32)
     120           0 : MASK_SUBPIX_VAR_SSSE3(32, 16)
     121           0 : MASK_SUBPIX_VAR_SSSE3(16, 32)
     122           0 : MASK_SUBPIX_VAR_SSSE3(16, 16)
     123           0 : MASK_SUBPIX_VAR_SSSE3(16, 8)
     124           0 : MASK_SUBPIX_VAR8XH_SSSE3(16)
     125           0 : MASK_SUBPIX_VAR8XH_SSSE3(8)
     126           0 : MASK_SUBPIX_VAR8XH_SSSE3(4)
     127           0 : MASK_SUBPIX_VAR4XH_SSSE3(8)
     128           0 : MASK_SUBPIX_VAR4XH_SSSE3(4)
     129             : 
     130           0 : static INLINE __m128i filter_block(const __m128i a, const __m128i b,
     131             :                                    const __m128i filter) {
     132           0 :   __m128i v0 = _mm_unpacklo_epi8(a, b);
     133           0 :   v0 = _mm_maddubs_epi16(v0, filter);
     134           0 :   v0 = xx_roundn_epu16(v0, FILTER_BITS);
     135             : 
     136           0 :   __m128i v1 = _mm_unpackhi_epi8(a, b);
     137           0 :   v1 = _mm_maddubs_epi16(v1, filter);
     138           0 :   v1 = xx_roundn_epu16(v1, FILTER_BITS);
     139             : 
     140           0 :   return _mm_packus_epi16(v0, v1);
     141             : }
     142             : 
     143           0 : static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
     144             :                             int yoffset, uint8_t *dst, int w, int h) {
     145             :   int i, j;
     146             :   // Horizontal filter
     147           0 :   if (xoffset == 0) {
     148           0 :     uint8_t *b = dst;
     149           0 :     for (i = 0; i < h + 1; ++i) {
     150           0 :       for (j = 0; j < w; j += 16) {
     151           0 :         __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
     152           0 :         _mm_storeu_si128((__m128i *)&b[j], x);
     153             :       }
     154           0 :       src += src_stride;
     155           0 :       b += w;
     156             :     }
     157           0 :   } else if (xoffset == 4) {
     158           0 :     uint8_t *b = dst;
     159           0 :     for (i = 0; i < h + 1; ++i) {
     160           0 :       for (j = 0; j < w; j += 16) {
     161           0 :         __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
     162           0 :         __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
     163           0 :         __m128i z = _mm_alignr_epi8(y, x, 1);
     164           0 :         _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z));
     165             :       }
     166           0 :       src += src_stride;
     167           0 :       b += w;
     168             :     }
     169             :   } else {
     170           0 :     uint8_t *b = dst;
     171           0 :     const uint8_t *hfilter = bilinear_filters_2t[xoffset];
     172           0 :     const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
     173           0 :     for (i = 0; i < h + 1; ++i) {
     174           0 :       for (j = 0; j < w; j += 16) {
     175           0 :         const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
     176           0 :         const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
     177           0 :         const __m128i z = _mm_alignr_epi8(y, x, 1);
     178           0 :         const __m128i res = filter_block(x, z, hfilter_vec);
     179           0 :         _mm_storeu_si128((__m128i *)&b[j], res);
     180             :       }
     181             : 
     182           0 :       src += src_stride;
     183           0 :       b += w;
     184             :     }
     185             :   }
     186             : 
     187             :   // Vertical filter
     188           0 :   if (yoffset == 0) {
     189             :     // The data is already in 'dst', so no need to filter
     190           0 :   } else if (yoffset == 4) {
     191           0 :     for (i = 0; i < h; ++i) {
     192           0 :       for (j = 0; j < w; j += 16) {
     193           0 :         __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
     194           0 :         __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
     195           0 :         _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y));
     196             :       }
     197           0 :       dst += w;
     198             :     }
     199             :   } else {
     200           0 :     const uint8_t *vfilter = bilinear_filters_2t[yoffset];
     201           0 :     const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
     202           0 :     for (i = 0; i < h; ++i) {
     203           0 :       for (j = 0; j < w; j += 16) {
     204           0 :         const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
     205           0 :         const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
     206           0 :         const __m128i res = filter_block(x, y, vfilter_vec);
     207           0 :         _mm_storeu_si128((__m128i *)&dst[j], res);
     208             :       }
     209             : 
     210           0 :       dst += w;
     211             :     }
     212             :   }
     213           0 : }
     214             : 
     215           0 : static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0,
     216             :                                          const __m128i *a1, const __m128i *b1,
     217             :                                          const __m128i *filter) {
     218           0 :   __m128i v0 = _mm_unpacklo_epi8(*a0, *b0);
     219           0 :   v0 = _mm_maddubs_epi16(v0, *filter);
     220           0 :   v0 = xx_roundn_epu16(v0, FILTER_BITS);
     221             : 
     222           0 :   __m128i v1 = _mm_unpacklo_epi8(*a1, *b1);
     223           0 :   v1 = _mm_maddubs_epi16(v1, *filter);
     224           0 :   v1 = xx_roundn_epu16(v1, FILTER_BITS);
     225             : 
     226           0 :   return _mm_packus_epi16(v0, v1);
     227             : }
     228             : 
     229           0 : static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
     230             :                                int yoffset, uint8_t *dst, int h) {
     231             :   int i;
     232             :   // Horizontal filter
     233           0 :   if (xoffset == 0) {
     234           0 :     uint8_t *b = dst;
     235           0 :     for (i = 0; i < h + 1; ++i) {
     236           0 :       __m128i x = _mm_loadl_epi64((__m128i *)src);
     237             :       _mm_storel_epi64((__m128i *)b, x);
     238           0 :       src += src_stride;
     239           0 :       b += 8;
     240             :     }
     241           0 :   } else if (xoffset == 4) {
     242           0 :     uint8_t *b = dst;
     243           0 :     for (i = 0; i < h + 1; ++i) {
     244           0 :       __m128i x = _mm_loadu_si128((__m128i *)src);
     245           0 :       __m128i z = _mm_srli_si128(x, 1);
     246           0 :       _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z));
     247           0 :       src += src_stride;
     248           0 :       b += 8;
     249             :     }
     250             :   } else {
     251           0 :     uint8_t *b = dst;
     252           0 :     const uint8_t *hfilter = bilinear_filters_2t[xoffset];
     253           0 :     const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
     254           0 :     for (i = 0; i < h; i += 2) {
     255           0 :       const __m128i x0 = _mm_loadu_si128((__m128i *)src);
     256           0 :       const __m128i z0 = _mm_srli_si128(x0, 1);
     257           0 :       const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
     258           0 :       const __m128i z1 = _mm_srli_si128(x1, 1);
     259           0 :       const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
     260             :       _mm_storeu_si128((__m128i *)b, res);
     261             : 
     262           0 :       src += src_stride * 2;
     263           0 :       b += 16;
     264             :     }
     265             :     // Handle i = h separately
     266           0 :     const __m128i x0 = _mm_loadu_si128((__m128i *)src);
     267           0 :     const __m128i z0 = _mm_srli_si128(x0, 1);
     268             : 
     269           0 :     __m128i v0 = _mm_unpacklo_epi8(x0, z0);
     270           0 :     v0 = _mm_maddubs_epi16(v0, hfilter_vec);
     271           0 :     v0 = xx_roundn_epu16(v0, FILTER_BITS);
     272             : 
     273           0 :     _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0));
     274             :   }
     275             : 
     276             :   // Vertical filter
     277           0 :   if (yoffset == 0) {
     278             :     // The data is already in 'dst', so no need to filter
     279           0 :   } else if (yoffset == 4) {
     280           0 :     for (i = 0; i < h; ++i) {
     281           0 :       __m128i x = _mm_loadl_epi64((__m128i *)dst);
     282           0 :       __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
     283           0 :       _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y));
     284           0 :       dst += 8;
     285             :     }
     286             :   } else {
     287           0 :     const uint8_t *vfilter = bilinear_filters_2t[yoffset];
     288           0 :     const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
     289           0 :     for (i = 0; i < h; i += 2) {
     290           0 :       const __m128i x = _mm_loadl_epi64((__m128i *)dst);
     291           0 :       const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
     292           0 :       const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
     293           0 :       const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
     294             :       _mm_storeu_si128((__m128i *)dst, res);
     295             : 
     296           0 :       dst += 16;
     297             :     }
     298             :   }
     299           0 : }
     300             : 
     301           0 : static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
     302             :                                int yoffset, uint8_t *dst, int h) {
     303             :   int i;
     304             :   // Horizontal filter
     305           0 :   if (xoffset == 0) {
     306           0 :     uint8_t *b = dst;
     307           0 :     for (i = 0; i < h + 1; ++i) {
     308           0 :       __m128i x = xx_loadl_32((__m128i *)src);
     309           0 :       xx_storel_32((__m128i *)b, x);
     310           0 :       src += src_stride;
     311           0 :       b += 4;
     312             :     }
     313           0 :   } else if (xoffset == 4) {
     314           0 :     uint8_t *b = dst;
     315           0 :     for (i = 0; i < h + 1; ++i) {
     316           0 :       __m128i x = _mm_loadl_epi64((__m128i *)src);
     317           0 :       __m128i z = _mm_srli_si128(x, 1);
     318           0 :       xx_storel_32((__m128i *)b, _mm_avg_epu8(x, z));
     319           0 :       src += src_stride;
     320           0 :       b += 4;
     321             :     }
     322             :   } else {
     323           0 :     uint8_t *b = dst;
     324           0 :     const uint8_t *hfilter = bilinear_filters_2t[xoffset];
     325           0 :     const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
     326           0 :     for (i = 0; i < h; i += 4) {
     327           0 :       const __m128i x0 = _mm_loadl_epi64((__m128i *)src);
     328           0 :       const __m128i z0 = _mm_srli_si128(x0, 1);
     329           0 :       const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]);
     330           0 :       const __m128i z1 = _mm_srli_si128(x1, 1);
     331           0 :       const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]);
     332           0 :       const __m128i z2 = _mm_srli_si128(x2, 1);
     333           0 :       const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]);
     334           0 :       const __m128i z3 = _mm_srli_si128(x3, 1);
     335             : 
     336           0 :       const __m128i a0 = _mm_unpacklo_epi32(x0, x1);
     337           0 :       const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
     338           0 :       const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
     339           0 :       const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
     340           0 :       const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec);
     341             :       _mm_storeu_si128((__m128i *)b, res);
     342             : 
     343           0 :       src += src_stride * 4;
     344           0 :       b += 16;
     345             :     }
     346             :     // Handle i = h separately
     347           0 :     const __m128i x = _mm_loadl_epi64((__m128i *)src);
     348           0 :     const __m128i z = _mm_srli_si128(x, 1);
     349             : 
     350           0 :     __m128i v0 = _mm_unpacklo_epi8(x, z);
     351           0 :     v0 = _mm_maddubs_epi16(v0, hfilter_vec);
     352           0 :     v0 = xx_roundn_epu16(v0, FILTER_BITS);
     353             : 
     354           0 :     xx_storel_32((__m128i *)b, _mm_packus_epi16(v0, v0));
     355             :   }
     356             : 
     357             :   // Vertical filter
     358           0 :   if (yoffset == 0) {
     359             :     // The data is already in 'dst', so no need to filter
     360           0 :   } else if (yoffset == 4) {
     361           0 :     for (i = 0; i < h; ++i) {
     362           0 :       __m128i x = xx_loadl_32((__m128i *)dst);
     363           0 :       __m128i y = xx_loadl_32((__m128i *)&dst[4]);
     364           0 :       xx_storel_32((__m128i *)dst, _mm_avg_epu8(x, y));
     365           0 :       dst += 4;
     366             :     }
     367             :   } else {
     368           0 :     const uint8_t *vfilter = bilinear_filters_2t[yoffset];
     369           0 :     const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
     370           0 :     for (i = 0; i < h; i += 4) {
     371           0 :       const __m128i a = xx_loadl_32((__m128i *)dst);
     372           0 :       const __m128i b = xx_loadl_32((__m128i *)&dst[4]);
     373           0 :       const __m128i c = xx_loadl_32((__m128i *)&dst[8]);
     374           0 :       const __m128i d = xx_loadl_32((__m128i *)&dst[12]);
     375           0 :       const __m128i e = xx_loadl_32((__m128i *)&dst[16]);
     376             : 
     377           0 :       const __m128i a0 = _mm_unpacklo_epi32(a, b);
     378           0 :       const __m128i b0 = _mm_unpacklo_epi32(b, c);
     379           0 :       const __m128i a1 = _mm_unpacklo_epi32(c, d);
     380           0 :       const __m128i b1 = _mm_unpacklo_epi32(d, e);
     381           0 :       const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec);
     382             :       _mm_storeu_si128((__m128i *)dst, res);
     383             : 
     384           0 :       dst += 16;
     385             :     }
     386             :   }
     387           0 : }
     388             : 
     389           0 : static INLINE void accumulate_block(const __m128i *src, const __m128i *a,
     390             :                                     const __m128i *b, const __m128i *m,
     391             :                                     __m128i *sum, __m128i *sum_sq) {
     392           0 :   const __m128i zero = _mm_setzero_si128();
     393           0 :   const __m128i one = _mm_set1_epi16(1);
     394           0 :   const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
     395           0 :   const __m128i m_inv = _mm_sub_epi8(mask_max, *m);
     396             : 
     397             :   // Calculate 16 predicted pixels.
     398             :   // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
     399             :   // is 64 * 255, so we have plenty of space to add rounding constants.
     400           0 :   const __m128i data_l = _mm_unpacklo_epi8(*a, *b);
     401           0 :   const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv);
     402           0 :   __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
     403           0 :   pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
     404             : 
     405           0 :   const __m128i data_r = _mm_unpackhi_epi8(*a, *b);
     406           0 :   const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv);
     407           0 :   __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
     408           0 :   pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
     409             : 
     410           0 :   const __m128i src_l = _mm_unpacklo_epi8(*src, zero);
     411           0 :   const __m128i src_r = _mm_unpackhi_epi8(*src, zero);
     412           0 :   const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
     413           0 :   const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
     414             : 
     415             :   // Update partial sums and partial sums of squares
     416           0 :   *sum =
     417           0 :       _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one));
     418           0 :   *sum_sq =
     419           0 :       _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l),
     420             :                                            _mm_madd_epi16(diff_r, diff_r)));
     421           0 : }
     422             : 
     423           0 : static void masked_variance(const uint8_t *src_ptr, int src_stride,
     424             :                             const uint8_t *a_ptr, int a_stride,
     425             :                             const uint8_t *b_ptr, int b_stride,
     426             :                             const uint8_t *m_ptr, int m_stride, int width,
     427             :                             int height, unsigned int *sse, int *sum_) {
     428             :   int x, y;
     429           0 :   __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
     430             : 
     431           0 :   for (y = 0; y < height; y++) {
     432           0 :     for (x = 0; x < width; x += 16) {
     433           0 :       const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
     434           0 :       const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
     435           0 :       const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
     436           0 :       const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
     437           0 :       accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
     438             :     }
     439             : 
     440           0 :     src_ptr += src_stride;
     441           0 :     a_ptr += a_stride;
     442           0 :     b_ptr += b_stride;
     443           0 :     m_ptr += m_stride;
     444             :   }
     445             :   // Reduce down to a single sum and sum of squares
     446           0 :   sum = _mm_hadd_epi32(sum, sum_sq);
     447           0 :   sum = _mm_hadd_epi32(sum, sum);
     448           0 :   *sum_ = _mm_cvtsi128_si32(sum);
     449           0 :   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
     450           0 : }
     451             : 
     452           0 : static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
     453             :                                const uint8_t *a_ptr, const uint8_t *b_ptr,
     454             :                                const uint8_t *m_ptr, int m_stride, int height,
     455             :                                unsigned int *sse, int *sum_) {
     456             :   int y;
     457           0 :   __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
     458             : 
     459           0 :   for (y = 0; y < height; y += 2) {
     460           0 :     __m128i src = _mm_unpacklo_epi64(
     461             :         _mm_loadl_epi64((const __m128i *)src_ptr),
     462           0 :         _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
     463           0 :     const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
     464           0 :     const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
     465           0 :     const __m128i m =
     466           0 :         _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
     467           0 :                            _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
     468           0 :     accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
     469             : 
     470           0 :     src_ptr += src_stride * 2;
     471           0 :     a_ptr += 16;
     472           0 :     b_ptr += 16;
     473           0 :     m_ptr += m_stride * 2;
     474             :   }
     475             :   // Reduce down to a single sum and sum of squares
     476           0 :   sum = _mm_hadd_epi32(sum, sum_sq);
     477           0 :   sum = _mm_hadd_epi32(sum, sum);
     478           0 :   *sum_ = _mm_cvtsi128_si32(sum);
     479           0 :   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
     480           0 : }
     481             : 
     482           0 : static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
     483             :                                const uint8_t *a_ptr, const uint8_t *b_ptr,
     484             :                                const uint8_t *m_ptr, int m_stride, int height,
     485             :                                unsigned int *sse, int *sum_) {
     486             :   int y;
     487           0 :   __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
     488             : 
     489           0 :   for (y = 0; y < height; y += 4) {
     490             :     // Load four rows at a time
     491           0 :     __m128i src =
     492           0 :         _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride],
     493           0 :                        *(uint32_t *)&src_ptr[src_stride * 2],
     494           0 :                        *(uint32_t *)&src_ptr[src_stride * 3]);
     495           0 :     const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
     496           0 :     const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
     497           0 :     const __m128i m = _mm_setr_epi32(
     498           0 :         *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride],
     499           0 :         *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]);
     500           0 :     accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
     501             : 
     502           0 :     src_ptr += src_stride * 4;
     503           0 :     a_ptr += 16;
     504           0 :     b_ptr += 16;
     505           0 :     m_ptr += m_stride * 4;
     506             :   }
     507             :   // Reduce down to a single sum and sum of squares
     508           0 :   sum = _mm_hadd_epi32(sum, sum_sq);
     509           0 :   sum = _mm_hadd_epi32(sum, sum);
     510           0 :   *sum_ = _mm_cvtsi128_si32(sum);
     511           0 :   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
     512           0 : }
     513             : 
     514             : #if CONFIG_HIGHBITDEPTH
     515             : // For width a multiple of 8
     516             : static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
     517             :                                    int xoffset, int yoffset, uint16_t *dst,
     518             :                                    int w, int h);
     519             : 
     520             : static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
     521             :                                       int xoffset, int yoffset, uint16_t *dst,
     522             :                                       int h);
     523             : 
     524             : // For width a multiple of 8
     525             : static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
     526             :                                    const uint16_t *a_ptr, int a_stride,
     527             :                                    const uint16_t *b_ptr, int b_stride,
     528             :                                    const uint8_t *m_ptr, int m_stride,
     529             :                                    int width, int height, uint64_t *sse,
     530             :                                    int *sum_);
     531             : 
     532             : static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
     533             :                                       const uint16_t *a_ptr,
     534             :                                       const uint16_t *b_ptr,
     535             :                                       const uint8_t *m_ptr, int m_stride,
     536             :                                       int height, int *sse, int *sum_);
     537             : 
     538             : #define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H)                                  \
     539             :   unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3(     \
     540             :       const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
     541             :       const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
     542             :       const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
     543             :     uint64_t sse64;                                                         \
     544             :     int sum;                                                                \
     545             :     uint16_t temp[(H + 1) * W];                                             \
     546             :     const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
     547             :     const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
     548             :     const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
     549             :                                                                             \
     550             :     highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
     551             :                                                                             \
     552             :     if (!invert_mask)                                                       \
     553             :       highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
     554             :                              msk_stride, W, H, &sse64, &sum);               \
     555             :     else                                                                    \
     556             :       highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
     557             :                              msk_stride, W, H, &sse64, &sum);               \
     558             :     *sse = (uint32_t)sse64;                                                 \
     559             :     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
     560             :   }                                                                         \
     561             :   unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3(    \
     562             :       const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
     563             :       const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
     564             :       const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
     565             :     uint64_t sse64;                                                         \
     566             :     int sum;                                                                \
     567             :     uint16_t temp[(H + 1) * W];                                             \
     568             :     const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
     569             :     const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
     570             :     const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
     571             :                                                                             \
     572             :     highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
     573             :                                                                             \
     574             :     if (!invert_mask)                                                       \
     575             :       highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
     576             :                              msk_stride, W, H, &sse64, &sum);               \
     577             :     else                                                                    \
     578             :       highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
     579             :                              msk_stride, W, H, &sse64, &sum);               \
     580             :     *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4);                          \
     581             :     sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
     582             :     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
     583             :   }                                                                         \
     584             :   unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3(    \
     585             :       const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
     586             :       const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
     587             :       const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
     588             :     uint64_t sse64;                                                         \
     589             :     int sum;                                                                \
     590             :     uint16_t temp[(H + 1) * W];                                             \
     591             :     const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
     592             :     const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
     593             :     const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
     594             :                                                                             \
     595             :     highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
     596             :                                                                             \
     597             :     if (!invert_mask)                                                       \
     598             :       highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
     599             :                              msk_stride, W, H, &sse64, &sum);               \
     600             :     else                                                                    \
     601             :       highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
     602             :                              msk_stride, W, H, &sse64, &sum);               \
     603             :     *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8);                          \
     604             :     sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
     605             :     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
     606             :   }
     607             : 
     608             : #define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H)                                  \
     609             :   unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3(         \
     610             :       const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
     611             :       const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
     612             :       const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
     613             :     int sse_;                                                               \
     614             :     int sum;                                                                \
     615             :     uint16_t temp[(H + 1) * 4];                                             \
     616             :     const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
     617             :     const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
     618             :     const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
     619             :                                                                             \
     620             :     highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
     621             :                                                                             \
     622             :     if (!invert_mask)                                                       \
     623             :       highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
     624             :                                 msk_stride, H, &sse_, &sum);                \
     625             :     else                                                                    \
     626             :       highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
     627             :                                 msk_stride, H, &sse_, &sum);                \
     628             :     *sse = (uint32_t)sse_;                                                  \
     629             :     return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));               \
     630             :   }                                                                         \
     631             :   unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3(        \
     632             :       const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
     633             :       const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
     634             :       const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
     635             :     int sse_;                                                               \
     636             :     int sum;                                                                \
     637             :     uint16_t temp[(H + 1) * 4];                                             \
     638             :     const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
     639             :     const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
     640             :     const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
     641             :                                                                             \
     642             :     highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
     643             :                                                                             \
     644             :     if (!invert_mask)                                                       \
     645             :       highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
     646             :                                 msk_stride, H, &sse_, &sum);                \
     647             :     else                                                                    \
     648             :       highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
     649             :                                 msk_stride, H, &sse_, &sum);                \
     650             :     *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4);                           \
     651             :     sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
     652             :     return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));               \
     653             :   }                                                                         \
     654             :   unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3(        \
     655             :       const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
     656             :       const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
     657             :       const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
     658             :     int sse_;                                                               \
     659             :     int sum;                                                                \
     660             :     uint16_t temp[(H + 1) * 4];                                             \
     661             :     const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
     662             :     const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
     663             :     const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
     664             :                                                                             \
     665             :     highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
     666             :                                                                             \
     667             :     if (!invert_mask)                                                       \
     668             :       highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
     669             :                                 msk_stride, H, &sse_, &sum);                \
     670             :     else                                                                    \
     671             :       highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
     672             :                                 msk_stride, H, &sse_, &sum);                \
     673             :     *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8);                           \
     674             :     sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
     675             :     return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));               \
     676             :   }
     677             : 
     678             : #if CONFIG_EXT_PARTITION
     679             : HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
     680             : HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
     681             : HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
     682             : #endif
     683           0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
     684           0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
     685           0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
     686           0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32)
     687           0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16)
     688           0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32)
     689           0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16)
     690           0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8)
     691           0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16)
     692           0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
     693           0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
     694           0 : HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
     695           0 : HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
     696             : 
     697           0 : static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
     698             :                                           const __m128i filter) {
     699           0 :   __m128i v0 = _mm_unpacklo_epi16(a, b);
     700           0 :   v0 = _mm_madd_epi16(v0, filter);
     701           0 :   v0 = xx_roundn_epu32(v0, FILTER_BITS);
     702             : 
     703           0 :   __m128i v1 = _mm_unpackhi_epi16(a, b);
     704           0 :   v1 = _mm_madd_epi16(v1, filter);
     705           0 :   v1 = xx_roundn_epu32(v1, FILTER_BITS);
     706             : 
     707           0 :   return _mm_packs_epi32(v0, v1);
     708             : }
     709             : 
     710           0 : static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
     711             :                                    int xoffset, int yoffset, uint16_t *dst,
     712             :                                    int w, int h) {
     713             :   int i, j;
     714             :   // Horizontal filter
     715           0 :   if (xoffset == 0) {
     716           0 :     uint16_t *b = dst;
     717           0 :     for (i = 0; i < h + 1; ++i) {
     718           0 :       for (j = 0; j < w; j += 8) {
     719           0 :         __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
     720           0 :         _mm_storeu_si128((__m128i *)&b[j], x);
     721             :       }
     722           0 :       src += src_stride;
     723           0 :       b += w;
     724             :     }
     725           0 :   } else if (xoffset == 4) {
     726           0 :     uint16_t *b = dst;
     727           0 :     for (i = 0; i < h + 1; ++i) {
     728           0 :       for (j = 0; j < w; j += 8) {
     729           0 :         __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
     730           0 :         __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
     731           0 :         __m128i z = _mm_alignr_epi8(y, x, 2);
     732           0 :         _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z));
     733             :       }
     734           0 :       src += src_stride;
     735           0 :       b += w;
     736             :     }
     737             :   } else {
     738           0 :     uint16_t *b = dst;
     739           0 :     const uint8_t *hfilter = bilinear_filters_2t[xoffset];
     740           0 :     const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
     741           0 :     for (i = 0; i < h + 1; ++i) {
     742           0 :       for (j = 0; j < w; j += 8) {
     743           0 :         const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
     744           0 :         const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
     745           0 :         const __m128i z = _mm_alignr_epi8(y, x, 2);
     746           0 :         const __m128i res = highbd_filter_block(x, z, hfilter_vec);
     747           0 :         _mm_storeu_si128((__m128i *)&b[j], res);
     748             :       }
     749             : 
     750           0 :       src += src_stride;
     751           0 :       b += w;
     752             :     }
     753             :   }
     754             : 
     755             :   // Vertical filter
     756           0 :   if (yoffset == 0) {
     757             :     // The data is already in 'dst', so no need to filter
     758           0 :   } else if (yoffset == 4) {
     759           0 :     for (i = 0; i < h; ++i) {
     760           0 :       for (j = 0; j < w; j += 8) {
     761           0 :         __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
     762           0 :         __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
     763           0 :         _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y));
     764             :       }
     765           0 :       dst += w;
     766             :     }
     767             :   } else {
     768           0 :     const uint8_t *vfilter = bilinear_filters_2t[yoffset];
     769           0 :     const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
     770           0 :     for (i = 0; i < h; ++i) {
     771           0 :       for (j = 0; j < w; j += 8) {
     772           0 :         const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
     773           0 :         const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
     774           0 :         const __m128i res = highbd_filter_block(x, y, vfilter_vec);
     775           0 :         _mm_storeu_si128((__m128i *)&dst[j], res);
     776             :       }
     777             : 
     778           0 :       dst += w;
     779             :     }
     780             :   }
     781           0 : }
     782             : 
     783           0 : static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0,
     784             :                                                 const __m128i *b0,
     785             :                                                 const __m128i *a1,
     786             :                                                 const __m128i *b1,
     787             :                                                 const __m128i *filter) {
     788           0 :   __m128i v0 = _mm_unpacklo_epi16(*a0, *b0);
     789           0 :   v0 = _mm_madd_epi16(v0, *filter);
     790           0 :   v0 = xx_roundn_epu32(v0, FILTER_BITS);
     791             : 
     792           0 :   __m128i v1 = _mm_unpacklo_epi16(*a1, *b1);
     793           0 :   v1 = _mm_madd_epi16(v1, *filter);
     794           0 :   v1 = xx_roundn_epu32(v1, FILTER_BITS);
     795             : 
     796           0 :   return _mm_packs_epi32(v0, v1);
     797             : }
     798             : 
     799           0 : static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
     800             :                                       int xoffset, int yoffset, uint16_t *dst,
     801             :                                       int h) {
     802             :   int i;
     803             :   // Horizontal filter
     804           0 :   if (xoffset == 0) {
     805           0 :     uint16_t *b = dst;
     806           0 :     for (i = 0; i < h + 1; ++i) {
     807           0 :       __m128i x = _mm_loadl_epi64((__m128i *)src);
     808             :       _mm_storel_epi64((__m128i *)b, x);
     809           0 :       src += src_stride;
     810           0 :       b += 4;
     811             :     }
     812           0 :   } else if (xoffset == 4) {
     813           0 :     uint16_t *b = dst;
     814           0 :     for (i = 0; i < h + 1; ++i) {
     815           0 :       __m128i x = _mm_loadu_si128((__m128i *)src);
     816           0 :       __m128i z = _mm_srli_si128(x, 2);
     817           0 :       _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z));
     818           0 :       src += src_stride;
     819           0 :       b += 4;
     820             :     }
     821             :   } else {
     822           0 :     uint16_t *b = dst;
     823           0 :     const uint8_t *hfilter = bilinear_filters_2t[xoffset];
     824           0 :     const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
     825           0 :     for (i = 0; i < h; i += 2) {
     826           0 :       const __m128i x0 = _mm_loadu_si128((__m128i *)src);
     827           0 :       const __m128i z0 = _mm_srli_si128(x0, 2);
     828           0 :       const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
     829           0 :       const __m128i z1 = _mm_srli_si128(x1, 2);
     830           0 :       const __m128i res =
     831             :           highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
     832             :       _mm_storeu_si128((__m128i *)b, res);
     833             : 
     834           0 :       src += src_stride * 2;
     835           0 :       b += 8;
     836             :     }
     837             :     // Process i = h separately
     838           0 :     __m128i x = _mm_loadu_si128((__m128i *)src);
     839           0 :     __m128i z = _mm_srli_si128(x, 2);
     840             : 
     841           0 :     __m128i v0 = _mm_unpacklo_epi16(x, z);
     842           0 :     v0 = _mm_madd_epi16(v0, hfilter_vec);
     843           0 :     v0 = xx_roundn_epu32(v0, FILTER_BITS);
     844             : 
     845           0 :     _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0));
     846             :   }
     847             : 
     848             :   // Vertical filter
     849           0 :   if (yoffset == 0) {
     850             :     // The data is already in 'dst', so no need to filter
     851           0 :   } else if (yoffset == 4) {
     852           0 :     for (i = 0; i < h; ++i) {
     853           0 :       __m128i x = _mm_loadl_epi64((__m128i *)dst);
     854           0 :       __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
     855           0 :       _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y));
     856           0 :       dst += 4;
     857             :     }
     858             :   } else {
     859           0 :     const uint8_t *vfilter = bilinear_filters_2t[yoffset];
     860           0 :     const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
     861           0 :     for (i = 0; i < h; i += 2) {
     862           0 :       const __m128i x = _mm_loadl_epi64((__m128i *)dst);
     863           0 :       const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
     864           0 :       const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
     865           0 :       const __m128i res =
     866             :           highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
     867             :       _mm_storeu_si128((__m128i *)dst, res);
     868             : 
     869           0 :       dst += 8;
     870             :     }
     871             :   }
     872           0 : }
     873             : 
     874           0 : static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
     875             :                                    const uint16_t *a_ptr, int a_stride,
     876             :                                    const uint16_t *b_ptr, int b_stride,
     877             :                                    const uint8_t *m_ptr, int m_stride,
     878             :                                    int width, int height, uint64_t *sse,
     879             :                                    int *sum_) {
     880             :   int x, y;
     881             :   // Note on bit widths:
     882             :   // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26,
     883             :   // so this can be kept as four 32-bit values.
     884             :   // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38,
     885             :   // so this must be stored as two 64-bit values.
     886           0 :   __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
     887           0 :   const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
     888           0 :   const __m128i round_const =
     889             :       _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
     890           0 :   const __m128i zero = _mm_setzero_si128();
     891             : 
     892           0 :   for (y = 0; y < height; y++) {
     893           0 :     for (x = 0; x < width; x += 8) {
     894           0 :       const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
     895           0 :       const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
     896           0 :       const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
     897           0 :       const __m128i m =
     898           0 :           _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero);
     899           0 :       const __m128i m_inv = _mm_sub_epi16(mask_max, m);
     900             : 
     901             :       // Calculate 8 predicted pixels.
     902           0 :       const __m128i data_l = _mm_unpacklo_epi16(a, b);
     903           0 :       const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
     904           0 :       __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
     905           0 :       pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
     906             :                               AOM_BLEND_A64_ROUND_BITS);
     907             : 
     908           0 :       const __m128i data_r = _mm_unpackhi_epi16(a, b);
     909           0 :       const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
     910           0 :       __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
     911           0 :       pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
     912             :                               AOM_BLEND_A64_ROUND_BITS);
     913             : 
     914           0 :       const __m128i src_l = _mm_unpacklo_epi16(src, zero);
     915           0 :       const __m128i src_r = _mm_unpackhi_epi16(src, zero);
     916           0 :       __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
     917           0 :       __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
     918             : 
     919             :       // Update partial sums and partial sums of squares
     920           0 :       sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
     921             :       // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit
     922             :       // field, but the range of values is only [-(2^12 - 1), 2^12 - 1].
     923             :       // So we can re-pack into 16-bit fields and use _mm_madd_epi16
     924             :       // to calculate the squares and partially sum them.
     925           0 :       const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
     926           0 :       const __m128i prod = _mm_madd_epi16(tmp, tmp);
     927             :       // Then we want to sign-extend to 64 bits and accumulate
     928           0 :       const __m128i sign = _mm_srai_epi32(prod, 31);
     929           0 :       const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign);
     930           0 :       const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign);
     931           0 :       sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1));
     932             :     }
     933             : 
     934           0 :     src_ptr += src_stride;
     935           0 :     a_ptr += a_stride;
     936           0 :     b_ptr += b_stride;
     937           0 :     m_ptr += m_stride;
     938             :   }
     939             :   // Reduce down to a single sum and sum of squares
     940           0 :   sum = _mm_hadd_epi32(sum, zero);
     941           0 :   sum = _mm_hadd_epi32(sum, zero);
     942           0 :   *sum_ = _mm_cvtsi128_si32(sum);
     943           0 :   sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8));
     944             :   _mm_storel_epi64((__m128i *)sse, sum_sq);
     945           0 : }
     946             : 
     947           0 : static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
     948             :                                       const uint16_t *a_ptr,
     949             :                                       const uint16_t *b_ptr,
     950             :                                       const uint8_t *m_ptr, int m_stride,
     951             :                                       int height, int *sse, int *sum_) {
     952             :   int y;
     953             :   // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions).
     954             :   // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18
     955             :   // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30.
     956             :   // So we can safely pack sum_sq into 32-bit fields, which is slightly more
     957             :   // convenient.
     958           0 :   __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
     959           0 :   const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
     960           0 :   const __m128i round_const =
     961             :       _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
     962           0 :   const __m128i zero = _mm_setzero_si128();
     963             : 
     964           0 :   for (y = 0; y < height; y += 2) {
     965           0 :     __m128i src = _mm_unpacklo_epi64(
     966             :         _mm_loadl_epi64((const __m128i *)src_ptr),
     967           0 :         _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
     968           0 :     const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
     969           0 :     const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
     970           0 :     const __m128i m = _mm_unpacklo_epi8(
     971             :         _mm_unpacklo_epi32(
     972           0 :             _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
     973           0 :             _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
     974             :         zero);
     975           0 :     const __m128i m_inv = _mm_sub_epi16(mask_max, m);
     976             : 
     977           0 :     const __m128i data_l = _mm_unpacklo_epi16(a, b);
     978           0 :     const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
     979           0 :     __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
     980           0 :     pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
     981             :                             AOM_BLEND_A64_ROUND_BITS);
     982             : 
     983           0 :     const __m128i data_r = _mm_unpackhi_epi16(a, b);
     984           0 :     const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
     985           0 :     __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
     986           0 :     pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
     987             :                             AOM_BLEND_A64_ROUND_BITS);
     988             : 
     989           0 :     const __m128i src_l = _mm_unpacklo_epi16(src, zero);
     990           0 :     const __m128i src_r = _mm_unpackhi_epi16(src, zero);
     991           0 :     __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
     992           0 :     __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
     993             : 
     994             :     // Update partial sums and partial sums of squares
     995           0 :     sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
     996           0 :     const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
     997           0 :     const __m128i prod = _mm_madd_epi16(tmp, tmp);
     998           0 :     sum_sq = _mm_add_epi32(sum_sq, prod);
     999             : 
    1000           0 :     src_ptr += src_stride * 2;
    1001           0 :     a_ptr += 8;
    1002           0 :     b_ptr += 8;
    1003           0 :     m_ptr += m_stride * 2;
    1004             :   }
    1005             :   // Reduce down to a single sum and sum of squares
    1006           0 :   sum = _mm_hadd_epi32(sum, sum_sq);
    1007           0 :   sum = _mm_hadd_epi32(sum, zero);
    1008           0 :   *sum_ = _mm_cvtsi128_si32(sum);
    1009           0 :   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
    1010           0 : }
    1011             : 
    1012             : #endif

Generated by: LCOV version 1.13