LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - masked_sad_intrin_ssse3.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 188 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 31 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <stdio.h>
      13             : #include <tmmintrin.h>
      14             : 
      15             : #include "./aom_config.h"
      16             : #include "./aom_dsp_rtcd.h"
      17             : #include "aom_dsp/blend.h"
      18             : #include "aom/aom_integer.h"
      19             : #include "aom_dsp/x86/synonyms.h"
      20             : 
      21             : // For width a multiple of 16
      22             : static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
      23             :                                             int src_stride,
      24             :                                             const uint8_t *a_ptr, int a_stride,
      25             :                                             const uint8_t *b_ptr, int b_stride,
      26             :                                             const uint8_t *m_ptr, int m_stride,
      27             :                                             int width, int height);
      28             : 
      29             : static INLINE unsigned int masked_sad8xh_ssse3(
      30             :     const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
      31             :     const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
      32             :     int height);
      33             : 
      34             : static INLINE unsigned int masked_sad4xh_ssse3(
      35             :     const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
      36             :     const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
      37             :     int height);
      38             : 
      39             : #define MASKSADMXN_SSSE3(m, n)                                                \
      40             :   unsigned int aom_masked_sad##m##x##n##_ssse3(                               \
      41             :       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
      42             :       const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
      43             :       int invert_mask) {                                                      \
      44             :     if (!invert_mask)                                                         \
      45             :       return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred,  \
      46             :                               m, msk, msk_stride, m, n);                      \
      47             :     else                                                                      \
      48             :       return masked_sad_ssse3(src, src_stride, second_pred, m, ref,           \
      49             :                               ref_stride, msk, msk_stride, m, n);             \
      50             :   }
      51             : 
      52             : #define MASKSAD8XN_SSSE3(n)                                                   \
      53             :   unsigned int aom_masked_sad8x##n##_ssse3(                                   \
      54             :       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
      55             :       const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
      56             :       int invert_mask) {                                                      \
      57             :     if (!invert_mask)                                                         \
      58             :       return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,            \
      59             :                                  second_pred, 8, msk, msk_stride, n);         \
      60             :     else                                                                      \
      61             :       return masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref,        \
      62             :                                  ref_stride, msk, msk_stride, n);             \
      63             :   }
      64             : 
      65             : #define MASKSAD4XN_SSSE3(n)                                                   \
      66             :   unsigned int aom_masked_sad4x##n##_ssse3(                                   \
      67             :       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
      68             :       const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
      69             :       int invert_mask) {                                                      \
      70             :     if (!invert_mask)                                                         \
      71             :       return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,            \
      72             :                                  second_pred, 4, msk, msk_stride, n);         \
      73             :     else                                                                      \
      74             :       return masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref,        \
      75             :                                  ref_stride, msk, msk_stride, n);             \
      76             :   }
      77             : 
      78             : #if CONFIG_EXT_PARTITION
      79             : MASKSADMXN_SSSE3(128, 128)
      80             : MASKSADMXN_SSSE3(128, 64)
      81             : MASKSADMXN_SSSE3(64, 128)
      82             : #endif  // CONFIG_EXT_PARTITION
      83           0 : MASKSADMXN_SSSE3(64, 64)
      84           0 : MASKSADMXN_SSSE3(64, 32)
      85           0 : MASKSADMXN_SSSE3(32, 64)
      86           0 : MASKSADMXN_SSSE3(32, 32)
      87           0 : MASKSADMXN_SSSE3(32, 16)
      88           0 : MASKSADMXN_SSSE3(16, 32)
      89           0 : MASKSADMXN_SSSE3(16, 16)
      90           0 : MASKSADMXN_SSSE3(16, 8)
      91           0 : MASKSAD8XN_SSSE3(16)
      92           0 : MASKSAD8XN_SSSE3(8)
      93           0 : MASKSAD8XN_SSSE3(4)
      94           0 : MASKSAD4XN_SSSE3(8)
      95           0 : MASKSAD4XN_SSSE3(4)
      96             : 
      97           0 : static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
      98             :                                             int src_stride,
      99             :                                             const uint8_t *a_ptr, int a_stride,
     100             :                                             const uint8_t *b_ptr, int b_stride,
     101             :                                             const uint8_t *m_ptr, int m_stride,
     102             :                                             int width, int height) {
     103             :   int x, y;
     104           0 :   __m128i res = _mm_setzero_si128();
     105           0 :   const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
     106             : 
     107           0 :   for (y = 0; y < height; y++) {
     108           0 :     for (x = 0; x < width; x += 16) {
     109           0 :       const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
     110           0 :       const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
     111           0 :       const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
     112           0 :       const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
     113           0 :       const __m128i m_inv = _mm_sub_epi8(mask_max, m);
     114             : 
     115             :       // Calculate 16 predicted pixels.
     116             :       // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
     117             :       // is 64 * 255, so we have plenty of space to add rounding constants.
     118           0 :       const __m128i data_l = _mm_unpacklo_epi8(a, b);
     119           0 :       const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
     120           0 :       __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
     121           0 :       pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
     122             : 
     123           0 :       const __m128i data_r = _mm_unpackhi_epi8(a, b);
     124           0 :       const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
     125           0 :       __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
     126           0 :       pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
     127             : 
     128           0 :       const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
     129           0 :       res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
     130             :     }
     131             : 
     132           0 :     src_ptr += src_stride;
     133           0 :     a_ptr += a_stride;
     134           0 :     b_ptr += b_stride;
     135           0 :     m_ptr += m_stride;
     136             :   }
     137             :   // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
     138           0 :   int32_t sad =
     139           0 :       _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
     140           0 :   return (sad + 31) >> 6;
     141             : }
     142             : 
     143           0 : static INLINE unsigned int masked_sad8xh_ssse3(
     144             :     const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
     145             :     const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
     146             :     int height) {
     147             :   int y;
     148           0 :   __m128i res = _mm_setzero_si128();
     149           0 :   const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
     150             : 
     151           0 :   for (y = 0; y < height; y += 2) {
     152           0 :     const __m128i src = _mm_unpacklo_epi64(
     153             :         _mm_loadl_epi64((const __m128i *)src_ptr),
     154           0 :         _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
     155           0 :     const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
     156           0 :     const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
     157           0 :     const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
     158           0 :     const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
     159           0 :     const __m128i m =
     160           0 :         _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
     161           0 :                            _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
     162           0 :     const __m128i m_inv = _mm_sub_epi8(mask_max, m);
     163             : 
     164           0 :     const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
     165           0 :     const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
     166           0 :     __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
     167           0 :     pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
     168             : 
     169           0 :     const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
     170           0 :     const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
     171           0 :     __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
     172           0 :     pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
     173             : 
     174           0 :     const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
     175           0 :     res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
     176             : 
     177           0 :     src_ptr += src_stride * 2;
     178           0 :     a_ptr += a_stride * 2;
     179           0 :     b_ptr += b_stride * 2;
     180           0 :     m_ptr += m_stride * 2;
     181             :   }
     182           0 :   int32_t sad =
     183           0 :       _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
     184           0 :   return (sad + 31) >> 6;
     185             : }
     186             : 
     187           0 : static INLINE unsigned int masked_sad4xh_ssse3(
     188             :     const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
     189             :     const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
     190             :     int height) {
     191             :   int y;
     192           0 :   __m128i res = _mm_setzero_si128();
     193           0 :   const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
     194             : 
     195           0 :   for (y = 0; y < height; y += 2) {
     196             :     // Load two rows at a time, this seems to be a bit faster
     197             :     // than four rows at a time in this case.
     198           0 :     const __m128i src = _mm_unpacklo_epi32(
     199           0 :         _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
     200           0 :         _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
     201           0 :     const __m128i a =
     202           0 :         _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
     203           0 :                            _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
     204           0 :     const __m128i b =
     205           0 :         _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
     206           0 :                            _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
     207           0 :     const __m128i m =
     208           0 :         _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
     209           0 :                            _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
     210           0 :     const __m128i m_inv = _mm_sub_epi8(mask_max, m);
     211             : 
     212           0 :     const __m128i data = _mm_unpacklo_epi8(a, b);
     213           0 :     const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
     214           0 :     __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
     215           0 :     pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);
     216             : 
     217           0 :     const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
     218           0 :     res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
     219             : 
     220           0 :     src_ptr += src_stride * 2;
     221           0 :     a_ptr += a_stride * 2;
     222           0 :     b_ptr += b_stride * 2;
     223           0 :     m_ptr += m_stride * 2;
     224             :   }
     225             :   // At this point, the SAD is stored in lane 0 of 'res'
     226           0 :   int32_t sad = _mm_cvtsi128_si32(res);
     227           0 :   return (sad + 31) >> 6;
     228             : }
     229             : 
     230             : #if CONFIG_HIGHBITDEPTH
     231             : // For width a multiple of 8
     232             : static INLINE unsigned int highbd_masked_sad_ssse3(
     233             :     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     234             :     const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
     235             :     int width, int height);
     236             : 
     237             : static INLINE unsigned int highbd_masked_sad4xh_ssse3(
     238             :     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     239             :     const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
     240             :     int height);
     241             : 
     242             : #define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
     243             :   unsigned int aom_highbd_masked_sad##m##x##n##_ssse3(                        \
     244             :       const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
     245             :       int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,        \
     246             :       int msk_stride, int invert_mask) {                                      \
     247             :     if (!invert_mask)                                                         \
     248             :       return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride,      \
     249             :                                      second_pred8, m, msk, msk_stride, m, n); \
     250             :     else                                                                      \
     251             :       return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \
     252             :                                      ref_stride, msk, msk_stride, m, n);      \
     253             :   }
     254             : 
     255             : #define HIGHBD_MASKSAD4XN_SSSE3(n)                                             \
     256             :   unsigned int aom_highbd_masked_sad4x##n##_ssse3(                             \
     257             :       const uint8_t *src8, int src_stride, const uint8_t *ref8,                \
     258             :       int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,         \
     259             :       int msk_stride, int invert_mask) {                                       \
     260             :     if (!invert_mask)                                                          \
     261             :       return highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, ref_stride,    \
     262             :                                         second_pred8, 4, msk, msk_stride, n);  \
     263             :     else                                                                       \
     264             :       return highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4,     \
     265             :                                         ref8, ref_stride, msk, msk_stride, n); \
     266             :   }
     267             : 
     268             : #if CONFIG_EXT_PARTITION
     269             : HIGHBD_MASKSADMXN_SSSE3(128, 128)
     270             : HIGHBD_MASKSADMXN_SSSE3(128, 64)
     271             : HIGHBD_MASKSADMXN_SSSE3(64, 128)
     272             : #endif  // CONFIG_EXT_PARTITION
     273           0 : HIGHBD_MASKSADMXN_SSSE3(64, 64)
     274           0 : HIGHBD_MASKSADMXN_SSSE3(64, 32)
     275           0 : HIGHBD_MASKSADMXN_SSSE3(32, 64)
     276           0 : HIGHBD_MASKSADMXN_SSSE3(32, 32)
     277           0 : HIGHBD_MASKSADMXN_SSSE3(32, 16)
     278           0 : HIGHBD_MASKSADMXN_SSSE3(16, 32)
     279           0 : HIGHBD_MASKSADMXN_SSSE3(16, 16)
     280           0 : HIGHBD_MASKSADMXN_SSSE3(16, 8)
     281           0 : HIGHBD_MASKSADMXN_SSSE3(8, 16)
     282           0 : HIGHBD_MASKSADMXN_SSSE3(8, 8)
     283           0 : HIGHBD_MASKSADMXN_SSSE3(8, 4)
     284           0 : HIGHBD_MASKSAD4XN_SSSE3(8)
     285           0 : HIGHBD_MASKSAD4XN_SSSE3(4)
     286             : 
     287           0 : static INLINE unsigned int highbd_masked_sad_ssse3(
     288             :     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     289             :     const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
     290             :     int width, int height) {
     291           0 :   const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
     292           0 :   const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
     293           0 :   const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
     294             :   int x, y;
     295           0 :   __m128i res = _mm_setzero_si128();
     296           0 :   const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
     297           0 :   const __m128i round_const =
     298             :       _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
     299           0 :   const __m128i one = _mm_set1_epi16(1);
     300             : 
     301           0 :   for (y = 0; y < height; y++) {
     302           0 :     for (x = 0; x < width; x += 8) {
     303           0 :       const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
     304           0 :       const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
     305           0 :       const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
     306             :       // Zero-extend mask to 16 bits
     307           0 :       const __m128i m = _mm_unpacklo_epi8(
     308           0 :           _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
     309           0 :       const __m128i m_inv = _mm_sub_epi16(mask_max, m);
     310             : 
     311           0 :       const __m128i data_l = _mm_unpacklo_epi16(a, b);
     312           0 :       const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
     313           0 :       __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
     314           0 :       pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
     315             :                               AOM_BLEND_A64_ROUND_BITS);
     316             : 
     317           0 :       const __m128i data_r = _mm_unpackhi_epi16(a, b);
     318           0 :       const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
     319           0 :       __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
     320           0 :       pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
     321             :                               AOM_BLEND_A64_ROUND_BITS);
     322             : 
     323             :       // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
     324             :       // so it is safe to do signed saturation here.
     325           0 :       const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
     326             :       // There is no 16-bit SAD instruction, so we have to synthesize
     327             :       // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
     328             :       // and accumulating them at the end
     329           0 :       const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
     330           0 :       res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
     331             :     }
     332             : 
     333           0 :     src_ptr += src_stride;
     334           0 :     a_ptr += a_stride;
     335           0 :     b_ptr += b_stride;
     336           0 :     m_ptr += m_stride;
     337             :   }
     338             :   // At this point, we have four 32-bit partial SADs stored in 'res'.
     339           0 :   res = _mm_hadd_epi32(res, res);
     340           0 :   res = _mm_hadd_epi32(res, res);
     341           0 :   int sad = _mm_cvtsi128_si32(res);
     342           0 :   return (sad + 31) >> 6;
     343             : }
     344             : 
     345           0 : static INLINE unsigned int highbd_masked_sad4xh_ssse3(
     346             :     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
     347             :     const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
     348             :     int height) {
     349           0 :   const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
     350           0 :   const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
     351           0 :   const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
     352             :   int y;
     353           0 :   __m128i res = _mm_setzero_si128();
     354           0 :   const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
     355           0 :   const __m128i round_const =
     356             :       _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
     357           0 :   const __m128i one = _mm_set1_epi16(1);
     358             : 
     359           0 :   for (y = 0; y < height; y += 2) {
     360           0 :     const __m128i src = _mm_unpacklo_epi64(
     361             :         _mm_loadl_epi64((const __m128i *)src_ptr),
     362           0 :         _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
     363           0 :     const __m128i a =
     364           0 :         _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
     365           0 :                            _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
     366           0 :     const __m128i b =
     367           0 :         _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
     368           0 :                            _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
     369             :     // Zero-extend mask to 16 bits
     370           0 :     const __m128i m = _mm_unpacklo_epi8(
     371             :         _mm_unpacklo_epi32(
     372           0 :             _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
     373           0 :             _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
     374             :         _mm_setzero_si128());
     375           0 :     const __m128i m_inv = _mm_sub_epi16(mask_max, m);
     376             : 
     377           0 :     const __m128i data_l = _mm_unpacklo_epi16(a, b);
     378           0 :     const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
     379           0 :     __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
     380           0 :     pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
     381             :                             AOM_BLEND_A64_ROUND_BITS);
     382             : 
     383           0 :     const __m128i data_r = _mm_unpackhi_epi16(a, b);
     384           0 :     const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
     385           0 :     __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
     386           0 :     pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
     387             :                             AOM_BLEND_A64_ROUND_BITS);
     388             : 
     389           0 :     const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
     390           0 :     const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
     391           0 :     res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
     392             : 
     393           0 :     src_ptr += src_stride * 2;
     394           0 :     a_ptr += a_stride * 2;
     395           0 :     b_ptr += b_stride * 2;
     396           0 :     m_ptr += m_stride * 2;
     397             :   }
     398           0 :   res = _mm_hadd_epi32(res, res);
     399           0 :   res = _mm_hadd_epi32(res, res);
     400           0 :   int sad = _mm_cvtsi128_si32(res);
     401           0 :   return (sad + 31) >> 6;
     402             : }
     403             : 
     404             : #endif

Generated by: LCOV version 1.13