LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - blend_a64_mask_sse4.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 436 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 38 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <smmintrin.h>  // SSE4.1
      13             : 
      14             : #include <assert.h>
      15             : 
      16             : #include "aom/aom_integer.h"
      17             : #include "aom_ports/mem.h"
      18             : #include "aom_dsp/aom_dsp_common.h"
      19             : #include "aom_dsp/blend.h"
      20             : 
      21             : #include "aom_dsp/x86/synonyms.h"
      22             : #include "aom_dsp/x86/blend_sse4.h"
      23             : 
      24             : #include "./aom_dsp_rtcd.h"
      25             : 
      26             : //////////////////////////////////////////////////////////////////////////////
      27             : // No sub-sampling
      28             : //////////////////////////////////////////////////////////////////////////////
      29             : 
      30           0 : static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
      31             :                                      const uint8_t *src0, uint32_t src0_stride,
      32             :                                      const uint8_t *src1, uint32_t src1_stride,
      33             :                                      const uint8_t *mask, uint32_t mask_stride,
      34             :                                      int h, int w) {
      35           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
      36             : 
      37             :   (void)w;
      38             : 
      39             :   do {
      40           0 :     const __m128i v_m0_b = xx_loadl_32(mask);
      41           0 :     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
      42           0 :     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
      43             : 
      44           0 :     const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
      45             : 
      46           0 :     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
      47             : 
      48           0 :     xx_storel_32(dst, v_res_b);
      49             : 
      50           0 :     dst += dst_stride;
      51           0 :     src0 += src0_stride;
      52           0 :     src1 += src1_stride;
      53           0 :     mask += mask_stride;
      54           0 :   } while (--h);
      55           0 : }
      56             : 
      57           0 : static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
      58             :                                      const uint8_t *src0, uint32_t src0_stride,
      59             :                                      const uint8_t *src1, uint32_t src1_stride,
      60             :                                      const uint8_t *mask, uint32_t mask_stride,
      61             :                                      int h, int w) {
      62           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
      63             : 
      64             :   (void)w;
      65             : 
      66             :   do {
      67           0 :     const __m128i v_m0_b = xx_loadl_64(mask);
      68           0 :     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
      69           0 :     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
      70             : 
      71           0 :     const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
      72             : 
      73           0 :     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
      74             : 
      75           0 :     xx_storel_64(dst, v_res_b);
      76             : 
      77           0 :     dst += dst_stride;
      78           0 :     src0 += src0_stride;
      79           0 :     src1 += src1_stride;
      80           0 :     mask += mask_stride;
      81           0 :   } while (--h);
      82           0 : }
      83             : 
      84           0 : static void blend_a64_mask_w16n_sse4_1(
      85             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
      86             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
      87             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
      88           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
      89             : 
      90             :   do {
      91             :     int c;
      92           0 :     for (c = 0; c < w; c += 16) {
      93           0 :       const __m128i v_m0l_b = xx_loadl_64(mask + c);
      94           0 :       const __m128i v_m0h_b = xx_loadl_64(mask + c + 8);
      95           0 :       const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
      96           0 :       const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
      97           0 :       const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
      98           0 :       const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
      99             : 
     100           0 :       const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
     101           0 :       const __m128i v_resh_w =
     102           0 :           blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
     103             : 
     104           0 :       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
     105             : 
     106           0 :       xx_storeu_128(dst + c, v_res_b);
     107             :     }
     108           0 :     dst += dst_stride;
     109           0 :     src0 += src0_stride;
     110           0 :     src1 += src1_stride;
     111           0 :     mask += mask_stride;
     112           0 :   } while (--h);
     113           0 : }
     114             : 
     115             : //////////////////////////////////////////////////////////////////////////////
     116             : // Horizontal sub-sampling
     117             : //////////////////////////////////////////////////////////////////////////////
     118             : 
     119           0 : static void blend_a64_mask_sx_w4_sse4_1(
     120             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     121             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     122             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     123           0 :   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     124             :                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     125           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     126             : 
     127             :   (void)w;
     128             : 
     129             :   do {
     130           0 :     const __m128i v_r_b = xx_loadl_64(mask);
     131           0 :     const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
     132             : 
     133           0 :     const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
     134           0 :     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     135             : 
     136           0 :     const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
     137             : 
     138           0 :     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
     139             : 
     140           0 :     xx_storel_32(dst, v_res_b);
     141             : 
     142           0 :     dst += dst_stride;
     143           0 :     src0 += src0_stride;
     144           0 :     src1 += src1_stride;
     145           0 :     mask += mask_stride;
     146           0 :   } while (--h);
     147           0 : }
     148             : 
     149           0 : static void blend_a64_mask_sx_w8_sse4_1(
     150             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     151             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     152             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     153           0 :   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     154             :                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     155           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     156             : 
     157             :   (void)w;
     158             : 
     159             :   do {
     160           0 :     const __m128i v_r_b = xx_loadu_128(mask);
     161           0 :     const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
     162             : 
     163           0 :     const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
     164           0 :     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     165             : 
     166           0 :     const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
     167             : 
     168           0 :     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
     169             : 
     170           0 :     xx_storel_64(dst, v_res_b);
     171             : 
     172           0 :     dst += dst_stride;
     173           0 :     src0 += src0_stride;
     174           0 :     src1 += src1_stride;
     175           0 :     mask += mask_stride;
     176           0 :   } while (--h);
     177           0 : }
     178             : 
     179           0 : static void blend_a64_mask_sx_w16n_sse4_1(
     180             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     181             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     182             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     183           0 :   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     184             :                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     185           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     186             : 
     187             :   do {
     188             :     int c;
     189           0 :     for (c = 0; c < w; c += 16) {
     190           0 :       const __m128i v_rl_b = xx_loadu_128(mask + 2 * c);
     191           0 :       const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16);
     192           0 :       const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
     193           0 :       const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
     194             : 
     195           0 :       const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b);
     196           0 :       const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b);
     197           0 :       const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
     198           0 :       const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
     199             : 
     200           0 :       const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
     201           0 :       const __m128i v_resh_w =
     202           0 :           blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
     203             : 
     204           0 :       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
     205             : 
     206           0 :       xx_storeu_128(dst + c, v_res_b);
     207             :     }
     208           0 :     dst += dst_stride;
     209           0 :     src0 += src0_stride;
     210           0 :     src1 += src1_stride;
     211           0 :     mask += mask_stride;
     212           0 :   } while (--h);
     213           0 : }
     214             : 
     215             : //////////////////////////////////////////////////////////////////////////////
     216             : // Vertical sub-sampling
     217             : //////////////////////////////////////////////////////////////////////////////
     218             : 
     219           0 : static void blend_a64_mask_sy_w4_sse4_1(
     220             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     221             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     222             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     223           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     224             : 
     225             :   (void)w;
     226             : 
     227             :   do {
     228           0 :     const __m128i v_ra_b = xx_loadl_32(mask);
     229           0 :     const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
     230           0 :     const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
     231             : 
     232           0 :     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
     233           0 :     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     234             : 
     235           0 :     const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
     236             : 
     237           0 :     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
     238             : 
     239           0 :     xx_storel_32(dst, v_res_b);
     240             : 
     241           0 :     dst += dst_stride;
     242           0 :     src0 += src0_stride;
     243           0 :     src1 += src1_stride;
     244           0 :     mask += 2 * mask_stride;
     245           0 :   } while (--h);
     246           0 : }
     247             : 
     248           0 : static void blend_a64_mask_sy_w8_sse4_1(
     249             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     250             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     251             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     252           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     253             : 
     254             :   (void)w;
     255             : 
     256             :   do {
     257           0 :     const __m128i v_ra_b = xx_loadl_64(mask);
     258           0 :     const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     259           0 :     const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
     260             : 
     261           0 :     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
     262           0 :     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     263             : 
     264           0 :     const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
     265             : 
     266           0 :     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
     267             : 
     268           0 :     xx_storel_64(dst, v_res_b);
     269             : 
     270           0 :     dst += dst_stride;
     271           0 :     src0 += src0_stride;
     272           0 :     src1 += src1_stride;
     273           0 :     mask += 2 * mask_stride;
     274           0 :   } while (--h);
     275           0 : }
     276             : 
     277           0 : static void blend_a64_mask_sy_w16n_sse4_1(
     278             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     279             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     280             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     281           0 :   const __m128i v_zero = _mm_setzero_si128();
     282           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     283             : 
     284             :   do {
     285             :     int c;
     286           0 :     for (c = 0; c < w; c += 16) {
     287           0 :       const __m128i v_ra_b = xx_loadu_128(mask + c);
     288           0 :       const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
     289           0 :       const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
     290             : 
     291           0 :       const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
     292           0 :       const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero);
     293           0 :       const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
     294           0 :       const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
     295             : 
     296           0 :       const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
     297           0 :       const __m128i v_resh_w =
     298           0 :           blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
     299             : 
     300           0 :       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
     301             : 
     302           0 :       xx_storeu_128(dst + c, v_res_b);
     303             :     }
     304           0 :     dst += dst_stride;
     305           0 :     src0 += src0_stride;
     306           0 :     src1 += src1_stride;
     307           0 :     mask += 2 * mask_stride;
     308           0 :   } while (--h);
     309           0 : }
     310             : 
     311             : //////////////////////////////////////////////////////////////////////////////
     312             : // Horizontal and Vertical sub-sampling
     313             : //////////////////////////////////////////////////////////////////////////////
     314             : 
     315           0 : static void blend_a64_mask_sx_sy_w4_sse4_1(
     316             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     317             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     318             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     319           0 :   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     320             :                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     321           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     322             : 
     323             :   (void)w;
     324             : 
     325             :   do {
     326           0 :     const __m128i v_ra_b = xx_loadl_64(mask);
     327           0 :     const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     328           0 :     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     329           0 :     const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
     330           0 :     const __m128i v_rvsb_w =
     331           0 :         _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
     332           0 :     const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
     333             : 
     334           0 :     const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
     335           0 :     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     336             : 
     337           0 :     const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
     338             : 
     339           0 :     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
     340             : 
     341           0 :     xx_storel_32(dst, v_res_b);
     342             : 
     343           0 :     dst += dst_stride;
     344           0 :     src0 += src0_stride;
     345           0 :     src1 += src1_stride;
     346           0 :     mask += 2 * mask_stride;
     347           0 :   } while (--h);
     348           0 : }
     349             : 
     350           0 : static void blend_a64_mask_sx_sy_w8_sse4_1(
     351             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     352             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     353             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     354           0 :   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     355             :                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     356           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     357             : 
     358             :   (void)w;
     359             : 
     360             :   do {
     361           0 :     const __m128i v_ra_b = xx_loadu_128(mask);
     362           0 :     const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
     363           0 :     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     364           0 :     const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
     365           0 :     const __m128i v_rvsb_w =
     366           0 :         _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
     367           0 :     const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
     368             : 
     369           0 :     const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
     370           0 :     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     371             : 
     372           0 :     const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
     373             : 
     374           0 :     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
     375             : 
     376           0 :     xx_storel_64(dst, v_res_b);
     377             : 
     378           0 :     dst += dst_stride;
     379           0 :     src0 += src0_stride;
     380           0 :     src1 += src1_stride;
     381           0 :     mask += 2 * mask_stride;
     382           0 :   } while (--h);
     383           0 : }
     384             : 
     385           0 : static void blend_a64_mask_sx_sy_w16n_sse4_1(
     386             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     387             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     388             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     389           0 :   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     390             :                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     391           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     392             : 
     393             :   do {
     394             :     int c;
     395           0 :     for (c = 0; c < w; c += 16) {
     396           0 :       const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
     397           0 :       const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
     398           0 :       const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
     399           0 :       const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
     400           0 :       const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
     401           0 :       const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
     402           0 :       const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
     403           0 :       const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
     404           0 :       const __m128i v_rvsbl_w =
     405           0 :           _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
     406           0 :       const __m128i v_rvsbh_w =
     407           0 :           _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
     408           0 :       const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
     409           0 :       const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
     410             : 
     411           0 :       const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
     412           0 :       const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
     413           0 :       const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
     414           0 :       const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
     415             : 
     416           0 :       const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
     417           0 :       const __m128i v_resh_w =
     418           0 :           blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
     419             : 
     420           0 :       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
     421             : 
     422           0 :       xx_storeu_128(dst + c, v_res_b);
     423             :     }
     424           0 :     dst += dst_stride;
     425           0 :     src0 += src0_stride;
     426           0 :     src1 += src1_stride;
     427           0 :     mask += 2 * mask_stride;
     428           0 :   } while (--h);
     429           0 : }
     430             : 
     431             : //////////////////////////////////////////////////////////////////////////////
     432             : // Dispatch
     433             : //////////////////////////////////////////////////////////////////////////////
     434             : 
     435           0 : void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
     436             :                                const uint8_t *src0, uint32_t src0_stride,
     437             :                                const uint8_t *src1, uint32_t src1_stride,
     438             :                                const uint8_t *mask, uint32_t mask_stride, int h,
     439             :                                int w, int suby, int subx) {
     440             :   typedef void (*blend_fn)(
     441             :       uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
     442             :       uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     443             :       const uint8_t *mask, uint32_t mask_stride, int h, int w);
     444             : 
     445             :   // Dimensions are: width_index X subx X suby
     446             :   static const blend_fn blend[3][2][2] = {
     447             :     { // w % 16 == 0
     448             :       { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
     449             :       { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
     450             :     { // w == 4
     451             :       { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
     452             :       { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
     453             :     { // w == 8
     454             :       { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
     455             :       { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
     456             :   };
     457             : 
     458           0 :   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
     459           0 :   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
     460             : 
     461           0 :   assert(h >= 1);
     462           0 :   assert(w >= 1);
     463           0 :   assert(IS_POWER_OF_TWO(h));
     464           0 :   assert(IS_POWER_OF_TWO(w));
     465             : 
     466           0 :   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     467           0 :     aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
     468             :                          mask, mask_stride, h, w, suby, subx);
     469             :   } else {
     470           0 :     blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
     471             :                                               src0_stride, src1, src1_stride,
     472             :                                               mask, mask_stride, h, w);
     473             :   }
     474           0 : }
     475             : 
     476             : #if CONFIG_HIGHBITDEPTH
     477             : //////////////////////////////////////////////////////////////////////////////
     478             : // No sub-sampling
     479             : //////////////////////////////////////////////////////////////////////////////
     480             : 
     481           0 : static INLINE void blend_a64_mask_bn_w4_sse4_1(
     482             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     483             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     484             :     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
     485           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     486             : 
     487             :   do {
     488           0 :     const __m128i v_m0_b = xx_loadl_32(mask);
     489           0 :     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
     490           0 :     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     491             : 
     492           0 :     const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
     493             : 
     494           0 :     xx_storel_64(dst, v_res_w);
     495             : 
     496           0 :     dst += dst_stride;
     497           0 :     src0 += src0_stride;
     498           0 :     src1 += src1_stride;
     499           0 :     mask += mask_stride;
     500           0 :   } while (--h);
     501           0 : }
     502             : 
     503           0 : static void blend_a64_mask_b10_w4_sse4_1(
     504             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     505             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     506             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     507             :   (void)w;
     508           0 :   blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     509             :                               src1_stride, mask, mask_stride, h, blend_4_b10);
     510           0 : }
     511             : 
     512           0 : static void blend_a64_mask_b12_w4_sse4_1(
     513             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     514             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     515             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     516             :   (void)w;
     517           0 :   blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     518             :                               src1_stride, mask, mask_stride, h, blend_4_b12);
     519           0 : }
     520             : 
     521           0 : static INLINE void blend_a64_mask_bn_w8n_sse4_1(
     522             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     523             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     524             :     const uint8_t *mask, uint32_t mask_stride, int h, int w,
     525             :     blend_unit_fn blend) {
     526           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     527             : 
     528             :   do {
     529             :     int c;
     530           0 :     for (c = 0; c < w; c += 8) {
     531           0 :       const __m128i v_m0_b = xx_loadl_64(mask + c);
     532           0 :       const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
     533           0 :       const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     534             : 
     535           0 :       const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
     536             : 
     537           0 :       xx_storeu_128(dst + c, v_res_w);
     538             :     }
     539           0 :     dst += dst_stride;
     540           0 :     src0 += src0_stride;
     541           0 :     src1 += src1_stride;
     542           0 :     mask += mask_stride;
     543           0 :   } while (--h);
     544           0 : }
     545             : 
     546           0 : static void blend_a64_mask_b10_w8n_sse4_1(
     547             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     548             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     549             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     550           0 :   blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     551             :                                src1_stride, mask, mask_stride, h, w,
     552             :                                blend_8_b10);
     553           0 : }
     554             : 
     555           0 : static void blend_a64_mask_b12_w8n_sse4_1(
     556             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     557             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     558             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     559           0 :   blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     560             :                                src1_stride, mask, mask_stride, h, w,
     561             :                                blend_8_b12);
     562           0 : }
     563             : 
     564             : //////////////////////////////////////////////////////////////////////////////
     565             : // Horizontal sub-sampling
     566             : //////////////////////////////////////////////////////////////////////////////
     567             : 
     568           0 : static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
     569             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     570             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     571             :     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
     572           0 :   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     573             :                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     574           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     575             : 
     576             :   do {
     577           0 :     const __m128i v_r_b = xx_loadl_64(mask);
     578           0 :     const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
     579             : 
     580           0 :     const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
     581           0 :     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     582             : 
     583           0 :     const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
     584             : 
     585           0 :     xx_storel_64(dst, v_res_w);
     586             : 
     587           0 :     dst += dst_stride;
     588           0 :     src0 += src0_stride;
     589           0 :     src1 += src1_stride;
     590           0 :     mask += mask_stride;
     591           0 :   } while (--h);
     592           0 : }
     593             : 
     594           0 : static void blend_a64_mask_b10_sx_w4_sse4_1(
     595             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     596             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     597             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     598             :   (void)w;
     599           0 :   blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     600             :                                  src1_stride, mask, mask_stride, h,
     601             :                                  blend_4_b10);
     602           0 : }
     603             : 
     604           0 : static void blend_a64_mask_b12_sx_w4_sse4_1(
     605             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     606             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     607             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     608             :   (void)w;
     609           0 :   blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     610             :                                  src1_stride, mask, mask_stride, h,
     611             :                                  blend_4_b12);
     612           0 : }
     613             : 
     614           0 : static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
     615             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     616             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     617             :     const uint8_t *mask, uint32_t mask_stride, int h, int w,
     618             :     blend_unit_fn blend) {
     619           0 :   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     620             :                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     621           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     622             : 
     623             :   do {
     624             :     int c;
     625           0 :     for (c = 0; c < w; c += 8) {
     626           0 :       const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
     627           0 :       const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
     628             : 
     629           0 :       const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
     630           0 :       const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     631             : 
     632           0 :       const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
     633             : 
     634           0 :       xx_storeu_128(dst + c, v_res_w);
     635             :     }
     636           0 :     dst += dst_stride;
     637           0 :     src0 += src0_stride;
     638           0 :     src1 += src1_stride;
     639           0 :     mask += mask_stride;
     640           0 :   } while (--h);
     641           0 : }
     642             : 
     643           0 : static void blend_a64_mask_b10_sx_w8n_sse4_1(
     644             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     645             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     646             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     647           0 :   blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     648             :                                   src1_stride, mask, mask_stride, h, w,
     649             :                                   blend_8_b10);
     650           0 : }
     651             : 
     652           0 : static void blend_a64_mask_b12_sx_w8n_sse4_1(
     653             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     654             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     655             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     656           0 :   blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     657             :                                   src1_stride, mask, mask_stride, h, w,
     658             :                                   blend_8_b12);
     659           0 : }
     660             : 
     661             : //////////////////////////////////////////////////////////////////////////////
     662             : // Vertical sub-sampling
     663             : //////////////////////////////////////////////////////////////////////////////
     664             : 
     665           0 : static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
     666             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     667             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     668             :     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
     669           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     670             : 
     671             :   do {
     672           0 :     const __m128i v_ra_b = xx_loadl_32(mask);
     673           0 :     const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
     674           0 :     const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
     675             : 
     676           0 :     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
     677           0 :     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     678             : 
     679           0 :     const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
     680             : 
     681           0 :     xx_storel_64(dst, v_res_w);
     682             : 
     683           0 :     dst += dst_stride;
     684           0 :     src0 += src0_stride;
     685           0 :     src1 += src1_stride;
     686           0 :     mask += 2 * mask_stride;
     687           0 :   } while (--h);
     688           0 : }
     689             : 
     690           0 : static void blend_a64_mask_b10_sy_w4_sse4_1(
     691             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     692             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     693             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     694             :   (void)w;
     695           0 :   blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     696             :                                  src1_stride, mask, mask_stride, h,
     697             :                                  blend_4_b10);
     698           0 : }
     699             : 
     700           0 : static void blend_a64_mask_b12_sy_w4_sse4_1(
     701             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     702             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     703             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     704             :   (void)w;
     705           0 :   blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     706             :                                  src1_stride, mask, mask_stride, h,
     707             :                                  blend_4_b12);
     708           0 : }
     709             : 
     710           0 : static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
     711             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     712             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     713             :     const uint8_t *mask, uint32_t mask_stride, int h, int w,
     714             :     blend_unit_fn blend) {
     715           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     716             : 
     717             :   do {
     718             :     int c;
     719           0 :     for (c = 0; c < w; c += 8) {
     720           0 :       const __m128i v_ra_b = xx_loadl_64(mask + c);
     721           0 :       const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
     722           0 :       const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
     723             : 
     724           0 :       const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
     725           0 :       const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     726             : 
     727           0 :       const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
     728             : 
     729           0 :       xx_storeu_128(dst + c, v_res_w);
     730             :     }
     731           0 :     dst += dst_stride;
     732           0 :     src0 += src0_stride;
     733           0 :     src1 += src1_stride;
     734           0 :     mask += 2 * mask_stride;
     735           0 :   } while (--h);
     736           0 : }
     737             : 
     738           0 : static void blend_a64_mask_b10_sy_w8n_sse4_1(
     739             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     740             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     741             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     742           0 :   blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     743             :                                   src1_stride, mask, mask_stride, h, w,
     744             :                                   blend_8_b10);
     745           0 : }
     746             : 
     747           0 : static void blend_a64_mask_b12_sy_w8n_sse4_1(
     748             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     749             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     750             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     751           0 :   blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     752             :                                   src1_stride, mask, mask_stride, h, w,
     753             :                                   blend_8_b12);
     754           0 : }
     755             : 
     756             : //////////////////////////////////////////////////////////////////////////////
     757             : // Horizontal and Vertical sub-sampling
     758             : //////////////////////////////////////////////////////////////////////////////
     759             : 
     760           0 : static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
     761             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     762             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     763             :     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
     764           0 :   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     765             :                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     766           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     767             : 
     768             :   do {
     769           0 :     const __m128i v_ra_b = xx_loadl_64(mask);
     770           0 :     const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     771           0 :     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     772           0 :     const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
     773           0 :     const __m128i v_rvsb_w =
     774           0 :         _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
     775           0 :     const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
     776             : 
     777           0 :     const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
     778           0 :     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     779             : 
     780           0 :     const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
     781             : 
     782           0 :     xx_storel_64(dst, v_res_w);
     783             : 
     784           0 :     dst += dst_stride;
     785           0 :     src0 += src0_stride;
     786           0 :     src1 += src1_stride;
     787           0 :     mask += 2 * mask_stride;
     788           0 :   } while (--h);
     789           0 : }
     790             : 
     791           0 : static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
     792             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     793             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     794             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     795             :   (void)w;
     796           0 :   blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     797             :                                     src1_stride, mask, mask_stride, h,
     798             :                                     blend_4_b10);
     799           0 : }
     800             : 
     801           0 : static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
     802             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     803             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     804             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     805             :   (void)w;
     806           0 :   blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     807             :                                     src1_stride, mask, mask_stride, h,
     808             :                                     blend_4_b12);
     809           0 : }
     810             : 
     811           0 : static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
     812             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     813             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     814             :     const uint8_t *mask, uint32_t mask_stride, int h, int w,
     815             :     blend_unit_fn blend) {
     816           0 :   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     817             :                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     818           0 :   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     819             : 
     820             :   do {
     821             :     int c;
     822           0 :     for (c = 0; c < w; c += 8) {
     823           0 :       const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
     824           0 :       const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
     825           0 :       const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     826           0 :       const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
     827           0 :       const __m128i v_rvsb_w =
     828           0 :           _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
     829           0 :       const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
     830             : 
     831           0 :       const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
     832           0 :       const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     833             : 
     834           0 :       const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
     835             : 
     836           0 :       xx_storeu_128(dst + c, v_res_w);
     837             :     }
     838           0 :     dst += dst_stride;
     839           0 :     src0 += src0_stride;
     840           0 :     src1 += src1_stride;
     841           0 :     mask += 2 * mask_stride;
     842           0 :   } while (--h);
     843           0 : }
     844             : 
     845           0 : static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
     846             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     847             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     848             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     849           0 :   blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     850             :                                      src1_stride, mask, mask_stride, h, w,
     851             :                                      blend_8_b10);
     852           0 : }
     853             : 
     854           0 : static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
     855             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     856             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     857             :     const uint8_t *mask, uint32_t mask_stride, int h, int w) {
     858           0 :   blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     859             :                                      src1_stride, mask, mask_stride, h, w,
     860             :                                      blend_8_b12);
     861           0 : }
     862             : 
     863             : //////////////////////////////////////////////////////////////////////////////
     864             : // Dispatch
     865             : //////////////////////////////////////////////////////////////////////////////
     866             : 
     867           0 : void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
     868             :                                       const uint8_t *src0_8,
     869             :                                       uint32_t src0_stride,
     870             :                                       const uint8_t *src1_8,
     871             :                                       uint32_t src1_stride, const uint8_t *mask,
     872             :                                       uint32_t mask_stride, int h, int w,
     873             :                                       int suby, int subx, int bd) {
     874             :   typedef void (*blend_fn)(
     875             :       uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
     876             :       uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     877             :       const uint8_t *mask, uint32_t mask_stride, int h, int w);
     878             : 
     879             :   // Dimensions are: bd_index X width_index X subx X suby
     880             :   static const blend_fn blend[2][2][2][2] = {
     881             :     {   // bd == 8 or 10
     882             :       { // w % 8 == 0
     883             :         { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
     884             :         { blend_a64_mask_b10_sx_w8n_sse4_1,
     885             :           blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
     886             :       { // w == 4
     887             :         { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
     888             :         { blend_a64_mask_b10_sx_w4_sse4_1,
     889             :           blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
     890             :     {   // bd == 12
     891             :       { // w % 8 == 0
     892             :         { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
     893             :         { blend_a64_mask_b12_sx_w8n_sse4_1,
     894             :           blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
     895             :       { // w == 4
     896             :         { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
     897             :         { blend_a64_mask_b12_sx_w4_sse4_1,
     898             :           blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
     899             :   };
     900             : 
     901           0 :   assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
     902           0 :   assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
     903             : 
     904           0 :   assert(h >= 1);
     905           0 :   assert(w >= 1);
     906           0 :   assert(IS_POWER_OF_TWO(h));
     907           0 :   assert(IS_POWER_OF_TWO(w));
     908             : 
     909           0 :   assert(bd == 8 || bd == 10 || bd == 12);
     910           0 :   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     911           0 :     aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
     912             :                                 src1_stride, mask, mask_stride, h, w, suby,
     913             :                                 subx, bd);
     914             :   } else {
     915           0 :     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
     916           0 :     const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
     917           0 :     const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
     918             : 
     919           0 :     blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
     920             :         dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     921             :         mask_stride, h, w);
     922             :   }
     923           0 : }
     924             : #endif  // CONFIG_HIGHBITDEPTH

Generated by: LCOV version 1.13