LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - obmc_sad_sse4.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 118 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 30 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <assert.h>
      13             : #include <immintrin.h>
      14             : 
      15             : #include "./aom_config.h"
      16             : #include "aom_ports/mem.h"
      17             : #include "aom/aom_integer.h"
      18             : 
      19             : #include "aom_dsp/aom_dsp_common.h"
      20             : #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
      21             : #include "aom_dsp/x86/synonyms.h"
      22             : 
      23             : ////////////////////////////////////////////////////////////////////////////////
      24             : // 8 bit
      25             : ////////////////////////////////////////////////////////////////////////////////
      26             : 
      27           0 : static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride,
      28             :                                        const int32_t *wsrc, const int32_t *mask,
      29             :                                        const int height) {
      30           0 :   const int pre_step = pre_stride - 4;
      31           0 :   int n = 0;
      32           0 :   __m128i v_sad_d = _mm_setzero_si128();
      33             : 
      34             :   do {
      35           0 :     const __m128i v_p_b = xx_loadl_32(pre + n);
      36           0 :     const __m128i v_m_d = xx_load_128(mask + n);
      37           0 :     const __m128i v_w_d = xx_load_128(wsrc + n);
      38             : 
      39           0 :     const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
      40             : 
      41             :     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
      42             :     // boundaries. We use pmaddwd, as it has lower latency on Haswell
      43             :     // than pmulld but produces the same result with these inputs.
      44           0 :     const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
      45             : 
      46           0 :     const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
      47           0 :     const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
      48             : 
      49             :     // Rounded absolute difference
      50           0 :     const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
      51             : 
      52           0 :     v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
      53             : 
      54           0 :     n += 4;
      55             : 
      56           0 :     if (n % 4 == 0) pre += pre_step;
      57           0 :   } while (n < 4 * height);
      58             : 
      59           0 :   return xx_hsum_epi32_si32(v_sad_d);
      60             : }
      61             : 
      62           0 : static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
      63             :                                         const int pre_stride,
      64             :                                         const int32_t *wsrc,
      65             :                                         const int32_t *mask, const int width,
      66             :                                         const int height) {
      67           0 :   const int pre_step = pre_stride - width;
      68           0 :   int n = 0;
      69           0 :   __m128i v_sad_d = _mm_setzero_si128();
      70             : 
      71           0 :   assert(width >= 8);
      72           0 :   assert(IS_POWER_OF_TWO(width));
      73             : 
      74             :   do {
      75           0 :     const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
      76           0 :     const __m128i v_m1_d = xx_load_128(mask + n + 4);
      77           0 :     const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
      78           0 :     const __m128i v_p0_b = xx_loadl_32(pre + n);
      79           0 :     const __m128i v_m0_d = xx_load_128(mask + n);
      80           0 :     const __m128i v_w0_d = xx_load_128(wsrc + n);
      81             : 
      82           0 :     const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
      83           0 :     const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
      84             : 
      85             :     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
      86             :     // boundaries. We use pmaddwd, as it has lower latency on Haswell
      87             :     // than pmulld but produces the same result with these inputs.
      88           0 :     const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
      89           0 :     const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
      90             : 
      91           0 :     const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
      92           0 :     const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
      93           0 :     const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
      94           0 :     const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
      95             : 
      96             :     // Rounded absolute difference
      97           0 :     const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
      98           0 :     const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
      99             : 
     100           0 :     v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
     101           0 :     v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
     102             : 
     103           0 :     n += 8;
     104             : 
     105           0 :     if (n % width == 0) pre += pre_step;
     106           0 :   } while (n < width * height);
     107             : 
     108           0 :   return xx_hsum_epi32_si32(v_sad_d);
     109             : }
     110             : 
     111             : #define OBMCSADWXH(w, h)                                       \
     112             :   unsigned int aom_obmc_sad##w##x##h##_sse4_1(                 \
     113             :       const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
     114             :       const int32_t *msk) {                                    \
     115             :     if (w == 4) {                                              \
     116             :       return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);       \
     117             :     } else {                                                   \
     118             :       return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);   \
     119             :     }                                                          \
     120             :   }
     121             : 
     122             : #if CONFIG_EXT_PARTITION
     123             : OBMCSADWXH(128, 128)
     124             : OBMCSADWXH(128, 64)
     125             : OBMCSADWXH(64, 128)
     126             : #endif  // CONFIG_EXT_PARTITION
     127           0 : OBMCSADWXH(64, 64)
     128           0 : OBMCSADWXH(64, 32)
     129           0 : OBMCSADWXH(32, 64)
     130           0 : OBMCSADWXH(32, 32)
     131           0 : OBMCSADWXH(32, 16)
     132           0 : OBMCSADWXH(16, 32)
     133           0 : OBMCSADWXH(16, 16)
     134           0 : OBMCSADWXH(16, 8)
     135           0 : OBMCSADWXH(8, 16)
     136           0 : OBMCSADWXH(8, 8)
     137           0 : OBMCSADWXH(8, 4)
     138           0 : OBMCSADWXH(4, 8)
     139           0 : OBMCSADWXH(4, 4)
     140             : 
     141             : ////////////////////////////////////////////////////////////////////////////////
     142             : // High bit-depth
     143             : ////////////////////////////////////////////////////////////////////////////////
     144             : 
     145             : #if CONFIG_HIGHBITDEPTH
     146           0 : static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
     147             :                                            const int pre_stride,
     148             :                                            const int32_t *wsrc,
     149             :                                            const int32_t *mask,
     150             :                                            const int height) {
     151           0 :   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
     152           0 :   const int pre_step = pre_stride - 4;
     153           0 :   int n = 0;
     154           0 :   __m128i v_sad_d = _mm_setzero_si128();
     155             : 
     156             :   do {
     157           0 :     const __m128i v_p_w = xx_loadl_64(pre + n);
     158           0 :     const __m128i v_m_d = xx_load_128(mask + n);
     159           0 :     const __m128i v_w_d = xx_load_128(wsrc + n);
     160             : 
     161           0 :     const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
     162             : 
     163             :     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
     164             :     // boundaries. We use pmaddwd, as it has lower latency on Haswell
     165             :     // than pmulld but produces the same result with these inputs.
     166           0 :     const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
     167             : 
     168           0 :     const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
     169           0 :     const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
     170             : 
     171             :     // Rounded absolute difference
     172           0 :     const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
     173             : 
     174           0 :     v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
     175             : 
     176           0 :     n += 4;
     177             : 
     178           0 :     if (n % 4 == 0) pre += pre_step;
     179           0 :   } while (n < 4 * height);
     180             : 
     181           0 :   return xx_hsum_epi32_si32(v_sad_d);
     182             : }
     183             : 
     184           0 : static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
     185             :                                             const int pre_stride,
     186             :                                             const int32_t *wsrc,
     187             :                                             const int32_t *mask,
     188             :                                             const int width, const int height) {
     189           0 :   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
     190           0 :   const int pre_step = pre_stride - width;
     191           0 :   int n = 0;
     192           0 :   __m128i v_sad_d = _mm_setzero_si128();
     193             : 
     194           0 :   assert(width >= 8);
     195           0 :   assert(IS_POWER_OF_TWO(width));
     196             : 
     197             :   do {
     198           0 :     const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
     199           0 :     const __m128i v_m1_d = xx_load_128(mask + n + 4);
     200           0 :     const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
     201           0 :     const __m128i v_p0_w = xx_loadl_64(pre + n);
     202           0 :     const __m128i v_m0_d = xx_load_128(mask + n);
     203           0 :     const __m128i v_w0_d = xx_load_128(wsrc + n);
     204             : 
     205           0 :     const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
     206           0 :     const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
     207             : 
     208             :     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
     209             :     // boundaries. We use pmaddwd, as it has lower latency on Haswell
     210             :     // than pmulld but produces the same result with these inputs.
     211           0 :     const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
     212           0 :     const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
     213             : 
     214           0 :     const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
     215           0 :     const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
     216           0 :     const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
     217           0 :     const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
     218             : 
     219             :     // Rounded absolute difference
     220           0 :     const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
     221           0 :     const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
     222             : 
     223           0 :     v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
     224           0 :     v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
     225             : 
     226           0 :     n += 8;
     227             : 
     228           0 :     if (n % width == 0) pre += pre_step;
     229           0 :   } while (n < width * height);
     230             : 
     231           0 :   return xx_hsum_epi32_si32(v_sad_d);
     232             : }
     233             : 
     234             : #define HBD_OBMCSADWXH(w, h)                                      \
     235             :   unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1(             \
     236             :       const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
     237             :       const int32_t *mask) {                                      \
     238             :     if (w == 4) {                                                 \
     239             :       return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h);     \
     240             :     } else {                                                      \
     241             :       return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
     242             :     }                                                             \
     243             :   }
     244             : 
     245             : #if CONFIG_EXT_PARTITION
     246             : HBD_OBMCSADWXH(128, 128)
     247             : HBD_OBMCSADWXH(128, 64)
     248             : HBD_OBMCSADWXH(64, 128)
     249             : #endif  // CONFIG_EXT_PARTITION
     250           0 : HBD_OBMCSADWXH(64, 64)
     251           0 : HBD_OBMCSADWXH(64, 32)
     252           0 : HBD_OBMCSADWXH(32, 64)
     253           0 : HBD_OBMCSADWXH(32, 32)
     254           0 : HBD_OBMCSADWXH(32, 16)
     255           0 : HBD_OBMCSADWXH(16, 32)
     256           0 : HBD_OBMCSADWXH(16, 16)
     257           0 : HBD_OBMCSADWXH(16, 8)
     258           0 : HBD_OBMCSADWXH(8, 16)
     259           0 : HBD_OBMCSADWXH(8, 8)
     260           0 : HBD_OBMCSADWXH(8, 4)
     261           0 : HBD_OBMCSADWXH(4, 8)
     262           0 : HBD_OBMCSADWXH(4, 4)
     263             : #endif  // CONFIG_HIGHBITDEPTH

Generated by: LCOV version 1.13