LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - obmc_variance_sse4.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 179 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 59 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <assert.h>
      13             : #include <immintrin.h>
      14             : 
      15             : #include "./aom_config.h"
      16             : #include "aom_ports/mem.h"
      17             : #include "aom/aom_integer.h"
      18             : 
      19             : #include "aom_dsp/aom_dsp_common.h"
      20             : #include "aom_dsp/aom_filter.h"
      21             : #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
      22             : #include "aom_dsp/x86/synonyms.h"
      23             : 
      24             : ////////////////////////////////////////////////////////////////////////////////
      25             : // 8 bit
      26             : ////////////////////////////////////////////////////////////////////////////////
      27             : 
      28           0 : static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
      29             :                                     const int32_t *wsrc, const int32_t *mask,
      30             :                                     unsigned int *const sse, int *const sum,
      31             :                                     const int h) {
      32           0 :   const int pre_step = pre_stride - 4;
      33           0 :   int n = 0;
      34           0 :   __m128i v_sum_d = _mm_setzero_si128();
      35           0 :   __m128i v_sse_d = _mm_setzero_si128();
      36             : 
      37           0 :   assert(IS_POWER_OF_TWO(h));
      38             : 
      39             :   do {
      40           0 :     const __m128i v_p_b = xx_loadl_32(pre + n);
      41           0 :     const __m128i v_m_d = xx_load_128(mask + n);
      42           0 :     const __m128i v_w_d = xx_load_128(wsrc + n);
      43             : 
      44           0 :     const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
      45             : 
      46             :     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
      47             :     // boundaries. We use pmaddwd, as it has lower latency on Haswell
      48             :     // than pmulld but produces the same result with these inputs.
      49           0 :     const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
      50             : 
      51           0 :     const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
      52           0 :     const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
      53           0 :     const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
      54             : 
      55           0 :     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
      56           0 :     v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
      57             : 
      58           0 :     n += 4;
      59             : 
      60           0 :     if (n % 4 == 0) pre += pre_step;
      61           0 :   } while (n < 4 * h);
      62             : 
      63           0 :   *sum = xx_hsum_epi32_si32(v_sum_d);
      64           0 :   *sse = xx_hsum_epi32_si32(v_sse_d);
      65           0 : }
      66             : 
      67           0 : static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
      68             :                                      const int32_t *wsrc, const int32_t *mask,
      69             :                                      unsigned int *const sse, int *const sum,
      70             :                                      const int w, const int h) {
      71           0 :   const int pre_step = pre_stride - w;
      72           0 :   int n = 0;
      73           0 :   __m128i v_sum_d = _mm_setzero_si128();
      74           0 :   __m128i v_sse_d = _mm_setzero_si128();
      75             : 
      76           0 :   assert(w >= 8);
      77           0 :   assert(IS_POWER_OF_TWO(w));
      78           0 :   assert(IS_POWER_OF_TWO(h));
      79             : 
      80             :   do {
      81           0 :     const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
      82           0 :     const __m128i v_m1_d = xx_load_128(mask + n + 4);
      83           0 :     const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
      84           0 :     const __m128i v_p0_b = xx_loadl_32(pre + n);
      85           0 :     const __m128i v_m0_d = xx_load_128(mask + n);
      86           0 :     const __m128i v_w0_d = xx_load_128(wsrc + n);
      87             : 
      88           0 :     const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
      89           0 :     const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
      90             : 
      91             :     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
      92             :     // boundaries. We use pmaddwd, as it has lower latency on Haswell
      93             :     // than pmulld but produces the same result with these inputs.
      94           0 :     const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
      95           0 :     const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
      96             : 
      97           0 :     const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
      98           0 :     const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
      99             : 
     100           0 :     const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
     101           0 :     const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
     102           0 :     const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
     103           0 :     const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
     104             : 
     105           0 :     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
     106           0 :     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
     107           0 :     v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
     108             : 
     109           0 :     n += 8;
     110             : 
     111           0 :     if (n % w == 0) pre += pre_step;
     112           0 :   } while (n < w * h);
     113             : 
     114           0 :   *sum = xx_hsum_epi32_si32(v_sum_d);
     115           0 :   *sse = xx_hsum_epi32_si32(v_sse_d);
     116           0 : }
     117             : 
     118             : #define OBMCVARWXH(W, H)                                               \
     119             :   unsigned int aom_obmc_variance##W##x##H##_sse4_1(                    \
     120             :       const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
     121             :       const int32_t *mask, unsigned int *sse) {                        \
     122             :     int sum;                                                           \
     123             :     if (W == 4) {                                                      \
     124             :       obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);     \
     125             :     } else {                                                           \
     126             :       obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
     127             :     }                                                                  \
     128             :     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));      \
     129             :   }
     130             : 
     131             : #if CONFIG_EXT_PARTITION
     132             : OBMCVARWXH(128, 128)
     133             : OBMCVARWXH(128, 64)
     134             : OBMCVARWXH(64, 128)
     135             : #endif  // CONFIG_EXT_PARTITION
     136           0 : OBMCVARWXH(64, 64)
     137           0 : OBMCVARWXH(64, 32)
     138           0 : OBMCVARWXH(32, 64)
     139           0 : OBMCVARWXH(32, 32)
     140           0 : OBMCVARWXH(32, 16)
     141           0 : OBMCVARWXH(16, 32)
     142           0 : OBMCVARWXH(16, 16)
     143           0 : OBMCVARWXH(16, 8)
     144           0 : OBMCVARWXH(8, 16)
     145           0 : OBMCVARWXH(8, 8)
     146           0 : OBMCVARWXH(8, 4)
     147           0 : OBMCVARWXH(4, 8)
     148           0 : OBMCVARWXH(4, 4)
     149             : 
     150             : ////////////////////////////////////////////////////////////////////////////////
     151             : // High bit-depth
     152             : ////////////////////////////////////////////////////////////////////////////////
     153             : 
     154             : #if CONFIG_HIGHBITDEPTH
     155           0 : static INLINE void hbd_obmc_variance_w4(
     156             :     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
     157             :     const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
     158           0 :   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
     159           0 :   const int pre_step = pre_stride - 4;
     160           0 :   int n = 0;
     161           0 :   __m128i v_sum_d = _mm_setzero_si128();
     162           0 :   __m128i v_sse_d = _mm_setzero_si128();
     163             : 
     164           0 :   assert(IS_POWER_OF_TWO(h));
     165             : 
     166             :   do {
     167           0 :     const __m128i v_p_w = xx_loadl_64(pre + n);
     168           0 :     const __m128i v_m_d = xx_load_128(mask + n);
     169           0 :     const __m128i v_w_d = xx_load_128(wsrc + n);
     170             : 
     171           0 :     const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
     172             : 
     173             :     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
     174             :     // boundaries. We use pmaddwd, as it has lower latency on Haswell
     175             :     // than pmulld but produces the same result with these inputs.
     176           0 :     const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
     177             : 
     178           0 :     const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
     179           0 :     const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
     180           0 :     const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
     181             : 
     182           0 :     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
     183           0 :     v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
     184             : 
     185           0 :     n += 4;
     186             : 
     187           0 :     if (n % 4 == 0) pre += pre_step;
     188           0 :   } while (n < 4 * h);
     189             : 
     190           0 :   *sum = xx_hsum_epi32_si32(v_sum_d);
     191           0 :   *sse = xx_hsum_epi32_si32(v_sse_d);
     192           0 : }
     193             : 
     194           0 : static INLINE void hbd_obmc_variance_w8n(
     195             :     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
     196             :     const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
     197             :     const int h) {
     198           0 :   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
     199           0 :   const int pre_step = pre_stride - w;
     200           0 :   int n = 0;
     201           0 :   __m128i v_sum_d = _mm_setzero_si128();
     202           0 :   __m128i v_sse_d = _mm_setzero_si128();
     203             : 
     204           0 :   assert(w >= 8);
     205           0 :   assert(IS_POWER_OF_TWO(w));
     206           0 :   assert(IS_POWER_OF_TWO(h));
     207             : 
     208             :   do {
     209           0 :     const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
     210           0 :     const __m128i v_m1_d = xx_load_128(mask + n + 4);
     211           0 :     const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
     212           0 :     const __m128i v_p0_w = xx_loadl_64(pre + n);
     213           0 :     const __m128i v_m0_d = xx_load_128(mask + n);
     214           0 :     const __m128i v_w0_d = xx_load_128(wsrc + n);
     215             : 
     216           0 :     const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
     217           0 :     const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
     218             : 
     219             :     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
     220             :     // boundaries. We use pmaddwd, as it has lower latency on Haswell
     221             :     // than pmulld but produces the same result with these inputs.
     222           0 :     const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
     223           0 :     const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
     224             : 
     225           0 :     const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
     226           0 :     const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
     227             : 
     228           0 :     const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
     229           0 :     const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
     230           0 :     const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
     231           0 :     const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
     232             : 
     233           0 :     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
     234           0 :     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
     235           0 :     v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
     236             : 
     237           0 :     n += 8;
     238             : 
     239           0 :     if (n % w == 0) pre += pre_step;
     240           0 :   } while (n < w * h);
     241             : 
     242           0 :   *sum += xx_hsum_epi32_si64(v_sum_d);
     243           0 :   *sse += xx_hsum_epi32_si64(v_sse_d);
     244           0 : }
     245             : 
     246           0 : static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
     247             :                                         const int32_t *wsrc,
     248             :                                         const int32_t *mask, int w, int h,
     249             :                                         unsigned int *sse, int *sum) {
     250           0 :   int64_t sum64 = 0;
     251           0 :   uint64_t sse64 = 0;
     252           0 :   if (w == 4) {
     253           0 :     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
     254             :   } else {
     255           0 :     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
     256             :   }
     257           0 :   *sum = (int)sum64;
     258           0 :   *sse = (unsigned int)sse64;
     259           0 : }
     260             : 
     261           0 : static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
     262             :                                            const int32_t *wsrc,
     263             :                                            const int32_t *mask, int w, int h,
     264             :                                            unsigned int *sse, int *sum) {
     265           0 :   int64_t sum64 = 0;
     266           0 :   uint64_t sse64 = 0;
     267           0 :   if (w == 4) {
     268           0 :     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
     269             :   } else {
     270           0 :     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
     271             :   }
     272           0 :   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
     273           0 :   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
     274           0 : }
     275             : 
     276           0 : static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
     277             :                                            const int32_t *wsrc,
     278             :                                            const int32_t *mask, int w, int h,
     279             :                                            unsigned int *sse, int *sum) {
     280           0 :   int64_t sum64 = 0;
     281           0 :   uint64_t sse64 = 0;
     282           0 :   if (w == 128) {
     283             :     do {
     284           0 :       hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 128,
     285             :                             32);
     286           0 :       pre8 += 32 * pre_stride;
     287           0 :       wsrc += 32 * 128;
     288           0 :       mask += 32 * 128;
     289           0 :       h -= 32;
     290           0 :     } while (h > 0);
     291           0 :   } else if (w == 64 && h >= 128) {
     292             :     do {
     293           0 :       hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 64,
     294             :                             64);
     295           0 :       pre8 += 64 * pre_stride;
     296           0 :       wsrc += 64 * 64;
     297           0 :       mask += 64 * 64;
     298           0 :       h -= 64;
     299           0 :     } while (h > 0);
     300           0 :   } else if (w == 4) {
     301           0 :     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
     302             :   } else {
     303           0 :     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
     304             :   }
     305           0 :   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
     306           0 :   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
     307           0 : }
     308             : 
     309             : #define HBD_OBMCVARWXH(W, H)                                               \
     310             :   unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1(                 \
     311             :       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
     312             :       const int32_t *mask, unsigned int *sse) {                            \
     313             :     int sum;                                                               \
     314             :     highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
     315             :     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
     316             :   }                                                                        \
     317             :                                                                            \
     318             :   unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1(              \
     319             :       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
     320             :       const int32_t *mask, unsigned int *sse) {                            \
     321             :     int sum;                                                               \
     322             :     int64_t var;                                                           \
     323             :     highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
     324             :     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
     325             :     return (var >= 0) ? (uint32_t)var : 0;                                 \
     326             :   }                                                                        \
     327             :                                                                            \
     328             :   unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1(              \
     329             :       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
     330             :       const int32_t *mask, unsigned int *sse) {                            \
     331             :     int sum;                                                               \
     332             :     int64_t var;                                                           \
     333             :     highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
     334             :     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
     335             :     return (var >= 0) ? (uint32_t)var : 0;                                 \
     336             :   }
     337             : 
     338             : #if CONFIG_EXT_PARTITION
     339             : HBD_OBMCVARWXH(128, 128)
     340             : HBD_OBMCVARWXH(128, 64)
     341             : HBD_OBMCVARWXH(64, 128)
     342             : #endif  // CONFIG_EXT_PARTITION
     343           0 : HBD_OBMCVARWXH(64, 64)
     344           0 : HBD_OBMCVARWXH(64, 32)
     345           0 : HBD_OBMCVARWXH(32, 64)
     346           0 : HBD_OBMCVARWXH(32, 32)
     347           0 : HBD_OBMCVARWXH(32, 16)
     348           0 : HBD_OBMCVARWXH(16, 32)
     349           0 : HBD_OBMCVARWXH(16, 16)
     350           0 : HBD_OBMCVARWXH(16, 8)
     351           0 : HBD_OBMCVARWXH(8, 16)
     352           0 : HBD_OBMCVARWXH(8, 8)
     353           0 : HBD_OBMCVARWXH(8, 4)
     354           0 : HBD_OBMCVARWXH(4, 8)
     355           0 : HBD_OBMCVARWXH(4, 4)
     356             : #endif  // CONFIG_HIGHBITDEPTH

Generated by: LCOV version 1.13