LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - variance_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 322 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 76 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <assert.h>
      13             : #include <emmintrin.h>  // SSE2
      14             : 
      15             : #include "./aom_config.h"
      16             : #include "./aom_dsp_rtcd.h"
      17             : 
      18             : #include "aom_ports/mem.h"
      19             : 
      20             : typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
      21             :                                const unsigned char *ref, int ref_stride,
      22             :                                unsigned int *sse, int *sum);
      23             : 
      24           0 : unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
      25           0 :   __m128i vsum = _mm_setzero_si128();
      26             :   int i;
      27             : 
      28           0 :   for (i = 0; i < 32; ++i) {
      29           0 :     const __m128i v = _mm_loadu_si128((const __m128i *)src);
      30           0 :     vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
      31           0 :     src += 8;
      32             :   }
      33             : 
      34           0 :   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
      35           0 :   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
      36           0 :   return _mm_cvtsi128_si32(vsum);
      37             : }
      38             : 
      39             : #define READ64(p, stride, i)                                  \
      40             :   _mm_unpacklo_epi8(                                          \
      41             :       _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
      42             :       _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
      43             : 
      44           0 : static void get4x4var_sse2(const uint8_t *src, int src_stride,
      45             :                            const uint8_t *ref, int ref_stride,
      46             :                            unsigned int *sse, int *sum) {
      47           0 :   const __m128i zero = _mm_setzero_si128();
      48           0 :   const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
      49           0 :   const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
      50           0 :   const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
      51           0 :   const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
      52           0 :   const __m128i diff0 = _mm_sub_epi16(src0, ref0);
      53           0 :   const __m128i diff1 = _mm_sub_epi16(src1, ref1);
      54             : 
      55             :   // sum
      56           0 :   __m128i vsum = _mm_add_epi16(diff0, diff1);
      57           0 :   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
      58           0 :   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
      59           0 :   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
      60           0 :   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
      61             : 
      62             :   // sse
      63           0 :   vsum =
      64           0 :       _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
      65           0 :   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
      66           0 :   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
      67           0 :   *sse = _mm_cvtsi128_si32(vsum);
      68           0 : }
      69             : 
      70           0 : void aom_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref,
      71             :                         int ref_stride, unsigned int *sse, int *sum) {
      72           0 :   const __m128i zero = _mm_setzero_si128();
      73           0 :   __m128i vsum = _mm_setzero_si128();
      74           0 :   __m128i vsse = _mm_setzero_si128();
      75             :   int i;
      76             : 
      77           0 :   for (i = 0; i < 8; i += 2) {
      78           0 :     const __m128i src0 = _mm_unpacklo_epi8(
      79           0 :         _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero);
      80           0 :     const __m128i ref0 = _mm_unpacklo_epi8(
      81           0 :         _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero);
      82           0 :     const __m128i diff0 = _mm_sub_epi16(src0, ref0);
      83             : 
      84           0 :     const __m128i src1 = _mm_unpacklo_epi8(
      85           0 :         _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero);
      86           0 :     const __m128i ref1 = _mm_unpacklo_epi8(
      87           0 :         _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero);
      88           0 :     const __m128i diff1 = _mm_sub_epi16(src1, ref1);
      89             : 
      90           0 :     vsum = _mm_add_epi16(vsum, diff0);
      91           0 :     vsum = _mm_add_epi16(vsum, diff1);
      92           0 :     vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
      93           0 :     vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
      94             :   }
      95             : 
      96             :   // sum
      97           0 :   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
      98           0 :   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
      99           0 :   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
     100           0 :   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
     101             : 
     102             :   // sse
     103           0 :   vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
     104           0 :   vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
     105           0 :   *sse = _mm_cvtsi128_si32(vsse);
     106           0 : }
     107             : 
     108           0 : void aom_get16x16var_sse2(const uint8_t *src, int src_stride,
     109             :                           const uint8_t *ref, int ref_stride, unsigned int *sse,
     110             :                           int *sum) {
     111           0 :   const __m128i zero = _mm_setzero_si128();
     112           0 :   __m128i vsum = _mm_setzero_si128();
     113           0 :   __m128i vsse = _mm_setzero_si128();
     114             :   int i;
     115             : 
     116           0 :   for (i = 0; i < 16; ++i) {
     117           0 :     const __m128i s = _mm_loadu_si128((const __m128i *)src);
     118           0 :     const __m128i r = _mm_loadu_si128((const __m128i *)ref);
     119             : 
     120           0 :     const __m128i src0 = _mm_unpacklo_epi8(s, zero);
     121           0 :     const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
     122           0 :     const __m128i diff0 = _mm_sub_epi16(src0, ref0);
     123             : 
     124           0 :     const __m128i src1 = _mm_unpackhi_epi8(s, zero);
     125           0 :     const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
     126           0 :     const __m128i diff1 = _mm_sub_epi16(src1, ref1);
     127             : 
     128           0 :     vsum = _mm_add_epi16(vsum, diff0);
     129           0 :     vsum = _mm_add_epi16(vsum, diff1);
     130           0 :     vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
     131           0 :     vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
     132             : 
     133           0 :     src += src_stride;
     134           0 :     ref += ref_stride;
     135             :   }
     136             : 
     137             :   // sum
     138           0 :   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
     139           0 :   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
     140           0 :   *sum =
     141           0 :       (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1);
     142             : 
     143             :   // sse
     144           0 :   vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
     145           0 :   vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
     146           0 :   *sse = _mm_cvtsi128_si32(vsse);
     147           0 : }
     148             : 
     149           0 : static void variance_sse2(const unsigned char *src, int src_stride,
     150             :                           const unsigned char *ref, int ref_stride, int w,
     151             :                           int h, unsigned int *sse, int *sum,
     152             :                           getNxMvar_fn_t var_fn, int block_size) {
     153             :   int i, j;
     154             : 
     155           0 :   *sse = 0;
     156           0 :   *sum = 0;
     157             : 
     158           0 :   for (i = 0; i < h; i += block_size) {
     159           0 :     for (j = 0; j < w; j += block_size) {
     160             :       unsigned int sse0;
     161             :       int sum0;
     162           0 :       var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
     163             :              ref_stride, &sse0, &sum0);
     164           0 :       *sse += sse0;
     165           0 :       *sum += sum0;
     166             :     }
     167             :   }
     168           0 : }
     169             : 
     170           0 : unsigned int aom_variance4x4_sse2(const uint8_t *src, int src_stride,
     171             :                                   const uint8_t *ref, int ref_stride,
     172             :                                   unsigned int *sse) {
     173             :   int sum;
     174           0 :   get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
     175           0 :   assert(sum <= 255 * 4 * 4);
     176           0 :   assert(sum >= -255 * 4 * 4);
     177           0 :   return *sse - ((sum * sum) >> 4);
     178             : }
     179             : 
     180           0 : unsigned int aom_variance8x4_sse2(const uint8_t *src, int src_stride,
     181             :                                   const uint8_t *ref, int ref_stride,
     182             :                                   unsigned int *sse) {
     183             :   int sum;
     184           0 :   variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum,
     185             :                 get4x4var_sse2, 4);
     186           0 :   assert(sum <= 255 * 8 * 4);
     187           0 :   assert(sum >= -255 * 8 * 4);
     188           0 :   return *sse - ((sum * sum) >> 5);
     189             : }
     190             : 
     191           0 : unsigned int aom_variance4x8_sse2(const uint8_t *src, int src_stride,
     192             :                                   const uint8_t *ref, int ref_stride,
     193             :                                   unsigned int *sse) {
     194             :   int sum;
     195           0 :   variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum,
     196             :                 get4x4var_sse2, 4);
     197           0 :   assert(sum <= 255 * 8 * 4);
     198           0 :   assert(sum >= -255 * 8 * 4);
     199           0 :   return *sse - ((sum * sum) >> 5);
     200             : }
     201             : 
     202           0 : unsigned int aom_variance8x8_sse2(const unsigned char *src, int src_stride,
     203             :                                   const unsigned char *ref, int ref_stride,
     204             :                                   unsigned int *sse) {
     205             :   int sum;
     206           0 :   aom_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
     207           0 :   assert(sum <= 255 * 8 * 8);
     208           0 :   assert(sum >= -255 * 8 * 8);
     209           0 :   return *sse - ((sum * sum) >> 6);
     210             : }
     211             : 
     212           0 : unsigned int aom_variance16x8_sse2(const unsigned char *src, int src_stride,
     213             :                                    const unsigned char *ref, int ref_stride,
     214             :                                    unsigned int *sse) {
     215             :   int sum;
     216           0 :   variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum,
     217             :                 aom_get8x8var_sse2, 8);
     218           0 :   assert(sum <= 255 * 16 * 8);
     219           0 :   assert(sum >= -255 * 16 * 8);
     220           0 :   return *sse - ((sum * sum) >> 7);
     221             : }
     222             : 
     223           0 : unsigned int aom_variance8x16_sse2(const unsigned char *src, int src_stride,
     224             :                                    const unsigned char *ref, int ref_stride,
     225             :                                    unsigned int *sse) {
     226             :   int sum;
     227           0 :   variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum,
     228             :                 aom_get8x8var_sse2, 8);
     229           0 :   assert(sum <= 255 * 16 * 8);
     230           0 :   assert(sum >= -255 * 16 * 8);
     231           0 :   return *sse - ((sum * sum) >> 7);
     232             : }
     233             : 
     234           0 : unsigned int aom_variance16x16_sse2(const unsigned char *src, int src_stride,
     235             :                                     const unsigned char *ref, int ref_stride,
     236             :                                     unsigned int *sse) {
     237             :   int sum;
     238           0 :   aom_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
     239           0 :   assert(sum <= 255 * 16 * 16);
     240           0 :   assert(sum >= -255 * 16 * 16);
     241           0 :   return *sse - ((uint32_t)((int64_t)sum * sum) >> 8);
     242             : }
     243             : 
     244           0 : unsigned int aom_variance32x32_sse2(const uint8_t *src, int src_stride,
     245             :                                     const uint8_t *ref, int ref_stride,
     246             :                                     unsigned int *sse) {
     247             :   int sum;
     248           0 :   variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
     249             :                 aom_get16x16var_sse2, 16);
     250           0 :   assert(sum <= 255 * 32 * 32);
     251           0 :   assert(sum >= -255 * 32 * 32);
     252           0 :   return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
     253             : }
     254             : 
     255           0 : unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride,
     256             :                                     const uint8_t *ref, int ref_stride,
     257             :                                     unsigned int *sse) {
     258             :   int sum;
     259           0 :   variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
     260             :                 aom_get16x16var_sse2, 16);
     261           0 :   assert(sum <= 255 * 32 * 16);
     262           0 :   assert(sum >= -255 * 32 * 16);
     263           0 :   return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
     264             : }
     265             : 
     266           0 : unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride,
     267             :                                     const uint8_t *ref, int ref_stride,
     268             :                                     unsigned int *sse) {
     269             :   int sum;
     270           0 :   variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
     271             :                 aom_get16x16var_sse2, 16);
     272           0 :   assert(sum <= 255 * 32 * 16);
     273           0 :   assert(sum >= -255 * 32 * 16);
     274           0 :   return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
     275             : }
     276             : 
     277           0 : unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride,
     278             :                                     const uint8_t *ref, int ref_stride,
     279             :                                     unsigned int *sse) {
     280             :   int sum;
     281           0 :   variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
     282             :                 aom_get16x16var_sse2, 16);
     283           0 :   assert(sum <= 255 * 64 * 64);
     284           0 :   assert(sum >= -255 * 64 * 64);
     285           0 :   return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
     286             : }
     287             : 
     288           0 : unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride,
     289             :                                     const uint8_t *ref, int ref_stride,
     290             :                                     unsigned int *sse) {
     291             :   int sum;
     292           0 :   variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
     293             :                 aom_get16x16var_sse2, 16);
     294           0 :   assert(sum <= 255 * 64 * 32);
     295           0 :   assert(sum >= -255 * 64 * 32);
     296           0 :   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
     297             : }
     298             : 
     299           0 : unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride,
     300             :                                     const uint8_t *ref, int ref_stride,
     301             :                                     unsigned int *sse) {
     302             :   int sum;
     303           0 :   variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
     304             :                 aom_get16x16var_sse2, 16);
     305           0 :   assert(sum <= 255 * 64 * 32);
     306           0 :   assert(sum >= -255 * 64 * 32);
     307           0 :   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
     308             : }
     309             : 
     310           0 : unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
     311             :                              const uint8_t *ref, int ref_stride,
     312             :                              unsigned int *sse) {
     313           0 :   aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
     314           0 :   return *sse;
     315             : }
     316             : 
     317           0 : unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride,
     318             :                               const uint8_t *ref, int ref_stride,
     319             :                               unsigned int *sse) {
     320           0 :   aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
     321           0 :   return *sse;
     322             : }
     323             : 
     324           0 : unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride,
     325             :                               const uint8_t *ref, int ref_stride,
     326             :                               unsigned int *sse) {
     327           0 :   aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
     328           0 :   return *sse;
     329             : }
     330             : 
     331           0 : unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
     332             :                                const uint8_t *ref, int ref_stride,
     333             :                                unsigned int *sse) {
     334           0 :   aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
     335           0 :   return *sse;
     336             : }
     337             : 
     338             : // The 2 unused parameters are place holders for PIC enabled build.
     339             : // These definitions are for functions defined in subpel_variance.asm
     340             : #define DECL(w, opt)                                                           \
     341             :   int aom_sub_pixel_variance##w##xh_##opt(                                     \
     342             :       const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
     343             :       const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
     344             :       void *unused0, void *unused)
     345             : #define DECLS(opt1, opt2) \
     346             :   DECL(4, opt1);          \
     347             :   DECL(8, opt1);          \
     348             :   DECL(16, opt1)
     349             : 
     350             : DECLS(sse2, sse2);
     351             : DECLS(ssse3, ssse3);
     352             : #undef DECLS
     353             : #undef DECL
     354             : 
     355             : #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
     356             :   unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                        \
     357             :       const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
     358             :       const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {             \
     359             :     unsigned int sse;                                                          \
     360             :     int se = aom_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset,   \
     361             :                                                   y_offset, dst, dst_stride,   \
     362             :                                                   h, &sse, NULL, NULL);        \
     363             :     if (w > wf) {                                                              \
     364             :       unsigned int sse2;                                                       \
     365             :       int se2 = aom_sub_pixel_variance##wf##xh_##opt(                          \
     366             :           src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
     367             :           &sse2, NULL, NULL);                                                  \
     368             :       se += se2;                                                               \
     369             :       sse += sse2;                                                             \
     370             :       if (w > wf * 2) {                                                        \
     371             :         se2 = aom_sub_pixel_variance##wf##xh_##opt(                            \
     372             :             src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
     373             :             &sse2, NULL, NULL);                                                \
     374             :         se += se2;                                                             \
     375             :         sse += sse2;                                                           \
     376             :         se2 = aom_sub_pixel_variance##wf##xh_##opt(                            \
     377             :             src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
     378             :             &sse2, NULL, NULL);                                                \
     379             :         se += se2;                                                             \
     380             :         sse += sse2;                                                           \
     381             :       }                                                                        \
     382             :     }                                                                          \
     383             :     *sse_ptr = sse;                                                            \
     384             :     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
     385             :   }
     386             : 
     387             : #define FNS(opt1, opt2)                              \
     388             :   FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
     389             :   FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
     390             :   FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
     391             :   FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
     392             :   FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
     393             :   FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
     394             :   FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
     395             :   FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t));   \
     396             :   FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t));    \
     397             :   FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t));     \
     398             :   FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t));     \
     399             :   FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t));     \
     400             :   FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
     401             : 
     402           0 : FNS(sse2, sse2);
     403           0 : FNS(ssse3, ssse3);
     404             : 
     405             : #undef FNS
     406             : #undef FN
     407             : 
     408             : // The 2 unused parameters are place holders for PIC enabled build.
     409             : #define DECL(w, opt)                                                        \
     410             :   int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
     411             :       const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
     412             :       const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
     413             :       ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
     414             :       void *unused)
     415             : #define DECLS(opt1, opt2) \
     416             :   DECL(4, opt1);          \
     417             :   DECL(8, opt1);          \
     418             :   DECL(16, opt1)
     419             : 
     420             : DECLS(sse2, sse2);
     421             : DECLS(ssse3, ssse3);
     422             : #undef DECL
     423             : #undef DECLS
     424             : 
     425             : #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
     426             :   unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                    \
     427             :       const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
     428             :       const uint8_t *dst, int dst_stride, unsigned int *sseptr,                \
     429             :       const uint8_t *sec) {                                                    \
     430             :     unsigned int sse;                                                          \
     431             :     int se = aom_sub_pixel_avg_variance##wf##xh_##opt(                         \
     432             :         src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
     433             :         NULL, NULL);                                                           \
     434             :     if (w > wf) {                                                              \
     435             :       unsigned int sse2;                                                       \
     436             :       int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(                      \
     437             :           src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
     438             :           sec + 16, w, h, &sse2, NULL, NULL);                                  \
     439             :       se += se2;                                                               \
     440             :       sse += sse2;                                                             \
     441             :       if (w > wf * 2) {                                                        \
     442             :         se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(                        \
     443             :             src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
     444             :             sec + 32, w, h, &sse2, NULL, NULL);                                \
     445             :         se += se2;                                                             \
     446             :         sse += sse2;                                                           \
     447             :         se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(                        \
     448             :             src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
     449             :             sec + 48, w, h, &sse2, NULL, NULL);                                \
     450             :         se += se2;                                                             \
     451             :         sse += sse2;                                                           \
     452             :       }                                                                        \
     453             :     }                                                                          \
     454             :     *sseptr = sse;                                                             \
     455             :     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
     456             :   }
     457             : 
     458             : #define FNS(opt1, opt2)                              \
     459             :   FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
     460             :   FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
     461             :   FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
     462             :   FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
     463             :   FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
     464             :   FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
     465             :   FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
     466             :   FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t));  \
     467             :   FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t));   \
     468             :   FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t));    \
     469             :   FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t));    \
     470             :   FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t));    \
     471             :   FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
     472             : 
     473           0 : FNS(sse2, sse);
     474           0 : FNS(ssse3, ssse3);
     475             : 
     476             : #undef FNS
     477             : #undef FN
     478             : 
     479           0 : void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
     480             :                              const uint8_t *ref, int ref_stride) {
     481             :   int i, j;
     482           0 :   int stride = ref_stride << 3;
     483             : 
     484           0 :   if (width >= 16) {
     485             :     // read 16 points at one time
     486           0 :     for (i = 0; i < height; i++) {
     487           0 :       for (j = 0; j < width; j += 16) {
     488           0 :         __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
     489           0 :         __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
     490           0 :         __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
     491           0 :         __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
     492           0 :         __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
     493           0 :         __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
     494           0 :         __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
     495           0 :         __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
     496             :         __m128i t0, t1, t2, t3;
     497             : 
     498           0 :         t0 = _mm_unpacklo_epi8(s0, s1);
     499           0 :         s1 = _mm_unpackhi_epi8(s0, s1);
     500           0 :         t1 = _mm_unpacklo_epi8(s2, s3);
     501           0 :         s3 = _mm_unpackhi_epi8(s2, s3);
     502           0 :         t2 = _mm_unpacklo_epi8(s4, s5);
     503           0 :         s5 = _mm_unpackhi_epi8(s4, s5);
     504           0 :         t3 = _mm_unpacklo_epi8(s6, s7);
     505           0 :         s7 = _mm_unpackhi_epi8(s6, s7);
     506             : 
     507           0 :         s0 = _mm_unpacklo_epi8(t0, s1);
     508           0 :         s2 = _mm_unpacklo_epi8(t1, s3);
     509           0 :         s4 = _mm_unpacklo_epi8(t2, s5);
     510           0 :         s6 = _mm_unpacklo_epi8(t3, s7);
     511           0 :         s0 = _mm_unpacklo_epi32(s0, s2);
     512           0 :         s4 = _mm_unpacklo_epi32(s4, s6);
     513           0 :         s0 = _mm_unpacklo_epi64(s0, s4);
     514             : 
     515             :         _mm_storeu_si128((__m128i *)(comp_pred), s0);
     516           0 :         comp_pred += 16;
     517           0 :         ref += 16 * 8;
     518             :       }
     519           0 :       ref += stride - (width << 3);
     520             :     }
     521           0 :   } else if (width >= 8) {
     522             :     // read 8 points at one time
     523           0 :     for (i = 0; i < height; i++) {
     524           0 :       for (j = 0; j < width; j += 8) {
     525           0 :         __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
     526           0 :         __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
     527           0 :         __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
     528           0 :         __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
     529             :         __m128i t0, t1;
     530             : 
     531           0 :         t0 = _mm_unpacklo_epi8(s0, s1);
     532           0 :         s1 = _mm_unpackhi_epi8(s0, s1);
     533           0 :         t1 = _mm_unpacklo_epi8(s2, s3);
     534           0 :         s3 = _mm_unpackhi_epi8(s2, s3);
     535             : 
     536           0 :         s0 = _mm_unpacklo_epi8(t0, s1);
     537           0 :         s2 = _mm_unpacklo_epi8(t1, s3);
     538           0 :         s0 = _mm_unpacklo_epi32(s0, s2);
     539             : 
     540             :         _mm_storel_epi64((__m128i *)(comp_pred), s0);
     541           0 :         comp_pred += 8;
     542           0 :         ref += 8 * 8;
     543             :       }
     544           0 :       ref += stride - (width << 3);
     545             :     }
     546             :   } else {
     547             :     // read 4 points at one time
     548           0 :     for (i = 0; i < height; i++) {
     549           0 :       for (j = 0; j < width; j += 4) {
     550           0 :         __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
     551           0 :         __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
     552             :         __m128i t0;
     553             : 
     554           0 :         t0 = _mm_unpacklo_epi8(s0, s1);
     555           0 :         s1 = _mm_unpackhi_epi8(s0, s1);
     556           0 :         s0 = _mm_unpacklo_epi8(t0, s1);
     557             : 
     558           0 :         *(int *)comp_pred = _mm_cvtsi128_si32(s0);
     559           0 :         comp_pred += 4;
     560           0 :         ref += 4 * 8;
     561             :       }
     562           0 :       ref += stride - (width << 3);
     563             :     }
     564             :   }
     565           0 : }
     566             : 
     567           0 : void aom_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred,
     568             :                                       int width, int height, const uint8_t *ref,
     569             :                                       int ref_stride) {
     570           0 :   const __m128i zero = _mm_set1_epi16(0);
     571           0 :   const __m128i one = _mm_set1_epi16(1);
     572             :   int i, j;
     573           0 :   int stride = ref_stride << 3;
     574             : 
     575           0 :   if (width >= 16) {
     576             :     // read 16 points at one time
     577           0 :     for (i = 0; i < height; i++) {
     578           0 :       for (j = 0; j < width; j += 16) {
     579           0 :         __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
     580           0 :         __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
     581           0 :         __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
     582           0 :         __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
     583           0 :         __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
     584           0 :         __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
     585           0 :         __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
     586           0 :         __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
     587           0 :         __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
     588             :         __m128i p1;
     589             :         __m128i t0, t1, t2, t3;
     590             : 
     591           0 :         t0 = _mm_unpacklo_epi8(s0, s1);
     592           0 :         s1 = _mm_unpackhi_epi8(s0, s1);
     593           0 :         t1 = _mm_unpacklo_epi8(s2, s3);
     594           0 :         s3 = _mm_unpackhi_epi8(s2, s3);
     595           0 :         t2 = _mm_unpacklo_epi8(s4, s5);
     596           0 :         s5 = _mm_unpackhi_epi8(s4, s5);
     597           0 :         t3 = _mm_unpacklo_epi8(s6, s7);
     598           0 :         s7 = _mm_unpackhi_epi8(s6, s7);
     599             : 
     600           0 :         s0 = _mm_unpacklo_epi8(t0, s1);
     601           0 :         s2 = _mm_unpacklo_epi8(t1, s3);
     602           0 :         s4 = _mm_unpacklo_epi8(t2, s5);
     603           0 :         s6 = _mm_unpacklo_epi8(t3, s7);
     604             : 
     605           0 :         s0 = _mm_unpacklo_epi32(s0, s2);
     606           0 :         s4 = _mm_unpacklo_epi32(s4, s6);
     607           0 :         s0 = _mm_unpacklo_epi8(s0, zero);
     608           0 :         s4 = _mm_unpacklo_epi8(s4, zero);
     609             : 
     610           0 :         p1 = _mm_unpackhi_epi8(p0, zero);
     611           0 :         p0 = _mm_unpacklo_epi8(p0, zero);
     612           0 :         p0 = _mm_adds_epu16(s0, p0);
     613           0 :         p1 = _mm_adds_epu16(s4, p1);
     614           0 :         p0 = _mm_adds_epu16(p0, one);
     615           0 :         p1 = _mm_adds_epu16(p1, one);
     616             : 
     617           0 :         p0 = _mm_srli_epi16(p0, 1);
     618           0 :         p1 = _mm_srli_epi16(p1, 1);
     619           0 :         p0 = _mm_packus_epi16(p0, p1);
     620             : 
     621             :         _mm_storeu_si128((__m128i *)(comp_pred), p0);
     622           0 :         comp_pred += 16;
     623           0 :         pred += 16;
     624           0 :         ref += 16 * 8;
     625             :       }
     626           0 :       ref += stride - (width << 3);
     627             :     }
     628           0 :   } else if (width >= 8) {
     629             :     // read 8 points at one time
     630           0 :     for (i = 0; i < height; i++) {
     631           0 :       for (j = 0; j < width; j += 8) {
     632           0 :         __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
     633           0 :         __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
     634           0 :         __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
     635           0 :         __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
     636           0 :         __m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
     637             :         __m128i t0, t1;
     638             : 
     639           0 :         t0 = _mm_unpacklo_epi8(s0, s1);
     640           0 :         s1 = _mm_unpackhi_epi8(s0, s1);
     641           0 :         t1 = _mm_unpacklo_epi8(s2, s3);
     642           0 :         s3 = _mm_unpackhi_epi8(s2, s3);
     643             : 
     644           0 :         s0 = _mm_unpacklo_epi8(t0, s1);
     645           0 :         s2 = _mm_unpacklo_epi8(t1, s3);
     646           0 :         s0 = _mm_unpacklo_epi32(s0, s2);
     647           0 :         s0 = _mm_unpacklo_epi8(s0, zero);
     648             : 
     649           0 :         p0 = _mm_unpacklo_epi8(p0, zero);
     650           0 :         p0 = _mm_adds_epu16(s0, p0);
     651           0 :         p0 = _mm_adds_epu16(p0, one);
     652           0 :         p0 = _mm_srli_epi16(p0, 1);
     653           0 :         p0 = _mm_packus_epi16(p0, zero);
     654             : 
     655             :         _mm_storel_epi64((__m128i *)(comp_pred), p0);
     656           0 :         comp_pred += 8;
     657           0 :         pred += 8;
     658           0 :         ref += 8 * 8;
     659             :       }
     660           0 :       ref += stride - (width << 3);
     661             :     }
     662             :   } else {
     663             :     // read 4 points at one time
     664           0 :     for (i = 0; i < height; i++) {
     665           0 :       for (j = 0; j < width; j += 4) {
     666           0 :         __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
     667           0 :         __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
     668           0 :         __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)pred);
     669             :         __m128i t0;
     670             : 
     671           0 :         t0 = _mm_unpacklo_epi8(s0, s1);
     672           0 :         s1 = _mm_unpackhi_epi8(s0, s1);
     673           0 :         s0 = _mm_unpacklo_epi8(t0, s1);
     674           0 :         s0 = _mm_unpacklo_epi8(s0, zero);
     675             : 
     676           0 :         p0 = _mm_unpacklo_epi8(p0, zero);
     677           0 :         p0 = _mm_adds_epu16(s0, p0);
     678           0 :         p0 = _mm_adds_epu16(p0, one);
     679           0 :         p0 = _mm_srli_epi16(p0, 1);
     680           0 :         p0 = _mm_packus_epi16(p0, zero);
     681             : 
     682           0 :         *(int *)comp_pred = _mm_cvtsi128_si32(p0);
     683           0 :         comp_pred += 4;
     684           0 :         pred += 4;
     685           0 :         ref += 4 * 8;
     686             :       }
     687           0 :       ref += stride - (width << 3);
     688             :     }
     689             :   }
     690           0 : }

Generated by: LCOV version 1.13