LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - highbd_variance_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 161 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 113 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <emmintrin.h>  // SSE2
      13             : 
      14             : #include "./aom_config.h"
      15             : #include "./aom_dsp_rtcd.h"
      16             : 
      17             : #include "aom_ports/mem.h"
      18             : 
      19             : typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
      20             :                                        const uint16_t *ref, int ref_stride,
      21             :                                        uint32_t *sse, int *sum);
      22             : 
      23             : uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
      24             :                                     const uint16_t *ref, int ref_stride,
      25             :                                     uint32_t *sse, int *sum);
      26             : 
      27             : uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
      28             :                                       const uint16_t *ref, int ref_stride,
      29             :                                       uint32_t *sse, int *sum);
      30             : 
      31           0 : static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
      32             :                                    const uint16_t *ref, int ref_stride, int w,
      33             :                                    int h, uint32_t *sse, int *sum,
      34             :                                    high_variance_fn_t var_fn, int block_size) {
      35             :   int i, j;
      36             : 
      37           0 :   *sse = 0;
      38           0 :   *sum = 0;
      39             : 
      40           0 :   for (i = 0; i < h; i += block_size) {
      41           0 :     for (j = 0; j < w; j += block_size) {
      42             :       unsigned int sse0;
      43             :       int sum0;
      44           0 :       var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
      45             :              ref_stride, &sse0, &sum0);
      46           0 :       *sse += sse0;
      47           0 :       *sum += sum0;
      48             :     }
      49             :   }
      50           0 : }
      51             : 
      52           0 : static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
      53             :                                     const uint16_t *ref, int ref_stride, int w,
      54             :                                     int h, uint32_t *sse, int *sum,
      55             :                                     high_variance_fn_t var_fn, int block_size) {
      56             :   int i, j;
      57           0 :   uint64_t sse_long = 0;
      58           0 :   int32_t sum_long = 0;
      59             : 
      60           0 :   for (i = 0; i < h; i += block_size) {
      61           0 :     for (j = 0; j < w; j += block_size) {
      62             :       unsigned int sse0;
      63             :       int sum0;
      64           0 :       var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
      65             :              ref_stride, &sse0, &sum0);
      66           0 :       sse_long += sse0;
      67           0 :       sum_long += sum0;
      68             :     }
      69             :   }
      70           0 :   *sum = ROUND_POWER_OF_TWO(sum_long, 2);
      71           0 :   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
      72           0 : }
      73             : 
      74           0 : static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
      75             :                                     const uint16_t *ref, int ref_stride, int w,
      76             :                                     int h, uint32_t *sse, int *sum,
      77             :                                     high_variance_fn_t var_fn, int block_size) {
      78             :   int i, j;
      79           0 :   uint64_t sse_long = 0;
      80           0 :   int32_t sum_long = 0;
      81             : 
      82           0 :   for (i = 0; i < h; i += block_size) {
      83           0 :     for (j = 0; j < w; j += block_size) {
      84             :       unsigned int sse0;
      85             :       int sum0;
      86           0 :       var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
      87             :              ref_stride, &sse0, &sum0);
      88           0 :       sse_long += sse0;
      89           0 :       sum_long += sum0;
      90             :     }
      91             :   }
      92           0 :   *sum = ROUND_POWER_OF_TWO(sum_long, 4);
      93           0 :   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
      94           0 : }
      95             : 
      96             : #define HIGH_GET_VAR(S)                                                       \
      97             :   void aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
      98             :                                          const uint8_t *ref8, int ref_stride, \
      99             :                                          uint32_t *sse, int *sum) {           \
     100             :     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
     101             :     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
     102             :     aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
     103             :                                        sum);                                  \
     104             :   }                                                                           \
     105             :                                                                               \
     106             :   void aom_highbd_10_get##S##x##S##var_sse2(                                  \
     107             :       const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
     108             :       int ref_stride, uint32_t *sse, int *sum) {                              \
     109             :     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
     110             :     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
     111             :     aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
     112             :                                        sum);                                  \
     113             :     *sum = ROUND_POWER_OF_TWO(*sum, 2);                                       \
     114             :     *sse = ROUND_POWER_OF_TWO(*sse, 4);                                       \
     115             :   }                                                                           \
     116             :                                                                               \
     117             :   void aom_highbd_12_get##S##x##S##var_sse2(                                  \
     118             :       const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
     119             :       int ref_stride, uint32_t *sse, int *sum) {                              \
     120             :     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
     121             :     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
     122             :     aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
     123             :                                        sum);                                  \
     124             :     *sum = ROUND_POWER_OF_TWO(*sum, 4);                                       \
     125             :     *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
     126             :   }
     127             : 
     128           0 : HIGH_GET_VAR(16);
     129           0 : HIGH_GET_VAR(8);
     130             : 
     131             : #undef HIGH_GET_VAR
     132             : 
     133             : #define VAR_FN(w, h, block_size, shift)                                    \
     134             :   uint32_t aom_highbd_8_variance##w##x##h##_sse2(                          \
     135             :       const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
     136             :       int ref_stride, uint32_t *sse) {                                     \
     137             :     int sum;                                                               \
     138             :     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
     139             :     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
     140             :     highbd_8_variance_sse2(                                                \
     141             :         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
     142             :         aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
     143             :     return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
     144             :   }                                                                        \
     145             :                                                                            \
     146             :   uint32_t aom_highbd_10_variance##w##x##h##_sse2(                         \
     147             :       const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
     148             :       int ref_stride, uint32_t *sse) {                                     \
     149             :     int sum;                                                               \
     150             :     int64_t var;                                                           \
     151             :     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
     152             :     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
     153             :     highbd_10_variance_sse2(                                               \
     154             :         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
     155             :         aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
     156             :     var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
     157             :     return (var >= 0) ? (uint32_t)var : 0;                                 \
     158             :   }                                                                        \
     159             :                                                                            \
     160             :   uint32_t aom_highbd_12_variance##w##x##h##_sse2(                         \
     161             :       const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
     162             :       int ref_stride, uint32_t *sse) {                                     \
     163             :     int sum;                                                               \
     164             :     int64_t var;                                                           \
     165             :     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
     166             :     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
     167             :     highbd_12_variance_sse2(                                               \
     168             :         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
     169             :         aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
     170             :     var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
     171             :     return (var >= 0) ? (uint32_t)var : 0;                                 \
     172             :   }
     173             : 
     174           0 : VAR_FN(64, 64, 16, 12);
     175           0 : VAR_FN(64, 32, 16, 11);
     176           0 : VAR_FN(32, 64, 16, 11);
     177           0 : VAR_FN(32, 32, 16, 10);
     178           0 : VAR_FN(32, 16, 16, 9);
     179           0 : VAR_FN(16, 32, 16, 9);
     180           0 : VAR_FN(16, 16, 16, 8);
     181           0 : VAR_FN(16, 8, 8, 7);
     182           0 : VAR_FN(8, 16, 8, 7);
     183           0 : VAR_FN(8, 8, 8, 6);
     184             : 
     185             : #undef VAR_FN
     186             : 
     187           0 : unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
     188             :                                         const uint8_t *ref8, int ref_stride,
     189             :                                         unsigned int *sse) {
     190             :   int sum;
     191           0 :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     192           0 :   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     193           0 :   highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
     194             :                          aom_highbd_calc16x16var_sse2, 16);
     195           0 :   return *sse;
     196             : }
     197             : 
     198           0 : unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
     199             :                                          const uint8_t *ref8, int ref_stride,
     200             :                                          unsigned int *sse) {
     201             :   int sum;
     202           0 :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     203           0 :   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     204           0 :   highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
     205             :                           aom_highbd_calc16x16var_sse2, 16);
     206           0 :   return *sse;
     207             : }
     208             : 
     209           0 : unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
     210             :                                          const uint8_t *ref8, int ref_stride,
     211             :                                          unsigned int *sse) {
     212             :   int sum;
     213           0 :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     214           0 :   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     215           0 :   highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
     216             :                           aom_highbd_calc16x16var_sse2, 16);
     217           0 :   return *sse;
     218             : }
     219             : 
     220           0 : unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
     221             :                                       const uint8_t *ref8, int ref_stride,
     222             :                                       unsigned int *sse) {
     223             :   int sum;
     224           0 :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     225           0 :   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     226           0 :   highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
     227             :                          aom_highbd_calc8x8var_sse2, 8);
     228           0 :   return *sse;
     229             : }
     230             : 
     231           0 : unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
     232             :                                        const uint8_t *ref8, int ref_stride,
     233             :                                        unsigned int *sse) {
     234             :   int sum;
     235           0 :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     236           0 :   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     237           0 :   highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
     238             :                           aom_highbd_calc8x8var_sse2, 8);
     239           0 :   return *sse;
     240             : }
     241             : 
     242           0 : unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
     243             :                                        const uint8_t *ref8, int ref_stride,
     244             :                                        unsigned int *sse) {
     245             :   int sum;
     246           0 :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     247           0 :   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     248           0 :   highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
     249             :                           aom_highbd_calc8x8var_sse2, 8);
     250           0 :   return *sse;
     251             : }
     252             : 
     253             : // The 2 unused parameters are place holders for PIC enabled build.
     254             : // These definitions are for functions defined in
     255             : // highbd_subpel_variance_impl_sse2.asm
     256             : #define DECL(w, opt)                                                         \
     257             :   int aom_highbd_sub_pixel_variance##w##xh_##opt(                            \
     258             :       const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
     259             :       const uint16_t *dst, ptrdiff_t dst_stride, int height,                 \
     260             :       unsigned int *sse, void *unused0, void *unused);
     261             : #define DECLS(opt) \
     262             :   DECL(8, opt);    \
     263             :   DECL(16, opt)
     264             : 
     265             : DECLS(sse2);
     266             : 
     267             : #undef DECLS
     268             : #undef DECL
     269             : 
     270             : #define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
     271             :   uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
     272             :       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
     273             :       const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
     274             :     uint32_t sse;                                                              \
     275             :     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     276             :     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
     277             :     int se = aom_highbd_sub_pixel_variance##wf##xh_##opt(                      \
     278             :         src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
     279             :         NULL);                                                                 \
     280             :     if (w > wf) {                                                              \
     281             :       unsigned int sse2;                                                       \
     282             :       int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
     283             :           src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
     284             :           &sse2, NULL, NULL);                                                  \
     285             :       se += se2;                                                               \
     286             :       sse += sse2;                                                             \
     287             :       if (w > wf * 2) {                                                        \
     288             :         se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
     289             :             src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
     290             :             &sse2, NULL, NULL);                                                \
     291             :         se += se2;                                                             \
     292             :         sse += sse2;                                                           \
     293             :         se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
     294             :             src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
     295             :             &sse2, NULL, NULL);                                                \
     296             :         se += se2;                                                             \
     297             :         sse += sse2;                                                           \
     298             :       }                                                                        \
     299             :     }                                                                          \
     300             :     *sse_ptr = sse;                                                            \
     301             :     return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
     302             :   }                                                                            \
     303             :                                                                                \
     304             :   uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
     305             :       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
     306             :       const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
     307             :     int64_t var;                                                               \
     308             :     uint32_t sse;                                                              \
     309             :     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     310             :     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
     311             :     int se = aom_highbd_sub_pixel_variance##wf##xh_##opt(                      \
     312             :         src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
     313             :         NULL);                                                                 \
     314             :     if (w > wf) {                                                              \
     315             :       uint32_t sse2;                                                           \
     316             :       int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
     317             :           src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
     318             :           &sse2, NULL, NULL);                                                  \
     319             :       se += se2;                                                               \
     320             :       sse += sse2;                                                             \
     321             :       if (w > wf * 2) {                                                        \
     322             :         se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
     323             :             src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
     324             :             &sse2, NULL, NULL);                                                \
     325             :         se += se2;                                                             \
     326             :         sse += sse2;                                                           \
     327             :         se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
     328             :             src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
     329             :             &sse2, NULL, NULL);                                                \
     330             :         se += se2;                                                             \
     331             :         sse += sse2;                                                           \
     332             :       }                                                                        \
     333             :     }                                                                          \
     334             :     se = ROUND_POWER_OF_TWO(se, 2);                                            \
     335             :     sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
     336             :     *sse_ptr = sse;                                                            \
     337             :     var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
     338             :     return (var >= 0) ? (uint32_t)var : 0;                                     \
     339             :   }                                                                            \
     340             :                                                                                \
     341             :   uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
     342             :       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
     343             :       const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
     344             :     int start_row;                                                             \
     345             :     uint32_t sse;                                                              \
     346             :     int se = 0;                                                                \
     347             :     int64_t var;                                                               \
     348             :     uint64_t long_sse = 0;                                                     \
     349             :     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     350             :     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
     351             :     for (start_row = 0; start_row < h; start_row += 16) {                      \
     352             :       uint32_t sse2;                                                           \
     353             :       int height = h - start_row < 16 ? h - start_row : 16;                    \
     354             :       int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
     355             :           src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
     356             :           dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL,     \
     357             :           NULL);                                                               \
     358             :       se += se2;                                                               \
     359             :       long_sse += sse2;                                                        \
     360             :       if (w > wf) {                                                            \
     361             :         se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
     362             :             src + 16 + (start_row * src_stride), src_stride, x_offset,         \
     363             :             y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
     364             :             &sse2, NULL, NULL);                                                \
     365             :         se += se2;                                                             \
     366             :         long_sse += sse2;                                                      \
     367             :         if (w > wf * 2) {                                                      \
     368             :           se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
     369             :               src + 32 + (start_row * src_stride), src_stride, x_offset,       \
     370             :               y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
     371             :               height, &sse2, NULL, NULL);                                      \
     372             :           se += se2;                                                           \
     373             :           long_sse += sse2;                                                    \
     374             :           se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
     375             :               src + 48 + (start_row * src_stride), src_stride, x_offset,       \
     376             :               y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
     377             :               height, &sse2, NULL, NULL);                                      \
     378             :           se += se2;                                                           \
     379             :           long_sse += sse2;                                                    \
     380             :         }                                                                      \
     381             :       }                                                                        \
     382             :     }                                                                          \
     383             :     se = ROUND_POWER_OF_TWO(se, 4);                                            \
     384             :     sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
     385             :     *sse_ptr = sse;                                                            \
     386             :     var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
     387             :     return (var >= 0) ? (uint32_t)var : 0;                                     \
     388             :   }
     389             : 
     390             : #define FNS(opt)                        \
     391             :   FN(64, 64, 16, 6, 6, opt, (int64_t)); \
     392             :   FN(64, 32, 16, 6, 5, opt, (int64_t)); \
     393             :   FN(32, 64, 16, 5, 6, opt, (int64_t)); \
     394             :   FN(32, 32, 16, 5, 5, opt, (int64_t)); \
     395             :   FN(32, 16, 16, 5, 4, opt, (int64_t)); \
     396             :   FN(16, 32, 16, 4, 5, opt, (int64_t)); \
     397             :   FN(16, 16, 16, 4, 4, opt, (int64_t)); \
     398             :   FN(16, 8, 16, 4, 3, opt, (int64_t));  \
     399             :   FN(8, 16, 8, 3, 4, opt, (int64_t));   \
     400             :   FN(8, 8, 8, 3, 3, opt, (int64_t));    \
     401             :   FN(8, 4, 8, 3, 2, opt, (int64_t));
     402             : 
     403           0 : FNS(sse2);
     404             : 
     405             : #undef FNS
     406             : #undef FN
     407             : 
     408             : // The 2 unused parameters are place holders for PIC enabled build.
     409             : #define DECL(w, opt)                                                         \
     410             :   int aom_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
     411             :       const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
     412             :       const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec,        \
     413             :       ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,    \
     414             :       void *unused);
     415             : #define DECLS(opt1) \
     416             :   DECL(16, opt1)    \
     417             :   DECL(8, opt1)
     418             : 
     419             : DECLS(sse2);
     420             : #undef DECL
     421             : #undef DECLS
     422             : 
     423             : #define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
     424             :   uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
     425             :       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
     426             :       const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
     427             :       const uint8_t *sec8) {                                                   \
     428             :     uint32_t sse;                                                              \
     429             :     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     430             :     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
     431             :     uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
     432             :     int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
     433             :         src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
     434             :         NULL, NULL);                                                           \
     435             :     if (w > wf) {                                                              \
     436             :       uint32_t sse2;                                                           \
     437             :       int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
     438             :           src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
     439             :           sec + 16, w, h, &sse2, NULL, NULL);                                  \
     440             :       se += se2;                                                               \
     441             :       sse += sse2;                                                             \
     442             :       if (w > wf * 2) {                                                        \
     443             :         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
     444             :             src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
     445             :             sec + 32, w, h, &sse2, NULL, NULL);                                \
     446             :         se += se2;                                                             \
     447             :         sse += sse2;                                                           \
     448             :         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
     449             :             src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
     450             :             sec + 48, w, h, &sse2, NULL, NULL);                                \
     451             :         se += se2;                                                             \
     452             :         sse += sse2;                                                           \
     453             :       }                                                                        \
     454             :     }                                                                          \
     455             :     *sse_ptr = sse;                                                            \
     456             :     return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
     457             :   }                                                                            \
     458             :                                                                                \
     459             :   uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
     460             :       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
     461             :       const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
     462             :       const uint8_t *sec8) {                                                   \
     463             :     int64_t var;                                                               \
     464             :     uint32_t sse;                                                              \
     465             :     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     466             :     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
     467             :     uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
     468             :     int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
     469             :         src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
     470             :         NULL, NULL);                                                           \
     471             :     if (w > wf) {                                                              \
     472             :       uint32_t sse2;                                                           \
     473             :       int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
     474             :           src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
     475             :           sec + 16, w, h, &sse2, NULL, NULL);                                  \
     476             :       se += se2;                                                               \
     477             :       sse += sse2;                                                             \
     478             :       if (w > wf * 2) {                                                        \
     479             :         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
     480             :             src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
     481             :             sec + 32, w, h, &sse2, NULL, NULL);                                \
     482             :         se += se2;                                                             \
     483             :         sse += sse2;                                                           \
     484             :         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
     485             :             src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
     486             :             sec + 48, w, h, &sse2, NULL, NULL);                                \
     487             :         se += se2;                                                             \
     488             :         sse += sse2;                                                           \
     489             :       }                                                                        \
     490             :     }                                                                          \
     491             :     se = ROUND_POWER_OF_TWO(se, 2);                                            \
     492             :     sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
     493             :     *sse_ptr = sse;                                                            \
     494             :     var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
     495             :     return (var >= 0) ? (uint32_t)var : 0;                                     \
     496             :   }                                                                            \
     497             :                                                                                \
     498             :   uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
     499             :       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
     500             :       const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
     501             :       const uint8_t *sec8) {                                                   \
     502             :     int start_row;                                                             \
     503             :     int64_t var;                                                               \
     504             :     uint32_t sse;                                                              \
     505             :     int se = 0;                                                                \
     506             :     uint64_t long_sse = 0;                                                     \
     507             :     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     508             :     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
     509             :     uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
     510             :     for (start_row = 0; start_row < h; start_row += 16) {                      \
     511             :       uint32_t sse2;                                                           \
     512             :       int height = h - start_row < 16 ? h - start_row : 16;                    \
     513             :       int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
     514             :           src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
     515             :           dst + (start_row * dst_stride), dst_stride, sec + (start_row * w),   \
     516             :           w, height, &sse2, NULL, NULL);                                       \
     517             :       se += se2;                                                               \
     518             :       long_sse += sse2;                                                        \
     519             :       if (w > wf) {                                                            \
     520             :         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
     521             :             src + 16 + (start_row * src_stride), src_stride, x_offset,         \
     522             :             y_offset, dst + 16 + (start_row * dst_stride), dst_stride,         \
     523             :             sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL);         \
     524             :         se += se2;                                                             \
     525             :         long_sse += sse2;                                                      \
     526             :         if (w > wf * 2) {                                                      \
     527             :           se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
     528             :               src + 32 + (start_row * src_stride), src_stride, x_offset,       \
     529             :               y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
     530             :               sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL);       \
     531             :           se += se2;                                                           \
     532             :           long_sse += sse2;                                                    \
     533             :           se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
     534             :               src + 48 + (start_row * src_stride), src_stride, x_offset,       \
     535             :               y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
     536             :               sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL);       \
     537             :           se += se2;                                                           \
     538             :           long_sse += sse2;                                                    \
     539             :         }                                                                      \
     540             :       }                                                                        \
     541             :     }                                                                          \
     542             :     se = ROUND_POWER_OF_TWO(se, 4);                                            \
     543             :     sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
     544             :     *sse_ptr = sse;                                                            \
     545             :     var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
     546             :     return (var >= 0) ? (uint32_t)var : 0;                                     \
     547             :   }
     548             : 
     549             : #define FNS(opt1)                        \
     550             :   FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
     551             :   FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
     552             :   FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
     553             :   FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
     554             :   FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
     555             :   FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
     556             :   FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
     557             :   FN(16, 8, 16, 4, 3, opt1, (int64_t));  \
     558             :   FN(8, 16, 8, 4, 3, opt1, (int64_t));   \
     559             :   FN(8, 8, 8, 3, 3, opt1, (int64_t));    \
     560             :   FN(8, 4, 8, 3, 2, opt1, (int64_t));
     561             : 
     562           0 : FNS(sse2);
     563             : 
     564             : #undef FNS
     565             : #undef FN
     566             : 
     567           0 : void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height,
     568             :                                     const uint8_t *ref8, int ref_stride) {
     569             :   int i, j;
     570           0 :   int stride = ref_stride << 3;
     571           0 :   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     572             : 
     573           0 :   if (width >= 8) {
     574             :     // read 8 points at one time
     575           0 :     for (i = 0; i < height; i++) {
     576           0 :       for (j = 0; j < width; j += 8) {
     577           0 :         __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
     578           0 :         __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
     579           0 :         __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
     580           0 :         __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
     581           0 :         __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
     582           0 :         __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
     583           0 :         __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
     584           0 :         __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
     585             :         __m128i t0, t1, t2, t3;
     586             : 
     587           0 :         t0 = _mm_unpacklo_epi16(s0, s1);
     588           0 :         t1 = _mm_unpacklo_epi16(s2, s3);
     589           0 :         t2 = _mm_unpacklo_epi16(s4, s5);
     590           0 :         t3 = _mm_unpacklo_epi16(s6, s7);
     591           0 :         t0 = _mm_unpacklo_epi32(t0, t1);
     592           0 :         t2 = _mm_unpacklo_epi32(t2, t3);
     593           0 :         t0 = _mm_unpacklo_epi64(t0, t2);
     594             : 
     595             :         _mm_storeu_si128((__m128i *)(comp_pred), t0);
     596           0 :         comp_pred += 8;
     597           0 :         ref += 64;  // 8 * 8;
     598             :       }
     599           0 :       ref += stride - (width << 3);
     600             :     }
     601             :   } else {
     602             :     // read 4 points at one time
     603           0 :     for (i = 0; i < height; i++) {
     604           0 :       for (j = 0; j < width; j += 4) {
     605           0 :         __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
     606           0 :         __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
     607           0 :         __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
     608           0 :         __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
     609             :         __m128i t0, t1;
     610             : 
     611           0 :         t0 = _mm_unpacklo_epi16(s0, s1);
     612           0 :         t1 = _mm_unpacklo_epi16(s2, s3);
     613           0 :         t0 = _mm_unpacklo_epi32(t0, t1);
     614             : 
     615             :         _mm_storel_epi64((__m128i *)(comp_pred), t0);
     616           0 :         comp_pred += 4;
     617           0 :         ref += 4 * 8;
     618             :       }
     619           0 :       ref += stride - (width << 3);
     620             :     }
     621             :   }
     622           0 : }
     623             : 
     624           0 : void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
     625             :                                              const uint8_t *pred8, int width,
     626             :                                              int height, const uint8_t *ref8,
     627             :                                              int ref_stride) {
     628           0 :   const __m128i one = _mm_set1_epi16(1);
     629             :   int i, j;
     630           0 :   int stride = ref_stride << 3;
     631           0 :   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
     632           0 :   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     633             : 
     634           0 :   if (width >= 8) {
     635             :     // read 8 points at one time
     636           0 :     for (i = 0; i < height; i++) {
     637           0 :       for (j = 0; j < width; j += 8) {
     638           0 :         __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
     639           0 :         __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
     640           0 :         __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
     641           0 :         __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
     642           0 :         __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
     643           0 :         __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
     644           0 :         __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
     645           0 :         __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
     646           0 :         __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
     647             :         __m128i t0, t1, t2, t3;
     648             : 
     649           0 :         t0 = _mm_unpacklo_epi16(s0, s1);
     650           0 :         t1 = _mm_unpacklo_epi16(s2, s3);
     651           0 :         t2 = _mm_unpacklo_epi16(s4, s5);
     652           0 :         t3 = _mm_unpacklo_epi16(s6, s7);
     653           0 :         t0 = _mm_unpacklo_epi32(t0, t1);
     654           0 :         t2 = _mm_unpacklo_epi32(t2, t3);
     655           0 :         t0 = _mm_unpacklo_epi64(t0, t2);
     656             : 
     657           0 :         p0 = _mm_adds_epu16(t0, p0);
     658           0 :         p0 = _mm_adds_epu16(p0, one);
     659           0 :         p0 = _mm_srli_epi16(p0, 1);
     660             : 
     661             :         _mm_storeu_si128((__m128i *)(comp_pred), p0);
     662           0 :         comp_pred += 8;
     663           0 :         pred += 8;
     664           0 :         ref += 8 * 8;
     665             :       }
     666           0 :       ref += stride - (width << 3);
     667             :     }
     668             :   } else {
     669             :     // read 4 points at one time
     670           0 :     for (i = 0; i < height; i++) {
     671           0 :       for (j = 0; j < width; j += 4) {
     672           0 :         __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
     673           0 :         __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
     674           0 :         __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
     675           0 :         __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
     676           0 :         __m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
     677             :         __m128i t0, t1;
     678             : 
     679           0 :         t0 = _mm_unpacklo_epi16(s0, s1);
     680           0 :         t1 = _mm_unpacklo_epi16(s2, s3);
     681           0 :         t0 = _mm_unpacklo_epi32(t0, t1);
     682             : 
     683           0 :         p0 = _mm_adds_epu16(t0, p0);
     684           0 :         p0 = _mm_adds_epu16(p0, one);
     685           0 :         p0 = _mm_srli_epi16(p0, 1);
     686             : 
     687             :         _mm_storel_epi64((__m128i *)(comp_pred), p0);
     688           0 :         comp_pred += 4;
     689           0 :         pred += 4;
     690           0 :         ref += 4 * 8;
     691             :       }
     692           0 :       ref += stride - (width << 3);
     693             :     }
     694             :   }
     695           0 : }

Generated by: LCOV version 1.13