LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp - aom_convolve.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 264 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 34 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <assert.h>
      13             : #include <string.h>
      14             : 
      15             : #include "./aom_config.h"
      16             : #include "./aom_dsp_rtcd.h"
      17             : #include "aom/aom_integer.h"
      18             : #include "aom_dsp/aom_convolve.h"
      19             : #include "aom_dsp/aom_dsp_common.h"
      20             : #include "aom_dsp/aom_filter.h"
      21             : #include "aom_ports/mem.h"
      22             : 
      23           0 : static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
      24             :                            uint8_t *dst, ptrdiff_t dst_stride,
      25             :                            const InterpKernel *x_filters, int x0_q4,
      26             :                            int x_step_q4, int w, int h) {
      27             :   int x, y;
      28           0 :   src -= SUBPEL_TAPS / 2 - 1;
      29           0 :   for (y = 0; y < h; ++y) {
      30           0 :     int x_q4 = x0_q4;
      31           0 :     for (x = 0; x < w; ++x) {
      32           0 :       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      33           0 :       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
      34           0 :       int k, sum = 0;
      35           0 :       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
      36           0 :       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
      37           0 :       x_q4 += x_step_q4;
      38             :     }
      39           0 :     src += src_stride;
      40           0 :     dst += dst_stride;
      41             :   }
      42           0 : }
      43             : 
      44           0 : static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
      45             :                                uint8_t *dst, ptrdiff_t dst_stride,
      46             :                                const InterpKernel *x_filters, int x0_q4,
      47             :                                int x_step_q4, int w, int h) {
      48             :   int x, y;
      49           0 :   src -= SUBPEL_TAPS / 2 - 1;
      50           0 :   for (y = 0; y < h; ++y) {
      51           0 :     int x_q4 = x0_q4;
      52           0 :     for (x = 0; x < w; ++x) {
      53           0 :       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      54           0 :       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
      55           0 :       int k, sum = 0;
      56           0 :       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
      57           0 :       dst[x] = ROUND_POWER_OF_TWO(
      58             :           dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
      59           0 :       x_q4 += x_step_q4;
      60             :     }
      61           0 :     src += src_stride;
      62           0 :     dst += dst_stride;
      63             :   }
      64           0 : }
      65             : 
      66           0 : static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
      67             :                           uint8_t *dst, ptrdiff_t dst_stride,
      68             :                           const InterpKernel *y_filters, int y0_q4,
      69             :                           int y_step_q4, int w, int h) {
      70             :   int x, y;
      71           0 :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
      72             : 
      73           0 :   for (x = 0; x < w; ++x) {
      74           0 :     int y_q4 = y0_q4;
      75           0 :     for (y = 0; y < h; ++y) {
      76           0 :       const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      77           0 :       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
      78           0 :       int k, sum = 0;
      79           0 :       for (k = 0; k < SUBPEL_TAPS; ++k)
      80           0 :         sum += src_y[k * src_stride] * y_filter[k];
      81           0 :       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
      82           0 :       y_q4 += y_step_q4;
      83             :     }
      84           0 :     ++src;
      85           0 :     ++dst;
      86             :   }
      87           0 : }
      88             : 
      89           0 : static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
      90             :                               uint8_t *dst, ptrdiff_t dst_stride,
      91             :                               const InterpKernel *y_filters, int y0_q4,
      92             :                               int y_step_q4, int w, int h) {
      93             :   int x, y;
      94           0 :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
      95             : 
      96           0 :   for (x = 0; x < w; ++x) {
      97           0 :     int y_q4 = y0_q4;
      98           0 :     for (y = 0; y < h; ++y) {
      99           0 :       const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
     100           0 :       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
     101           0 :       int k, sum = 0;
     102           0 :       for (k = 0; k < SUBPEL_TAPS; ++k)
     103           0 :         sum += src_y[k * src_stride] * y_filter[k];
     104           0 :       dst[y * dst_stride] = ROUND_POWER_OF_TWO(
     105             :           dst[y * dst_stride] +
     106             :               clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
     107             :           1);
     108           0 :       y_q4 += y_step_q4;
     109             :     }
     110           0 :     ++src;
     111           0 :     ++dst;
     112             :   }
     113           0 : }
     114             : 
     115           0 : static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     116             :                      ptrdiff_t dst_stride, const InterpKernel *const x_filters,
     117             :                      int x0_q4, int x_step_q4,
     118             :                      const InterpKernel *const y_filters, int y0_q4,
     119             :                      int y_step_q4, int w, int h) {
     120             :   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
     121             :   // 2d filtering proceeds in 2 steps:
     122             :   //   (1) Interpolate horizontally into an intermediate buffer, temp.
     123             :   //   (2) Interpolate temp vertically to derive the sub-pixel result.
     124             :   // Deriving the maximum number of rows in the temp buffer (135):
     125             :   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
     126             :   // --Largest block size is 64x64 pixels.
     127             :   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
     128             :   //   original frame (in 1/16th pixel units).
     129             :   // --Must round-up because block may be located at sub-pixel position.
     130             :   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
     131             :   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
     132             :   uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
     133           0 :   int intermediate_height =
     134           0 :       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
     135             : 
     136           0 :   assert(w <= MAX_SB_SIZE);
     137           0 :   assert(h <= MAX_SB_SIZE);
     138             : 
     139           0 :   assert(y_step_q4 <= 32);
     140           0 :   assert(x_step_q4 <= 32);
     141             : 
     142           0 :   convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
     143             :                  MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
     144             :                  intermediate_height);
     145           0 :   convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
     146             :                 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
     147           0 : }
     148             : 
     149           0 : static const InterpKernel *get_filter_base(const int16_t *filter) {
     150             :   // NOTE: This assumes that the filter table is 256-byte aligned.
     151             :   // TODO(agrange) Modify to make independent of table alignment.
     152           0 :   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
     153             : }
     154             : 
     155           0 : static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
     156           0 :   return (int)((const InterpKernel *)(intptr_t)f - base);
     157             : }
     158             : 
     159           0 : void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
     160             :                            uint8_t *dst, ptrdiff_t dst_stride,
     161             :                            const int16_t *filter_x, int x_step_q4,
     162             :                            const int16_t *filter_y, int y_step_q4, int w,
     163             :                            int h) {
     164           0 :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     165           0 :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     166             : 
     167             :   (void)filter_y;
     168             :   (void)y_step_q4;
     169             : 
     170           0 :   convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
     171             :                  w, h);
     172           0 : }
     173             : 
     174           0 : void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
     175             :                                uint8_t *dst, ptrdiff_t dst_stride,
     176             :                                const int16_t *filter_x, int x_step_q4,
     177             :                                const int16_t *filter_y, int y_step_q4, int w,
     178             :                                int h) {
     179           0 :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     180           0 :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     181             : 
     182             :   (void)filter_y;
     183             :   (void)y_step_q4;
     184             : 
     185           0 :   convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
     186             :                      x_step_q4, w, h);
     187           0 : }
     188             : 
     189           0 : void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
     190             :                           uint8_t *dst, ptrdiff_t dst_stride,
     191             :                           const int16_t *filter_x, int x_step_q4,
     192             :                           const int16_t *filter_y, int y_step_q4, int w,
     193             :                           int h) {
     194           0 :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     195           0 :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     196             : 
     197             :   (void)filter_x;
     198             :   (void)x_step_q4;
     199             : 
     200           0 :   convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
     201             :                 w, h);
     202           0 : }
     203             : 
     204           0 : void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
     205             :                               uint8_t *dst, ptrdiff_t dst_stride,
     206             :                               const int16_t *filter_x, int x_step_q4,
     207             :                               const int16_t *filter_y, int y_step_q4, int w,
     208             :                               int h) {
     209           0 :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     210           0 :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     211             : 
     212             :   (void)filter_x;
     213             :   (void)x_step_q4;
     214             : 
     215           0 :   convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
     216             :                     y_step_q4, w, h);
     217           0 : }
     218             : 
     219           0 : void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     220             :                      ptrdiff_t dst_stride, const int16_t *filter_x,
     221             :                      int x_step_q4, const int16_t *filter_y, int y_step_q4,
     222             :                      int w, int h) {
     223           0 :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     224           0 :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     225             : 
     226           0 :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     227           0 :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     228             : 
     229           0 :   convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
     230             :            filters_y, y0_q4, y_step_q4, w, h);
     231           0 : }
     232             : 
     233           0 : void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     234             :                          ptrdiff_t dst_stride, const int16_t *filter_x,
     235             :                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
     236             :                          int w, int h) {
     237             :   /* Fixed size intermediate buffer places limits on parameters. */
     238             :   DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
     239           0 :   assert(w <= MAX_SB_SIZE);
     240           0 :   assert(h <= MAX_SB_SIZE);
     241             : 
     242           0 :   aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
     243             :                   filter_y, y_step_q4, w, h);
     244           0 :   aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
     245             :                      h);
     246           0 : }
     247             : 
     248           0 : void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     249             :                          ptrdiff_t dst_stride, const int16_t *filter_x,
     250             :                          int filter_x_stride, const int16_t *filter_y,
     251             :                          int filter_y_stride, int w, int h) {
     252             :   int r;
     253             : 
     254             :   (void)filter_x;
     255             :   (void)filter_x_stride;
     256             :   (void)filter_y;
     257             :   (void)filter_y_stride;
     258             : 
     259           0 :   for (r = h; r > 0; --r) {
     260           0 :     memcpy(dst, src, w);
     261           0 :     src += src_stride;
     262           0 :     dst += dst_stride;
     263             :   }
     264           0 : }
     265             : 
     266           0 : void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     267             :                         ptrdiff_t dst_stride, const int16_t *filter_x,
     268             :                         int filter_x_stride, const int16_t *filter_y,
     269             :                         int filter_y_stride, int w, int h) {
     270             :   int x, y;
     271             : 
     272             :   (void)filter_x;
     273             :   (void)filter_x_stride;
     274             :   (void)filter_y;
     275             :   (void)filter_y_stride;
     276             : 
     277           0 :   for (y = 0; y < h; ++y) {
     278           0 :     for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
     279             : 
     280           0 :     src += src_stride;
     281           0 :     dst += dst_stride;
     282             :   }
     283           0 : }
     284             : 
     285           0 : void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     286             :                         ptrdiff_t dst_stride, const int16_t *filter_x,
     287             :                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
     288             :                         int w, int h) {
     289           0 :   aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
     290             :                         filter_y, y_step_q4, w, h);
     291           0 : }
     292             : 
     293           0 : void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     294             :                        ptrdiff_t dst_stride, const int16_t *filter_x,
     295             :                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
     296             :                        int w, int h) {
     297           0 :   aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
     298             :                        filter_y, y_step_q4, w, h);
     299           0 : }
     300             : 
     301           0 : void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     302             :                      ptrdiff_t dst_stride, const int16_t *filter_x,
     303             :                      int x_step_q4, const int16_t *filter_y, int y_step_q4,
     304             :                      int w, int h) {
     305           0 :   aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
     306             :                   filter_y, y_step_q4, w, h);
     307           0 : }
     308             : 
     309           0 : void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
     310             :                             uint8_t *dst, ptrdiff_t dst_stride,
     311             :                             const int16_t *filter_x, int x_step_q4,
     312             :                             const int16_t *filter_y, int y_step_q4, int w,
     313             :                             int h) {
     314           0 :   aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
     315             :                             x_step_q4, filter_y, y_step_q4, w, h);
     316           0 : }
     317             : 
     318           0 : void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
     319             :                            uint8_t *dst, ptrdiff_t dst_stride,
     320             :                            const int16_t *filter_x, int x_step_q4,
     321             :                            const int16_t *filter_y, int y_step_q4, int w,
     322             :                            int h) {
     323           0 :   aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
     324             :                            x_step_q4, filter_y, y_step_q4, w, h);
     325           0 : }
     326             : 
     327           0 : void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     328             :                          ptrdiff_t dst_stride, const int16_t *filter_x,
     329             :                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
     330             :                          int w, int h) {
     331           0 :   aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
     332             :                       filter_y, y_step_q4, w, h);
     333           0 : }
     334             : 
     335             : #if CONFIG_LOOP_RESTORATION
     336             : static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
     337             :                                    uint8_t *dst, ptrdiff_t dst_stride,
     338             :                                    const InterpKernel *x_filters, int x0_q4,
     339             :                                    int x_step_q4, int w, int h) {
     340             :   int x, y, k;
     341             :   src -= SUBPEL_TAPS / 2 - 1;
     342             :   for (y = 0; y < h; ++y) {
     343             :     int x_q4 = x0_q4;
     344             :     for (x = 0; x < w; ++x) {
     345             :       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
     346             :       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
     347             :       int sum = 0;
     348             :       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
     349             :       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
     350             :                           src_x[SUBPEL_TAPS / 2 - 1]);
     351             :       x_q4 += x_step_q4;
     352             :     }
     353             :     src += src_stride;
     354             :     dst += dst_stride;
     355             :   }
     356             : }
     357             : 
     358             : static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
     359             :                                   uint8_t *dst, ptrdiff_t dst_stride,
     360             :                                   const InterpKernel *y_filters, int y0_q4,
     361             :                                   int y_step_q4, int w, int h) {
     362             :   int x, y, k;
     363             :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
     364             : 
     365             :   for (x = 0; x < w; ++x) {
     366             :     int y_q4 = y0_q4;
     367             :     for (y = 0; y < h; ++y) {
     368             :       const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
     369             :       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
     370             :       int sum = 0;
     371             :       for (k = 0; k < SUBPEL_TAPS; ++k)
     372             :         sum += src_y[k * src_stride] * y_filter[k];
     373             :       dst[y * dst_stride] =
     374             :           clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
     375             :                      src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
     376             :       y_q4 += y_step_q4;
     377             :     }
     378             :     ++src;
     379             :     ++dst;
     380             :   }
     381             : }
     382             : 
     383             : static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
     384             :                              uint8_t *dst, ptrdiff_t dst_stride,
     385             :                              const InterpKernel *const x_filters, int x0_q4,
     386             :                              int x_step_q4, const InterpKernel *const y_filters,
     387             :                              int y0_q4, int y_step_q4, int w, int h) {
     388             :   uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
     389             :   int intermediate_height =
     390             :       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
     391             : 
     392             :   assert(w <= MAX_SB_SIZE);
     393             :   assert(h <= MAX_SB_SIZE);
     394             : 
     395             :   assert(y_step_q4 <= 32);
     396             :   assert(x_step_q4 <= 32);
     397             : 
     398             :   convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
     399             :                          temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
     400             :                          intermediate_height);
     401             :   convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
     402             :                         dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
     403             : }
     404             : 
     405             : void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
     406             :                                    uint8_t *dst, ptrdiff_t dst_stride,
     407             :                                    const int16_t *filter_x, int x_step_q4,
     408             :                                    const int16_t *filter_y, int y_step_q4,
     409             :                                    int w, int h) {
     410             :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     411             :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     412             : 
     413             :   (void)filter_y;
     414             :   (void)y_step_q4;
     415             : 
     416             :   convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
     417             :                          x_step_q4, w, h);
     418             : }
     419             : 
     420             : void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
     421             :                                   uint8_t *dst, ptrdiff_t dst_stride,
     422             :                                   const int16_t *filter_x, int x_step_q4,
     423             :                                   const int16_t *filter_y, int y_step_q4, int w,
     424             :                                   int h) {
     425             :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     426             :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     427             : 
     428             :   (void)filter_x;
     429             :   (void)x_step_q4;
     430             : 
     431             :   convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
     432             :                         y_step_q4, w, h);
     433             : }
     434             : 
     435             : void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
     436             :                              uint8_t *dst, ptrdiff_t dst_stride,
     437             :                              const int16_t *filter_x, int x_step_q4,
     438             :                              const int16_t *filter_y, int y_step_q4, int w,
     439             :                              int h) {
     440             :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     441             :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     442             : 
     443             :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     444             :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     445             : 
     446             :   convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
     447             :                    x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
     448             : }
     449             : 
     450             : static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
     451             :                                        uint16_t *dst, ptrdiff_t dst_stride,
     452             :                                        const InterpKernel *x_filters, int x0_q4,
     453             :                                        int x_step_q4, int w, int h) {
     454             :   const int bd = 8;
     455             :   int x, y, k;
     456             :   src -= SUBPEL_TAPS / 2 - 1;
     457             :   for (y = 0; y < h; ++y) {
     458             :     int x_q4 = x0_q4;
     459             :     for (x = 0; x < w; ++x) {
     460             :       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
     461             :       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
     462             :       int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
     463             :                 (1 << (bd + FILTER_BITS - 1));
     464             :       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
     465             :       dst[x] =
     466             :           (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
     467             :                           0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
     468             :       x_q4 += x_step_q4;
     469             :     }
     470             :     src += src_stride;
     471             :     dst += dst_stride;
     472             :   }
     473             : }
     474             : 
     475             : static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
     476             :                                       uint8_t *dst, ptrdiff_t dst_stride,
     477             :                                       const InterpKernel *y_filters, int y0_q4,
     478             :                                       int y_step_q4, int w, int h) {
     479             :   const int bd = 8;
     480             :   int x, y, k;
     481             :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
     482             : 
     483             :   for (x = 0; x < w; ++x) {
     484             :     int y_q4 = y0_q4;
     485             :     for (y = 0; y < h; ++y) {
     486             :       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
     487             :       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
     488             :       int sum =
     489             :           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
     490             :           (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
     491             :       for (k = 0; k < SUBPEL_TAPS; ++k)
     492             :         sum += src_y[k * src_stride] * y_filter[k];
     493             :       dst[y * dst_stride] =
     494             :           clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
     495             :       y_q4 += y_step_q4;
     496             :     }
     497             :     ++src;
     498             :     ++dst;
     499             :   }
     500             : }
     501             : 
     502             : static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
     503             :                                  uint8_t *dst, ptrdiff_t dst_stride,
     504             :                                  const InterpKernel *const x_filters, int x0_q4,
     505             :                                  int x_step_q4,
     506             :                                  const InterpKernel *const y_filters, int y0_q4,
     507             :                                  int y_step_q4, int w, int h) {
     508             :   uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
     509             :   int intermediate_height =
     510             :       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
     511             : 
     512             :   assert(w <= MAX_SB_SIZE);
     513             :   assert(h <= MAX_SB_SIZE);
     514             : 
     515             :   assert(y_step_q4 <= 32);
     516             :   assert(x_step_q4 <= 32);
     517             : 
     518             :   convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
     519             :                              src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
     520             :                              x_step_q4, w, intermediate_height);
     521             :   convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
     522             :                             MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
     523             :                             y_step_q4, w, h);
     524             : }
     525             : 
     526             : void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
     527             :                                        uint16_t *dst, ptrdiff_t dst_stride,
     528             :                                        const int16_t *filter_x, int x_step_q4,
     529             :                                        const int16_t *filter_y, int y_step_q4,
     530             :                                        int w, int h) {
     531             :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     532             :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     533             : 
     534             :   (void)filter_y;
     535             :   (void)y_step_q4;
     536             : 
     537             :   convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
     538             :                              x_step_q4, w, h);
     539             : }
     540             : 
     541             : void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
     542             :                                       uint8_t *dst, ptrdiff_t dst_stride,
     543             :                                       const int16_t *filter_x, int x_step_q4,
     544             :                                       const int16_t *filter_y, int y_step_q4,
     545             :                                       int w, int h) {
     546             :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     547             :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     548             : 
     549             :   (void)filter_x;
     550             :   (void)x_step_q4;
     551             : 
     552             :   convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
     553             :                             y_step_q4, w, h);
     554             : }
     555             : 
     556             : void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
     557             :                                  uint8_t *dst, ptrdiff_t dst_stride,
     558             :                                  const int16_t *filter_x, int x_step_q4,
     559             :                                  const int16_t *filter_y, int y_step_q4, int w,
     560             :                                  int h) {
     561             :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     562             :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     563             : 
     564             :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     565             :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     566             : 
     567             :   convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
     568             :                        x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
     569             : }
     570             : #endif  // CONFIG_LOOP_RESTORATION
     571             : 
     572             : #if CONFIG_HIGHBITDEPTH
     573           0 : static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
     574             :                                   uint8_t *dst8, ptrdiff_t dst_stride,
     575             :                                   const InterpKernel *x_filters, int x0_q4,
     576             :                                   int x_step_q4, int w, int h, int bd) {
     577             :   int x, y;
     578           0 :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     579           0 :   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
     580           0 :   src -= SUBPEL_TAPS / 2 - 1;
     581           0 :   for (y = 0; y < h; ++y) {
     582           0 :     int x_q4 = x0_q4;
     583           0 :     for (x = 0; x < w; ++x) {
     584           0 :       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
     585           0 :       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
     586           0 :       int k, sum = 0;
     587           0 :       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
     588           0 :       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
     589           0 :       x_q4 += x_step_q4;
     590             :     }
     591           0 :     src += src_stride;
     592           0 :     dst += dst_stride;
     593             :   }
     594           0 : }
     595             : 
     596           0 : static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
     597             :                                       uint8_t *dst8, ptrdiff_t dst_stride,
     598             :                                       const InterpKernel *x_filters, int x0_q4,
     599             :                                       int x_step_q4, int w, int h, int bd) {
     600             :   int x, y;
     601           0 :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     602           0 :   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
     603           0 :   src -= SUBPEL_TAPS / 2 - 1;
     604           0 :   for (y = 0; y < h; ++y) {
     605           0 :     int x_q4 = x0_q4;
     606           0 :     for (x = 0; x < w; ++x) {
     607           0 :       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
     608           0 :       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
     609           0 :       int k, sum = 0;
     610           0 :       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
     611           0 :       dst[x] = ROUND_POWER_OF_TWO(
     612             :           dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
     613             :           1);
     614           0 :       x_q4 += x_step_q4;
     615             :     }
     616           0 :     src += src_stride;
     617           0 :     dst += dst_stride;
     618             :   }
     619           0 : }
     620             : 
     621           0 : static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
     622             :                                  uint8_t *dst8, ptrdiff_t dst_stride,
     623             :                                  const InterpKernel *y_filters, int y0_q4,
     624             :                                  int y_step_q4, int w, int h, int bd) {
     625             :   int x, y;
     626           0 :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     627           0 :   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
     628           0 :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
     629           0 :   for (x = 0; x < w; ++x) {
     630           0 :     int y_q4 = y0_q4;
     631           0 :     for (y = 0; y < h; ++y) {
     632           0 :       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
     633           0 :       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
     634           0 :       int k, sum = 0;
     635           0 :       for (k = 0; k < SUBPEL_TAPS; ++k)
     636           0 :         sum += src_y[k * src_stride] * y_filter[k];
     637           0 :       dst[y * dst_stride] =
     638           0 :           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
     639           0 :       y_q4 += y_step_q4;
     640             :     }
     641           0 :     ++src;
     642           0 :     ++dst;
     643             :   }
     644           0 : }
     645             : 
     646           0 : static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
     647             :                                      uint8_t *dst8, ptrdiff_t dst_stride,
     648             :                                      const InterpKernel *y_filters, int y0_q4,
     649             :                                      int y_step_q4, int w, int h, int bd) {
     650             :   int x, y;
     651           0 :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     652           0 :   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
     653           0 :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
     654           0 :   for (x = 0; x < w; ++x) {
     655           0 :     int y_q4 = y0_q4;
     656           0 :     for (y = 0; y < h; ++y) {
     657           0 :       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
     658           0 :       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
     659           0 :       int k, sum = 0;
     660           0 :       for (k = 0; k < SUBPEL_TAPS; ++k)
     661           0 :         sum += src_y[k * src_stride] * y_filter[k];
     662           0 :       dst[y * dst_stride] = ROUND_POWER_OF_TWO(
     663             :           dst[y * dst_stride] +
     664             :               clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
     665             :           1);
     666           0 :       y_q4 += y_step_q4;
     667             :     }
     668           0 :     ++src;
     669           0 :     ++dst;
     670             :   }
     671           0 : }
     672             : 
     673           0 : static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
     674             :                             uint8_t *dst, ptrdiff_t dst_stride,
     675             :                             const InterpKernel *const x_filters, int x0_q4,
     676             :                             int x_step_q4, const InterpKernel *const y_filters,
     677             :                             int y0_q4, int y_step_q4, int w, int h, int bd) {
     678             :   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
     679             :   // 2d filtering proceeds in 2 steps:
     680             :   //   (1) Interpolate horizontally into an intermediate buffer, temp.
     681             :   //   (2) Interpolate temp vertically to derive the sub-pixel result.
     682             :   // Deriving the maximum number of rows in the temp buffer (135):
     683             :   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
     684             :   // --Largest block size is 64x64 pixels.
     685             :   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
     686             :   //   original frame (in 1/16th pixel units).
     687             :   // --Must round-up because block may be located at sub-pixel position.
     688             :   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
     689             :   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
     690             :   uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
     691           0 :   int intermediate_height =
     692           0 :       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
     693             : 
     694           0 :   assert(w <= MAX_SB_SIZE);
     695           0 :   assert(h <= MAX_SB_SIZE);
     696           0 :   assert(y_step_q4 <= 32);
     697           0 :   assert(x_step_q4 <= 32);
     698             : 
     699           0 :   highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
     700           0 :                         CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
     701             :                         x_step_q4, w, intermediate_height, bd);
     702           0 :   highbd_convolve_vert(
     703           0 :       CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
     704             :       MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
     705           0 : }
     706             : 
     707           0 : void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
     708             :                                   uint8_t *dst, ptrdiff_t dst_stride,
     709             :                                   const int16_t *filter_x, int x_step_q4,
     710             :                                   const int16_t *filter_y, int y_step_q4, int w,
     711             :                                   int h, int bd) {
     712           0 :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     713           0 :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     714             :   (void)filter_y;
     715             :   (void)y_step_q4;
     716             : 
     717           0 :   highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
     718             :                         x_step_q4, w, h, bd);
     719           0 : }
     720             : 
     721           0 : void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
     722             :                                       uint8_t *dst, ptrdiff_t dst_stride,
     723             :                                       const int16_t *filter_x, int x_step_q4,
     724             :                                       const int16_t *filter_y, int y_step_q4,
     725             :                                       int w, int h, int bd) {
     726           0 :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     727           0 :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     728             :   (void)filter_y;
     729             :   (void)y_step_q4;
     730             : 
     731           0 :   highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
     732             :                             x_step_q4, w, h, bd);
     733           0 : }
     734             : 
     735           0 : void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
     736             :                                  uint8_t *dst, ptrdiff_t dst_stride,
     737             :                                  const int16_t *filter_x, int x_step_q4,
     738             :                                  const int16_t *filter_y, int y_step_q4, int w,
     739             :                                  int h, int bd) {
     740           0 :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     741           0 :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     742             :   (void)filter_x;
     743             :   (void)x_step_q4;
     744             : 
     745           0 :   highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
     746             :                        y_step_q4, w, h, bd);
     747           0 : }
     748             : 
     749           0 : void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
     750             :                                      uint8_t *dst, ptrdiff_t dst_stride,
     751             :                                      const int16_t *filter_x, int x_step_q4,
     752             :                                      const int16_t *filter_y, int y_step_q4,
     753             :                                      int w, int h, int bd) {
     754           0 :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     755           0 :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     756             :   (void)filter_x;
     757             :   (void)x_step_q4;
     758             : 
     759           0 :   highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
     760             :                            y_step_q4, w, h, bd);
     761           0 : }
     762             : 
     763           0 : void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
     764             :                             uint8_t *dst, ptrdiff_t dst_stride,
     765             :                             const int16_t *filter_x, int x_step_q4,
     766             :                             const int16_t *filter_y, int y_step_q4, int w,
     767             :                             int h, int bd) {
     768           0 :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     769           0 :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     770             : 
     771           0 :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     772           0 :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     773             : 
     774           0 :   highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
     775             :                   filters_y, y0_q4, y_step_q4, w, h, bd);
     776           0 : }
     777             : 
     778           0 : void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
     779             :                                 uint8_t *dst, ptrdiff_t dst_stride,
     780             :                                 const int16_t *filter_x, int x_step_q4,
     781             :                                 const int16_t *filter_y, int y_step_q4, int w,
     782             :                                 int h, int bd) {
     783             :   // Fixed size intermediate buffer places limits on parameters.
     784             :   DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
     785           0 :   assert(w <= MAX_SB_SIZE);
     786           0 :   assert(h <= MAX_SB_SIZE);
     787             : 
     788           0 :   aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
     789             :                          filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
     790           0 :   aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
     791             :                             dst_stride, NULL, 0, NULL, 0, w, h, bd);
     792           0 : }
     793             : 
     794           0 : void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
     795             :                                 uint8_t *dst8, ptrdiff_t dst_stride,
     796             :                                 const int16_t *filter_x, int filter_x_stride,
     797             :                                 const int16_t *filter_y, int filter_y_stride,
     798             :                                 int w, int h, int bd) {
     799             :   int r;
     800           0 :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     801           0 :   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
     802             :   (void)filter_x;
     803             :   (void)filter_y;
     804             :   (void)filter_x_stride;
     805             :   (void)filter_y_stride;
     806             :   (void)bd;
     807             : 
     808           0 :   for (r = h; r > 0; --r) {
     809           0 :     memcpy(dst, src, w * sizeof(uint16_t));
     810           0 :     src += src_stride;
     811           0 :     dst += dst_stride;
     812             :   }
     813           0 : }
     814             : 
     815           0 : void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
     816             :                                uint8_t *dst8, ptrdiff_t dst_stride,
     817             :                                const int16_t *filter_x, int filter_x_stride,
     818             :                                const int16_t *filter_y, int filter_y_stride,
     819             :                                int w, int h, int bd) {
     820             :   int x, y;
     821           0 :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     822           0 :   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
     823             :   (void)filter_x;
     824             :   (void)filter_y;
     825             :   (void)filter_x_stride;
     826             :   (void)filter_y_stride;
     827             :   (void)bd;
     828             : 
     829           0 :   for (y = 0; y < h; ++y) {
     830           0 :     for (x = 0; x < w; ++x) {
     831           0 :       dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
     832             :     }
     833           0 :     src += src_stride;
     834           0 :     dst += dst_stride;
     835             :   }
     836           0 : }
     837             : 
     838             : #if CONFIG_LOOP_RESTORATION
     839             : static void highbd_convolve_add_src_horiz(const uint8_t *src8,
     840             :                                           ptrdiff_t src_stride, uint8_t *dst8,
     841             :                                           ptrdiff_t dst_stride,
     842             :                                           const InterpKernel *x_filters,
     843             :                                           int x0_q4, int x_step_q4, int w,
     844             :                                           int h, int bd) {
     845             :   int x, y, k;
     846             :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     847             :   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
     848             :   src -= SUBPEL_TAPS / 2 - 1;
     849             :   for (y = 0; y < h; ++y) {
     850             :     int x_q4 = x0_q4;
     851             :     for (x = 0; x < w; ++x) {
     852             :       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
     853             :       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
     854             :       int sum = 0;
     855             :       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
     856             :       dst[x] = clip_pixel_highbd(
     857             :           ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
     858             :           bd);
     859             :       x_q4 += x_step_q4;
     860             :     }
     861             :     src += src_stride;
     862             :     dst += dst_stride;
     863             :   }
     864             : }
     865             : 
     866             : static void highbd_convolve_add_src_vert(const uint8_t *src8,
     867             :                                          ptrdiff_t src_stride, uint8_t *dst8,
     868             :                                          ptrdiff_t dst_stride,
     869             :                                          const InterpKernel *y_filters,
     870             :                                          int y0_q4, int y_step_q4, int w, int h,
     871             :                                          int bd) {
     872             :   int x, y, k;
     873             :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     874             :   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
     875             :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
     876             :   for (x = 0; x < w; ++x) {
     877             :     int y_q4 = y0_q4;
     878             :     for (y = 0; y < h; ++y) {
     879             :       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
     880             :       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
     881             :       int sum = 0;
     882             :       for (k = 0; k < SUBPEL_TAPS; ++k)
     883             :         sum += src_y[k * src_stride] * y_filter[k];
     884             :       dst[y * dst_stride] =
     885             :           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
     886             :                                 src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
     887             :                             bd);
     888             :       y_q4 += y_step_q4;
     889             :     }
     890             :     ++src;
     891             :     ++dst;
     892             :   }
     893             : }
     894             : 
     895             : static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
     896             :                                     uint8_t *dst, ptrdiff_t dst_stride,
     897             :                                     const InterpKernel *const x_filters,
     898             :                                     int x0_q4, int x_step_q4,
     899             :                                     const InterpKernel *const y_filters,
     900             :                                     int y0_q4, int y_step_q4, int w, int h,
     901             :                                     int bd) {
     902             :   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
     903             :   // 2d filtering proceeds in 2 steps:
     904             :   //   (1) Interpolate horizontally into an intermediate buffer, temp.
     905             :   //   (2) Interpolate temp vertically to derive the sub-pixel result.
     906             :   // Deriving the maximum number of rows in the temp buffer (135):
     907             :   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
     908             :   // --Largest block size is 64x64 pixels.
     909             :   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
     910             :   //   original frame (in 1/16th pixel units).
     911             :   // --Must round-up because block may be located at sub-pixel position.
     912             :   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
     913             :   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
     914             :   uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
     915             :   int intermediate_height =
     916             :       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
     917             : 
     918             :   assert(w <= MAX_SB_SIZE);
     919             :   assert(h <= MAX_SB_SIZE);
     920             :   assert(y_step_q4 <= 32);
     921             :   assert(x_step_q4 <= 32);
     922             : 
     923             :   highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
     924             :                                 src_stride, CONVERT_TO_BYTEPTR(temp),
     925             :                                 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
     926             :                                 intermediate_height, bd);
     927             :   highbd_convolve_add_src_vert(
     928             :       CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
     929             :       MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
     930             : }
     931             : 
     932             : void aom_highbd_convolve8_add_src_horiz_c(
     933             :     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     934             :     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
     935             :     const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
     936             :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     937             :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     938             :   (void)filter_y;
     939             :   (void)y_step_q4;
     940             : 
     941             :   highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x,
     942             :                                 x0_q4, x_step_q4, w, h, bd);
     943             : }
     944             : 
     945             : void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src,
     946             :                                          ptrdiff_t src_stride, uint8_t *dst,
     947             :                                          ptrdiff_t dst_stride,
     948             :                                          const int16_t *filter_x, int x_step_q4,
     949             :                                          const int16_t *filter_y, int y_step_q4,
     950             :                                          int w, int h, int bd) {
     951             :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     952             :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     953             :   (void)filter_x;
     954             :   (void)x_step_q4;
     955             : 
     956             :   highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y,
     957             :                                y0_q4, y_step_q4, w, h, bd);
     958             : }
     959             : 
     960             : void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
     961             :                                     uint8_t *dst, ptrdiff_t dst_stride,
     962             :                                     const int16_t *filter_x, int x_step_q4,
     963             :                                     const int16_t *filter_y, int y_step_q4,
     964             :                                     int w, int h, int bd) {
     965             :   const InterpKernel *const filters_x = get_filter_base(filter_x);
     966             :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
     967             : 
     968             :   const InterpKernel *const filters_y = get_filter_base(filter_y);
     969             :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
     970             : 
     971             :   highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
     972             :                           x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
     973             : }
     974             : 
     975             : static void highbd_convolve_add_src_horiz_hip(
     976             :     const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
     977             :     ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
     978             :     int x_step_q4, int w, int h, int bd) {
     979             :   const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd);
     980             :   int x, y, k;
     981             :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     982             :   src -= SUBPEL_TAPS / 2 - 1;
     983             :   for (y = 0; y < h; ++y) {
     984             :     int x_q4 = x0_q4;
     985             :     for (x = 0; x < w; ++x) {
     986             :       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
     987             :       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
     988             :       int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
     989             :                 (1 << (bd + FILTER_BITS - 1));
     990             :       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
     991             :       dst[x] =
     992             :           (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
     993             :                           0, extraprec_clamp_limit - 1);
     994             :       x_q4 += x_step_q4;
     995             :     }
     996             :     src += src_stride;
     997             :     dst += dst_stride;
     998             :   }
     999             : }
    1000             : 
    1001             : static void highbd_convolve_add_src_vert_hip(
    1002             :     const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
    1003             :     ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
    1004             :     int y_step_q4, int w, int h, int bd) {
    1005             :   int x, y, k;
    1006             :   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
    1007             :   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
    1008             :   for (x = 0; x < w; ++x) {
    1009             :     int y_q4 = y0_q4;
    1010             :     for (y = 0; y < h; ++y) {
    1011             :       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
    1012             :       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
    1013             :       int sum =
    1014             :           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
    1015             :           (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
    1016             :       for (k = 0; k < SUBPEL_TAPS; ++k)
    1017             :         sum += src_y[k * src_stride] * y_filter[k];
    1018             :       dst[y * dst_stride] = clip_pixel_highbd(
    1019             :           ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS), bd);
    1020             :       y_q4 += y_step_q4;
    1021             :     }
    1022             :     ++src;
    1023             :     ++dst;
    1024             :   }
    1025             : }
    1026             : 
    1027             : static void highbd_convolve_add_src_hip(
    1028             :     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    1029             :     ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4,
    1030             :     int x_step_q4, const InterpKernel *const y_filters, int y0_q4,
    1031             :     int y_step_q4, int w, int h, int bd) {
    1032             :   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
    1033             :   // 2d filtering proceeds in 2 steps:
    1034             :   //   (1) Interpolate horizontally into an intermediate buffer, temp.
    1035             :   //   (2) Interpolate temp vertically to derive the sub-pixel result.
    1036             :   // Deriving the maximum number of rows in the temp buffer (135):
    1037             :   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
    1038             :   // --Largest block size is 64x64 pixels.
    1039             :   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
    1040             :   //   original frame (in 1/16th pixel units).
    1041             :   // --Must round-up because block may be located at sub-pixel position.
    1042             :   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
    1043             :   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
    1044             :   uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
    1045             :   int intermediate_height =
    1046             :       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
    1047             : 
    1048             :   assert(w <= MAX_SB_SIZE);
    1049             :   assert(h <= MAX_SB_SIZE);
    1050             :   assert(y_step_q4 <= 32);
    1051             :   assert(x_step_q4 <= 32);
    1052             : 
    1053             :   highbd_convolve_add_src_horiz_hip(
    1054             :       src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE,
    1055             :       x_filters, x0_q4, x_step_q4, w, intermediate_height, bd);
    1056             :   highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
    1057             :                                    MAX_SB_SIZE, dst, dst_stride, y_filters,
    1058             :                                    y0_q4, y_step_q4, w, h, bd);
    1059             : }
    1060             : 
    1061             : void aom_highbd_convolve8_add_src_horiz_hip_c(
    1062             :     const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst,
    1063             :     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
    1064             :     const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
    1065             :   const InterpKernel *const filters_x = get_filter_base(filter_x);
    1066             :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
    1067             :   (void)filter_y;
    1068             :   (void)y_step_q4;
    1069             : 
    1070             :   highbd_convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x,
    1071             :                                     x0_q4, x_step_q4, w, h, bd);
    1072             : }
    1073             : 
    1074             : void aom_highbd_convolve8_add_src_vert_hip_c(
    1075             :     const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
    1076             :     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
    1077             :     const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
    1078             :   const InterpKernel *const filters_y = get_filter_base(filter_y);
    1079             :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
    1080             :   (void)filter_x;
    1081             :   (void)x_step_q4;
    1082             : 
    1083             :   highbd_convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y,
    1084             :                                    y0_q4, y_step_q4, w, h, bd);
    1085             : }
    1086             : 
    1087             : void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src,
    1088             :                                         ptrdiff_t src_stride, uint8_t *dst,
    1089             :                                         ptrdiff_t dst_stride,
    1090             :                                         const int16_t *filter_x, int x_step_q4,
    1091             :                                         const int16_t *filter_y, int y_step_q4,
    1092             :                                         int w, int h, int bd) {
    1093             :   const InterpKernel *const filters_x = get_filter_base(filter_x);
    1094             :   const int x0_q4 = get_filter_offset(filter_x, filters_x);
    1095             : 
    1096             :   const InterpKernel *const filters_y = get_filter_base(filter_y);
    1097             :   const int y0_q4 = get_filter_offset(filter_y, filters_y);
    1098             : 
    1099             :   highbd_convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x,
    1100             :                               x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w,
    1101             :                               h, bd);
    1102             : }
    1103             : 
    1104             : #endif  // CONFIG_LOOP_RESTORATION
    1105             : #endif  // CONFIG_HIGHBITDEPTH

Generated by: LCOV version 1.13