LCOV - code coverage report
Current view: top level - third_party/aom/av1/common - warped_motion.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 623 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 42 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <stdio.h>
      13             : #include <stdlib.h>
      14             : #include <memory.h>
      15             : #include <math.h>
      16             : #include <assert.h>
      17             : 
      18             : #include "./av1_rtcd.h"
      19             : #include "av1/common/warped_motion.h"
      20             : 
      21             : /* clang-format off */
      22             : static const int error_measure_lut[512] = {
      23             :   // pow 0.7
      24             :   16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
      25             :   16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
      26             :   15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
      27             :   15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
      28             :   14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
      29             :   14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
      30             :   14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
      31             :   13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
      32             :   13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
      33             :   12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
      34             :   12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
      35             :   12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
      36             :   11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
      37             :   11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
      38             :   10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
      39             :   10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
      40             :   10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
      41             :   9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
      42             :   9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
      43             :   8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
      44             :   8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
      45             :   7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
      46             :   7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
      47             :   6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
      48             :   6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
      49             :   5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
      50             :   5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
      51             :   4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
      52             :   3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
      53             :   3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
      54             :   2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
      55             :   1323, 1187, 1045,  894,  731,  550,  339,    0,
      56             :   339,  550,  731,  894, 1045, 1187, 1323, 1452,
      57             :   1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
      58             :   2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
      59             :   3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
      60             :   3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
      61             :   4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
      62             :   5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
      63             :   5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
      64             :   6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
      65             :   6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
      66             :   7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
      67             :   7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
      68             :   8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
      69             :   8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
      70             :   9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
      71             :   9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
      72             :   10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
      73             :   10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
      74             :   11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
      75             :   11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
      76             :   11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
      77             :   12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
      78             :   12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
      79             :   13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
      80             :   13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
      81             :   13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
      82             :   14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
      83             :   14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
      84             :   15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
      85             :   15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
      86             :   15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
      87             :   16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
      88             : };
      89             : /* clang-format on */
      90             : 
      91           0 : static ProjectPointsFunc get_project_points_type(TransformationType type) {
      92           0 :   switch (type) {
      93           0 :     case HOMOGRAPHY: return project_points_homography;
      94           0 :     case AFFINE: return project_points_affine;
      95           0 :     case ROTZOOM: return project_points_rotzoom;
      96           0 :     case TRANSLATION: return project_points_translation;
      97           0 :     default: assert(0); return NULL;
      98             :   }
      99             : }
     100             : 
     101           0 : void project_points_translation(const int32_t *mat, int *points, int *proj,
     102             :                                 const int n, const int stride_points,
     103             :                                 const int stride_proj, const int subsampling_x,
     104             :                                 const int subsampling_y) {
     105             :   int i;
     106           0 :   for (i = 0; i < n; ++i) {
     107           0 :     const int x = *(points++), y = *(points++);
     108           0 :     if (subsampling_x)
     109           0 :       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
     110             :           ((x * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[0]),
     111             :           WARPEDDIFF_PREC_BITS + 1);
     112             :     else
     113           0 :       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
     114             :           ((x * (1 << WARPEDMODEL_PREC_BITS)) + mat[0]), WARPEDDIFF_PREC_BITS);
     115           0 :     if (subsampling_y)
     116           0 :       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
     117             :           ((y * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[1]),
     118             :           WARPEDDIFF_PREC_BITS + 1);
     119             :     else
     120           0 :       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
     121             :           ((y * (1 << WARPEDMODEL_PREC_BITS))) + mat[1], WARPEDDIFF_PREC_BITS);
     122           0 :     points += stride_points - 2;
     123           0 :     proj += stride_proj - 2;
     124             :   }
     125           0 : }
     126             : 
     127           0 : void project_points_rotzoom(const int32_t *mat, int *points, int *proj,
     128             :                             const int n, const int stride_points,
     129             :                             const int stride_proj, const int subsampling_x,
     130             :                             const int subsampling_y) {
     131             :   int i;
     132           0 :   for (i = 0; i < n; ++i) {
     133           0 :     const int x = *(points++), y = *(points++);
     134           0 :     if (subsampling_x)
     135           0 :       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
     136             :           mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
     137             :               (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
     138             :           WARPEDDIFF_PREC_BITS + 1);
     139             :     else
     140           0 :       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0],
     141             :                                             WARPEDDIFF_PREC_BITS);
     142           0 :     if (subsampling_y)
     143           0 :       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
     144             :           -mat[3] * 2 * x + mat[2] * 2 * y + mat[1] +
     145             :               (-mat[3] + mat[2] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
     146             :           WARPEDDIFF_PREC_BITS + 1);
     147             :     else
     148           0 :       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(-mat[3] * x + mat[2] * y + mat[1],
     149             :                                             WARPEDDIFF_PREC_BITS);
     150           0 :     points += stride_points - 2;
     151           0 :     proj += stride_proj - 2;
     152             :   }
     153           0 : }
     154             : 
     155           0 : void project_points_affine(const int32_t *mat, int *points, int *proj,
     156             :                            const int n, const int stride_points,
     157             :                            const int stride_proj, const int subsampling_x,
     158             :                            const int subsampling_y) {
     159             :   int i;
     160           0 :   for (i = 0; i < n; ++i) {
     161           0 :     const int x = *(points++), y = *(points++);
     162           0 :     if (subsampling_x)
     163           0 :       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
     164             :           mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
     165             :               (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
     166             :           WARPEDDIFF_PREC_BITS + 1);
     167             :     else
     168           0 :       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0],
     169             :                                             WARPEDDIFF_PREC_BITS);
     170           0 :     if (subsampling_y)
     171           0 :       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
     172             :           mat[4] * 2 * x + mat[5] * 2 * y + mat[1] +
     173             :               (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
     174             :           WARPEDDIFF_PREC_BITS + 1);
     175             :     else
     176           0 :       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[4] * x + mat[5] * y + mat[1],
     177             :                                             WARPEDDIFF_PREC_BITS);
     178           0 :     points += stride_points - 2;
     179           0 :     proj += stride_proj - 2;
     180             :   }
     181           0 : }
     182             : 
     183           0 : void project_points_hortrapezoid(const int32_t *mat, int *points, int *proj,
     184             :                                  const int n, const int stride_points,
     185             :                                  const int stride_proj, const int subsampling_x,
     186             :                                  const int subsampling_y) {
     187             :   int i;
     188             :   int64_t x, y, Z;
     189             :   int64_t xp, yp;
     190           0 :   for (i = 0; i < n; ++i) {
     191           0 :     x = *(points++), y = *(points++);
     192           0 :     x = (subsampling_x ? 4 * x + 1 : 2 * x);
     193           0 :     y = (subsampling_y ? 4 * y + 1 : 2 * y);
     194             : 
     195           0 :     Z = (mat[7] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
     196           0 :     xp = (mat[2] * x + mat[3] * y + 2 * mat[0]) *
     197             :          (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
     198             :                 WARPEDMODEL_PREC_BITS));
     199           0 :     yp = (mat[5] * y + 2 * mat[1]) *
     200             :          (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
     201             :                 WARPEDMODEL_PREC_BITS));
     202             : 
     203           0 :     xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
     204           0 :     yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
     205             : 
     206           0 :     if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
     207           0 :     if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
     208           0 :     *(proj++) = (int)xp;
     209           0 :     *(proj++) = (int)yp;
     210             : 
     211           0 :     points += stride_points - 2;
     212           0 :     proj += stride_proj - 2;
     213             :   }
     214           0 : }
     215             : 
     216           0 : void project_points_vertrapezoid(const int32_t *mat, int *points, int *proj,
     217             :                                  const int n, const int stride_points,
     218             :                                  const int stride_proj, const int subsampling_x,
     219             :                                  const int subsampling_y) {
     220             :   int i;
     221             :   int64_t x, y, Z;
     222             :   int64_t xp, yp;
     223           0 :   for (i = 0; i < n; ++i) {
     224           0 :     x = *(points++), y = *(points++);
     225           0 :     x = (subsampling_x ? 4 * x + 1 : 2 * x);
     226           0 :     y = (subsampling_y ? 4 * y + 1 : 2 * y);
     227             : 
     228           0 :     Z = (mat[6] * x + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
     229           0 :     xp = (mat[2] * x + 2 * mat[0]) *
     230             :          (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
     231             :                 WARPEDMODEL_PREC_BITS));
     232           0 :     yp = (mat[4] * x + mat[5] * y + 2 * mat[1]) *
     233             :          (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
     234             :                 WARPEDMODEL_PREC_BITS));
     235             : 
     236           0 :     xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
     237           0 :     yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
     238             : 
     239           0 :     if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
     240           0 :     if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
     241           0 :     *(proj++) = (int)xp;
     242           0 :     *(proj++) = (int)yp;
     243             : 
     244           0 :     points += stride_points - 2;
     245           0 :     proj += stride_proj - 2;
     246             :   }
     247           0 : }
     248             : 
     249           0 : void project_points_homography(const int32_t *mat, int *points, int *proj,
     250             :                                const int n, const int stride_points,
     251             :                                const int stride_proj, const int subsampling_x,
     252             :                                const int subsampling_y) {
     253             :   int i;
     254             :   int64_t x, y, Z;
     255             :   int64_t xp, yp;
     256           0 :   for (i = 0; i < n; ++i) {
     257           0 :     x = *(points++), y = *(points++);
     258           0 :     x = (subsampling_x ? 4 * x + 1 : 2 * x);
     259           0 :     y = (subsampling_y ? 4 * y + 1 : 2 * y);
     260             : 
     261           0 :     Z = (mat[6] * x + mat[7] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
     262           0 :     xp = (mat[2] * x + mat[3] * y + 2 * mat[0]) *
     263             :          (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
     264             :                 WARPEDMODEL_PREC_BITS));
     265           0 :     yp = (mat[4] * x + mat[5] * y + 2 * mat[1]) *
     266             :          (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
     267             :                 WARPEDMODEL_PREC_BITS));
     268             : 
     269           0 :     xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
     270           0 :     yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
     271             : 
     272           0 :     if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
     273           0 :     if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
     274           0 :     *(proj++) = (int)xp;
     275           0 :     *(proj++) = (int)yp;
     276             : 
     277           0 :     points += stride_points - 2;
     278           0 :     proj += stride_proj - 2;
     279             :   }
     280           0 : }
     281             : 
     282             : // 'points' are at original scale, output 'proj's are scaled up by
     283             : // 1 << WARPEDPIXEL_PREC_BITS
     284           0 : void project_points(const WarpedMotionParams *wm_params, int *points, int *proj,
     285             :                     const int n, const int stride_points, const int stride_proj,
     286             :                     const int subsampling_x, const int subsampling_y) {
     287           0 :   switch (wm_params->wmtype) {
     288             :     case AFFINE:
     289           0 :       project_points_affine(wm_params->wmmat, points, proj, n, stride_points,
     290             :                             stride_proj, subsampling_x, subsampling_y);
     291           0 :       break;
     292             :     case ROTZOOM:
     293           0 :       project_points_rotzoom(wm_params->wmmat, points, proj, n, stride_points,
     294             :                              stride_proj, subsampling_x, subsampling_y);
     295           0 :       break;
     296             :     case HOMOGRAPHY:
     297           0 :       project_points_homography(wm_params->wmmat, points, proj, n,
     298             :                                 stride_points, stride_proj, subsampling_x,
     299             :                                 subsampling_y);
     300           0 :       break;
     301           0 :     default: assert(0 && "Invalid warped motion type!"); return;
     302             :   }
     303           0 : }
     304             : 
     305             : static const int16_t
     306             :     filter_ntap[WARPEDPIXEL_PREC_SHIFTS][WARPEDPIXEL_FILTER_TAPS] = {
     307             : #if WARPEDPIXEL_PREC_BITS == 6
     308             :       { 0, 0, 128, 0, 0, 0 },      { 0, -1, 128, 2, -1, 0 },
     309             :       { 1, -3, 127, 4, -1, 0 },    { 1, -4, 126, 6, -2, 1 },
     310             :       { 1, -5, 126, 8, -3, 1 },    { 1, -6, 125, 11, -4, 1 },
     311             :       { 1, -7, 124, 13, -4, 1 },   { 2, -8, 123, 15, -5, 1 },
     312             :       { 2, -9, 122, 18, -6, 1 },   { 2, -10, 121, 20, -6, 1 },
     313             :       { 2, -11, 120, 22, -7, 2 },  { 2, -12, 119, 25, -8, 2 },
     314             :       { 3, -13, 117, 27, -8, 2 },  { 3, -13, 116, 29, -9, 2 },
     315             :       { 3, -14, 114, 32, -10, 3 }, { 3, -15, 113, 35, -10, 2 },
     316             :       { 3, -15, 111, 37, -11, 3 }, { 3, -16, 109, 40, -11, 3 },
     317             :       { 3, -16, 108, 42, -12, 3 }, { 4, -17, 106, 45, -13, 3 },
     318             :       { 4, -17, 104, 47, -13, 3 }, { 4, -17, 102, 50, -14, 3 },
     319             :       { 4, -17, 100, 52, -14, 3 }, { 4, -18, 98, 55, -15, 4 },
     320             :       { 4, -18, 96, 58, -15, 3 },  { 4, -18, 94, 60, -16, 4 },
     321             :       { 4, -18, 91, 63, -16, 4 },  { 4, -18, 89, 65, -16, 4 },
     322             :       { 4, -18, 87, 68, -17, 4 },  { 4, -18, 85, 70, -17, 4 },
     323             :       { 4, -18, 82, 73, -17, 4 },  { 4, -18, 80, 75, -17, 4 },
     324             :       { 4, -18, 78, 78, -18, 4 },  { 4, -17, 75, 80, -18, 4 },
     325             :       { 4, -17, 73, 82, -18, 4 },  { 4, -17, 70, 85, -18, 4 },
     326             :       { 4, -17, 68, 87, -18, 4 },  { 4, -16, 65, 89, -18, 4 },
     327             :       { 4, -16, 63, 91, -18, 4 },  { 4, -16, 60, 94, -18, 4 },
     328             :       { 3, -15, 58, 96, -18, 4 },  { 4, -15, 55, 98, -18, 4 },
     329             :       { 3, -14, 52, 100, -17, 4 }, { 3, -14, 50, 102, -17, 4 },
     330             :       { 3, -13, 47, 104, -17, 4 }, { 3, -13, 45, 106, -17, 4 },
     331             :       { 3, -12, 42, 108, -16, 3 }, { 3, -11, 40, 109, -16, 3 },
     332             :       { 3, -11, 37, 111, -15, 3 }, { 2, -10, 35, 113, -15, 3 },
     333             :       { 3, -10, 32, 114, -14, 3 }, { 2, -9, 29, 116, -13, 3 },
     334             :       { 2, -8, 27, 117, -13, 3 },  { 2, -8, 25, 119, -12, 2 },
     335             :       { 2, -7, 22, 120, -11, 2 },  { 1, -6, 20, 121, -10, 2 },
     336             :       { 1, -6, 18, 122, -9, 2 },   { 1, -5, 15, 123, -8, 2 },
     337             :       { 1, -4, 13, 124, -7, 1 },   { 1, -4, 11, 125, -6, 1 },
     338             :       { 1, -3, 8, 126, -5, 1 },    { 1, -2, 6, 126, -4, 1 },
     339             :       { 0, -1, 4, 127, -3, 1 },    { 0, -1, 2, 128, -1, 0 },
     340             : #elif WARPEDPIXEL_PREC_BITS == 5
     341             :       { 0, 0, 128, 0, 0, 0 },      { 1, -3, 127, 4, -1, 0 },
     342             :       { 1, -5, 126, 8, -3, 1 },    { 1, -7, 124, 13, -4, 1 },
     343             :       { 2, -9, 122, 18, -6, 1 },   { 2, -11, 120, 22, -7, 2 },
     344             :       { 3, -13, 117, 27, -8, 2 },  { 3, -14, 114, 32, -10, 3 },
     345             :       { 3, -15, 111, 37, -11, 3 }, { 3, -16, 108, 42, -12, 3 },
     346             :       { 4, -17, 104, 47, -13, 3 }, { 4, -17, 100, 52, -14, 3 },
     347             :       { 4, -18, 96, 58, -15, 3 },  { 4, -18, 91, 63, -16, 4 },
     348             :       { 4, -18, 87, 68, -17, 4 },  { 4, -18, 82, 73, -17, 4 },
     349             :       { 4, -18, 78, 78, -18, 4 },  { 4, -17, 73, 82, -18, 4 },
     350             :       { 4, -17, 68, 87, -18, 4 },  { 4, -16, 63, 91, -18, 4 },
     351             :       { 3, -15, 58, 96, -18, 4 },  { 3, -14, 52, 100, -17, 4 },
     352             :       { 3, -13, 47, 104, -17, 4 }, { 3, -12, 42, 108, -16, 3 },
     353             :       { 3, -11, 37, 111, -15, 3 }, { 3, -10, 32, 114, -14, 3 },
     354             :       { 2, -8, 27, 117, -13, 3 },  { 2, -7, 22, 120, -11, 2 },
     355             :       { 1, -6, 18, 122, -9, 2 },   { 1, -4, 13, 124, -7, 1 },
     356             :       { 1, -3, 8, 126, -5, 1 },    { 0, -1, 4, 127, -3, 1 },
     357             : #endif  // WARPEDPIXEL_PREC_BITS == 6
     358             :     };
     359             : 
     360           0 : static int32_t do_ntap_filter(const int32_t *const p, int x) {
     361             :   int i;
     362           0 :   int32_t sum = 0;
     363           0 :   for (i = 0; i < WARPEDPIXEL_FILTER_TAPS; ++i) {
     364           0 :     sum += p[i - WARPEDPIXEL_FILTER_TAPS / 2 + 1] * filter_ntap[x][i];
     365             :   }
     366           0 :   return sum;
     367             : }
     368             : 
     369           0 : static int32_t do_cubic_filter(const int32_t *const p, int x) {
     370           0 :   if (x == 0) {
     371           0 :     return p[0] * (1 << WARPEDPIXEL_FILTER_BITS);
     372           0 :   } else if (x == (1 << WARPEDPIXEL_PREC_BITS)) {
     373           0 :     return p[1] * (1 << WARPEDPIXEL_FILTER_BITS);
     374             :   } else {
     375           0 :     const int64_t v1 = (int64_t)x * x * x * (3 * (p[0] - p[1]) + p[2] - p[-1]);
     376           0 :     const int64_t v2 =
     377           0 :         (int64_t)x * x * (2 * p[-1] - 5 * p[0] + 4 * p[1] - p[2]);
     378           0 :     const int64_t v3 = x * (p[1] - p[-1]);
     379           0 :     const int64_t v4 = 2 * p[0];
     380           0 :     return (int32_t)ROUND_POWER_OF_TWO_SIGNED(
     381             :         (v4 * (1 << (3 * WARPEDPIXEL_PREC_BITS))) +
     382             :             (v3 * (1 << (2 * WARPEDPIXEL_PREC_BITS))) +
     383             :             (v2 * (1 << WARPEDPIXEL_PREC_BITS)) + v1,
     384             :         3 * WARPEDPIXEL_PREC_BITS + 1 - WARPEDPIXEL_FILTER_BITS);
     385             :   }
     386             : }
     387             : 
     388           0 : static INLINE void get_subcolumn(int taps, const uint8_t *const ref,
     389             :                                  int32_t *col, int stride, int x, int y_start) {
     390             :   int i;
     391           0 :   for (i = 0; i < taps; ++i) {
     392           0 :     col[i] = ref[(i + y_start) * stride + x];
     393             :   }
     394           0 : }
     395             : 
     396           0 : static uint8_t bi_ntap_filter(const uint8_t *const ref, int x, int y,
     397             :                               int stride) {
     398             :   int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
     399             :   int k;
     400           0 :   const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
     401           0 :   const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
     402           0 :   for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
     403             :     int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
     404           0 :     get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
     405           0 :                   i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
     406             :                   j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
     407           0 :     arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
     408           0 :                             y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
     409             :   }
     410           0 :   val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
     411           0 :                        x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
     412           0 :   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
     413           0 :   return (uint8_t)clip_pixel(val);
     414             : }
     415             : 
     416           0 : static uint8_t bi_cubic_filter(const uint8_t *const ref, int x, int y,
     417             :                                int stride) {
     418             :   int32_t val, arr[4];
     419             :   int k;
     420           0 :   const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
     421           0 :   const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
     422           0 :   for (k = 0; k < 4; ++k) {
     423             :     int32_t arr_temp[4];
     424           0 :     get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
     425           0 :     arr[k] =
     426           0 :         do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
     427             :   }
     428           0 :   val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
     429           0 :   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
     430           0 :   return (uint8_t)clip_pixel(val);
     431             : }
     432             : 
     433           0 : static uint8_t bi_linear_filter(const uint8_t *const ref, int x, int y,
     434             :                                 int stride) {
     435           0 :   const int ix = x >> WARPEDPIXEL_PREC_BITS;
     436           0 :   const int iy = y >> WARPEDPIXEL_PREC_BITS;
     437           0 :   const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
     438           0 :   const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
     439             :   int32_t val;
     440           0 :   val = ROUND_POWER_OF_TWO_SIGNED(
     441             :       ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
     442             :               (WARPEDPIXEL_PREC_SHIFTS - sx) +
     443             :           ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
     444             :           ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
     445             :           ref[(iy + 1) * stride + ix + 1] * sy * sx,
     446             :       WARPEDPIXEL_PREC_BITS * 2);
     447           0 :   return (uint8_t)clip_pixel(val);
     448             : }
     449             : 
     450           0 : static uint8_t warp_interpolate(const uint8_t *const ref, int x, int y,
     451             :                                 int width, int height, int stride) {
     452           0 :   const int ix = x >> WARPEDPIXEL_PREC_BITS;
     453           0 :   const int iy = y >> WARPEDPIXEL_PREC_BITS;
     454           0 :   const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
     455           0 :   const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
     456             :   int32_t v;
     457             : 
     458           0 :   if (ix < 0 && iy < 0)
     459           0 :     return ref[0];
     460           0 :   else if (ix < 0 && iy >= height - 1)
     461           0 :     return ref[(height - 1) * stride];
     462           0 :   else if (ix >= width - 1 && iy < 0)
     463           0 :     return ref[width - 1];
     464           0 :   else if (ix >= width - 1 && iy >= height - 1)
     465           0 :     return ref[(height - 1) * stride + (width - 1)];
     466           0 :   else if (ix < 0) {
     467           0 :     v = ROUND_POWER_OF_TWO_SIGNED(
     468             :         ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
     469             :             ref[(iy + 1) * stride] * sy,
     470             :         WARPEDPIXEL_PREC_BITS);
     471           0 :     return clip_pixel(v);
     472           0 :   } else if (iy < 0) {
     473           0 :     v = ROUND_POWER_OF_TWO_SIGNED(
     474             :         ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
     475             :         WARPEDPIXEL_PREC_BITS);
     476           0 :     return clip_pixel(v);
     477           0 :   } else if (ix >= width - 1) {
     478           0 :     v = ROUND_POWER_OF_TWO_SIGNED(
     479             :         ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
     480             :             ref[(iy + 1) * stride + width - 1] * sy,
     481             :         WARPEDPIXEL_PREC_BITS);
     482           0 :     return clip_pixel(v);
     483           0 :   } else if (iy >= height - 1) {
     484           0 :     v = ROUND_POWER_OF_TWO_SIGNED(
     485             :         ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
     486             :             ref[(height - 1) * stride + ix + 1] * sx,
     487             :         WARPEDPIXEL_PREC_BITS);
     488           0 :     return clip_pixel(v);
     489           0 :   } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
     490           0 :              iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
     491           0 :              ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
     492           0 :              iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
     493           0 :     return bi_ntap_filter(ref, x, y, stride);
     494           0 :   } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
     495           0 :     return bi_cubic_filter(ref, x, y, stride);
     496             :   } else {
     497           0 :     return bi_linear_filter(ref, x, y, stride);
     498             :   }
     499             : }
     500             : 
     501             : // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
     502             : // at a time. The zoom/rotation/shear in the model are applied to the
     503             : // "fractional" position of each pixel, which therefore varies within
     504             : // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
     505             : // We need an extra 2 taps to fit this in, for a total of 8 taps.
     506             : /* clang-format off */
     507             : const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
     508             : #if WARPEDPIXEL_PREC_BITS == 6
     509             :   // [-1, 0)
     510             :   { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
     511             :   { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
     512             :   { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
     513             :   { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
     514             :   { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
     515             :   { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
     516             :   { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
     517             :   { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
     518             :   { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
     519             :   { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
     520             :   { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
     521             :   { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
     522             :   { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
     523             :   { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
     524             :   { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
     525             :   { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
     526             :   { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
     527             :   { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
     528             :   { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
     529             :   { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
     530             :   { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
     531             :   { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
     532             :   { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
     533             :   { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
     534             :   { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
     535             :   { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
     536             :   { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
     537             :   { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
     538             :   { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
     539             :   { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
     540             :   { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
     541             :   { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
     542             : 
     543             :   // [0, 1)
     544             :   { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
     545             :   { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
     546             :   { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
     547             :   {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
     548             :   {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
     549             :   {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
     550             :   {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
     551             :   {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
     552             :   {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
     553             :   {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
     554             :   {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
     555             :   {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
     556             :   {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
     557             :   {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
     558             :   {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
     559             :   {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
     560             :   {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
     561             :   {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
     562             :   {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
     563             :   {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
     564             :   {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
     565             :   {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
     566             :   {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
     567             :   {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
     568             :   {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
     569             :   {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
     570             :   {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
     571             :   {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
     572             :   {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
     573             :   {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
     574             :   { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
     575             :   { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
     576             : 
     577             :   // [1, 2)
     578             :   { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
     579             :   { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
     580             :   { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
     581             :   { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
     582             :   { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
     583             :   { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
     584             :   { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
     585             :   { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
     586             :   { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
     587             :   { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
     588             :   { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
     589             :   { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
     590             :   { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
     591             :   { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
     592             :   { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
     593             :   { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
     594             :   { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
     595             :   { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
     596             :   { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
     597             :   { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
     598             :   { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
     599             :   { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
     600             :   { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
     601             :   { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
     602             :   { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
     603             :   { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
     604             :   { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
     605             :   { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
     606             :   { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
     607             :   { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
     608             :   { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
     609             :   { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
     610             :   // dummy (replicate row index 191)
     611             :   { 0, 0, 0,   0,   2, 127, - 1, 0 },
     612             : 
     613             : #elif WARPEDPIXEL_PREC_BITS == 5
     614             :   // [-1, 0)
     615             :   {0,   0, 127,   1,   0, 0, 0, 0}, {1,  -3, 127,   4,  -1, 0, 0, 0},
     616             :   {1,  -5, 126,   8,  -3, 1, 0, 0}, {1,  -7, 124,  13,  -4, 1, 0, 0},
     617             :   {2,  -9, 122,  18,  -6, 1, 0, 0}, {2, -11, 120,  22,  -7, 2, 0, 0},
     618             :   {3, -13, 117,  27,  -8, 2, 0, 0}, {3, -14, 114,  32, -10, 3, 0, 0},
     619             :   {3, -15, 111,  37, -11, 3, 0, 0}, {3, -16, 108,  42, -12, 3, 0, 0},
     620             :   {4, -17, 104,  47, -13, 3, 0, 0}, {4, -17, 100,  52, -14, 3, 0, 0},
     621             :   {4, -18,  96,  58, -15, 3, 0, 0}, {4, -18,  91,  63, -16, 4, 0, 0},
     622             :   {4, -18,  87,  68, -17, 4, 0, 0}, {4, -18,  82,  73, -17, 4, 0, 0},
     623             :   {4, -18,  78,  78, -18, 4, 0, 0}, {4, -17,  73,  82, -18, 4, 0, 0},
     624             :   {4, -17,  68,  87, -18, 4, 0, 0}, {4, -16,  63,  91, -18, 4, 0, 0},
     625             :   {3, -15,  58,  96, -18, 4, 0, 0}, {3, -14,  52, 100, -17, 4, 0, 0},
     626             :   {3, -13,  47, 104, -17, 4, 0, 0}, {3, -12,  42, 108, -16, 3, 0, 0},
     627             :   {3, -11,  37, 111, -15, 3, 0, 0}, {3, -10,  32, 114, -14, 3, 0, 0},
     628             :   {2,  -8,  27, 117, -13, 3, 0, 0}, {2,  -7,  22, 120, -11, 2, 0, 0},
     629             :   {1,  -6,  18, 122,  -9, 2, 0, 0}, {1,  -4,  13, 124,  -7, 1, 0, 0},
     630             :   {1,  -3,   8, 126,  -5, 1, 0, 0}, {0,  -1,   4, 127,  -3, 1, 0, 0},
     631             :   // [0, 1)
     632             :   { 0,  0,   0, 127,   1,   0,   0,  0}, { 0,  1,  -3, 127,   4,  -2,   1,  0},
     633             :   { 0,  2,  -6, 126,   8,  -3,   1,  0}, {-1,  3,  -8, 125,  13,  -5,   2, -1},
     634             :   {-1,  4, -11, 123,  18,  -7,   3, -1}, {-1,  4, -13, 121,  23,  -8,   3, -1},
     635             :   {-1,  5, -15, 119,  27, -10,   4, -1}, {-2,  6, -17, 116,  33, -12,   5, -1},
     636             :   {-2,  6, -18, 113,  38, -13,   5, -1}, {-2,  7, -19, 110,  43, -15,   6, -2},
     637             :   {-2,  7, -20, 106,  49, -16,   6, -2}, {-2,  7, -21, 102,  54, -17,   7, -2},
     638             :   {-2,  8, -22,  98,  59, -18,   7, -2}, {-2,  8, -22,  94,  64, -19,   7, -2},
     639             :   {-2,  8, -22,  89,  69, -20,   8, -2}, {-2,  8, -21,  84,  74, -21,   8, -2},
     640             :   {-2,  8, -21,  79,  79, -21,   8, -2}, {-2,  8, -21,  74,  84, -21,   8, -2},
     641             :   {-2,  8, -20,  69,  89, -22,   8, -2}, {-2,  7, -19,  64,  94, -22,   8, -2},
     642             :   {-2,  7, -18,  59,  98, -22,   8, -2}, {-2,  7, -17,  54, 102, -21,   7, -2},
     643             :   {-2,  6, -16,  49, 106, -20,   7, -2}, {-2,  6, -15,  43, 110, -19,   7, -2},
     644             :   {-1,  5, -13,  38, 113, -18,   6, -2}, {-1,  5, -12,  33, 116, -17,   6, -2},
     645             :   {-1,  4, -10,  27, 119, -15,   5, -1}, {-1,  3,  -8,  23, 121, -13,   4, -1},
     646             :   {-1,  3,  -7,  18, 123, -11,   4, -1}, {-1,  2,  -5,  13, 125,  -8,   3, -1},
     647             :   { 0,  1,  -3,   8, 126,  -6,   2,  0}, { 0,  1,  -2,   4, 127,  -3,   1,  0},
     648             :   // [1, 2)
     649             :   {0, 0, 0,   1, 127,   0,   0, 0}, {0, 0, 1,  -3, 127,   4,  -1, 0},
     650             :   {0, 0, 1,  -5, 126,   8,  -3, 1}, {0, 0, 1,  -7, 124,  13,  -4, 1},
     651             :   {0, 0, 2,  -9, 122,  18,  -6, 1}, {0, 0, 2, -11, 120,  22,  -7, 2},
     652             :   {0, 0, 3, -13, 117,  27,  -8, 2}, {0, 0, 3, -14, 114,  32, -10, 3},
     653             :   {0, 0, 3, -15, 111,  37, -11, 3}, {0, 0, 3, -16, 108,  42, -12, 3},
     654             :   {0, 0, 4, -17, 104,  47, -13, 3}, {0, 0, 4, -17, 100,  52, -14, 3},
     655             :   {0, 0, 4, -18,  96,  58, -15, 3}, {0, 0, 4, -18,  91,  63, -16, 4},
     656             :   {0, 0, 4, -18,  87,  68, -17, 4}, {0, 0, 4, -18,  82,  73, -17, 4},
     657             :   {0, 0, 4, -18,  78,  78, -18, 4}, {0, 0, 4, -17,  73,  82, -18, 4},
     658             :   {0, 0, 4, -17,  68,  87, -18, 4}, {0, 0, 4, -16,  63,  91, -18, 4},
     659             :   {0, 0, 3, -15,  58,  96, -18, 4}, {0, 0, 3, -14,  52, 100, -17, 4},
     660             :   {0, 0, 3, -13,  47, 104, -17, 4}, {0, 0, 3, -12,  42, 108, -16, 3},
     661             :   {0, 0, 3, -11,  37, 111, -15, 3}, {0, 0, 3, -10,  32, 114, -14, 3},
     662             :   {0, 0, 2,  -8,  27, 117, -13, 3}, {0, 0, 2,  -7,  22, 120, -11, 2},
     663             :   {0, 0, 1,  -6,  18, 122,  -9, 2}, {0, 0, 1,  -4,  13, 124,  -7, 1},
     664             :   {0, 0, 1,  -3,   8, 126,  -5, 1}, {0, 0, 0,  -1,   4, 127,  -3, 1},
     665             :   // dummy (replicate row index 95)
     666             :   {0, 0, 0,  -1,   4, 127,  -3, 1},
     667             : 
     668             : #endif  // WARPEDPIXEL_PREC_BITS == 6
     669             : };
     670             : 
     671             : /* clang-format on */
     672             : 
     673             : #define DIV_LUT_PREC_BITS 14
     674             : #define DIV_LUT_BITS 8
     675             : #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
     676             : 
     677             : static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
     678             :   16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
     679             :   15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
     680             :   15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
     681             :   14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
     682             :   13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
     683             :   13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
     684             :   13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
     685             :   12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
     686             :   12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
     687             :   11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
     688             :   11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
     689             :   11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
     690             :   10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
     691             :   10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
     692             :   10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
     693             :   9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
     694             :   9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
     695             :   9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
     696             :   9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
     697             :   9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
     698             :   8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
     699             :   8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
     700             :   8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
     701             :   8240,  8224,  8208,  8192,
     702             : };
     703             : 
     704             : #if CONFIG_WARPED_MOTION
     705             : // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
     706             : // at precision of DIV_LUT_PREC_BITS along with the shift.
     707           0 : static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
     708             :   int64_t e, f;
     709           0 :   *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
     710           0 :                                : get_msb((unsigned int)D));
     711             :   // e is obtained from D after resetting the most significant 1 bit.
     712           0 :   e = D - ((uint64_t)1 << *shift);
     713             :   // Get the most significant DIV_LUT_BITS (8) bits of e into f
     714           0 :   if (*shift > DIV_LUT_BITS)
     715           0 :     f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
     716             :   else
     717           0 :     f = e << (DIV_LUT_BITS - *shift);
     718           0 :   assert(f <= DIV_LUT_NUM);
     719           0 :   *shift += DIV_LUT_PREC_BITS;
     720             :   // Use f as lookup into the precomputed table of multipliers
     721           0 :   return div_lut[f];
     722             : }
     723             : #endif  // CONFIG_WARPED_MOTION
     724             : 
     725           0 : static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
     726             :   int32_t e, f;
     727           0 :   *shift = get_msb(D);
     728             :   // e is obtained from D after resetting the most significant 1 bit.
     729           0 :   e = D - ((uint32_t)1 << *shift);
     730             :   // Get the most significant DIV_LUT_BITS (8) bits of e into f
     731           0 :   if (*shift > DIV_LUT_BITS)
     732           0 :     f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
     733             :   else
     734           0 :     f = e << (DIV_LUT_BITS - *shift);
     735           0 :   assert(f <= DIV_LUT_NUM);
     736           0 :   *shift += DIV_LUT_PREC_BITS;
     737             :   // Use f as lookup into the precomputed table of multipliers
     738           0 :   return div_lut[f];
     739             : }
     740             : 
     741           0 : static int is_affine_valid(const WarpedMotionParams *const wm) {
     742           0 :   const int32_t *mat = wm->wmmat;
     743           0 :   return (mat[2] > 0);
     744             : }
     745             : 
     746           0 : static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
     747             :                                    int16_t delta) {
     748           0 :   if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
     749           0 :       (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
     750           0 :     return 0;
     751             :   else
     752           0 :     return 1;
     753             : }
     754             : 
     755             : // Returns 1 on success or 0 on an invalid affine set
     756           0 : int get_shear_params(WarpedMotionParams *wm) {
     757           0 :   const int32_t *mat = wm->wmmat;
     758           0 :   if (!is_affine_valid(wm)) return 0;
     759           0 :   wm->alpha =
     760           0 :       clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
     761           0 :   wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
     762             :   int16_t shift;
     763           0 :   int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
     764             :   int64_t v;
     765           0 :   v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
     766           0 :   wm->gamma =
     767           0 :       clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
     768           0 :   v = ((int64_t)mat[3] * mat[4]) * y;
     769           0 :   wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
     770             :                         (1 << WARPEDMODEL_PREC_BITS),
     771             :                     INT16_MIN, INT16_MAX);
     772           0 :   if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
     773           0 :     return 0;
     774             : 
     775           0 :   wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
     776             :               (1 << WARP_PARAM_REDUCE_BITS);
     777           0 :   wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
     778             :              (1 << WARP_PARAM_REDUCE_BITS);
     779           0 :   wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
     780             :               (1 << WARP_PARAM_REDUCE_BITS);
     781           0 :   wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
     782             :               (1 << WARP_PARAM_REDUCE_BITS);
     783           0 :   return 1;
     784             : }
     785             : 
     786             : #if CONFIG_HIGHBITDEPTH
     787           0 : static INLINE void highbd_get_subcolumn(int taps, const uint16_t *const ref,
     788             :                                         int32_t *col, int stride, int x,
     789             :                                         int y_start) {
     790             :   int i;
     791           0 :   for (i = 0; i < taps; ++i) {
     792           0 :     col[i] = ref[(i + y_start) * stride + x];
     793             :   }
     794           0 : }
     795             : 
     796           0 : static uint16_t highbd_bi_ntap_filter(const uint16_t *const ref, int x, int y,
     797             :                                       int stride, int bd) {
     798             :   int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
     799             :   int k;
     800           0 :   const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
     801           0 :   const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
     802           0 :   for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
     803             :     int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
     804           0 :     highbd_get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
     805           0 :                          i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
     806             :                          j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
     807           0 :     arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
     808           0 :                             y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
     809             :   }
     810           0 :   val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
     811           0 :                        x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
     812           0 :   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
     813           0 :   return (uint16_t)clip_pixel_highbd(val, bd);
     814             : }
     815             : 
     816           0 : static uint16_t highbd_bi_cubic_filter(const uint16_t *const ref, int x, int y,
     817             :                                        int stride, int bd) {
     818             :   int32_t val, arr[4];
     819             :   int k;
     820           0 :   const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
     821           0 :   const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
     822           0 :   for (k = 0; k < 4; ++k) {
     823             :     int32_t arr_temp[4];
     824           0 :     highbd_get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
     825           0 :     arr[k] =
     826           0 :         do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
     827             :   }
     828           0 :   val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
     829           0 :   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
     830           0 :   return (uint16_t)clip_pixel_highbd(val, bd);
     831             : }
     832             : 
     833           0 : static uint16_t highbd_bi_linear_filter(const uint16_t *const ref, int x, int y,
     834             :                                         int stride, int bd) {
     835           0 :   const int ix = x >> WARPEDPIXEL_PREC_BITS;
     836           0 :   const int iy = y >> WARPEDPIXEL_PREC_BITS;
     837           0 :   const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
     838           0 :   const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
     839             :   int32_t val;
     840           0 :   val = ROUND_POWER_OF_TWO_SIGNED(
     841             :       ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
     842             :               (WARPEDPIXEL_PREC_SHIFTS - sx) +
     843             :           ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
     844             :           ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
     845             :           ref[(iy + 1) * stride + ix + 1] * sy * sx,
     846             :       WARPEDPIXEL_PREC_BITS * 2);
     847           0 :   return (uint16_t)clip_pixel_highbd(val, bd);
     848             : }
     849             : 
     850           0 : static uint16_t highbd_warp_interpolate(const uint16_t *const ref, int x, int y,
     851             :                                         int width, int height, int stride,
     852             :                                         int bd) {
     853           0 :   const int ix = x >> WARPEDPIXEL_PREC_BITS;
     854           0 :   const int iy = y >> WARPEDPIXEL_PREC_BITS;
     855           0 :   const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
     856           0 :   const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
     857             :   int32_t v;
     858             : 
     859           0 :   if (ix < 0 && iy < 0)
     860           0 :     return ref[0];
     861           0 :   else if (ix < 0 && iy > height - 1)
     862           0 :     return ref[(height - 1) * stride];
     863           0 :   else if (ix > width - 1 && iy < 0)
     864           0 :     return ref[width - 1];
     865           0 :   else if (ix > width - 1 && iy > height - 1)
     866           0 :     return ref[(height - 1) * stride + (width - 1)];
     867           0 :   else if (ix < 0) {
     868           0 :     v = ROUND_POWER_OF_TWO_SIGNED(
     869             :         ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
     870             :             ref[(iy + 1) * stride] * sy,
     871             :         WARPEDPIXEL_PREC_BITS);
     872           0 :     return clip_pixel_highbd(v, bd);
     873           0 :   } else if (iy < 0) {
     874           0 :     v = ROUND_POWER_OF_TWO_SIGNED(
     875             :         ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
     876             :         WARPEDPIXEL_PREC_BITS);
     877           0 :     return clip_pixel_highbd(v, bd);
     878           0 :   } else if (ix > width - 1) {
     879           0 :     v = ROUND_POWER_OF_TWO_SIGNED(
     880             :         ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
     881             :             ref[(iy + 1) * stride + width - 1] * sy,
     882             :         WARPEDPIXEL_PREC_BITS);
     883           0 :     return clip_pixel_highbd(v, bd);
     884           0 :   } else if (iy > height - 1) {
     885           0 :     v = ROUND_POWER_OF_TWO_SIGNED(
     886             :         ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
     887             :             ref[(height - 1) * stride + ix + 1] * sx,
     888             :         WARPEDPIXEL_PREC_BITS);
     889           0 :     return clip_pixel_highbd(v, bd);
     890           0 :   } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
     891           0 :              iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
     892           0 :              ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
     893           0 :              iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
     894           0 :     return highbd_bi_ntap_filter(ref, x, y, stride, bd);
     895           0 :   } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
     896           0 :     return highbd_bi_cubic_filter(ref, x, y, stride, bd);
     897             :   } else {
     898           0 :     return highbd_bi_linear_filter(ref, x, y, stride, bd);
     899             :   }
     900             : }
     901             : 
     902           0 : static INLINE int highbd_error_measure(int err, int bd) {
     903           0 :   const int b = bd - 8;
     904           0 :   const int bmask = (1 << b) - 1;
     905           0 :   const int v = (1 << b);
     906             :   int e1, e2;
     907           0 :   err = abs(err);
     908           0 :   e1 = err >> b;
     909           0 :   e2 = err & bmask;
     910           0 :   return error_measure_lut[255 + e1] * (v - e2) +
     911           0 :          error_measure_lut[256 + e1] * e2;
     912             : }
     913             : 
     914           0 : static void highbd_warp_plane_old(
     915             :     const WarpedMotionParams *const wm, const uint8_t *const ref8, int width,
     916             :     int height, int stride, const uint8_t *const pred8, int p_col, int p_row,
     917             :     int p_width, int p_height, int p_stride, int subsampling_x,
     918             :     int subsampling_y, int x_scale, int y_scale, int bd, int comp_avg) {
     919             :   int i, j;
     920           0 :   ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
     921           0 :   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
     922           0 :   const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
     923           0 :   if (projectpoints == NULL) return;
     924           0 :   for (i = p_row; i < p_row + p_height; ++i) {
     925           0 :     for (j = p_col; j < p_col + p_width; ++j) {
     926             :       int in[2], out[2];
     927           0 :       in[0] = j;
     928           0 :       in[1] = i;
     929           0 :       projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
     930           0 :       out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
     931           0 :       out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
     932           0 :       if (comp_avg)
     933           0 :         pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
     934             :             pred[(j - p_col) + (i - p_row) * p_stride] +
     935             :                 highbd_warp_interpolate(ref, out[0], out[1], width, height,
     936             :                                         stride, bd),
     937             :             1);
     938             :       else
     939           0 :         pred[(j - p_col) + (i - p_row) * p_stride] = highbd_warp_interpolate(
     940             :             ref, out[0], out[1], width, height, stride, bd);
     941             :     }
     942             :   }
     943             : }
     944             : 
     945             : /* Note: For an explanation of the warp algorithm, and some notes on bit widths
     946             :     for hardware implementations, see the comments above av1_warp_affine_c
     947             : */
     948           0 : void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
     949             :                               int width, int height, int stride, uint16_t *pred,
     950             :                               int p_col, int p_row, int p_width, int p_height,
     951             :                               int p_stride, int subsampling_x,
     952             :                               int subsampling_y, int bd, int comp_avg,
     953             :                               int16_t alpha, int16_t beta, int16_t gamma,
     954             :                               int16_t delta) {
     955             :   uint32_t tmp[15 * 8];
     956             :   int i, j, k, l, m;
     957             : 
     958           0 :   for (i = p_row; i < p_row + p_height; i += 8) {
     959           0 :     for (j = p_col; j < p_col + p_width; j += 8) {
     960             :       int32_t x4, y4, ix4, sx4, iy4, sy4;
     961           0 :       if (subsampling_x)
     962           0 :         x4 = (mat[2] * 4 * (j + 4) + mat[3] * 4 * (i + 4) + mat[0] * 2 +
     963           0 :               (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
     964             :              4;
     965             :       else
     966           0 :         x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
     967             : 
     968           0 :       if (subsampling_y)
     969           0 :         y4 = (mat[4] * 4 * (j + 4) + mat[5] * 4 * (i + 4) + mat[1] * 2 +
     970           0 :               (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
     971             :              4;
     972             :       else
     973           0 :         y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
     974             : 
     975           0 :       ix4 = x4 >> WARPEDMODEL_PREC_BITS;
     976           0 :       sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
     977           0 :       iy4 = y4 >> WARPEDMODEL_PREC_BITS;
     978           0 :       sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
     979             : 
     980           0 :       sx4 += alpha * (-4) + beta * (-4);
     981           0 :       sy4 += gamma * (-4) + delta * (-4);
     982             : 
     983           0 :       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
     984           0 :       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
     985             : 
     986             :       // Horizontal filter
     987           0 :       for (k = -7; k < 8; ++k) {
     988           0 :         int iy = iy4 + k;
     989           0 :         if (iy < 0)
     990           0 :           iy = 0;
     991           0 :         else if (iy > height - 1)
     992           0 :           iy = height - 1;
     993             : 
     994           0 :         int sx = sx4 + beta * (k + 4);
     995           0 :         for (l = -4; l < 4; ++l) {
     996           0 :           int ix = ix4 + l - 3;
     997           0 :           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
     998             :                            WARPEDPIXEL_PREC_SHIFTS;
     999           0 :           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
    1000           0 :           const int16_t *coeffs = warped_filter[offs];
    1001             : 
    1002           0 :           int32_t sum = 1 << (bd + WARPEDPIXEL_FILTER_BITS - 1);
    1003           0 :           for (m = 0; m < 8; ++m) {
    1004           0 :             int sample_x = ix + m;
    1005           0 :             if (sample_x < 0)
    1006           0 :               sample_x = 0;
    1007           0 :             else if (sample_x > width - 1)
    1008           0 :               sample_x = width - 1;
    1009           0 :             sum += ref[iy * stride + sample_x] * coeffs[m];
    1010             :           }
    1011           0 :           sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
    1012           0 :           assert(0 <= sum &&
    1013             :                  sum < (1 << (bd + WARPEDPIXEL_FILTER_BITS + 1 -
    1014             :                               HORSHEAR_REDUCE_PREC_BITS)));
    1015           0 :           tmp[(k + 7) * 8 + (l + 4)] = sum;
    1016           0 :           sx += alpha;
    1017             :         }
    1018             :       }
    1019             : 
    1020             :       // Vertical filter
    1021           0 :       for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
    1022           0 :         int sy = sy4 + delta * (k + 4);
    1023           0 :         for (l = -4; l < 4; ++l) {
    1024           0 :           uint16_t *p =
    1025           0 :               &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
    1026           0 :           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
    1027             :                            WARPEDPIXEL_PREC_SHIFTS;
    1028           0 :           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
    1029           0 :           const int16_t *coeffs = warped_filter[offs];
    1030             : 
    1031           0 :           int32_t sum = 1 << (bd + 2 * WARPEDPIXEL_FILTER_BITS -
    1032             :                               HORSHEAR_REDUCE_PREC_BITS);
    1033           0 :           for (m = 0; m < 8; ++m) {
    1034           0 :             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
    1035             :           }
    1036           0 :           sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
    1037           0 :           assert(0 <= sum && sum < (1 << (bd + 2)));
    1038           0 :           uint16_t px =
    1039           0 :               clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
    1040           0 :           if (comp_avg)
    1041           0 :             *p = ROUND_POWER_OF_TWO(*p + px, 1);
    1042             :           else
    1043           0 :             *p = px;
    1044           0 :           sy += gamma;
    1045             :         }
    1046             :       }
    1047             :     }
    1048             :   }
    1049           0 : }
    1050             : 
    1051           0 : static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
    1052             :                               int width, int height, int stride,
    1053             :                               const uint8_t *const pred8, int p_col, int p_row,
    1054             :                               int p_width, int p_height, int p_stride,
    1055             :                               int subsampling_x, int subsampling_y, int x_scale,
    1056             :                               int y_scale, int bd, int comp_avg) {
    1057           0 :   if (wm->wmtype == ROTZOOM) {
    1058           0 :     wm->wmmat[5] = wm->wmmat[2];
    1059           0 :     wm->wmmat[4] = -wm->wmmat[3];
    1060             :   }
    1061           0 :   if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) && x_scale == 16 &&
    1062           0 :       y_scale == 16) {
    1063           0 :     const int32_t *const mat = wm->wmmat;
    1064           0 :     const int16_t alpha = wm->alpha;
    1065           0 :     const int16_t beta = wm->beta;
    1066           0 :     const int16_t gamma = wm->gamma;
    1067           0 :     const int16_t delta = wm->delta;
    1068             : 
    1069           0 :     const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
    1070           0 :     uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
    1071           0 :     av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
    1072             :                            p_width, p_height, p_stride, subsampling_x,
    1073             :                            subsampling_y, bd, comp_avg, alpha, beta, gamma,
    1074             :                            delta);
    1075             :   } else {
    1076           0 :     highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col, p_row,
    1077             :                           p_width, p_height, p_stride, subsampling_x,
    1078             :                           subsampling_y, x_scale, y_scale, bd, comp_avg);
    1079             :   }
    1080           0 : }
    1081             : 
    1082           0 : static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
    1083             :                                   const uint16_t *const dst, int p_col,
    1084             :                                   int p_row, int p_width, int p_height,
    1085             :                                   int p_stride, int bd) {
    1086           0 :   int64_t sum_error = 0;
    1087           0 :   for (int i = 0; i < p_height; ++i) {
    1088           0 :     for (int j = 0; j < p_width; ++j) {
    1089           0 :       sum_error += highbd_error_measure(
    1090           0 :           dst[j + i * p_stride] - ref[(j + p_col) + (i + p_row) * stride], bd);
    1091             :     }
    1092             :   }
    1093           0 :   return sum_error;
    1094             : }
    1095             : 
    1096           0 : static int64_t highbd_warp_error(
    1097             :     WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
    1098             :     int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
    1099             :     int p_height, int p_stride, int subsampling_x, int subsampling_y,
    1100             :     int x_scale, int y_scale, int bd) {
    1101           0 :   int64_t gm_sumerr = 0;
    1102           0 :   uint16_t *tmp = aom_malloc(p_width * p_height * sizeof(*tmp));
    1103           0 :   if (!tmp) return INT64_MAX;
    1104             : 
    1105           0 :   highbd_warp_plane(wm, ref8, width, height, stride, CONVERT_TO_BYTEPTR(tmp),
    1106             :                     p_col, p_row, p_width, p_height, p_width, subsampling_x,
    1107             :                     subsampling_y, x_scale, y_scale, bd, 0);
    1108             : 
    1109           0 :   gm_sumerr = highbd_frame_error(tmp, p_width, CONVERT_TO_SHORTPTR(dst8), p_col,
    1110             :                                  p_row, p_width, p_height, p_stride, bd);
    1111             : 
    1112           0 :   aom_free(tmp);
    1113           0 :   return gm_sumerr;
    1114             : }
    1115             : #endif  // CONFIG_HIGHBITDEPTH
    1116             : 
    1117           0 : static INLINE int error_measure(int err) {
    1118           0 :   return error_measure_lut[255 + err];
    1119             : }
    1120             : 
    1121           0 : static void warp_plane_old(const WarpedMotionParams *const wm,
    1122             :                            const uint8_t *const ref, int width, int height,
    1123             :                            int stride, uint8_t *pred, int p_col, int p_row,
    1124             :                            int p_width, int p_height, int p_stride,
    1125             :                            int subsampling_x, int subsampling_y, int x_scale,
    1126             :                            int y_scale, int comp_avg) {
    1127             :   int i, j;
    1128           0 :   ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
    1129           0 :   if (projectpoints == NULL) return;
    1130           0 :   for (i = p_row; i < p_row + p_height; ++i) {
    1131           0 :     for (j = p_col; j < p_col + p_width; ++j) {
    1132             :       int in[2], out[2];
    1133           0 :       in[0] = j;
    1134           0 :       in[1] = i;
    1135           0 :       projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
    1136           0 :       out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
    1137           0 :       out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
    1138           0 :       if (comp_avg)
    1139           0 :         pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
    1140             :             pred[(j - p_col) + (i - p_row) * p_stride] +
    1141             :                 warp_interpolate(ref, out[0], out[1], width, height, stride),
    1142             :             1);
    1143             :       else
    1144           0 :         pred[(j - p_col) + (i - p_row) * p_stride] =
    1145           0 :             warp_interpolate(ref, out[0], out[1], width, height, stride);
    1146             :     }
    1147             :   }
    1148             : }
    1149             : 
    1150             : /* The warp filter for ROTZOOM and AFFINE models works as follows:
    1151             :    * Split the input into 8x8 blocks
    1152             :    * For each block, project the point (4, 4) within the block, to get the
    1153             :      overall block position. Split into integer and fractional coordinates,
    1154             :      maintaining full WARPEDMODEL precision
    1155             :    * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
    1156             :      variable horizontal offset. This means that, while the rows of the
    1157             :      intermediate buffer align with the rows of the *reference* image, the
    1158             :      columns align with the columns of the *destination* image.
    1159             :    * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
    1160             :      destination is too small we crop the output at this stage). Each pixel has
    1161             :      a variable vertical offset, so that the resulting rows are aligned with
    1162             :      the rows of the destination image.
    1163             : 
    1164             :    To accomplish these alignments, we factor the warp matrix as a
    1165             :    product of two shear / asymmetric zoom matrices:
    1166             :    / a b \  = /   1       0    \ * / 1+alpha  beta \
    1167             :    \ c d /    \ gamma  1+delta /   \    0      1   /
    1168             :    where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
    1169             :    The horizontal shear (with alpha and beta) is applied first,
    1170             :    then the vertical shear (with gamma and delta) is applied second.
    1171             : 
    1172             :    The only limitation is that, to fit this in a fixed 8-tap filter size,
    1173             :    the fractional pixel offsets must be at most +-1. Since the horizontal filter
    1174             :    generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
    1175             :    within the block, the parameters must satisfy
    1176             :    4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
    1177             :    for this filter to be applicable.
    1178             : 
    1179             :    Note: This function assumes that the caller has done all of the relevant
    1180             :    checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
    1181             :    are set appropriately (if using a ROTZOOM model), and that alpha, beta,
    1182             :    gamma, delta are all in range.
    1183             : 
    1184             :    TODO(david.barker): Maybe support scaled references?
    1185             : */
    1186             : /* A note on hardware implementation:
    1187             :     The warp filter is intended to be implementable using the same hardware as
    1188             :     the high-precision convolve filters from the loop-restoration and
    1189             :     convolve-round experiments.
    1190             : 
    1191             :     For a single filter stage, considering all of the coefficient sets for the
    1192             :     warp filter and the regular convolution filter, an input in the range
    1193             :     [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
    1194             :     before rounding.
    1195             : 
    1196             :     Allowing for some changes to the filter coefficient sets, call the range
    1197             :     [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
    1198             :     we can replace this by the range [0, 256 * 2^k], which can be stored in an
    1199             :     unsigned value with 8 + k bits.
    1200             : 
    1201             :     This allows the derivation of the appropriate bit widths and offsets for
    1202             :     the various intermediate values: If
    1203             : 
    1204             :     F := WARPEDPIXEL_FILTER_BITS = 7 (or else the above ranges need adjusting)
    1205             :          So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
    1206             :          intermediate value.
    1207             :     H := HORSHEAR_REDUCE_PREC_BITS
    1208             :     V := VERSHEAR_REDUCE_PREC_BITS
    1209             :     (and note that we must have H + V = 2*F for the output to have the same
    1210             :      scale as the input)
    1211             : 
    1212             :     then we end up with the following offsets and ranges:
    1213             :     Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
    1214             :                        uint{bd + F + 1}
    1215             :     After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
    1216             :     Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
    1217             :                      uint{bd + 2*F + 2 - H}
    1218             :     After rounding: The final value, before undoing the offset, fits into a
    1219             :                     uint{bd + 2}.
    1220             : 
    1221             :     Then we need to undo the offsets before clamping to a pixel. Note that,
    1222             :     if we do this at the end, the amount to subtract is actually independent
    1223             :     of H and V:
    1224             : 
    1225             :     offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
    1226             :                          (1 << ((bd + 2*F - H) - V))
    1227             :                       == (1 << (bd - 1)) + (1 << bd)
    1228             : 
    1229             :     This allows us to entirely avoid clamping in both the warp filter and
    1230             :     the convolve-round experiment. As of the time of writing, the Wiener filter
    1231             :     from loop-restoration can encode a central coefficient up to 216, which
    1232             :     leads to a maximum value of about 282 * 2^k after applying the offset.
    1233             :     So in that case we still need to clamp.
    1234             : */
    1235           0 : void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
    1236             :                        int height, int stride, uint8_t *pred, int p_col,
    1237             :                        int p_row, int p_width, int p_height, int p_stride,
    1238             :                        int subsampling_x, int subsampling_y, int comp_avg,
    1239             :                        int16_t alpha, int16_t beta, int16_t gamma,
    1240             :                        int16_t delta) {
    1241             :   uint16_t tmp[15 * 8];
    1242             :   int i, j, k, l, m;
    1243           0 :   const int bd = 8;
    1244             : 
    1245           0 :   for (i = p_row; i < p_row + p_height; i += 8) {
    1246           0 :     for (j = p_col; j < p_col + p_width; j += 8) {
    1247             :       int32_t x4, y4, ix4, sx4, iy4, sy4;
    1248           0 :       if (subsampling_x)
    1249           0 :         x4 = (mat[2] * 4 * (j + 4) + mat[3] * 4 * (i + 4) + mat[0] * 2 +
    1250           0 :               (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
    1251             :              4;
    1252             :       else
    1253           0 :         x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
    1254             : 
    1255           0 :       if (subsampling_y)
    1256           0 :         y4 = (mat[4] * 4 * (j + 4) + mat[5] * 4 * (i + 4) + mat[1] * 2 +
    1257           0 :               (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
    1258             :              4;
    1259             :       else
    1260           0 :         y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
    1261             : 
    1262           0 :       ix4 = x4 >> WARPEDMODEL_PREC_BITS;
    1263           0 :       sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    1264           0 :       iy4 = y4 >> WARPEDMODEL_PREC_BITS;
    1265           0 :       sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    1266             : 
    1267           0 :       sx4 += alpha * (-4) + beta * (-4);
    1268           0 :       sy4 += gamma * (-4) + delta * (-4);
    1269             : 
    1270           0 :       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    1271           0 :       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    1272             : 
    1273             :       // Horizontal filter
    1274           0 :       for (k = -7; k < 8; ++k) {
    1275             :         // Clamp to top/bottom edge of the frame
    1276           0 :         int iy = iy4 + k;
    1277           0 :         if (iy < 0)
    1278           0 :           iy = 0;
    1279           0 :         else if (iy > height - 1)
    1280           0 :           iy = height - 1;
    1281             : 
    1282           0 :         int sx = sx4 + beta * (k + 4);
    1283             : 
    1284           0 :         for (l = -4; l < 4; ++l) {
    1285           0 :           int ix = ix4 + l - 3;
    1286             :           // At this point, sx = sx4 + alpha * l + beta * k
    1287           0 :           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
    1288             :                            WARPEDPIXEL_PREC_SHIFTS;
    1289           0 :           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
    1290           0 :           const int16_t *coeffs = warped_filter[offs];
    1291             : 
    1292           0 :           int32_t sum = 1 << (bd + WARPEDPIXEL_FILTER_BITS - 1);
    1293           0 :           for (m = 0; m < 8; ++m) {
    1294             :             // Clamp to left/right edge of the frame
    1295           0 :             int sample_x = ix + m;
    1296           0 :             if (sample_x < 0)
    1297           0 :               sample_x = 0;
    1298           0 :             else if (sample_x > width - 1)
    1299           0 :               sample_x = width - 1;
    1300             : 
    1301           0 :             sum += ref[iy * stride + sample_x] * coeffs[m];
    1302             :           }
    1303           0 :           sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
    1304           0 :           assert(0 <= sum &&
    1305             :                  sum < (1 << (bd + WARPEDPIXEL_FILTER_BITS + 1 -
    1306             :                               HORSHEAR_REDUCE_PREC_BITS)));
    1307           0 :           tmp[(k + 7) * 8 + (l + 4)] = sum;
    1308           0 :           sx += alpha;
    1309             :         }
    1310             :       }
    1311             : 
    1312             :       // Vertical filter
    1313           0 :       for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
    1314           0 :         int sy = sy4 + delta * (k + 4);
    1315           0 :         for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
    1316           0 :           uint8_t *p =
    1317           0 :               &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
    1318             :           // At this point, sy = sy4 + gamma * l + delta * k
    1319           0 :           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
    1320             :                            WARPEDPIXEL_PREC_SHIFTS;
    1321           0 :           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
    1322           0 :           const int16_t *coeffs = warped_filter[offs];
    1323             : 
    1324           0 :           int32_t sum = 1 << (bd + 2 * WARPEDPIXEL_FILTER_BITS -
    1325             :                               HORSHEAR_REDUCE_PREC_BITS);
    1326           0 :           for (m = 0; m < 8; ++m) {
    1327           0 :             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
    1328             :           }
    1329           0 :           sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
    1330           0 :           assert(0 <= sum && sum < (1 << (bd + 2)));
    1331           0 :           uint8_t px = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
    1332           0 :           if (comp_avg)
    1333           0 :             *p = ROUND_POWER_OF_TWO(*p + px, 1);
    1334             :           else
    1335           0 :             *p = px;
    1336           0 :           sy += gamma;
    1337             :         }
    1338             :       }
    1339             :     }
    1340             :   }
    1341           0 : }
    1342             : 
    1343           0 : static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
    1344             :                        int width, int height, int stride, uint8_t *pred,
    1345             :                        int p_col, int p_row, int p_width, int p_height,
    1346             :                        int p_stride, int subsampling_x, int subsampling_y,
    1347             :                        int x_scale, int y_scale, int comp_avg) {
    1348           0 :   if (wm->wmtype == ROTZOOM) {
    1349           0 :     wm->wmmat[5] = wm->wmmat[2];
    1350           0 :     wm->wmmat[4] = -wm->wmmat[3];
    1351             :   }
    1352           0 :   if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) && x_scale == 16 &&
    1353           0 :       y_scale == 16) {
    1354           0 :     const int32_t *const mat = wm->wmmat;
    1355           0 :     const int16_t alpha = wm->alpha;
    1356           0 :     const int16_t beta = wm->beta;
    1357           0 :     const int16_t gamma = wm->gamma;
    1358           0 :     const int16_t delta = wm->delta;
    1359             : 
    1360           0 :     av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
    1361             :                     p_width, p_height, p_stride, subsampling_x, subsampling_y,
    1362             :                     comp_avg, alpha, beta, gamma, delta);
    1363             :   } else {
    1364           0 :     warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
    1365             :                    p_height, p_stride, subsampling_x, subsampling_y, x_scale,
    1366             :                    y_scale, comp_avg);
    1367             :   }
    1368           0 : }
    1369             : 
    1370           0 : static int64_t frame_error(const uint8_t *const ref, int stride,
    1371             :                            const uint8_t *const dst, int p_col, int p_row,
    1372             :                            int p_width, int p_height, int p_stride) {
    1373           0 :   int64_t sum_error = 0;
    1374           0 :   for (int i = 0; i < p_height; ++i) {
    1375           0 :     for (int j = 0; j < p_width; ++j) {
    1376           0 :       sum_error += (int64_t)error_measure(
    1377           0 :           dst[j + i * p_stride] - ref[(j + p_col) + (i + p_row) * stride]);
    1378             :     }
    1379             :   }
    1380           0 :   return sum_error;
    1381             : }
    1382             : 
    1383           0 : static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
    1384             :                           int width, int height, int stride,
    1385             :                           const uint8_t *const dst, int p_col, int p_row,
    1386             :                           int p_width, int p_height, int p_stride,
    1387             :                           int subsampling_x, int subsampling_y, int x_scale,
    1388             :                           int y_scale) {
    1389           0 :   int64_t gm_sumerr = 0;
    1390           0 :   uint8_t *tmp = aom_malloc(p_width * p_height);
    1391           0 :   if (!tmp) return INT64_MAX;
    1392             : 
    1393           0 :   warp_plane(wm, ref, width, height, stride, tmp, p_col, p_row, p_width,
    1394             :              p_height, p_width, subsampling_x, subsampling_y, x_scale, y_scale,
    1395             :              0);
    1396             : 
    1397           0 :   gm_sumerr =
    1398             :       frame_error(tmp, p_width, dst, p_col, p_row, p_width, p_height, p_stride);
    1399             : 
    1400           0 :   aom_free(tmp);
    1401           0 :   return gm_sumerr;
    1402             : }
    1403             : 
    1404           0 : int64_t av1_frame_error(
    1405             : #if CONFIG_HIGHBITDEPTH
    1406             :     int use_hbd, int bd,
    1407             : #endif  // CONFIG_HIGHBITDEPTH
    1408             :     const uint8_t *ref, int stride, uint8_t *dst, int p_col, int p_row,
    1409             :     int p_width, int p_height, int p_stride) {
    1410             : #if CONFIG_HIGHBITDEPTH
    1411           0 :   if (use_hbd) {
    1412           0 :     return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
    1413           0 :                               CONVERT_TO_SHORTPTR(dst), p_col, p_row, p_width,
    1414             :                               p_height, p_stride, bd);
    1415             :   }
    1416             : #endif  // CONFIG_HIGHBITDEPTH
    1417           0 :   return frame_error(ref, stride, dst, p_col, p_row, p_width, p_height,
    1418             :                      p_stride);
    1419             : }
    1420             : 
    1421           0 : int64_t av1_warp_error(WarpedMotionParams *wm,
    1422             : #if CONFIG_HIGHBITDEPTH
    1423             :                        int use_hbd, int bd,
    1424             : #endif  // CONFIG_HIGHBITDEPTH
    1425             :                        const uint8_t *ref, int width, int height, int stride,
    1426             :                        uint8_t *dst, int p_col, int p_row, int p_width,
    1427             :                        int p_height, int p_stride, int subsampling_x,
    1428             :                        int subsampling_y, int x_scale, int y_scale) {
    1429           0 :   if (wm->wmtype <= AFFINE)
    1430           0 :     if (!get_shear_params(wm)) return 1;
    1431             : #if CONFIG_HIGHBITDEPTH
    1432           0 :   if (use_hbd)
    1433           0 :     return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
    1434             :                              p_width, p_height, p_stride, subsampling_x,
    1435             :                              subsampling_y, x_scale, y_scale, bd);
    1436             : #endif  // CONFIG_HIGHBITDEPTH
    1437           0 :   return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
    1438             :                     p_height, p_stride, subsampling_x, subsampling_y, x_scale,
    1439             :                     y_scale);
    1440             : }
    1441             : 
    1442           0 : void av1_warp_plane(WarpedMotionParams *wm,
    1443             : #if CONFIG_HIGHBITDEPTH
    1444             :                     int use_hbd, int bd,
    1445             : #endif  // CONFIG_HIGHBITDEPTH
    1446             :                     const uint8_t *ref, int width, int height, int stride,
    1447             :                     uint8_t *pred, int p_col, int p_row, int p_width,
    1448             :                     int p_height, int p_stride, int subsampling_x,
    1449             :                     int subsampling_y, int x_scale, int y_scale, int comp_avg) {
    1450             : #if CONFIG_HIGHBITDEPTH
    1451           0 :   if (use_hbd)
    1452           0 :     highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
    1453             :                       p_width, p_height, p_stride, subsampling_x, subsampling_y,
    1454             :                       x_scale, y_scale, bd, comp_avg);
    1455             :   else
    1456             : #endif  // CONFIG_HIGHBITDEPTH
    1457           0 :     warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
    1458             :                p_height, p_stride, subsampling_x, subsampling_y, x_scale,
    1459             :                y_scale, comp_avg);
    1460           0 : }
    1461             : 
    1462             : #if CONFIG_WARPED_MOTION
    1463             : #define LEAST_SQUARES_ORDER 2
    1464             : 
    1465             : #define LS_MV_MAX 256  // max mv in 1/8-pel
    1466             : #define LS_STEP 2
    1467             : 
    1468             : // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
    1469             : // the precision needed is:
    1470             : //   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
    1471             : //   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
    1472             : //   1 [for sign] +
    1473             : //   LEAST_SQUARES_SAMPLES_MAX_BITS
    1474             : //        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
    1475             : // The value is 23
    1476             : #define LS_MAT_RANGE_BITS \
    1477             :   ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
    1478             : 
    1479             : // Bit-depth reduction from the full-range
    1480             : #define LS_MAT_DOWN_BITS 2
    1481             : 
    1482             : // bits range of A, Bx and By after downshifting
    1483             : #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
    1484             : #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
    1485             : #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
    1486             : 
    1487             : #define LS_SUM(a) ((a)*4 + LS_STEP * 2)
    1488             : #define LS_SQUARE(a) \
    1489             :   (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
    1490             : #define LS_PRODUCT1(a, b) \
    1491             :   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> 2)
    1492             : #define LS_PRODUCT2(a, b) \
    1493             :   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
    1494             : 
    1495           0 : static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
    1496             :                            int mvy, int mvx, WarpedMotionParams *wm, int mi_row,
    1497             :                            int mi_col) {
    1498           0 :   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
    1499           0 :   int32_t Bx[2] = { 0, 0 };
    1500           0 :   int32_t By[2] = { 0, 0 };
    1501           0 :   int i, n = 0;
    1502             : 
    1503           0 :   const int bw = block_size_wide[bsize];
    1504           0 :   const int bh = block_size_high[bsize];
    1505           0 :   const int suy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1) * 8;
    1506           0 :   const int sux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1) * 8;
    1507           0 :   const int duy = suy + mvy;
    1508           0 :   const int dux = sux + mvx;
    1509             : 
    1510             :   // Assume the center pixel of the block has exactly the same motion vector
    1511             :   // as transmitted for the block. First shift the origin of the source
    1512             :   // points to the block center, and the origin of the destination points to
    1513             :   // the block center added to the motion vector transmitted.
    1514             :   // Let (xi, yi) denote the source points and (xi', yi') denote destination
    1515             :   // points after origin shfifting, for i = 0, 1, 2, .... n-1.
    1516             :   // Then if  P = [x0, y0,
    1517             :   //               x1, y1
    1518             :   //               x2, y1,
    1519             :   //                ....
    1520             :   //              ]
    1521             :   //          q = [x0', x1', x2', ... ]'
    1522             :   //          r = [y0', y1', y2', ... ]'
    1523             :   // the least squares problems that need to be solved are:
    1524             :   //          [h1, h2]' = inv(P'P)P'q and
    1525             :   //          [h3, h4]' = inv(P'P)P'r
    1526             :   // where the affine transformation is given by:
    1527             :   //          x' = h1.x + h2.y
    1528             :   //          y' = h3.x + h4.y
    1529             :   //
    1530             :   // The loop below computes: A = P'P, Bx = P'q, By = P'r
    1531             :   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
    1532             :   int sx, sy, dx, dy;
    1533             :   // Contribution from neighbor block
    1534           0 :   for (i = 0; i < np && n < LEAST_SQUARES_SAMPLES_MAX; i++) {
    1535           0 :     dx = pts2[i * 2] - dux;
    1536           0 :     dy = pts2[i * 2 + 1] - duy;
    1537           0 :     sx = pts1[i * 2] - sux;
    1538           0 :     sy = pts1[i * 2 + 1] - suy;
    1539           0 :     if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
    1540           0 :       A[0][0] += LS_SQUARE(sx);
    1541           0 :       A[0][1] += LS_PRODUCT1(sx, sy);
    1542           0 :       A[1][1] += LS_SQUARE(sy);
    1543           0 :       Bx[0] += LS_PRODUCT2(sx, dx);
    1544           0 :       Bx[1] += LS_PRODUCT1(sy, dx);
    1545           0 :       By[0] += LS_PRODUCT1(sx, dy);
    1546           0 :       By[1] += LS_PRODUCT2(sy, dy);
    1547           0 :       n++;
    1548             :     }
    1549             :   }
    1550             :   int downshift;
    1551           0 :   if (n >= 4)
    1552           0 :     downshift = LS_MAT_DOWN_BITS;
    1553           0 :   else if (n >= 2)
    1554           0 :     downshift = LS_MAT_DOWN_BITS - 1;
    1555             :   else
    1556           0 :     downshift = LS_MAT_DOWN_BITS - 2;
    1557             : 
    1558             :   // Reduce precision by downshift bits
    1559           0 :   A[0][0] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][0], downshift), LS_MAT_MIN,
    1560             :                   LS_MAT_MAX);
    1561           0 :   A[0][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][1], downshift), LS_MAT_MIN,
    1562             :                   LS_MAT_MAX);
    1563           0 :   A[1][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[1][1], downshift), LS_MAT_MIN,
    1564             :                   LS_MAT_MAX);
    1565           0 :   Bx[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[0], downshift), LS_MAT_MIN,
    1566             :                 LS_MAT_MAX);
    1567           0 :   Bx[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[1], downshift), LS_MAT_MIN,
    1568             :                 LS_MAT_MAX);
    1569           0 :   By[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[0], downshift), LS_MAT_MIN,
    1570             :                 LS_MAT_MAX);
    1571           0 :   By[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[1], downshift), LS_MAT_MIN,
    1572             :                 LS_MAT_MAX);
    1573             : 
    1574             :   int64_t Px[2], Py[2], Det;
    1575             :   int16_t iDet, shift;
    1576             : 
    1577             :   // These divided by the Det, are the least squares solutions
    1578           0 :   Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
    1579           0 :   Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
    1580           0 :   Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
    1581           0 :   Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
    1582             : 
    1583             :   // Compute Determinant of A
    1584           0 :   Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
    1585           0 :   if (Det == 0) return 1;
    1586           0 :   iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
    1587           0 :   shift -= WARPEDMODEL_PREC_BITS;
    1588           0 :   if (shift < 0) {
    1589           0 :     iDet <<= (-shift);
    1590           0 :     shift = 0;
    1591             :   }
    1592             : 
    1593             :   int64_t v;
    1594           0 :   v = Px[0] * (int64_t)iDet;
    1595           0 :   wm->wmmat[2] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
    1596           0 :   v = Px[1] * (int64_t)iDet;
    1597           0 :   wm->wmmat[3] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
    1598           0 :   v = ((int64_t)dux * (1 << WARPEDMODEL_PREC_BITS)) -
    1599           0 :       (int64_t)sux * wm->wmmat[2] - (int64_t)suy * wm->wmmat[3];
    1600           0 :   wm->wmmat[0] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED(v, 3));
    1601             : 
    1602           0 :   v = Py[0] * (int64_t)iDet;
    1603           0 :   wm->wmmat[4] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
    1604           0 :   v = Py[1] * (int64_t)iDet;
    1605           0 :   wm->wmmat[5] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
    1606           0 :   v = ((int64_t)duy * (1 << WARPEDMODEL_PREC_BITS)) -
    1607           0 :       (int64_t)sux * wm->wmmat[4] - (int64_t)suy * wm->wmmat[5];
    1608           0 :   wm->wmmat[1] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED(v, 3));
    1609             : 
    1610           0 :   wm->wmmat[6] = wm->wmmat[7] = 0;
    1611             : 
    1612             :   // Clamp values
    1613           0 :   wm->wmmat[0] = clamp(wm->wmmat[0], -WARPEDMODEL_TRANS_CLAMP,
    1614             :                        WARPEDMODEL_TRANS_CLAMP - 1);
    1615           0 :   wm->wmmat[1] = clamp(wm->wmmat[1], -WARPEDMODEL_TRANS_CLAMP,
    1616             :                        WARPEDMODEL_TRANS_CLAMP - 1);
    1617           0 :   wm->wmmat[2] = clamp(wm->wmmat[2], -WARPEDMODEL_DIAGAFFINE_CLAMP,
    1618             :                        WARPEDMODEL_DIAGAFFINE_CLAMP - 1);
    1619           0 :   wm->wmmat[5] = clamp(wm->wmmat[5], -WARPEDMODEL_DIAGAFFINE_CLAMP,
    1620             :                        WARPEDMODEL_DIAGAFFINE_CLAMP - 1);
    1621           0 :   wm->wmmat[3] = clamp(wm->wmmat[3], -WARPEDMODEL_NONDIAGAFFINE_CLAMP,
    1622             :                        WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
    1623           0 :   wm->wmmat[4] = clamp(wm->wmmat[4], -WARPEDMODEL_NONDIAGAFFINE_CLAMP,
    1624             :                        WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
    1625           0 :   return 0;
    1626             : }
    1627             : 
    1628           0 : int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
    1629             :                     int mvx, WarpedMotionParams *wm_params, int mi_row,
    1630             :                     int mi_col) {
    1631           0 :   int result = 1;
    1632           0 :   switch (wm_params->wmtype) {
    1633             :     case AFFINE:
    1634           0 :       result = find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params,
    1635             :                                mi_row, mi_col);
    1636           0 :       break;
    1637           0 :     default: assert(0 && "Invalid warped motion type!"); return 1;
    1638             :   }
    1639           0 :   if (result == 0) {
    1640           0 :     if (wm_params->wmtype == ROTZOOM) {
    1641           0 :       wm_params->wmmat[5] = wm_params->wmmat[2];
    1642           0 :       wm_params->wmmat[4] = -wm_params->wmmat[3];
    1643             :     }
    1644           0 :     if (wm_params->wmtype == AFFINE || wm_params->wmtype == ROTZOOM) {
    1645             :       // check compatibility with the fast warp filter
    1646           0 :       if (!get_shear_params(wm_params)) return 1;
    1647             :     }
    1648             :   }
    1649             : 
    1650           0 :   return result;
    1651             : }
    1652             : #endif  // CONFIG_WARPED_MOTION

Generated by: LCOV version 1.13