LCOV - output.info - third_party/aom/aom

LCOV - code coverage report

Current view:	top level - third_party/aom/aom_dsp - loopfilter.c (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	0	405	0.0 %
Date:	2017-07-14 16:53:18	Functions:	0	44	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <stdlib.h>
      13             : 
      14             : #include "./aom_config.h"
      15             : #include "./aom_dsp_rtcd.h"
      16             : #include "aom_dsp/aom_dsp_common.h"
      17             : #include "aom_ports/mem.h"
      18             : 
      19           0 : static INLINE int8_t signed_char_clamp(int t) {
      20           0 :   return (int8_t)clamp(t, -128, 127);
      21             : }
      22             : 
      23             : #define PARALLEL_DEBLOCKING_11_TAP 0
      24             : #define PARALLEL_DEBLOCKING_9_TAP 0
      25             : 
      26             : #if CONFIG_HIGHBITDEPTH
      27           0 : static INLINE int16_t signed_char_clamp_high(int t, int bd) {
      28           0 :   switch (bd) {
      29           0 :     case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
      30           0 :     case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
      31             :     case 8:
      32           0 :     default: return (int16_t)clamp(t, -128, 128 - 1);
      33             :   }
      34             : }
      35             : #endif
      36             : #if CONFIG_PARALLEL_DEBLOCKING
      37             : // should we apply any filter at all: 11111111 yes, 00000000 no
      38             : static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
      39             :                                   uint8_t p0, uint8_t q0, uint8_t q1) {
      40             :   int8_t mask = 0;
      41             :   mask |= (abs(p1 - p0) > limit) * -1;
      42             :   mask |= (abs(q1 - q0) > limit) * -1;
      43             :   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
      44             :   return ~mask;
      45             : }
      46             : #endif  // CONFIG_PARALLEL_DEBLOCKING
      47           0 : static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
      48             :                                  uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
      49             :                                  uint8_t q1, uint8_t q2, uint8_t q3) {
      50           0 :   int8_t mask = 0;
      51           0 :   mask |= (abs(p3 - p2) > limit) * -1;
      52           0 :   mask |= (abs(p2 - p1) > limit) * -1;
      53           0 :   mask |= (abs(p1 - p0) > limit) * -1;
      54           0 :   mask |= (abs(q1 - q0) > limit) * -1;
      55           0 :   mask |= (abs(q2 - q1) > limit) * -1;
      56           0 :   mask |= (abs(q3 - q2) > limit) * -1;
      57           0 :   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
      58           0 :   return ~mask;
      59             : }
      60             : 
      61           0 : static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
      62             :                                 uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
      63             :                                 uint8_t q2, uint8_t q3) {
      64           0 :   int8_t mask = 0;
      65           0 :   mask |= (abs(p1 - p0) > thresh) * -1;
      66           0 :   mask |= (abs(q1 - q0) > thresh) * -1;
      67           0 :   mask |= (abs(p2 - p0) > thresh) * -1;
      68           0 :   mask |= (abs(q2 - q0) > thresh) * -1;
      69           0 :   mask |= (abs(p3 - p0) > thresh) * -1;
      70           0 :   mask |= (abs(q3 - q0) > thresh) * -1;
      71           0 :   return ~mask;
      72             : }
      73             : 
      74             : #if PARALLEL_DEBLOCKING_9_TAP
      75             : static INLINE int8_t flat_mask2(uint8_t thresh, uint8_t p4, uint8_t p0,
      76             :                                 uint8_t q0, uint8_t q4) {
      77             :   int8_t mask = 0;
      78             :   mask |= (abs(p4 - p0) > thresh) * -1;
      79             :   mask |= (abs(q4 - q0) > thresh) * -1;
      80             :   return ~mask;
      81             : }
      82             : #endif
      83             : 
      84             : #if PARALLEL_DEBLOCKING_11_TAP
      85             : static INLINE int8_t flat_mask3(uint8_t thresh, uint8_t p5, uint8_t p4,
      86             :                                 uint8_t p0, uint8_t q0, uint8_t q4,
      87             :                                 uint8_t q5) {
      88             :   int8_t mask = 0;
      89             :   mask |= (abs(p4 - p0) > thresh) * -1;
      90             :   mask |= (abs(q4 - q0) > thresh) * -1;
      91             :   mask |= (abs(p5 - p0) > thresh) * -1;
      92             :   mask |= (abs(q5 - q0) > thresh) * -1;
      93             :   return ~mask;
      94             : }
      95             : #endif
      96             : 
      97           0 : static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
      98             :                                 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
      99             :                                 uint8_t q1, uint8_t q2, uint8_t q3,
     100             :                                 uint8_t q4) {
     101           0 :   int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
     102           0 :   mask |= (abs(p4 - p0) > thresh) * -1;
     103           0 :   mask |= (abs(q4 - q0) > thresh) * -1;
     104           0 :   return ~mask;
     105             : }
     106             : 
     107             : // is there high edge variance internal edge: 11111111 yes, 00000000 no
     108           0 : static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
     109             :                               uint8_t q0, uint8_t q1) {
     110           0 :   int8_t hev = 0;
     111           0 :   hev |= (abs(p1 - p0) > thresh) * -1;
     112           0 :   hev |= (abs(q1 - q0) > thresh) * -1;
     113           0 :   return hev;
     114             : }
     115             : 
     116           0 : static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
     117             :                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
     118             :   int8_t filter1, filter2;
     119             : 
     120           0 :   const int8_t ps1 = (int8_t)*op1 ^ 0x80;
     121           0 :   const int8_t ps0 = (int8_t)*op0 ^ 0x80;
     122           0 :   const int8_t qs0 = (int8_t)*oq0 ^ 0x80;
     123           0 :   const int8_t qs1 = (int8_t)*oq1 ^ 0x80;
     124           0 :   const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
     125             : 
     126             :   // add outer taps if we have high edge variance
     127           0 :   int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
     128             : 
     129             :   // inner taps
     130           0 :   filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
     131             : 
     132             :   // save bottom 3 bits so that we round one side +4 and the other +3
     133             :   // if it equals 4 we'll set to adjust by -1 to account for the fact
     134             :   // we'd round 3 the other way
     135           0 :   filter1 = signed_char_clamp(filter + 4) >> 3;
     136           0 :   filter2 = signed_char_clamp(filter + 3) >> 3;
     137             : 
     138           0 :   *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
     139           0 :   *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
     140             : 
     141             :   // outer tap adjustments
     142           0 :   filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
     143             : 
     144           0 :   *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
     145           0 :   *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
     146           0 : }
     147             : 
     148           0 : void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
     149             :                             const uint8_t *blimit, const uint8_t *limit,
     150             :                             const uint8_t *thresh) {
     151             :   int i;
     152             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     153             :   int count = 4;
     154             : #else
     155           0 :   int count = 8;
     156             : #endif
     157             : 
     158             :   // loop filter designed to work using chars so that we can make maximum use
     159             :   // of 8 bit simd instructions.
     160           0 :   for (i = 0; i < count; ++i) {
     161             : #if !CONFIG_PARALLEL_DEBLOCKING
     162           0 :     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     163           0 :     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
     164           0 :     const int8_t mask =
     165           0 :         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     166             : #else   // CONFIG_PARALLEL_DEBLOCKING
     167             :     const uint8_t p1 = s[-2 * p], p0 = s[-p];
     168             :     const uint8_t q0 = s[0 * p], q1 = s[1 * p];
     169             :     const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
     170             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     171           0 :     filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
     172           0 :     ++s;
     173             :   }
     174           0 : }
     175             : 
     176           0 : void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
     177             :                                  const uint8_t *limit0, const uint8_t *thresh0,
     178             :                                  const uint8_t *blimit1, const uint8_t *limit1,
     179             :                                  const uint8_t *thresh1) {
     180           0 :   aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
     181           0 :   aom_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
     182           0 : }
     183             : 
     184           0 : void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
     185             :                           const uint8_t *limit, const uint8_t *thresh) {
     186             :   int i;
     187             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     188             :   int count = 4;
     189             : #else
     190           0 :   int count = 8;
     191             : #endif
     192             : 
     193             :   // loop filter designed to work using chars so that we can make maximum use
     194             :   // of 8 bit simd instructions.
     195           0 :   for (i = 0; i < count; ++i) {
     196             : #if !CONFIG_PARALLEL_DEBLOCKING
     197           0 :     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     198           0 :     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     199           0 :     const int8_t mask =
     200           0 :         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     201             : #else   // CONFIG_PARALLEL_DEBLOCKING
     202             :     const uint8_t p1 = s[-2], p0 = s[-1];
     203             :     const uint8_t q0 = s[0], q1 = s[1];
     204             :     const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
     205             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     206           0 :     filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
     207           0 :     s += pitch;
     208             :   }
     209           0 : }
     210             : 
     211           0 : void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
     212             :                                const uint8_t *limit0, const uint8_t *thresh0,
     213             :                                const uint8_t *blimit1, const uint8_t *limit1,
     214             :                                const uint8_t *thresh1) {
     215           0 :   aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
     216           0 :   aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
     217           0 : }
     218             : 
     219           0 : static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
     220             :                            uint8_t *op3, uint8_t *op2, uint8_t *op1,
     221             :                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
     222             :                            uint8_t *oq2, uint8_t *oq3) {
     223           0 :   if (flat && mask) {
     224           0 :     const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
     225           0 :     const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
     226             : 
     227             :     // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
     228           0 :     *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
     229           0 :     *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
     230           0 :     *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
     231           0 :     *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
     232           0 :     *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
     233           0 :     *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
     234             :   } else {
     235           0 :     filter4(mask, thresh, op1, op0, oq0, oq1);
     236             :   }
     237           0 : }
     238             : 
     239           0 : void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
     240             :                             const uint8_t *limit, const uint8_t *thresh) {
     241             :   int i;
     242             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     243             :   int count = 4;
     244             : #else
     245           0 :   int count = 8;
     246             : #endif
     247             : 
     248             :   // loop filter designed to work using chars so that we can make maximum use
     249             :   // of 8 bit simd instructions.
     250           0 :   for (i = 0; i < count; ++i) {
     251           0 :     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     252           0 :     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
     253             : 
     254           0 :     const int8_t mask =
     255           0 :         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     256           0 :     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     257           0 :     filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
     258           0 :             s + 1 * p, s + 2 * p, s + 3 * p);
     259           0 :     ++s;
     260             :   }
     261           0 : }
     262             : 
     263           0 : void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
     264             :                                  const uint8_t *limit0, const uint8_t *thresh0,
     265             :                                  const uint8_t *blimit1, const uint8_t *limit1,
     266             :                                  const uint8_t *thresh1) {
     267           0 :   aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
     268           0 :   aom_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
     269           0 : }
     270             : 
     271           0 : void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
     272             :                           const uint8_t *limit, const uint8_t *thresh) {
     273             :   int i;
     274             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     275             :   int count = 4;
     276             : #else
     277           0 :   int count = 8;
     278             : #endif
     279             : 
     280           0 :   for (i = 0; i < count; ++i) {
     281           0 :     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     282           0 :     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     283           0 :     const int8_t mask =
     284           0 :         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     285           0 :     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     286           0 :     filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
     287             :             s + 3);
     288           0 :     s += pitch;
     289             :   }
     290           0 : }
     291             : 
     292           0 : void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
     293             :                                const uint8_t *limit0, const uint8_t *thresh0,
     294             :                                const uint8_t *blimit1, const uint8_t *limit1,
     295             :                                const uint8_t *thresh1) {
     296           0 :   aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
     297           0 :   aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
     298           0 : }
     299             : 
     300             : #if PARALLEL_DEBLOCKING_11_TAP
     301             : static INLINE void filter12(int8_t mask, uint8_t thresh, int8_t flat,
     302             :                             int8_t flat2, uint8_t *op5, uint8_t *op4,
     303             :                             uint8_t *op3, uint8_t *op2, uint8_t *op1,
     304             :                             uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
     305             :                             uint8_t *oq2, uint8_t *oq3, uint8_t *oq4,
     306             :                             uint8_t *oq5) {
     307             :   if (flat2 && flat && mask) {
     308             :     const uint8_t p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1,
     309             :                   p0 = *op0;
     310             :     const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
     311             :                   q5 = *oq5;
     312             : 
     313             :     // 11-tap filter [1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1]
     314             :     *op4 = (p5 * 5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + 6) / 12;
     315             :     *op3 = (p5 * 4 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + 6) / 12;
     316             :     *op2 = (p5 * 3 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + 6) / 12;
     317             :     *op1 = (p5 * 2 + p4 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + q3 + 6) / 12;
     318             :     *op0 = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + q4 + 6) / 12;
     319             :     *oq0 = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + 6) / 12;
     320             :     *oq1 = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 + q5 * 2 + 6) / 12;
     321             :     *oq2 = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 * 3 + 6) / 12;
     322             :     *oq3 = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 * 4 + 6) / 12;
     323             :     *oq4 = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 5 + 6) / 12;
     324             :   } else {
     325             :     filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
     326             :   }
     327             : }
     328             : #endif
     329             : 
     330             : #if PARALLEL_DEBLOCKING_9_TAP
     331             : static INLINE void filter10(int8_t mask, uint8_t thresh, int8_t flat,
     332             :                             int8_t flat2, uint8_t *op4, uint8_t *op3,
     333             :                             uint8_t *op2, uint8_t *op1, uint8_t *op0,
     334             :                             uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
     335             :                             uint8_t *oq3, uint8_t *oq4) {
     336             :   if (flat2 && flat && mask) {
     337             :     const uint8_t p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
     338             :     const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4;
     339             : 
     340             :     // 9-tap filter [1, 1, 1, 1, 2, 1, 1, 1, 1]
     341             :     *op3 = (p4 * 4 + p3 * 2 + p2 + p1 + p0 + q0 + 5) / 10;
     342             :     *op2 = (p4 * 3 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + 5) / 10;
     343             :     *op1 = (p4 * 2 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + 5) / 10;
     344             :     *op0 = (p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + 5) / 10;
     345             :     *oq0 = (p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + 5) / 10;
     346             :     *oq1 = (p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 * 2 + 5) / 10;
     347             :     *oq2 = (p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 * 3 + 5) / 10;
     348             :     *oq3 = (p0 + q0 + q1 + q2 + q3 * 2 + q4 * 4 + 5) / 10;
     349             :   } else {
     350             :     filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
     351             :   }
     352             : }
     353             : #endif
     354             : 
     355           0 : static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat,
     356             :                             int8_t flat2, uint8_t *op7, uint8_t *op6,
     357             :                             uint8_t *op5, uint8_t *op4, uint8_t *op3,
     358             :                             uint8_t *op2, uint8_t *op1, uint8_t *op0,
     359             :                             uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
     360             :                             uint8_t *oq3, uint8_t *oq4, uint8_t *oq5,
     361             :                             uint8_t *oq6, uint8_t *oq7) {
     362           0 :   if (flat2 && flat && mask) {
     363           0 :     const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3,
     364           0 :                   p2 = *op2, p1 = *op1, p0 = *op0;
     365             : 
     366           0 :     const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
     367           0 :                   q5 = *oq5, q6 = *oq6, q7 = *oq7;
     368             : 
     369             :     // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
     370           0 :     *op6 = ROUND_POWER_OF_TWO(
     371             :         p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
     372           0 :     *op5 = ROUND_POWER_OF_TWO(
     373             :         p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
     374           0 :     *op4 = ROUND_POWER_OF_TWO(
     375             :         p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
     376           0 :     *op3 = ROUND_POWER_OF_TWO(
     377             :         p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
     378           0 :     *op2 = ROUND_POWER_OF_TWO(
     379             :         p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
     380             :         4);
     381           0 :     *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
     382             :                                   q0 + q1 + q2 + q3 + q4 + q5,
     383             :                               4);
     384           0 :     *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
     385             :                                   q1 + q2 + q3 + q4 + q5 + q6,
     386             :                               4);
     387           0 :     *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
     388             :                                   q2 + q3 + q4 + q5 + q6 + q7,
     389             :                               4);
     390           0 :     *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
     391             :                                   q3 + q4 + q5 + q6 + q7 * 2,
     392             :                               4);
     393           0 :     *oq2 = ROUND_POWER_OF_TWO(
     394             :         p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
     395             :         4);
     396           0 :     *oq3 = ROUND_POWER_OF_TWO(
     397             :         p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
     398           0 :     *oq4 = ROUND_POWER_OF_TWO(
     399             :         p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
     400           0 :     *oq5 = ROUND_POWER_OF_TWO(
     401             :         p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
     402           0 :     *oq6 = ROUND_POWER_OF_TWO(
     403             :         p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
     404             :   } else {
     405           0 :     filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
     406             :   }
     407           0 : }
     408             : 
     409           0 : static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
     410             :                                      const uint8_t *limit,
     411             :                                      const uint8_t *thresh, int count) {
     412             :   int i;
     413             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     414             :   int step = 4;
     415             : #else
     416           0 :   int step = 8;
     417             : #endif
     418             : 
     419             :   // loop filter designed to work using chars so that we can make maximum use
     420             :   // of 8 bit simd instructions.
     421           0 :   for (i = 0; i < step * count; ++i) {
     422           0 :     const uint8_t p7 = s[-8 * p], p6 = s[-7 * p], p5 = s[-6 * p],
     423           0 :                   p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p],
     424           0 :                   p1 = s[-2 * p], p0 = s[-p];
     425           0 :     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p],
     426           0 :                   q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p], q7 = s[7 * p];
     427           0 :     const int8_t mask =
     428           0 :         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     429           0 :     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     430             : 
     431             : #if PARALLEL_DEBLOCKING_11_TAP
     432             :     const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);
     433             : 
     434             :     filter12(mask, *thresh, flat, flat2, s - 6 * p, s - 5 * p, s - 4 * p,
     435             :              s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p,
     436             :              s + 3 * p, s + 4 * p, s + 5 * p);
     437             : 
     438             : #elif PARALLEL_DEBLOCKING_9_TAP
     439             :     const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4);
     440             : 
     441             :     filter10(mask, *thresh, flat, flat2, s - 5 * p, s - 4 * p, s - 3 * p,
     442             :              s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p,
     443             :              s + 4 * p);
     444             : #else
     445           0 :     const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
     446             : 
     447           0 :     filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
     448           0 :              s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
     449           0 :              s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
     450           0 :              s + 7 * p);
     451             : #endif
     452             : 
     453           0 :     ++s;
     454             :   }
     455           0 : }
     456             : 
     457           0 : void aom_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,
     458             :                                  const uint8_t *limit, const uint8_t *thresh) {
     459           0 :   mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
     460           0 : }
     461             : 
     462           0 : void aom_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
     463             :                                   const uint8_t *limit, const uint8_t *thresh) {
     464             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     465             :   mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
     466             : #else
     467           0 :   mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
     468             : #endif
     469           0 : }
     470             : 
     471           0 : static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
     472             :                                    const uint8_t *limit, const uint8_t *thresh,
     473             :                                    int count) {
     474             :   int i;
     475             : 
     476           0 :   for (i = 0; i < count; ++i) {
     477           0 :     const uint8_t p7 = s[-8], p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4],
     478           0 :                   p2 = s[-3], p1 = s[-2], p0 = s[-1];
     479           0 :     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
     480           0 :                   q5 = s[5], q6 = s[6], q7 = s[7];
     481           0 :     const int8_t mask =
     482           0 :         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     483           0 :     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     484             : 
     485             : #if PARALLEL_DEBLOCKING_11_TAP
     486             :     const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);
     487             : 
     488             :     filter12(mask, *thresh, flat, flat2, s - 6, s - 5, s - 4, s - 3, s - 2,
     489             :              s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5);
     490             : #elif PARALLEL_DEBLOCKING_9_TAP
     491             :     const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4);
     492             : 
     493             :     filter10(mask, *thresh, flat, flat2, s - 5, s - 4, s - 3, s - 2, s - 1, s,
     494             :              s + 1, s + 2, s + 3, s + 4);
     495             : 
     496             : #else
     497           0 :     const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
     498             : 
     499           0 :     filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
     500             :              s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
     501             :              s + 7);
     502             : #endif
     503             : 
     504           0 :     s += p;
     505             :   }
     506           0 : }
     507             : 
     508           0 : void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
     509             :                            const uint8_t *limit, const uint8_t *thresh) {
     510             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     511             :   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
     512             : #else
     513           0 :   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
     514             : #endif
     515           0 : }
     516             : 
     517           0 : void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
     518             :                                 const uint8_t *limit, const uint8_t *thresh) {
     519           0 :   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
     520           0 : }
     521             : 
     522             : #if CONFIG_HIGHBITDEPTH
     523             : #if CONFIG_PARALLEL_DEBLOCKING
     524             : // Should we apply any filter at all: 11111111 yes, 00000000 no ?
     525             : static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
     526             :                                          uint16_t p1, uint16_t p0, uint16_t q0,
     527             :                                          uint16_t q1, int bd) {
     528             :   int8_t mask = 0;
     529             :   int16_t limit16 = (uint16_t)limit << (bd - 8);
     530             :   int16_t blimit16 = (uint16_t)blimit << (bd - 8);
     531             :   mask |= (abs(p1 - p0) > limit16) * -1;
     532             :   mask |= (abs(q1 - q0) > limit16) * -1;
     533             :   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
     534             :   return ~mask;
     535             : }
     536             : #endif  // CONFIG_PARALLEL_DEBLOCKING
     537             : 
     538             : // Should we apply any filter at all: 11111111 yes, 00000000 no ?
     539           0 : static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
     540             :                                         uint16_t p3, uint16_t p2, uint16_t p1,
     541             :                                         uint16_t p0, uint16_t q0, uint16_t q1,
     542             :                                         uint16_t q2, uint16_t q3, int bd) {
     543           0 :   int8_t mask = 0;
     544           0 :   int16_t limit16 = (uint16_t)limit << (bd - 8);
     545           0 :   int16_t blimit16 = (uint16_t)blimit << (bd - 8);
     546           0 :   mask |= (abs(p3 - p2) > limit16) * -1;
     547           0 :   mask |= (abs(p2 - p1) > limit16) * -1;
     548           0 :   mask |= (abs(p1 - p0) > limit16) * -1;
     549           0 :   mask |= (abs(q1 - q0) > limit16) * -1;
     550           0 :   mask |= (abs(q2 - q1) > limit16) * -1;
     551           0 :   mask |= (abs(q3 - q2) > limit16) * -1;
     552           0 :   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
     553           0 :   return ~mask;
     554             : }
     555             : 
     556           0 : static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
     557             :                                        uint16_t p1, uint16_t p0, uint16_t q0,
     558             :                                        uint16_t q1, uint16_t q2, uint16_t q3,
     559             :                                        int bd) {
     560           0 :   int8_t mask = 0;
     561           0 :   int16_t thresh16 = (uint16_t)thresh << (bd - 8);
     562           0 :   mask |= (abs(p1 - p0) > thresh16) * -1;
     563           0 :   mask |= (abs(q1 - q0) > thresh16) * -1;
     564           0 :   mask |= (abs(p2 - p0) > thresh16) * -1;
     565           0 :   mask |= (abs(q2 - q0) > thresh16) * -1;
     566           0 :   mask |= (abs(p3 - p0) > thresh16) * -1;
     567           0 :   mask |= (abs(q3 - q0) > thresh16) * -1;
     568           0 :   return ~mask;
     569             : }
     570             : 
     571           0 : static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3,
     572             :                                        uint16_t p2, uint16_t p1, uint16_t p0,
     573             :                                        uint16_t q0, uint16_t q1, uint16_t q2,
     574             :                                        uint16_t q3, uint16_t q4, int bd) {
     575           0 :   int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     576           0 :   int16_t thresh16 = (uint16_t)thresh << (bd - 8);
     577           0 :   mask |= (abs(p4 - p0) > thresh16) * -1;
     578           0 :   mask |= (abs(q4 - q0) > thresh16) * -1;
     579           0 :   return ~mask;
     580             : }
     581             : 
     582             : // Is there high edge variance internal edge:
     583             : // 11111111_11111111 yes, 00000000_00000000 no ?
     584           0 : static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
     585             :                                       uint16_t q0, uint16_t q1, int bd) {
     586           0 :   int16_t hev = 0;
     587           0 :   int16_t thresh16 = (uint16_t)thresh << (bd - 8);
     588           0 :   hev |= (abs(p1 - p0) > thresh16) * -1;
     589           0 :   hev |= (abs(q1 - q0) > thresh16) * -1;
     590           0 :   return hev;
     591             : }
     592             : 
     593           0 : static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
     594             :                                   uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
     595             :                                   int bd) {
     596             :   int16_t filter1, filter2;
     597             :   // ^0x80 equivalent to subtracting 0x80 from the values to turn them
     598             :   // into -128 to +127 instead of 0 to 255.
     599           0 :   int shift = bd - 8;
     600           0 :   const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
     601           0 :   const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
     602           0 :   const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
     603           0 :   const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
     604           0 :   const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
     605             : 
     606             :   // Add outer taps if we have high edge variance.
     607           0 :   int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
     608             : 
     609             :   // Inner taps.
     610           0 :   filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
     611             : 
     612             :   // Save bottom 3 bits so that we round one side +4 and the other +3
     613             :   // if it equals 4 we'll set to adjust by -1 to account for the fact
     614             :   // we'd round 3 the other way.
     615           0 :   filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
     616           0 :   filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
     617             : 
     618           0 :   *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
     619           0 :   *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
     620             : 
     621             :   // Outer tap adjustments.
     622           0 :   filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
     623             : 
     624           0 :   *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
     625           0 :   *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
     626           0 : }
     627             : 
     628           0 : void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
     629             :                                    const uint8_t *blimit, const uint8_t *limit,
     630             :                                    const uint8_t *thresh, int bd) {
     631             :   int i;
     632             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     633             :   int count = 4;
     634             : #else
     635           0 :   int count = 8;
     636             : #endif
     637             : 
     638             :   // loop filter designed to work using chars so that we can make maximum use
     639             :   // of 8 bit simd instructions.
     640           0 :   for (i = 0; i < count; ++i) {
     641             : #if !CONFIG_PARALLEL_DEBLOCKING
     642           0 :     const uint16_t p3 = s[-4 * p];
     643           0 :     const uint16_t p2 = s[-3 * p];
     644           0 :     const uint16_t p1 = s[-2 * p];
     645           0 :     const uint16_t p0 = s[-p];
     646           0 :     const uint16_t q0 = s[0 * p];
     647           0 :     const uint16_t q1 = s[1 * p];
     648           0 :     const uint16_t q2 = s[2 * p];
     649           0 :     const uint16_t q3 = s[3 * p];
     650           0 :     const int8_t mask =
     651           0 :         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     652             : #else   // CONFIG_PARALLEL_DEBLOCKING
     653             :     const uint16_t p1 = s[-2 * p];
     654             :     const uint16_t p0 = s[-p];
     655             :     const uint16_t q0 = s[0 * p];
     656             :     const uint16_t q1 = s[1 * p];
     657             :     const int8_t mask =
     658             :         highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
     659             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     660           0 :     highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
     661           0 :     ++s;
     662             :   }
     663           0 : }
     664             : 
     665           0 : void aom_highbd_lpf_horizontal_4_dual_c(
     666             :     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
     667             :     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     668             :     const uint8_t *thresh1, int bd) {
     669           0 :   aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
     670           0 :   aom_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
     671           0 : }
     672             : 
     673           0 : void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
     674             :                                  const uint8_t *limit, const uint8_t *thresh,
     675             :                                  int bd) {
     676             :   int i;
     677             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     678             :   int count = 4;
     679             : #else
     680           0 :   int count = 8;
     681             : #endif
     682             : 
     683             :   // loop filter designed to work using chars so that we can make maximum use
     684             :   // of 8 bit simd instructions.
     685           0 :   for (i = 0; i < count; ++i) {
     686             : #if !CONFIG_PARALLEL_DEBLOCKING
     687           0 :     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     688           0 :     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     689           0 :     const int8_t mask =
     690           0 :         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     691             : #else   // CONFIG_PARALLEL_DEBLOCKING
     692             :     const uint16_t p1 = s[-2], p0 = s[-1];
     693             :     const uint16_t q0 = s[0], q1 = s[1];
     694             :     const int8_t mask =
     695             :         highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
     696             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     697           0 :     highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
     698           0 :     s += pitch;
     699             :   }
     700           0 : }
     701             : 
     702           0 : void aom_highbd_lpf_vertical_4_dual_c(
     703             :     uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     704             :     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     705             :     const uint8_t *thresh1, int bd) {
     706           0 :   aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
     707           0 :   aom_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
     708             :                               bd);
     709           0 : }
     710             : 
     711           0 : static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
     712             :                                   uint16_t *op3, uint16_t *op2, uint16_t *op1,
     713             :                                   uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
     714             :                                   uint16_t *oq2, uint16_t *oq3, int bd) {
     715           0 :   if (flat && mask) {
     716           0 :     const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
     717           0 :     const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
     718             : 
     719             :     // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
     720           0 :     *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
     721           0 :     *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
     722           0 :     *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
     723           0 :     *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
     724           0 :     *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
     725           0 :     *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
     726             :   } else {
     727           0 :     highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
     728             :   }
     729           0 : }
     730             : 
     731           0 : void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
     732             :                                    const uint8_t *limit, const uint8_t *thresh,
     733             :                                    int bd) {
     734             :   int i;
     735             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     736             :   int count = 4;
     737             : #else
     738           0 :   int count = 8;
     739             : #endif
     740             : 
     741             :   // loop filter designed to work using chars so that we can make maximum use
     742             :   // of 8 bit simd instructions.
     743           0 :   for (i = 0; i < count; ++i) {
     744           0 :     const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     745           0 :     const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
     746             : 
     747           0 :     const int8_t mask =
     748           0 :         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     749           0 :     const int8_t flat =
     750           0 :         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     751           0 :     highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
     752           0 :                    s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
     753           0 :     ++s;
     754             :   }
     755           0 : }
     756             : 
     757           0 : void aom_highbd_lpf_horizontal_8_dual_c(
     758             :     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
     759             :     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     760             :     const uint8_t *thresh1, int bd) {
     761           0 :   aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
     762           0 :   aom_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
     763           0 : }
     764             : 
     765           0 : void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
     766             :                                  const uint8_t *limit, const uint8_t *thresh,
     767             :                                  int bd) {
     768             :   int i;
     769             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     770             :   int count = 4;
     771             : #else
     772           0 :   int count = 8;
     773             : #endif
     774             : 
     775           0 :   for (i = 0; i < count; ++i) {
     776           0 :     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     777           0 :     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     778           0 :     const int8_t mask =
     779           0 :         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     780           0 :     const int8_t flat =
     781           0 :         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     782           0 :     highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
     783             :                    s + 2, s + 3, bd);
     784           0 :     s += pitch;
     785             :   }
     786           0 : }
     787             : 
     788           0 : void aom_highbd_lpf_vertical_8_dual_c(
     789             :     uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     790             :     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     791             :     const uint8_t *thresh1, int bd) {
     792           0 :   aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
     793           0 :   aom_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
     794             :                               bd);
     795           0 : }
     796             : 
     797           0 : static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat,
     798             :                                    int8_t flat2, uint16_t *op7, uint16_t *op6,
     799             :                                    uint16_t *op5, uint16_t *op4, uint16_t *op3,
     800             :                                    uint16_t *op2, uint16_t *op1, uint16_t *op0,
     801             :                                    uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
     802             :                                    uint16_t *oq3, uint16_t *oq4, uint16_t *oq5,
     803             :                                    uint16_t *oq6, uint16_t *oq7, int bd) {
     804           0 :   if (flat2 && flat && mask) {
     805           0 :     const uint16_t p7 = *op7;
     806           0 :     const uint16_t p6 = *op6;
     807           0 :     const uint16_t p5 = *op5;
     808           0 :     const uint16_t p4 = *op4;
     809           0 :     const uint16_t p3 = *op3;
     810           0 :     const uint16_t p2 = *op2;
     811           0 :     const uint16_t p1 = *op1;
     812           0 :     const uint16_t p0 = *op0;
     813           0 :     const uint16_t q0 = *oq0;
     814           0 :     const uint16_t q1 = *oq1;
     815           0 :     const uint16_t q2 = *oq2;
     816           0 :     const uint16_t q3 = *oq3;
     817           0 :     const uint16_t q4 = *oq4;
     818           0 :     const uint16_t q5 = *oq5;
     819           0 :     const uint16_t q6 = *oq6;
     820           0 :     const uint16_t q7 = *oq7;
     821             : 
     822             :     // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
     823           0 :     *op6 = ROUND_POWER_OF_TWO(
     824             :         p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
     825           0 :     *op5 = ROUND_POWER_OF_TWO(
     826             :         p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
     827           0 :     *op4 = ROUND_POWER_OF_TWO(
     828             :         p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
     829           0 :     *op3 = ROUND_POWER_OF_TWO(
     830             :         p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
     831           0 :     *op2 = ROUND_POWER_OF_TWO(
     832             :         p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
     833             :         4);
     834           0 :     *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
     835             :                                   q0 + q1 + q2 + q3 + q4 + q5,
     836             :                               4);
     837           0 :     *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
     838             :                                   q1 + q2 + q3 + q4 + q5 + q6,
     839             :                               4);
     840           0 :     *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
     841             :                                   q2 + q3 + q4 + q5 + q6 + q7,
     842             :                               4);
     843           0 :     *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
     844             :                                   q3 + q4 + q5 + q6 + q7 * 2,
     845             :                               4);
     846           0 :     *oq2 = ROUND_POWER_OF_TWO(
     847             :         p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
     848             :         4);
     849           0 :     *oq3 = ROUND_POWER_OF_TWO(
     850             :         p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
     851           0 :     *oq4 = ROUND_POWER_OF_TWO(
     852             :         p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
     853           0 :     *oq5 = ROUND_POWER_OF_TWO(
     854             :         p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
     855           0 :     *oq6 = ROUND_POWER_OF_TWO(
     856             :         p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
     857             :   } else {
     858           0 :     highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
     859             :                    bd);
     860             :   }
     861           0 : }
     862             : 
     863           0 : static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
     864             :                                             const uint8_t *blimit,
     865             :                                             const uint8_t *limit,
     866             :                                             const uint8_t *thresh, int count,
     867             :                                             int bd) {
     868             :   int i;
     869             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     870             :   int step = 4;
     871             : #else
     872           0 :   int step = 8;
     873             : #endif
     874             : 
     875             :   // loop filter designed to work using chars so that we can make maximum use
     876             :   // of 8 bit simd instructions.
     877           0 :   for (i = 0; i < step * count; ++i) {
     878           0 :     const uint16_t p3 = s[-4 * p];
     879           0 :     const uint16_t p2 = s[-3 * p];
     880           0 :     const uint16_t p1 = s[-2 * p];
     881           0 :     const uint16_t p0 = s[-p];
     882           0 :     const uint16_t q0 = s[0 * p];
     883           0 :     const uint16_t q1 = s[1 * p];
     884           0 :     const uint16_t q2 = s[2 * p];
     885           0 :     const uint16_t q3 = s[3 * p];
     886           0 :     const int8_t mask =
     887           0 :         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     888           0 :     const int8_t flat =
     889           0 :         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     890           0 :     const int8_t flat2 =
     891           0 :         highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
     892           0 :                           s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
     893             : 
     894           0 :     highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
     895           0 :                     s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
     896             :                     s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
     897             :                     s + 6 * p, s + 7 * p, bd);
     898           0 :     ++s;
     899             :   }
     900           0 : }
     901             : 
     902           0 : void aom_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p,
     903             :                                         const uint8_t *blimit,
     904             :                                         const uint8_t *limit,
     905             :                                         const uint8_t *thresh, int bd) {
     906           0 :   highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
     907           0 : }
     908             : 
     909           0 : void aom_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
     910             :                                          const uint8_t *blimit,
     911             :                                          const uint8_t *limit,
     912             :                                          const uint8_t *thresh, int bd) {
     913             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     914             :   highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
     915             : #else
     916           0 :   highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
     917             : #endif
     918           0 : }
     919             : 
     920           0 : static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
     921             :                                           const uint8_t *blimit,
     922             :                                           const uint8_t *limit,
     923             :                                           const uint8_t *thresh, int count,
     924             :                                           int bd) {
     925             :   int i;
     926             : 
     927           0 :   for (i = 0; i < count; ++i) {
     928           0 :     const uint16_t p3 = s[-4];
     929           0 :     const uint16_t p2 = s[-3];
     930           0 :     const uint16_t p1 = s[-2];
     931           0 :     const uint16_t p0 = s[-1];
     932           0 :     const uint16_t q0 = s[0];
     933           0 :     const uint16_t q1 = s[1];
     934           0 :     const uint16_t q2 = s[2];
     935           0 :     const uint16_t q3 = s[3];
     936           0 :     const int8_t mask =
     937           0 :         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     938           0 :     const int8_t flat =
     939           0 :         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     940           0 :     const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
     941           0 :                                            q0, s[4], s[5], s[6], s[7], bd);
     942             : 
     943           0 :     highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
     944             :                     s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
     945             :                     s + 5, s + 6, s + 7, bd);
     946           0 :     s += p;
     947             :   }
     948           0 : }
     949             : 
     950           0 : void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
     951             :                                   const uint8_t *limit, const uint8_t *thresh,
     952             :                                   int bd) {
     953             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     954             :   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
     955             : #else
     956           0 :   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
     957             : #endif
     958           0 : }
     959             : 
     960           0 : void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
     961             :                                        const uint8_t *blimit,
     962             :                                        const uint8_t *limit,
     963             :                                        const uint8_t *thresh, int bd) {
     964             : #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
     965             :   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
     966             : #else
     967           0 :   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
     968             : #endif
     969           0 : }
     970             : #endif  // CONFIG_HIGHBITDEPTH

Generated by: LCOV version 1.13