LCOV - code coverage report
Current view: top level - third_party/aom/av1/common - clpf_simd.h (source / functions) Hit Total Coverage
Test: output.info Lines: 0 226 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 40 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include "./av1_rtcd.h"
      13             : #include "./cdef_simd.h"
      14             : #include "aom_ports/bitops.h"
      15             : #include "aom_ports/mem.h"
      16             : 
      17             : // sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
      18             : SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
      19             :                            unsigned int adjdamp) {
      20             :   const v256 diff16 = v256_sub_16(a, b);
      21           0 :   v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
      22           0 :   const v128 sign = v128_cmplt_s8(diff, v128_zero());
      23           0 :   diff = v128_abs_s8(diff);
      24           0 :   return v128_xor(
      25             :       v128_add_8(sign,
      26             :                  v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
      27             :                                                 v128_shr_u8(diff, adjdamp)))),
      28             :       sign);
      29             : }
      30             : 
      31             : // delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
      32             : //         1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
      33             : //         3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
      34             : //         3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
      35             : SIMD_INLINE v128 calc_delta(v256 x, v256 a, v256 b, v256 c, v256 d, v256 e,
      36             :                             v256 f, v256 g, v256 h, unsigned int s,
      37             :                             unsigned int dmp) {
      38           0 :   const v128 bdeg =
      39           0 :       v128_add_8(v128_add_8(constrain(b, x, s, dmp), constrain(d, x, s, dmp)),
      40             :                  v128_add_8(constrain(e, x, s, dmp), constrain(g, x, s, dmp)));
      41           0 :   const v128 delta = v128_add_8(
      42             :       v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(c, x, s, dmp)),
      43             :                  v128_add_8(constrain(f, x, s, dmp), constrain(h, x, s, dmp))),
      44             :       v128_add_8(v128_add_8(bdeg, bdeg), bdeg));
      45           0 :   return v128_add_8(
      46             :       v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
      47             :       v128_shr_s8(
      48             :           v128_add_8(v128_dup_8(8),
      49             :                      v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
      50             :           4));
      51             : }
      52             : 
      53             : // delta = 1/8 * constrain(a, x, s, d) + 3/8 * constrain(b, x, s, d) +
      54             : //         3/8 * constrain(c, x, s, d) + 1/8 * constrain(d, x, s, d) +
      55             : SIMD_INLINE v128 calc_hdelta(v256 x, v256 a, v256 b, v256 c, v256 d,
      56             :                              unsigned int s, unsigned int dmp) {
      57           0 :   const v128 bc = v128_add_8(constrain(b, x, s, dmp), constrain(c, x, s, dmp));
      58           0 :   const v128 delta =
      59           0 :       v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(d, x, s, dmp)),
      60             :                  v128_add_8(v128_add_8(bc, bc), bc));
      61           0 :   return v128_add_8(
      62             :       v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
      63             :       v128_shr_s8(
      64             :           v128_add_8(v128_dup_8(4),
      65             :                      v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
      66             :           3));
      67             : }
      68             : 
      69             : // Process blocks of width 8, two lines at a time, 8 bit.
      70           0 : static void SIMD_FUNC(clpf_block8)(uint8_t *dst, const uint16_t *src,
      71             :                                    int dstride, int sstride, int sizey,
      72             :                                    unsigned int strength,
      73             :                                    unsigned int adjdamp) {
      74             :   int y;
      75             : 
      76           0 :   for (y = 0; y < sizey; y += 2) {
      77           0 :     const v128 l1 = v128_load_aligned(src);
      78           0 :     const v128 l2 = v128_load_aligned(src + sstride);
      79           0 :     const v128 l3 = v128_load_aligned(src - sstride);
      80           0 :     const v128 l4 = v128_load_aligned(src + 2 * sstride);
      81           0 :     const v256 a = v256_from_v128(v128_load_aligned(src - 2 * sstride), l3);
      82             :     const v256 b = v256_from_v128(l3, l1);
      83             :     const v256 g = v256_from_v128(l2, l4);
      84           0 :     const v256 h = v256_from_v128(l4, v128_load_aligned(src + 3 * sstride));
      85           0 :     const v256 c = v256_from_v128(v128_load_unaligned(src - 2),
      86           0 :                                   v128_load_unaligned(src - 2 + sstride));
      87           0 :     const v256 d = v256_from_v128(v128_load_unaligned(src - 1),
      88           0 :                                   v128_load_unaligned(src - 1 + sstride));
      89           0 :     const v256 e = v256_from_v128(v128_load_unaligned(src + 1),
      90           0 :                                   v128_load_unaligned(src + 1 + sstride));
      91           0 :     const v256 f = v256_from_v128(v128_load_unaligned(src + 2),
      92           0 :                                   v128_load_unaligned(src + 2 + sstride));
      93           0 :     const v128 o = calc_delta(v256_from_v128(l1, l2), a, b, c, d, e, f, g, h,
      94             :                               strength, adjdamp);
      95             : 
      96           0 :     v64_store_aligned(dst, v128_high_v64(o));
      97           0 :     v64_store_aligned(dst + dstride, v128_low_v64(o));
      98           0 :     src += sstride * 2;
      99           0 :     dst += dstride * 2;
     100             :   }
     101           0 : }
     102             : 
     103             : // Process blocks of width 4, four lines at a time, 8 bit.
     104           0 : static void SIMD_FUNC(clpf_block4)(uint8_t *dst, const uint16_t *src,
     105             :                                    int dstride, int sstride, int sizey,
     106             :                                    unsigned int strength,
     107             :                                    unsigned int adjdamp) {
     108             :   int y;
     109             : 
     110           0 :   for (y = 0; y < sizey; y += 4) {
     111           0 :     const v64 l0 = v64_load_aligned(src - 2 * sstride);
     112           0 :     const v64 l1 = v64_load_aligned(src - sstride);
     113           0 :     const v64 l2 = v64_load_aligned(src);
     114           0 :     const v64 l3 = v64_load_aligned(src + sstride);
     115           0 :     const v64 l4 = v64_load_aligned(src + 2 * sstride);
     116           0 :     const v64 l5 = v64_load_aligned(src + 3 * sstride);
     117           0 :     const v64 l6 = v64_load_aligned(src + 4 * sstride);
     118           0 :     const v64 l7 = v64_load_aligned(src + 5 * sstride);
     119           0 :     const v128 o =
     120           0 :         calc_delta(v256_from_v64(l2, l3, l4, l5), v256_from_v64(l0, l1, l2, l3),
     121             :                    v256_from_v64(l1, l2, l3, l4),
     122           0 :                    v256_from_v64(v64_load_unaligned(src - 2),
     123           0 :                                  v64_load_unaligned(src + sstride - 2),
     124           0 :                                  v64_load_unaligned(src + 2 * sstride - 2),
     125           0 :                                  v64_load_unaligned(src + 3 * sstride - 2)),
     126           0 :                    v256_from_v64(v64_load_unaligned(src - 1),
     127           0 :                                  v64_load_unaligned(src + sstride - 1),
     128           0 :                                  v64_load_unaligned(src + 2 * sstride - 1),
     129           0 :                                  v64_load_unaligned(src + 3 * sstride - 1)),
     130           0 :                    v256_from_v64(v64_load_unaligned(src + 1),
     131           0 :                                  v64_load_unaligned(src + sstride + 1),
     132           0 :                                  v64_load_unaligned(src + 2 * sstride + 1),
     133           0 :                                  v64_load_unaligned(src + 3 * sstride + 1)),
     134           0 :                    v256_from_v64(v64_load_unaligned(src + 2),
     135           0 :                                  v64_load_unaligned(src + sstride + 2),
     136           0 :                                  v64_load_unaligned(src + 2 * sstride + 2),
     137           0 :                                  v64_load_unaligned(src + 3 * sstride + 2)),
     138             :                    v256_from_v64(l3, l4, l5, l6), v256_from_v64(l4, l5, l6, l7),
     139             :                    strength, adjdamp);
     140             : 
     141           0 :     u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
     142           0 :     u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
     143           0 :     u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
     144           0 :     u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
     145             : 
     146           0 :     dst += 4 * dstride;
     147           0 :     src += 4 * sstride;
     148             :   }
     149           0 : }
     150             : 
     151           0 : static void SIMD_FUNC(clpf_hblock8)(uint8_t *dst, const uint16_t *src,
     152             :                                     int dstride, int sstride, int sizey,
     153             :                                     unsigned int strength,
     154             :                                     unsigned int adjdamp) {
     155             :   int y;
     156             : 
     157           0 :   for (y = 0; y < sizey; y += 2) {
     158           0 :     const v256 x = v256_from_v128(v128_load_aligned(src),
     159           0 :                                   v128_load_aligned(src + sstride));
     160           0 :     const v256 a = v256_from_v128(v128_load_unaligned(src - 2),
     161           0 :                                   v128_load_unaligned(src - 2 + sstride));
     162           0 :     const v256 b = v256_from_v128(v128_load_unaligned(src - 1),
     163           0 :                                   v128_load_unaligned(src - 1 + sstride));
     164           0 :     const v256 c = v256_from_v128(v128_load_unaligned(src + 1),
     165           0 :                                   v128_load_unaligned(src + 1 + sstride));
     166           0 :     const v256 d = v256_from_v128(v128_load_unaligned(src + 2),
     167           0 :                                   v128_load_unaligned(src + 2 + sstride));
     168           0 :     const v128 o = calc_hdelta(x, a, b, c, d, strength, adjdamp);
     169             : 
     170           0 :     v64_store_aligned(dst, v128_high_v64(o));
     171           0 :     v64_store_aligned(dst + dstride, v128_low_v64(o));
     172           0 :     src += sstride * 2;
     173           0 :     dst += dstride * 2;
     174             :   }
     175           0 : }
     176             : 
     177             : // Process blocks of width 4, four lines at a time, 8 bit.
     178           0 : static void SIMD_FUNC(clpf_hblock4)(uint8_t *dst, const uint16_t *src,
     179             :                                     int dstride, int sstride, int sizey,
     180             :                                     unsigned int strength,
     181             :                                     unsigned int adjdamp) {
     182             :   int y;
     183             : 
     184           0 :   for (y = 0; y < sizey; y += 4) {
     185           0 :     const v256 a = v256_from_v64(v64_load_unaligned(src - 2),
     186           0 :                                  v64_load_unaligned(src + sstride - 2),
     187           0 :                                  v64_load_unaligned(src + 2 * sstride - 2),
     188           0 :                                  v64_load_unaligned(src + 3 * sstride - 2));
     189           0 :     const v256 b = v256_from_v64(v64_load_unaligned(src - 1),
     190           0 :                                  v64_load_unaligned(src + sstride - 1),
     191           0 :                                  v64_load_unaligned(src + 2 * sstride - 1),
     192           0 :                                  v64_load_unaligned(src + 3 * sstride - 1));
     193           0 :     const v256 c = v256_from_v64(v64_load_unaligned(src + 1),
     194           0 :                                  v64_load_unaligned(src + sstride + 1),
     195           0 :                                  v64_load_unaligned(src + 2 * sstride + 1),
     196           0 :                                  v64_load_unaligned(src + 3 * sstride + 1));
     197           0 :     const v256 d = v256_from_v64(v64_load_unaligned(src + 2),
     198           0 :                                  v64_load_unaligned(src + sstride + 2),
     199           0 :                                  v64_load_unaligned(src + 2 * sstride + 2),
     200           0 :                                  v64_load_unaligned(src + 3 * sstride + 2));
     201             : 
     202           0 :     const v128 o = calc_hdelta(
     203           0 :         v256_from_v64(v64_load_aligned(src), v64_load_aligned(src + sstride),
     204           0 :                       v64_load_aligned(src + 2 * sstride),
     205           0 :                       v64_load_aligned(src + 3 * sstride)),
     206             :         a, b, c, d, strength, adjdamp);
     207             : 
     208           0 :     u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
     209           0 :     u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
     210           0 :     u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
     211           0 :     u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
     212             : 
     213           0 :     dst += 4 * dstride;
     214           0 :     src += 4 * sstride;
     215             :   }
     216           0 : }
     217             : 
     218           0 : void SIMD_FUNC(aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride,
     219             :                                int sstride, int sizex, int sizey,
     220             :                                unsigned int strength, unsigned int dmp) {
     221           0 :   if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
     222             :     // Fallback to C for odd sizes:
     223             :     // * block widths not 4 or 8
     224             :     // * block heights not a multiple of 4 if the block width is 4
     225           0 :     aom_clpf_block_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
     226             :   } else {
     227           0 :     (sizex == 4 ? SIMD_FUNC(clpf_block4) : SIMD_FUNC(clpf_block8))(
     228           0 :         dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
     229             :   }
     230           0 : }
     231             : 
     232           0 : void SIMD_FUNC(aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride,
     233             :                                 int sstride, int sizex, int sizey,
     234             :                                 unsigned int strength, unsigned int dmp) {
     235           0 :   if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
     236             :     // Fallback to C for odd sizes:
     237             :     // * block widths not 4 or 8
     238             :     // * block heights not a multiple of 4 if the block width is 4
     239           0 :     aom_clpf_hblock_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
     240             :   } else {
     241           0 :     (sizex == 4 ? SIMD_FUNC(clpf_hblock4) : SIMD_FUNC(clpf_hblock8))(
     242           0 :         dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
     243             :   }
     244           0 : }
     245             : 
     246             : // delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
     247             : //         1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
     248             : //         3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
     249             : //         3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
     250             : SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
     251             :                                 v128 f, v128 g, v128 h, unsigned int s,
     252             :                                 unsigned int dmp) {
     253           0 :   const v128 bdeg = v128_add_16(
     254             :       v128_add_16(constrain16(b, x, s, dmp), constrain16(d, x, s, dmp)),
     255             :       v128_add_16(constrain16(e, x, s, dmp), constrain16(g, x, s, dmp)));
     256           0 :   const v128 delta = v128_add_16(
     257             :       v128_add_16(
     258             :           v128_add_16(constrain16(a, x, s, dmp), constrain16(c, x, s, dmp)),
     259             :           v128_add_16(constrain16(f, x, s, dmp), constrain16(h, x, s, dmp))),
     260             :       v128_add_16(v128_add_16(bdeg, bdeg), bdeg));
     261           0 :   return v128_add_16(
     262             :       x,
     263             :       v128_shr_s16(
     264             :           v128_add_16(v128_dup_16(8),
     265             :                       v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
     266             :           4));
     267             : }
     268             : 
     269           0 : static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
     270             :                             v128 f, v128 g, v128 h, uint16_t *dst,
     271             :                             unsigned int s, unsigned int dmp, int dstride) {
     272           0 :   o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp);
     273           0 :   v64_store_aligned(dst, v128_high_v64(o));
     274           0 :   v64_store_aligned(dst + dstride, v128_low_v64(o));
     275           0 : }
     276             : 
     277           0 : static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
     278             :                             v128 f, v128 g, v128 h, uint16_t *dst,
     279             :                             unsigned int s, unsigned int adjdamp) {
     280           0 :   v128_store_aligned(dst,
     281             :                      calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, adjdamp));
     282           0 : }
     283             : 
     284             : // delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) +
     285             : //         3/16 * constrain(c, x, s, dmp) + 1/16 * constrain(d, x, s, dmp)
     286             : SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d,
     287             :                                  unsigned int s, unsigned int dmp) {
     288           0 :   const v128 bc =
     289           0 :       v128_add_16(constrain16(b, x, s, dmp), constrain16(c, x, s, dmp));
     290           0 :   const v128 delta = v128_add_16(
     291             :       v128_add_16(constrain16(a, x, s, dmp), constrain16(d, x, s, dmp)),
     292             :       v128_add_16(v128_add_16(bc, bc), bc));
     293           0 :   return v128_add_16(
     294             :       x,
     295             :       v128_shr_s16(
     296             :           v128_add_16(v128_dup_16(4),
     297             :                       v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
     298             :           3));
     299             : }
     300             : 
     301           0 : static void calc_hdelta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d,
     302             :                              uint16_t *dst, unsigned int s,
     303             :                              unsigned int adjdamp, int dstride) {
     304           0 :   o = calc_hdelta_hbd(o, a, b, c, d, s, adjdamp);
     305           0 :   v64_store_aligned(dst, v128_high_v64(o));
     306           0 :   v64_store_aligned(dst + dstride, v128_low_v64(o));
     307           0 : }
     308             : 
     309           0 : static void calc_hdelta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d,
     310             :                              uint16_t *dst, unsigned int s,
     311             :                              unsigned int adjdamp) {
     312           0 :   v128_store_aligned(dst, calc_hdelta_hbd(o, a, b, c, d, s, adjdamp));
     313           0 : }
     314             : 
     315             : // Process blocks of width 4, two lines at time.
     316           0 : static void SIMD_FUNC(clpf_block_hbd4)(uint16_t *dst, const uint16_t *src,
     317             :                                        int dstride, int sstride, int sizey,
     318             :                                        unsigned int strength,
     319             :                                        unsigned int adjdamp) {
     320             :   int y;
     321             : 
     322           0 :   for (y = 0; y < sizey; y += 2) {
     323           0 :     const v64 l1 = v64_load_aligned(src);
     324           0 :     const v64 l2 = v64_load_aligned(src + sstride);
     325           0 :     const v64 l3 = v64_load_aligned(src - sstride);
     326           0 :     const v64 l4 = v64_load_aligned(src + 2 * sstride);
     327           0 :     const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3);
     328           0 :     const v128 b = v128_from_v64(l3, l1);
     329           0 :     const v128 g = v128_from_v64(l2, l4);
     330           0 :     const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride));
     331           0 :     const v128 c = v128_from_v64(v64_load_unaligned(src - 2),
     332           0 :                                  v64_load_unaligned(src - 2 + sstride));
     333           0 :     const v128 d = v128_from_v64(v64_load_unaligned(src - 1),
     334           0 :                                  v64_load_unaligned(src - 1 + sstride));
     335           0 :     const v128 e = v128_from_v64(v64_load_unaligned(src + 1),
     336           0 :                                  v64_load_unaligned(src + 1 + sstride));
     337           0 :     const v128 f = v128_from_v64(v64_load_unaligned(src + 2),
     338           0 :                                  v64_load_unaligned(src + 2 + sstride));
     339             : 
     340           0 :     calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst,
     341             :                     strength, adjdamp, dstride);
     342           0 :     src += sstride * 2;
     343           0 :     dst += dstride * 2;
     344             :   }
     345           0 : }
     346             : 
     347             : // The most simple case.  Start here if you need to understand the functions.
     348           0 : static void SIMD_FUNC(clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
     349             :                                       int dstride, int sstride, int sizey,
     350             :                                       unsigned int strength,
     351             :                                       unsigned int adjdamp) {
     352             :   int y;
     353             : 
     354           0 :   for (y = 0; y < sizey; y++) {
     355           0 :     const v128 o = v128_load_aligned(src);
     356           0 :     const v128 a = v128_load_aligned(src - 2 * sstride);
     357           0 :     const v128 b = v128_load_aligned(src - 1 * sstride);
     358           0 :     const v128 g = v128_load_aligned(src + sstride);
     359           0 :     const v128 h = v128_load_aligned(src + 2 * sstride);
     360           0 :     const v128 c = v128_load_unaligned(src - 2);
     361           0 :     const v128 d = v128_load_unaligned(src - 1);
     362           0 :     const v128 e = v128_load_unaligned(src + 1);
     363           0 :     const v128 f = v128_load_unaligned(src + 2);
     364             : 
     365           0 :     calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, adjdamp);
     366           0 :     src += sstride;
     367           0 :     dst += dstride;
     368             :   }
     369           0 : }
     370             : 
     371             : // Process blocks of width 4, horizontal filter, two lines at time.
     372           0 : static void SIMD_FUNC(clpf_hblock_hbd4)(uint16_t *dst, const uint16_t *src,
     373             :                                         int dstride, int sstride, int sizey,
     374             :                                         unsigned int strength,
     375             :                                         unsigned int adjdamp) {
     376             :   int y;
     377             : 
     378           0 :   for (y = 0; y < sizey; y += 2) {
     379           0 :     const v128 a = v128_from_v64(v64_load_unaligned(src - 2),
     380           0 :                                  v64_load_unaligned(src - 2 + sstride));
     381           0 :     const v128 b = v128_from_v64(v64_load_unaligned(src - 1),
     382           0 :                                  v64_load_unaligned(src - 1 + sstride));
     383           0 :     const v128 c = v128_from_v64(v64_load_unaligned(src + 1),
     384           0 :                                  v64_load_unaligned(src + 1 + sstride));
     385           0 :     const v128 d = v128_from_v64(v64_load_unaligned(src + 2),
     386           0 :                                  v64_load_unaligned(src + 2 + sstride));
     387             : 
     388           0 :     calc_hdelta_hbd4(v128_from_v64(v64_load_unaligned(src),
     389           0 :                                    v64_load_unaligned(src + sstride)),
     390             :                      a, b, c, d, dst, strength, adjdamp, dstride);
     391           0 :     src += sstride * 2;
     392           0 :     dst += dstride * 2;
     393             :   }
     394           0 : }
     395             : 
     396             : // Process blocks of width 8, horizontal filter, two lines at time.
     397           0 : static void SIMD_FUNC(clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
     398             :                                        int dstride, int sstride, int sizey,
     399             :                                        unsigned int strength,
     400             :                                        unsigned int adjdamp) {
     401             :   int y;
     402             : 
     403           0 :   for (y = 0; y < sizey; y++) {
     404           0 :     const v128 o = v128_load_aligned(src);
     405           0 :     const v128 a = v128_load_unaligned(src - 2);
     406           0 :     const v128 b = v128_load_unaligned(src - 1);
     407           0 :     const v128 c = v128_load_unaligned(src + 1);
     408           0 :     const v128 d = v128_load_unaligned(src + 2);
     409             : 
     410           0 :     calc_hdelta_hbd8(o, a, b, c, d, dst, strength, adjdamp);
     411           0 :     src += sstride;
     412           0 :     dst += dstride;
     413             :   }
     414           0 : }
     415             : 
     416           0 : void SIMD_FUNC(aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
     417             :                                    int dstride, int sstride, int sizex,
     418             :                                    int sizey, unsigned int strength,
     419             :                                    unsigned int dmp) {
     420           0 :   if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
     421             :     // Fallback to C for odd sizes:
     422             :     // * block width not 4 or 8
     423             :     // * block heights not a multiple of 2 if the block width is 4
     424           0 :     aom_clpf_block_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
     425             :                          dmp);
     426             :   } else {
     427           0 :     (sizex == 4 ? SIMD_FUNC(clpf_block_hbd4) : SIMD_FUNC(clpf_block_hbd))(
     428           0 :         dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
     429             :   }
     430           0 : }
     431             : 
     432           0 : void SIMD_FUNC(aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
     433             :                                     int dstride, int sstride, int sizex,
     434             :                                     int sizey, unsigned int strength,
     435             :                                     unsigned int dmp) {
     436           0 :   if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
     437             :     // Fallback to C for odd sizes:
     438             :     // * block width not 4 or 8
     439             :     // * block heights not a multiple of 2 if the block width is 4
     440           0 :     aom_clpf_hblock_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
     441             :                           dmp);
     442             :   } else {
     443           0 :     (sizex == 4 ? SIMD_FUNC(clpf_hblock_hbd4) : SIMD_FUNC(clpf_hblock_hbd))(
     444           0 :         dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
     445             :   }
     446           0 : }

Generated by: LCOV version 1.13