LCOV - code coverage report
Current view: top level - third_party/aom/av1/common - od_dering_simd.h (source / functions) Hit Total Coverage
Test: output.info Lines: 0 223 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 31 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include "./av1_rtcd.h"
      13             : #include "./cdef_simd.h"
      14             : #include "./od_dering.h"
      15             : 
      16             : /* partial A is a 16-bit vector of the form:
      17             :    [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
      18             :    [0  y1 y2 y3 y4 y5 y6 y7].
      19             :    This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
      20             :    (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
      21             :    and const2. */
      22           0 : static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
      23             :                                     v128 const2) {
      24             :   v128 tmp;
      25             :   /* Reverse partial B. */
      26           0 :   partialb = v128_shuffle_8(
      27             :       partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
      28             :   /* Interleave the x and y values of identical indices and pair x8 with 0. */
      29           0 :   tmp = partiala;
      30           0 :   partiala = v128_ziplo_16(partialb, partiala);
      31           0 :   partialb = v128_ziphi_16(partialb, tmp);
      32             :   /* Square and add the corresponding x and y values. */
      33           0 :   partiala = v128_madd_s16(partiala, partiala);
      34           0 :   partialb = v128_madd_s16(partialb, partialb);
      35             :   /* Multiply by constant. */
      36           0 :   partiala = v128_mullo_s32(partiala, const1);
      37           0 :   partialb = v128_mullo_s32(partialb, const2);
      38             :   /* Sum all results. */
      39           0 :   partiala = v128_add_32(partiala, partialb);
      40           0 :   return partiala;
      41             : }
      42             : 
      43           0 : static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
      44             :   v128 t0, t1, t2, t3;
      45           0 :   t0 = v128_ziplo_32(x1, x0);
      46           0 :   t1 = v128_ziplo_32(x3, x2);
      47           0 :   t2 = v128_ziphi_32(x1, x0);
      48           0 :   t3 = v128_ziphi_32(x3, x2);
      49           0 :   x0 = v128_ziplo_64(t1, t0);
      50           0 :   x1 = v128_ziphi_64(t1, t0);
      51           0 :   x2 = v128_ziplo_64(t3, t2);
      52           0 :   x3 = v128_ziphi_64(t3, t2);
      53           0 :   return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
      54             : }
      55             : 
      56             : /* Computes cost for directions 0, 5, 6 and 7. We can call this function again
      57             :    to compute the remaining directions. */
      58           0 : static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
      59             :   v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
      60             :   v128 partial6;
      61             :   v128 tmp;
      62             :   /* Partial sums for lines 0 and 1. */
      63           0 :   partial4a = v128_shl_n_byte(lines[0], 14);
      64           0 :   partial4b = v128_shr_n_byte(lines[0], 2);
      65           0 :   partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
      66           0 :   partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
      67           0 :   tmp = v128_add_16(lines[0], lines[1]);
      68           0 :   partial5a = v128_shl_n_byte(tmp, 10);
      69           0 :   partial5b = v128_shr_n_byte(tmp, 6);
      70           0 :   partial7a = v128_shl_n_byte(tmp, 4);
      71           0 :   partial7b = v128_shr_n_byte(tmp, 12);
      72           0 :   partial6 = tmp;
      73             : 
      74             :   /* Partial sums for lines 2 and 3. */
      75           0 :   partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
      76           0 :   partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
      77           0 :   partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
      78           0 :   partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
      79           0 :   tmp = v128_add_16(lines[2], lines[3]);
      80           0 :   partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
      81           0 :   partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
      82           0 :   partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
      83           0 :   partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
      84           0 :   partial6 = v128_add_16(partial6, tmp);
      85             : 
      86             :   /* Partial sums for lines 4 and 5. */
      87           0 :   partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
      88           0 :   partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
      89           0 :   partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
      90           0 :   partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
      91           0 :   tmp = v128_add_16(lines[4], lines[5]);
      92           0 :   partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
      93           0 :   partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
      94           0 :   partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
      95           0 :   partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
      96           0 :   partial6 = v128_add_16(partial6, tmp);
      97             : 
      98             :   /* Partial sums for lines 6 and 7. */
      99           0 :   partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
     100           0 :   partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
     101           0 :   partial4a = v128_add_16(partial4a, lines[7]);
     102           0 :   tmp = v128_add_16(lines[6], lines[7]);
     103           0 :   partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
     104           0 :   partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
     105           0 :   partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
     106           0 :   partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
     107           0 :   partial6 = v128_add_16(partial6, tmp);
     108             : 
     109             :   /* Compute costs in terms of partial sums. */
     110           0 :   partial4a =
     111           0 :       fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
     112             :                        v128_from_32(105, 120, 140, 168));
     113           0 :   partial7a =
     114           0 :       fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
     115             :                        v128_from_32(105, 105, 105, 140));
     116           0 :   partial5a =
     117           0 :       fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
     118             :                        v128_from_32(105, 105, 105, 140));
     119           0 :   partial6 = v128_madd_s16(partial6, partial6);
     120           0 :   partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
     121             : 
     122           0 :   partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
     123             :   v128_store_unaligned(tmp_cost1, partial4a);
     124           0 :   return partial4a;
     125             : }
     126             : 
     127             : /* transpose and reverse the order of the lines -- equivalent to a 90-degree
     128             :    counter-clockwise rotation of the pixels. */
     129           0 : static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
     130           0 :   const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
     131           0 :   const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
     132           0 :   const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
     133           0 :   const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
     134           0 :   const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
     135           0 :   const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
     136           0 :   const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
     137           0 :   const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
     138             : 
     139           0 :   const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
     140           0 :   const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
     141           0 :   const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
     142           0 :   const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
     143           0 :   const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
     144           0 :   const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
     145           0 :   const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
     146           0 :   const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
     147             : 
     148           0 :   res[7] = v128_ziplo_64(tr1_1, tr1_0);
     149           0 :   res[6] = v128_ziphi_64(tr1_1, tr1_0);
     150           0 :   res[5] = v128_ziplo_64(tr1_3, tr1_2);
     151           0 :   res[4] = v128_ziphi_64(tr1_3, tr1_2);
     152           0 :   res[3] = v128_ziplo_64(tr1_5, tr1_4);
     153           0 :   res[2] = v128_ziphi_64(tr1_5, tr1_4);
     154           0 :   res[1] = v128_ziplo_64(tr1_7, tr1_6);
     155           0 :   res[0] = v128_ziphi_64(tr1_7, tr1_6);
     156           0 : }
     157             : 
     158           0 : int SIMD_FUNC(od_dir_find8)(const od_dering_in *img, int stride, int32_t *var,
     159             :                             int coeff_shift) {
     160             :   int i;
     161             :   int32_t cost[8];
     162           0 :   int32_t best_cost = 0;
     163           0 :   int best_dir = 0;
     164             :   v128 lines[8];
     165           0 :   for (i = 0; i < 8; i++) {
     166           0 :     lines[i] = v128_load_unaligned(&img[i * stride]);
     167           0 :     lines[i] =
     168           0 :         v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
     169             :   }
     170             : 
     171             : #if defined(__SSE4_1__)
     172             :   /* Compute "mostly vertical" directions. */
     173           0 :   __m128i dir47 = compute_directions(lines, cost + 4);
     174             : 
     175           0 :   array_reverse_transpose_8x8(lines, lines);
     176             : 
     177             :   /* Compute "mostly horizontal" directions. */
     178           0 :   __m128i dir03 = compute_directions(lines, cost);
     179             : 
     180           0 :   __m128i max = _mm_max_epi32(dir03, dir47);
     181           0 :   max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(1, 0, 3, 2)));
     182           0 :   max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(2, 3, 0, 1)));
     183           0 :   best_cost = _mm_cvtsi128_si32(max);
     184           0 :   __m128i t =
     185           0 :       _mm_packs_epi32(_mm_cmpeq_epi32(max, dir03), _mm_cmpeq_epi32(max, dir47));
     186           0 :   best_dir = _mm_movemask_epi8(_mm_packs_epi16(t, t));
     187           0 :   best_dir = get_msb(best_dir ^ (best_dir - 1));  // Count trailing zeros
     188             : #else
     189             :   /* Compute "mostly vertical" directions. */
     190           0 :   compute_directions(lines, cost + 4);
     191             : 
     192           0 :   array_reverse_transpose_8x8(lines, lines);
     193             : 
     194             :   /* Compute "mostly horizontal" directions. */
     195           0 :   compute_directions(lines, cost);
     196             : 
     197           0 :   for (i = 0; i < 8; i++) {
     198           0 :     if (cost[i] > best_cost) {
     199           0 :       best_cost = cost[i];
     200           0 :       best_dir = i;
     201             :     }
     202             :   }
     203             : #endif
     204             : 
     205             :   /* Difference between the optimal variance and the variance along the
     206             :      orthogonal direction. Again, the sum(x^2) terms cancel out. */
     207           0 :   *var = best_cost - cost[(best_dir + 4) & 7];
     208             :   /* We'd normally divide by 840, but dividing by 1024 is close enough
     209             :      for what we're going to do with this. */
     210           0 :   *var >>= 10;
     211           0 :   return best_dir;
     212             : }
     213             : 
     214           0 : void SIMD_FUNC(od_filter_dering_direction_4x4)(uint16_t *y, int ystride,
     215             :                                                const uint16_t *in,
     216             :                                                int threshold, int dir,
     217             :                                                int damping) {
     218             :   int i;
     219             :   v128 p0, p1, sum, row, res;
     220           0 :   int o1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
     221           0 :   int o2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
     222             : 
     223           0 :   if (threshold) damping -= get_msb(threshold);
     224           0 :   for (i = 0; i < 4; i += 2) {
     225           0 :     sum = v128_zero();
     226           0 :     row = v128_from_v64(v64_load_aligned(&in[i * OD_FILT_BSTRIDE]),
     227           0 :                         v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE]));
     228             : 
     229             :     // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
     230           0 :     p0 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE + o1]),
     231           0 :                        v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE + o1]));
     232           0 :     p0 = constrain16(p0, row, threshold, damping);
     233             : 
     234             :     // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
     235           0 :     p1 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE - o1]),
     236           0 :                        v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE - o1]));
     237           0 :     p1 = constrain16(p1, row, threshold, damping);
     238             : 
     239             :     // sum += 4 * (p0 + p1)
     240           0 :     sum = v128_add_16(sum, v128_shl_n_16(v128_add_16(p0, p1), 2));
     241             : 
     242             :     // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
     243           0 :     p0 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE + o2]),
     244           0 :                        v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE + o2]));
     245           0 :     p0 = constrain16(p0, row, threshold, damping);
     246             : 
     247             :     // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
     248           0 :     p1 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE - o2]),
     249           0 :                        v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE - o2]));
     250           0 :     p1 = constrain16(p1, row, threshold, damping);
     251             : 
     252             :     // sum += 1 * (p0 + p1)
     253           0 :     sum = v128_add_16(sum, v128_add_16(p0, p1));
     254             : 
     255             :     // res = row + ((sum + 8) >> 4)
     256           0 :     res = v128_add_16(sum, v128_dup_16(8));
     257           0 :     res = v128_shr_n_s16(res, 4);
     258           0 :     res = v128_add_16(row, res);
     259           0 :     v64_store_aligned(&y[i * ystride], v128_high_v64(res));
     260           0 :     v64_store_aligned(&y[(i + 1) * ystride], v128_low_v64(res));
     261             :   }
     262           0 : }
     263             : 
     264           0 : void SIMD_FUNC(od_filter_dering_direction_8x8)(uint16_t *y, int ystride,
     265             :                                                const uint16_t *in,
     266             :                                                int threshold, int dir,
     267             :                                                int damping) {
     268             :   int i;
     269             :   v128 sum, p0, p1, row, res;
     270           0 :   int o1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
     271           0 :   int o2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
     272           0 :   int o3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
     273             : 
     274           0 :   if (threshold) damping -= get_msb(threshold);
     275           0 :   for (i = 0; i < 8; i++) {
     276           0 :     sum = v128_zero();
     277           0 :     row = v128_load_aligned(&in[i * OD_FILT_BSTRIDE]);
     278             : 
     279             :     // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
     280           0 :     p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o1]);
     281           0 :     p0 = constrain16(p0, row, threshold, damping);
     282             : 
     283             :     // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
     284           0 :     p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o1]);
     285           0 :     p1 = constrain16(p1, row, threshold, damping);
     286             : 
     287             :     // sum += 3 * (p0 + p1)
     288           0 :     p0 = v128_add_16(p0, p1);
     289           0 :     p0 = v128_add_16(p0, v128_shl_n_16(p0, 1));
     290           0 :     sum = v128_add_16(sum, p0);
     291             : 
     292             :     // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
     293           0 :     p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o2]);
     294           0 :     p0 = constrain16(p0, row, threshold, damping);
     295             : 
     296             :     // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
     297           0 :     p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o2]);
     298           0 :     p1 = constrain16(p1, row, threshold, damping);
     299             : 
     300             :     // sum += 2 * (p0 + p1)
     301           0 :     p0 = v128_shl_n_16(v128_add_16(p0, p1), 1);
     302           0 :     sum = v128_add_16(sum, p0);
     303             : 
     304             :     // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
     305           0 :     p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o3]);
     306           0 :     p0 = constrain16(p0, row, threshold, damping);
     307             : 
     308             :     // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
     309           0 :     p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o3]);
     310           0 :     p1 = constrain16(p1, row, threshold, damping);
     311             : 
     312             :     // sum += (p0 + p1)
     313           0 :     p0 = v128_add_16(p0, p1);
     314           0 :     sum = v128_add_16(sum, p0);
     315             : 
     316             :     // res = row + ((sum + 8) >> 4)
     317           0 :     res = v128_add_16(sum, v128_dup_16(8));
     318           0 :     res = v128_shr_n_s16(res, 4);
     319           0 :     res = v128_add_16(row, res);
     320           0 :     v128_store_unaligned(&y[i * ystride], res);
     321             :   }
     322           0 : }
     323             : 
     324           0 : void SIMD_FUNC(copy_8x8_16bit_to_8bit)(uint8_t *dst, int dstride,
     325             :                                        const uint16_t *src, int sstride) {
     326             :   int i;
     327           0 :   for (i = 0; i < 8; i++) {
     328           0 :     v128 row = v128_load_unaligned(&src[i * sstride]);
     329           0 :     row = v128_pack_s16_u8(row, row);
     330           0 :     v64_store_unaligned(&dst[i * dstride], v128_low_v64(row));
     331             :   }
     332           0 : }
     333             : 
     334           0 : void SIMD_FUNC(copy_4x4_16bit_to_8bit)(uint8_t *dst, int dstride,
     335             :                                        const uint16_t *src, int sstride) {
     336             :   int i;
     337           0 :   for (i = 0; i < 4; i++) {
     338           0 :     v128 row = v128_load_unaligned(&src[i * sstride]);
     339           0 :     row = v128_pack_s16_u8(row, row);
     340           0 :     u32_store_unaligned(&dst[i * dstride], v128_low_u32(row));
     341             :   }
     342           0 : }
     343             : 
     344           0 : void SIMD_FUNC(copy_8x8_16bit_to_16bit)(uint16_t *dst, int dstride,
     345             :                                         const uint16_t *src, int sstride) {
     346             :   int i;
     347           0 :   for (i = 0; i < 8; i++) {
     348           0 :     v128 row = v128_load_unaligned(&src[i * sstride]);
     349           0 :     v128_store_unaligned(&dst[i * dstride], row);
     350             :   }
     351           0 : }
     352             : 
     353           0 : void SIMD_FUNC(copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride,
     354             :                                         const uint16_t *src, int sstride) {
     355             :   int i;
     356           0 :   for (i = 0; i < 4; i++) {
     357           0 :     v64 row = v64_load_unaligned(&src[i * sstride]);
     358           0 :     v64_store_unaligned(&dst[i * dstride], row);
     359             :   }
     360           0 : }
     361             : 
     362           0 : void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
     363             :                                          const uint8_t *src, int sstride, int v,
     364             :                                          int h) {
     365             :   int i, j;
     366           0 :   for (i = 0; i < v; i++) {
     367           0 :     for (j = 0; j < (h & ~0x7); j += 8) {
     368           0 :       v64 row = v64_load_unaligned(&src[i * sstride + j]);
     369           0 :       v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
     370             :     }
     371           0 :     for (; j < h; j++) {
     372           0 :       dst[i * dstride + j] = src[i * sstride + j];
     373             :     }
     374             :   }
     375           0 : }
     376             : 
     377           0 : void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
     378             :                                           const uint16_t *src, int sstride,
     379             :                                           int v, int h) {
     380             :   int i, j;
     381           0 :   for (i = 0; i < v; i++) {
     382           0 :     for (j = 0; j < (h & ~0x7); j += 8) {
     383           0 :       v128 row = v128_load_unaligned(&src[i * sstride + j]);
     384           0 :       v128_store_unaligned(&dst[i * dstride + j], row);
     385             :     }
     386           0 :     for (; j < h; j++) {
     387           0 :       dst[i * dstride + j] = src[i * sstride + j];
     388             :     }
     389             :   }
     390           0 : }

Generated by: LCOV version 1.13