LCOV - output.info - gfx/skia/skia/src/opts/SkBitmapFilter

LCOV - code coverage report

Current view:	top level - gfx/skia/skia/src/opts - SkBitmapFilter_opts.h (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	0	188	0.0 %
Date:	2017-07-14 16:53:18	Functions:	0	5	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*
       2             :  * Copyright 2016 Google Inc.
       3             :  *
       4             :  * Use of this source code is governed by a BSD-style license that can be
       5             :  * found in the LICENSE file.
       6             :  */
       7             : 
       8             : #ifndef SkBitmapFilter_opts_DEFINED
       9             : #define SkBitmapFilter_opts_DEFINED
      10             : 
      11             : #include "SkConvolver.h"
      12             : 
      13             : #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
      14             :     #include <immintrin.h>
      15             : #elif defined(SK_ARM_HAS_NEON)
      16             :     #include <arm_neon.h>
      17             : #endif
      18             : 
      19             : namespace SK_OPTS_NS {
      20             : 
      21             : #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
      22             : 
      23             :     static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,
      24             :             const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i& accum, int r) {
      25           0 :         int remainder[4] = {0};
      26           0 :         for (int i = 0; i < r; i++) {
      27           0 :             SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
      28           0 :             remainder[0] += coeff * pixelsLeft[i * 4 + 0];
      29           0 :             remainder[1] += coeff * pixelsLeft[i * 4 + 1];
      30           0 :             remainder[2] += coeff * pixelsLeft[i * 4 + 2];
      31           0 :             remainder[3] += coeff * pixelsLeft[i * 4 + 3];
      32             :         }
      33           0 :         __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remainder[3]);
      34           0 :         accum = _mm_add_epi32(accum, t);
      35             :     }
      36             : 
      37             :     // Convolves horizontally along a single row. The row data is given in
      38             :     // |srcData| and continues for the numValues() of the filter.
      39           0 :     void convolve_horizontally(const unsigned char* srcData,
      40             :                                const SkConvolutionFilter1D& filter,
      41             :                                unsigned char* outRow,
      42             :                                bool /*hasAlpha*/) {
      43             :         // Output one pixel each iteration, calculating all channels (RGBA) together.
      44           0 :         int numValues = filter.numValues();
      45           0 :         for (int outX = 0; outX < numValues; outX++) {
      46             :             // Get the filter that determines the current output pixel.
      47             :             int filterOffset, filterLength;
      48             :             const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
      49           0 :                 filter.FilterForValue(outX, &filterOffset, &filterLength);
      50             : 
      51             :             // Compute the first pixel in this row that the filter affects. It will
      52             :             // touch |filterLength| pixels (4 bytes each) after this.
      53           0 :             const unsigned char* rowToFilter = &srcData[filterOffset * 4];
      54             : 
      55           0 :             __m128i zero = _mm_setzero_si128();
      56           0 :             __m128i accum = _mm_setzero_si128();
      57             : 
      58             :             // We will load and accumulate with four coefficients per iteration.
      59           0 :             for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
      60             :                 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
      61             :                 __m128i coeff, coeff16;
      62             :                 // [16] xx xx xx xx c3 c2 c1 c0
      63           0 :                 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterValues));
      64             :                 // [16] xx xx xx xx c1 c1 c0 c0
      65           0 :                 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
      66             :                 // [16] c1 c1 c1 c1 c0 c0 c0 c0
      67           0 :                 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
      68             : 
      69             :                 // Load four pixels => unpack the first two pixels to 16 bits =>
      70             :                 // multiply with coefficients => accumulate the convolution result.
      71             :                 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
      72           0 :                 __m128i src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowToFilter));
      73             :                 // [16] a1 b1 g1 r1 a0 b0 g0 r0
      74           0 :                 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
      75           0 :                 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
      76           0 :                 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
      77             :                 // [32]  a0*c0 b0*c0 g0*c0 r0*c0
      78           0 :                 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
      79           0 :                 accum = _mm_add_epi32(accum, t);
      80             :                 // [32]  a1*c1 b1*c1 g1*c1 r1*c1
      81           0 :                 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
      82           0 :                 accum = _mm_add_epi32(accum, t);
      83             : 
      84             :                 // Duplicate 3rd and 4th coefficients for all channels =>
      85             :                 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
      86             :                 // => accumulate the convolution results.
      87             :                 // [16] xx xx xx xx c3 c3 c2 c2
      88           0 :                 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
      89             :                 // [16] c3 c3 c3 c3 c2 c2 c2 c2
      90           0 :                 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
      91             :                 // [16] a3 g3 b3 r3 a2 g2 b2 r2
      92           0 :                 src16 = _mm_unpackhi_epi8(src8, zero);
      93           0 :                 mul_hi = _mm_mulhi_epi16(src16, coeff16);
      94           0 :                 mul_lo = _mm_mullo_epi16(src16, coeff16);
      95             :                 // [32]  a2*c2 b2*c2 g2*c2 r2*c2
      96           0 :                 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
      97           0 :                 accum = _mm_add_epi32(accum, t);
      98             :                 // [32]  a3*c3 b3*c3 g3*c3 r3*c3
      99           0 :                 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
     100           0 :                 accum = _mm_add_epi32(accum, t);
     101             : 
     102             :                 // Advance the pixel and coefficients pointers.
     103           0 :                 rowToFilter += 16;
     104           0 :                 filterValues += 4;
     105             :             }
     106             : 
     107             :             // When |filterLength| is not divisible by 4, we accumulate the last 1 - 3
     108             :             // coefficients one at a time.
     109           0 :             int r = filterLength & 3;
     110           0 :             if (r) {
     111           0 :                 int remainderOffset = (filterOffset + filterLength - r) * 4;
     112           0 :                 AccumRemainder(srcData + remainderOffset, filterValues, accum, r);
     113             :             }
     114             : 
     115             :             // Shift right for fixed point implementation.
     116           0 :             accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
     117             : 
     118             :             // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
     119           0 :             accum = _mm_packs_epi32(accum, zero);
     120             :             // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
     121           0 :             accum = _mm_packus_epi16(accum, zero);
     122             : 
     123             :             // Store the pixel value of 32 bits.
     124           0 :             *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum);
     125           0 :             outRow += 4;
     126             :         }
     127           0 :     }
     128             : 
     129             :     // Convolves horizontally along four rows. The row data is given in
     130             :     // |srcData| and continues for the numValues() of the filter.
     131             :     // The algorithm is almost same as |convolve_horizontally|. Please
     132             :     // refer to that function for detailed comments.
     133           0 :     void convolve_4_rows_horizontally(const unsigned char* srcData[4],
     134             :                                       const SkConvolutionFilter1D& filter,
     135             :                                       unsigned char* outRow[4],
     136             :                                       size_t outRowBytes) {
     137           0 :         SkDEBUGCODE(const unsigned char* out_row_0_start = outRow[0];)
     138             : 
     139             :         // Output one pixel each iteration, calculating all channels (RGBA) together.
     140           0 :         int numValues = filter.numValues();
     141           0 :         for (int outX = 0; outX < numValues; outX++) {
     142             :             int filterOffset, filterLength;
     143             :             const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
     144           0 :                 filter.FilterForValue(outX, &filterOffset, &filterLength);
     145             : 
     146           0 :             __m128i zero = _mm_setzero_si128();
     147             : 
     148             :             // four pixels in a column per iteration.
     149           0 :             __m128i accum0 = _mm_setzero_si128();
     150           0 :             __m128i accum1 = _mm_setzero_si128();
     151           0 :             __m128i accum2 = _mm_setzero_si128();
     152           0 :             __m128i accum3 = _mm_setzero_si128();
     153             : 
     154           0 :             int start = filterOffset * 4;
     155             :             // We will load and accumulate with four coefficients per iteration.
     156           0 :             for (int filterX = 0; filterX < (filterLength >> 2); filterX++) {
     157             :                 __m128i coeff, coeff16lo, coeff16hi;
     158             :                 // [16] xx xx xx xx c3 c2 c1 c0
     159           0 :                 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterValues));
     160             :                 // [16] xx xx xx xx c1 c1 c0 c0
     161           0 :                 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
     162             :                 // [16] c1 c1 c1 c1 c0 c0 c0 c0
     163           0 :                 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
     164             :                 // [16] xx xx xx xx c3 c3 c2 c2
     165           0 :                 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
     166             :                 // [16] c3 c3 c3 c3 c2 c2 c2 c2
     167           0 :                 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
     168             : 
     169             :                 __m128i src8, src16, mul_hi, mul_lo, t;
     170             : 
     171             : #define ITERATION(src, accum)                                                    \
     172             :                 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
     173             :                 src16 = _mm_unpacklo_epi8(src8, zero);                           \
     174             :                 mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
     175             :                 mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
     176             :                 t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
     177             :                 accum = _mm_add_epi32(accum, t);                                 \
     178             :                 t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
     179             :                 accum = _mm_add_epi32(accum, t);                                 \
     180             :                 src16 = _mm_unpackhi_epi8(src8, zero);                           \
     181             :                 mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
     182             :                 mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
     183             :                 t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
     184             :                 accum = _mm_add_epi32(accum, t);                                 \
     185             :                 t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
     186             :                 accum = _mm_add_epi32(accum, t)
     187             : 
     188           0 :                 ITERATION(srcData[0] + start, accum0);
     189           0 :                 ITERATION(srcData[1] + start, accum1);
     190           0 :                 ITERATION(srcData[2] + start, accum2);
     191           0 :                 ITERATION(srcData[3] + start, accum3);
     192             : 
     193           0 :                 start += 16;
     194           0 :                 filterValues += 4;
     195             :             }
     196             : 
     197           0 :             int r = filterLength & 3;
     198           0 :             if (r) {
     199           0 :                 int remainderOffset = (filterOffset + filterLength - r) * 4;
     200           0 :                 AccumRemainder(srcData[0] + remainderOffset, filterValues, accum0, r);
     201           0 :                 AccumRemainder(srcData[1] + remainderOffset, filterValues, accum1, r);
     202           0 :                 AccumRemainder(srcData[2] + remainderOffset, filterValues, accum2, r);
     203           0 :                 AccumRemainder(srcData[3] + remainderOffset, filterValues, accum3, r);
     204             :             }
     205             : 
     206           0 :             accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
     207           0 :             accum0 = _mm_packs_epi32(accum0, zero);
     208           0 :             accum0 = _mm_packus_epi16(accum0, zero);
     209           0 :             accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
     210           0 :             accum1 = _mm_packs_epi32(accum1, zero);
     211           0 :             accum1 = _mm_packus_epi16(accum1, zero);
     212           0 :             accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
     213           0 :             accum2 = _mm_packs_epi32(accum2, zero);
     214           0 :             accum2 = _mm_packus_epi16(accum2, zero);
     215           0 :             accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
     216           0 :             accum3 = _mm_packs_epi32(accum3, zero);
     217           0 :             accum3 = _mm_packus_epi16(accum3, zero);
     218             : 
     219             :             // We seem to be running off the edge here (chromium:491660).
     220           0 :             SkASSERT(((size_t)outRow[0] - (size_t)out_row_0_start) < outRowBytes);
     221             : 
     222           0 :             *(reinterpret_cast<int*>(outRow[0])) = _mm_cvtsi128_si32(accum0);
     223           0 :             *(reinterpret_cast<int*>(outRow[1])) = _mm_cvtsi128_si32(accum1);
     224           0 :             *(reinterpret_cast<int*>(outRow[2])) = _mm_cvtsi128_si32(accum2);
     225           0 :             *(reinterpret_cast<int*>(outRow[3])) = _mm_cvtsi128_si32(accum3);
     226             : 
     227           0 :             outRow[0] += 4;
     228           0 :             outRow[1] += 4;
     229           0 :             outRow[2] += 4;
     230           0 :             outRow[3] += 4;
     231             :         }
     232           0 :     }
     233             : 
     234             :     // Does vertical convolution to produce one output row. The filter values and
     235             :     // length are given in the first two parameters. These are applied to each
     236             :     // of the rows pointed to in the |sourceDataRows| array, with each row
     237             :     // being |pixelWidth| wide.
     238             :     //
     239             :     // The output must have room for |pixelWidth * 4| bytes.
     240             :     template<bool hasAlpha>
     241           0 :     void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
     242             :                             int filterLength,
     243             :                             unsigned char* const* sourceDataRows,
     244             :                             int pixelWidth,
     245             :                             unsigned char* outRow) {
     246             :         // Output four pixels per iteration (16 bytes).
     247           0 :         int width = pixelWidth & ~3;
     248           0 :         __m128i zero = _mm_setzero_si128();
     249           0 :         for (int outX = 0; outX < width; outX += 4) {
     250             :             // Accumulated result for each pixel. 32 bits per RGBA channel.
     251           0 :             __m128i accum0 = _mm_setzero_si128();
     252           0 :             __m128i accum1 = _mm_setzero_si128();
     253           0 :             __m128i accum2 = _mm_setzero_si128();
     254           0 :             __m128i accum3 = _mm_setzero_si128();
     255             : 
     256             :             // Convolve with one filter coefficient per iteration.
     257           0 :             for (int filterY = 0; filterY < filterLength; filterY++) {
     258             : 
     259             :                 // Duplicate the filter coefficient 8 times.
     260             :                 // [16] cj cj cj cj cj cj cj cj
     261           0 :                 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
     262             : 
     263             :                 // Load four pixels (16 bytes) together.
     264             :                 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
     265             :                 const __m128i* src = reinterpret_cast<const __m128i*>(
     266           0 :                     &sourceDataRows[filterY][outX << 2]);
     267           0 :                 __m128i src8 = _mm_loadu_si128(src);
     268             : 
     269             :                 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
     270             :                 // multiply with current coefficient => accumulate the result.
     271             :                 // [16] a1 b1 g1 r1 a0 b0 g0 r0
     272           0 :                 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
     273           0 :                 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
     274           0 :                 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
     275             :                 // [32] a0 b0 g0 r0
     276           0 :                 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
     277           0 :                 accum0 = _mm_add_epi32(accum0, t);
     278             :                 // [32] a1 b1 g1 r1
     279           0 :                 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
     280           0 :                 accum1 = _mm_add_epi32(accum1, t);
     281             : 
     282             :                 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
     283             :                 // multiply with current coefficient => accumulate the result.
     284             :                 // [16] a3 b3 g3 r3 a2 b2 g2 r2
     285           0 :                 src16 = _mm_unpackhi_epi8(src8, zero);
     286           0 :                 mul_hi = _mm_mulhi_epi16(src16, coeff16);
     287           0 :                 mul_lo = _mm_mullo_epi16(src16, coeff16);
     288             :                 // [32] a2 b2 g2 r2
     289           0 :                 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
     290           0 :                 accum2 = _mm_add_epi32(accum2, t);
     291             :                 // [32] a3 b3 g3 r3
     292           0 :                 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
     293           0 :                 accum3 = _mm_add_epi32(accum3, t);
     294             :             }
     295             : 
     296             :             // Shift right for fixed point implementation.
     297           0 :             accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
     298           0 :             accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
     299           0 :             accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
     300           0 :             accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
     301             : 
     302             :             // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
     303             :             // [16] a1 b1 g1 r1 a0 b0 g0 r0
     304           0 :             accum0 = _mm_packs_epi32(accum0, accum1);
     305             :             // [16] a3 b3 g3 r3 a2 b2 g2 r2
     306           0 :             accum2 = _mm_packs_epi32(accum2, accum3);
     307             : 
     308             :             // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
     309             :             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
     310           0 :             accum0 = _mm_packus_epi16(accum0, accum2);
     311             : 
     312             :             if (hasAlpha) {
     313             :                 // Compute the max(ri, gi, bi) for each pixel.
     314             :                 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
     315           0 :                 __m128i a = _mm_srli_epi32(accum0, 8);
     316             :                 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
     317           0 :                 __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
     318             :                 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
     319           0 :                 a = _mm_srli_epi32(accum0, 16);
     320             :                 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
     321           0 :                 b = _mm_max_epu8(a, b);  // Max of r and g and b.
     322             :                 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
     323           0 :                 b = _mm_slli_epi32(b, 24);
     324             : 
     325             :                 // Make sure the value of alpha channel is always larger than maximum
     326             :                 // value of color channels.
     327           0 :                 accum0 = _mm_max_epu8(b, accum0);
     328             :             } else {
     329             :                 // Set value of alpha channels to 0xFF.
     330           0 :                 __m128i mask = _mm_set1_epi32(0xff000000);
     331           0 :                 accum0 = _mm_or_si128(accum0, mask);
     332             :             }
     333             : 
     334             :             // Store the convolution result (16 bytes) and advance the pixel pointers.
     335             :             _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0);
     336           0 :             outRow += 16;
     337             :         }
     338             : 
     339             :         // When the width of the output is not divisible by 4, We need to save one
     340             :         // pixel (4 bytes) each time. And also the fourth pixel is always absent.
     341           0 :         int r = pixelWidth & 3;
     342           0 :         if (r) {
     343           0 :             __m128i accum0 = _mm_setzero_si128();
     344           0 :             __m128i accum1 = _mm_setzero_si128();
     345           0 :             __m128i accum2 = _mm_setzero_si128();
     346           0 :             for (int filterY = 0; filterY < filterLength; ++filterY) {
     347           0 :                 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
     348             :                 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
     349             :                 const __m128i* src = reinterpret_cast<const __m128i*>(
     350           0 :                     &sourceDataRows[filterY][width << 2]);
     351           0 :                 __m128i src8 = _mm_loadu_si128(src);
     352             :                 // [16] a1 b1 g1 r1 a0 b0 g0 r0
     353           0 :                 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
     354           0 :                 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
     355           0 :                 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
     356             :                 // [32] a0 b0 g0 r0
     357           0 :                 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
     358           0 :                 accum0 = _mm_add_epi32(accum0, t);
     359             :                 // [32] a1 b1 g1 r1
     360           0 :                 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
     361           0 :                 accum1 = _mm_add_epi32(accum1, t);
     362             :                 // [16] a3 b3 g3 r3 a2 b2 g2 r2
     363           0 :                 src16 = _mm_unpackhi_epi8(src8, zero);
     364           0 :                 mul_hi = _mm_mulhi_epi16(src16, coeff16);
     365           0 :                 mul_lo = _mm_mullo_epi16(src16, coeff16);
     366             :                 // [32] a2 b2 g2 r2
     367           0 :                 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
     368           0 :                 accum2 = _mm_add_epi32(accum2, t);
     369             :             }
     370             : 
     371           0 :             accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
     372           0 :             accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
     373           0 :             accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
     374             :             // [16] a1 b1 g1 r1 a0 b0 g0 r0
     375           0 :             accum0 = _mm_packs_epi32(accum0, accum1);
     376             :             // [16] a3 b3 g3 r3 a2 b2 g2 r2
     377           0 :             accum2 = _mm_packs_epi32(accum2, zero);
     378             :             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
     379           0 :             accum0 = _mm_packus_epi16(accum0, accum2);
     380             :             if (hasAlpha) {
     381             :                 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
     382           0 :                 __m128i a = _mm_srli_epi32(accum0, 8);
     383             :                 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
     384           0 :                 __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
     385             :                 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
     386           0 :                 a = _mm_srli_epi32(accum0, 16);
     387             :                 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
     388           0 :                 b = _mm_max_epu8(a, b);  // Max of r and g and b.
     389             :                 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
     390           0 :                 b = _mm_slli_epi32(b, 24);
     391           0 :                 accum0 = _mm_max_epu8(b, accum0);
     392             :             } else {
     393           0 :                 __m128i mask = _mm_set1_epi32(0xff000000);
     394           0 :                 accum0 = _mm_or_si128(accum0, mask);
     395             :             }
     396             : 
     397           0 :             for (int i = 0; i < r; i++) {
     398           0 :                 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum0);
     399           0 :                 accum0 = _mm_srli_si128(accum0, 4);
     400           0 :                 outRow += 4;
     401             :             }
     402             :         }
     403           0 :     }
     404             : 
     405             : #elif defined(SK_ARM_HAS_NEON)
     406             : 
     407             :     static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,
     408             :             const SkConvolutionFilter1D::ConvolutionFixed* filterValues, int32x4_t& accum, int r) {
     409             :         int remainder[4] = {0};
     410             :         for (int i = 0; i < r; i++) {
     411             :             SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
     412             :             remainder[0] += coeff * pixelsLeft[i * 4 + 0];
     413             :             remainder[1] += coeff * pixelsLeft[i * 4 + 1];
     414             :             remainder[2] += coeff * pixelsLeft[i * 4 + 2];
     415             :             remainder[3] += coeff * pixelsLeft[i * 4 + 3];
     416             :         }
     417             :         int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]};
     418             :         accum += t;
     419             :     }
     420             : 
     421             :     // Convolves horizontally along a single row. The row data is given in
     422             :     // |srcData| and continues for the numValues() of the filter.
     423             :     void convolve_horizontally(const unsigned char* srcData,
     424             :                                const SkConvolutionFilter1D& filter,
     425             :                                unsigned char* outRow,
     426             :                                bool /*hasAlpha*/) {
     427             :         // Loop over each pixel on this row in the output image.
     428             :         int numValues = filter.numValues();
     429             :         for (int outX = 0; outX < numValues; outX++) {
     430             :             uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
     431             :             uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
     432             :             uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
     433             :             uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
     434             :             // Get the filter that determines the current output pixel.
     435             :             int filterOffset, filterLength;
     436             :             const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
     437             :                 filter.FilterForValue(outX, &filterOffset, &filterLength);
     438             : 
     439             :             // Compute the first pixel in this row that the filter affects. It will
     440             :             // touch |filterLength| pixels (4 bytes each) after this.
     441             :             const unsigned char* rowToFilter = &srcData[filterOffset * 4];
     442             : 
     443             :             // Apply the filter to the row to get the destination pixel in |accum|.
     444             :             int32x4_t accum = vdupq_n_s32(0);
     445             :             for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
     446             :                 // Load 4 coefficients
     447             :                 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
     448             :                 coeffs = vld1_s16(filterValues);
     449             :                 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
     450             :                 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
     451             :                 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
     452             :                 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
     453             : 
     454             :                 // Load pixels and calc
     455             :                 uint8x16_t pixels = vld1q_u8(rowToFilter);
     456             :                 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
     457             :                 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
     458             : 
     459             :                 int16x4_t p0_src = vget_low_s16(p01_16);
     460             :                 int16x4_t p1_src = vget_high_s16(p01_16);
     461             :                 int16x4_t p2_src = vget_low_s16(p23_16);
     462             :                 int16x4_t p3_src = vget_high_s16(p23_16);
     463             : 
     464             :                 int32x4_t p0 = vmull_s16(p0_src, coeff0);
     465             :                 int32x4_t p1 = vmull_s16(p1_src, coeff1);
     466             :                 int32x4_t p2 = vmull_s16(p2_src, coeff2);
     467             :                 int32x4_t p3 = vmull_s16(p3_src, coeff3);
     468             : 
     469             :                 accum += p0;
     470             :                 accum += p1;
     471             :                 accum += p2;
     472             :                 accum += p3;
     473             : 
     474             :                 // Advance the pointers
     475             :                 rowToFilter += 16;
     476             :                 filterValues += 4;
     477             :             }
     478             : 
     479             :             int r = filterLength & 3;
     480             :             if (r) {
     481             :                 int remainder_offset = (filterOffset + filterLength - r) * 4;
     482             :                 AccumRemainder(srcData + remainder_offset, filterValues, accum, r);
     483             :             }
     484             : 
     485             :             // Bring this value back in range. All of the filter scaling factors
     486             :             // are in fixed point with kShiftBits bits of fractional part.
     487             :             accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
     488             : 
     489             :             // Pack and store the new pixel.
     490             :             int16x4_t accum16 = vqmovn_s32(accum);
     491             :             uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
     492             :             vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(accum8), 0);
     493             :             outRow += 4;
     494             :         }
     495             :     }
     496             : 
     497             :     // Convolves horizontally along four rows. The row data is given in
     498             :     // |srcData| and continues for the numValues() of the filter.
     499             :     // The algorithm is almost same as |convolve_horizontally|. Please
     500             :     // refer to that function for detailed comments.
     501             :     void convolve_4_rows_horizontally(const unsigned char* srcData[4],
     502             :                                       const SkConvolutionFilter1D& filter,
     503             :                                       unsigned char* outRow[4],
     504             :                                       size_t outRowBytes) {
     505             :         // Output one pixel each iteration, calculating all channels (RGBA) together.
     506             :         int numValues = filter.numValues();
     507             :         for (int outX = 0; outX < numValues; outX++) {
     508             : 
     509             :             int filterOffset, filterLength;
     510             :             const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
     511             :             filter.FilterForValue(outX, &filterOffset, &filterLength);
     512             : 
     513             :             // four pixels in a column per iteration.
     514             :             int32x4_t accum0 = vdupq_n_s32(0);
     515             :             int32x4_t accum1 = vdupq_n_s32(0);
     516             :             int32x4_t accum2 = vdupq_n_s32(0);
     517             :             int32x4_t accum3 = vdupq_n_s32(0);
     518             : 
     519             :             uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
     520             :             uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
     521             :             uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
     522             :             uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
     523             : 
     524             :             int start = filterOffset * 4;
     525             : 
     526             :             // We will load and accumulate with four coefficients per iteration.
     527             :             for (int filterX = 0; filterX < (filterLength >> 2); filterX++) {
     528             :                 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
     529             : 
     530             :                 coeffs = vld1_s16(filterValues);
     531             :                 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
     532             :                 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
     533             :                 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
     534             :                 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
     535             : 
     536             :                 uint8x16_t pixels;
     537             :                 int16x8_t p01_16, p23_16;
     538             :                 int32x4_t p0, p1, p2, p3;
     539             : 
     540             : #define ITERATION(src, accum)                                                   \
     541             :                 pixels = vld1q_u8(src);                                         \
     542             :                 p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));  \
     543             :                 p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \
     544             :                 p0 = vmull_s16(vget_low_s16(p01_16), coeff0);                   \
     545             :                 p1 = vmull_s16(vget_high_s16(p01_16), coeff1);                  \
     546             :                 p2 = vmull_s16(vget_low_s16(p23_16), coeff2);                   \
     547             :                 p3 = vmull_s16(vget_high_s16(p23_16), coeff3);                  \
     548             :                 accum += p0;                                                    \
     549             :                 accum += p1;                                                    \
     550             :                 accum += p2;                                                    \
     551             :                 accum += p3
     552             : 
     553             :                 ITERATION(srcData[0] + start, accum0);
     554             :                 ITERATION(srcData[1] + start, accum1);
     555             :                 ITERATION(srcData[2] + start, accum2);
     556             :                 ITERATION(srcData[3] + start, accum3);
     557             : 
     558             :                 start += 16;
     559             :                 filterValues += 4;
     560             :             }
     561             : 
     562             :             int r = filterLength & 3;
     563             :             if (r) {
     564             :                 int remainder_offset = (filterOffset + filterLength - r) * 4;
     565             :                 AccumRemainder(srcData[0] + remainder_offset, filterValues, accum0, r);
     566             :                 AccumRemainder(srcData[1] + remainder_offset, filterValues, accum1, r);
     567             :                 AccumRemainder(srcData[2] + remainder_offset, filterValues, accum2, r);
     568             :                 AccumRemainder(srcData[3] + remainder_offset, filterValues, accum3, r);
     569             :             }
     570             : 
     571             :             int16x4_t accum16;
     572             :             uint8x8_t res0, res1, res2, res3;
     573             : 
     574             : #define PACK_RESULT(accum, res)                                             \
     575             :             accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);  \
     576             :             accum16 = vqmovn_s32(accum);                                    \
     577             :             res = vqmovun_s16(vcombine_s16(accum16, accum16));
     578             : 
     579             :             PACK_RESULT(accum0, res0);
     580             :             PACK_RESULT(accum1, res1);
     581             :             PACK_RESULT(accum2, res2);
     582             :             PACK_RESULT(accum3, res3);
     583             : 
     584             :             vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u8(res0), 0);
     585             :             vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u8(res1), 0);
     586             :             vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u8(res2), 0);
     587             :             vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u8(res3), 0);
     588             :             outRow[0] += 4;
     589             :             outRow[1] += 4;
     590             :             outRow[2] += 4;
     591             :             outRow[3] += 4;
     592             :         }
     593             :     }
     594             : 
     595             : 
     596             :     // Does vertical convolution to produce one output row. The filter values and
     597             :     // length are given in the first two parameters. These are applied to each
     598             :     // of the rows pointed to in the |sourceDataRows| array, with each row
     599             :     // being |pixelWidth| wide.
     600             :     //
     601             :     // The output must have room for |pixelWidth * 4| bytes.
     602             :     template<bool hasAlpha>
     603             :     void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
     604             :                             int filterLength,
     605             :                             unsigned char* const* sourceDataRows,
     606             :                             int pixelWidth,
     607             :                             unsigned char* outRow) {
     608             :         int width = pixelWidth & ~3;
     609             : 
     610             :         // Output four pixels per iteration (16 bytes).
     611             :         for (int outX = 0; outX < width; outX += 4) {
     612             : 
     613             :             // Accumulated result for each pixel. 32 bits per RGBA channel.
     614             :             int32x4_t accum0 = vdupq_n_s32(0);
     615             :             int32x4_t accum1 = vdupq_n_s32(0);
     616             :             int32x4_t accum2 = vdupq_n_s32(0);
     617             :             int32x4_t accum3 = vdupq_n_s32(0);
     618             : 
     619             :             // Convolve with one filter coefficient per iteration.
     620             :             for (int filterY = 0; filterY < filterLength; filterY++) {
     621             : 
     622             :                 // Duplicate the filter coefficient 4 times.
     623             :                 // [16] cj cj cj cj
     624             :                 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);
     625             : 
     626             :                 // Load four pixels (16 bytes) together.
     627             :                 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
     628             :                 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);
     629             : 
     630             :                 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
     631             :                 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
     632             :                 int16x4_t src16_0 = vget_low_s16(src16_01);
     633             :                 int16x4_t src16_1 = vget_high_s16(src16_01);
     634             :                 int16x4_t src16_2 = vget_low_s16(src16_23);
     635             :                 int16x4_t src16_3 = vget_high_s16(src16_23);
     636             : 
     637             :                 accum0 += vmull_s16(src16_0, coeff16);
     638             :                 accum1 += vmull_s16(src16_1, coeff16);
     639             :                 accum2 += vmull_s16(src16_2, coeff16);
     640             :                 accum3 += vmull_s16(src16_3, coeff16);
     641             :             }
     642             : 
     643             :             // Shift right for fixed point implementation.
     644             :             accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
     645             :             accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
     646             :             accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
     647             :             accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);
     648             : 
     649             :             // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
     650             :             // [16] a1 b1 g1 r1 a0 b0 g0 r0
     651             :             int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
     652             :             // [16] a3 b3 g3 r3 a2 b2 g2 r2
     653             :             int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3));
     654             : 
     655             :             // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
     656             :             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
     657             :             uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
     658             : 
     659             :             if (hasAlpha) {
     660             :                 // Compute the max(ri, gi, bi) for each pixel.
     661             :                 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
     662             :                 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
     663             :                 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
     664             :                 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
     665             :                 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
     666             :                 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
     667             :                 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
     668             :                 b = vmaxq_u8(a, b); // Max of r and g and b.
     669             :                 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
     670             :                 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
     671             : 
     672             :                 // Make sure the value of alpha channel is always larger than maximum
     673             :                 // value of color channels.
     674             :                 accum8 = vmaxq_u8(b, accum8);
     675             :             } else {
     676             :                 // Set value of alpha channels to 0xFF.
     677             :                 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
     678             :             }
     679             : 
     680             :             // Store the convolution result (16 bytes) and advance the pixel pointers.
     681             :             vst1q_u8(outRow, accum8);
     682             :             outRow += 16;
     683             :         }
     684             : 
     685             :         // Process the leftovers when the width of the output is not divisible
     686             :         // by 4, that is at most 3 pixels.
     687             :         int r = pixelWidth & 3;
     688             :         if (r) {
     689             : 
     690             :             int32x4_t accum0 = vdupq_n_s32(0);
     691             :             int32x4_t accum1 = vdupq_n_s32(0);
     692             :             int32x4_t accum2 = vdupq_n_s32(0);
     693             : 
     694             :             for (int filterY = 0; filterY < filterLength; ++filterY) {
     695             :                 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);
     696             : 
     697             :                 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
     698             :                 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]);
     699             : 
     700             :                 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
     701             :                 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
     702             :                 int16x4_t src16_0 = vget_low_s16(src16_01);
     703             :                 int16x4_t src16_1 = vget_high_s16(src16_01);
     704             :                 int16x4_t src16_2 = vget_low_s16(src16_23);
     705             : 
     706             :                 accum0 += vmull_s16(src16_0, coeff16);
     707             :                 accum1 += vmull_s16(src16_1, coeff16);
     708             :                 accum2 += vmull_s16(src16_2, coeff16);
     709             :             }
     710             : 
     711             :             accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
     712             :             accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
     713             :             accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
     714             : 
     715             :             int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
     716             :             int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2));
     717             : 
     718             :             uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
     719             : 
     720             :             if (hasAlpha) {
     721             :                 // Compute the max(ri, gi, bi) for each pixel.
     722             :                 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
     723             :                 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
     724             :                 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
     725             :                 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
     726             :                 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
     727             :                 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
     728             :                 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
     729             :                 b = vmaxq_u8(a, b); // Max of r and g and b.
     730             :                 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
     731             :                 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
     732             : 
     733             :                 // Make sure the value of alpha channel is always larger than maximum
     734             :                 // value of color channels.
     735             :                 accum8 = vmaxq_u8(b, accum8);
     736             :             } else {
     737             :                 // Set value of alpha channels to 0xFF.
     738             :                 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
     739             :             }
     740             : 
     741             :             switch(r) {
     742             :             case 1:
     743             :                 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u32_u8(accum8), 0);
     744             :                 break;
     745             :             case 2:
     746             :                 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
     747             :                          vreinterpret_u32_u8(vget_low_u8(accum8)));
     748             :                 break;
     749             :             case 3:
     750             :                 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
     751             :                          vreinterpret_u32_u8(vget_low_u8(accum8)));
     752             :                 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_u32_u8(accum8), 2);
     753             :                 break;
     754             :             }
     755             :         }
     756             :     }
     757             : 
     758             : #else
     759             : 
     760             :     // Converts the argument to an 8-bit unsigned value by clamping to the range
     761             :     // 0-255.
     762             :     inline unsigned char ClampTo8(int a) {
     763             :         if (static_cast<unsigned>(a) < 256) {
     764             :             return a;  // Avoid the extra check in the common case.
     765             :         }
     766             :         if (a < 0) {
     767             :             return 0;
     768             :         }
     769             :         return 255;
     770             :     }
     771             : 
     772             :     // Convolves horizontally along a single row. The row data is given in
     773             :     // |srcData| and continues for the numValues() of the filter.
     774             :     template<bool hasAlpha>
     775             :     void ConvolveHorizontally(const unsigned char* srcData,
     776             :                               const SkConvolutionFilter1D& filter,
     777             :                               unsigned char* outRow) {
     778             :         // Loop over each pixel on this row in the output image.
     779             :         int numValues = filter.numValues();
     780             :         for (int outX = 0; outX < numValues; outX++) {
     781             :             // Get the filter that determines the current output pixel.
     782             :             int filterOffset, filterLength;
     783             :             const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
     784             :                 filter.FilterForValue(outX, &filterOffset, &filterLength);
     785             : 
     786             :             // Compute the first pixel in this row that the filter affects. It will
     787             :             // touch |filterLength| pixels (4 bytes each) after this.
     788             :             const unsigned char* rowToFilter = &srcData[filterOffset * 4];
     789             : 
     790             :             // Apply the filter to the row to get the destination pixel in |accum|.
     791             :             int accum[4] = {0};
     792             :             for (int filterX = 0; filterX < filterLength; filterX++) {
     793             :                 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX];
     794             :                 accum[0] += curFilter * rowToFilter[filterX * 4 + 0];
     795             :                 accum[1] += curFilter * rowToFilter[filterX * 4 + 1];
     796             :                 accum[2] += curFilter * rowToFilter[filterX * 4 + 2];
     797             :                 if (hasAlpha) {
     798             :                     accum[3] += curFilter * rowToFilter[filterX * 4 + 3];
     799             :                 }
     800             :             }
     801             : 
     802             :             // Bring this value back in range. All of the filter scaling factors
     803             :             // are in fixed point with kShiftBits bits of fractional part.
     804             :             accum[0] >>= SkConvolutionFilter1D::kShiftBits;
     805             :             accum[1] >>= SkConvolutionFilter1D::kShiftBits;
     806             :             accum[2] >>= SkConvolutionFilter1D::kShiftBits;
     807             :             if (hasAlpha) {
     808             :                 accum[3] >>= SkConvolutionFilter1D::kShiftBits;
     809             :             }
     810             : 
     811             :             // Store the new pixel.
     812             :             outRow[outX * 4 + 0] = ClampTo8(accum[0]);
     813             :             outRow[outX * 4 + 1] = ClampTo8(accum[1]);
     814             :             outRow[outX * 4 + 2] = ClampTo8(accum[2]);
     815             :             if (hasAlpha) {
     816             :                 outRow[outX * 4 + 3] = ClampTo8(accum[3]);
     817             :             }
     818             :         }
     819             :     }
     820             : 
     821             :     // Does vertical convolution to produce one output row. The filter values and
     822             :     // length are given in the first two parameters. These are applied to each
     823             :     // of the rows pointed to in the |sourceDataRows| array, with each row
     824             :     // being |pixelWidth| wide.
     825             :     //
     826             :     // The output must have room for |pixelWidth * 4| bytes.
     827             :     template<bool hasAlpha>
     828             :     void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
     829             :                             int filterLength,
     830             :                             unsigned char* const* sourceDataRows,
     831             :                             int pixelWidth,
     832             :                             unsigned char* outRow) {
     833             :         // We go through each column in the output and do a vertical convolution,
     834             :         // generating one output pixel each time.
     835             :         for (int outX = 0; outX < pixelWidth; outX++) {
     836             :             // Compute the number of bytes over in each row that the current column
     837             :             // we're convolving starts at. The pixel will cover the next 4 bytes.
     838             :             int byteOffset = outX * 4;
     839             : 
     840             :             // Apply the filter to one column of pixels.
     841             :             int accum[4] = {0};
     842             :             for (int filterY = 0; filterY < filterLength; filterY++) {
     843             :                 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY];
     844             :                 accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0];
     845             :                 accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1];
     846             :                 accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2];
     847             :                 if (hasAlpha) {
     848             :                     accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3];
     849             :                 }
     850             :             }
     851             : 
     852             :             // Bring this value back in range. All of the filter scaling factors
     853             :             // are in fixed point with kShiftBits bits of precision.
     854             :             accum[0] >>= SkConvolutionFilter1D::kShiftBits;
     855             :             accum[1] >>= SkConvolutionFilter1D::kShiftBits;
     856             :             accum[2] >>= SkConvolutionFilter1D::kShiftBits;
     857             :             if (hasAlpha) {
     858             :                 accum[3] >>= SkConvolutionFilter1D::kShiftBits;
     859             :             }
     860             : 
     861             :             // Store the new pixel.
     862             :             outRow[byteOffset + 0] = ClampTo8(accum[0]);
     863             :             outRow[byteOffset + 1] = ClampTo8(accum[1]);
     864             :             outRow[byteOffset + 2] = ClampTo8(accum[2]);
     865             :             if (hasAlpha) {
     866             :                 unsigned char alpha = ClampTo8(accum[3]);
     867             : 
     868             :                 // Make sure the alpha channel doesn't come out smaller than any of the
     869             :                 // color channels. We use premultipled alpha channels, so this should
     870             :                 // never happen, but rounding errors will cause this from time to time.
     871             :                 // These "impossible" colors will cause overflows (and hence random pixel
     872             :                 // values) when the resulting bitmap is drawn to the screen.
     873             :                 //
     874             :                 // We only need to do this when generating the final output row (here).
     875             :                 int maxColorChannel = SkTMax(outRow[byteOffset + 0],
     876             :                                                SkTMax(outRow[byteOffset + 1],
     877             :                                                       outRow[byteOffset + 2]));
     878             :                 if (alpha < maxColorChannel) {
     879             :                     outRow[byteOffset + 3] = maxColorChannel;
     880             :                 } else {
     881             :                     outRow[byteOffset + 3] = alpha;
     882             :                 }
     883             :             } else {
     884             :                 // No alpha channel, the image is opaque.
     885             :                 outRow[byteOffset + 3] = 0xff;
     886             :             }
     887             :         }
     888             :     }
     889             : 
     890             :     // There's a bug somewhere here with GCC autovectorization (-ftree-vectorize).  We originally
     891             :     // thought this was 32 bit only, but subsequent tests show that some 64 bit gcc compiles
     892             :     // suffer here too.
     893             :     //
     894             :     // Dropping to -O2 disables -ftree-vectorize.  GCC 4.6 needs noinline.  https://bug.skia.org/2575
     895             : #if SK_HAS_ATTRIBUTE(optimize) && defined(SK_RELEASE)
     896             :         #define SK_MAYBE_DISABLE_VECTORIZATION __attribute__((optimize("O2"), noinline))
     897             : #else
     898             :         #define SK_MAYBE_DISABLE_VECTORIZATION
     899             : #endif
     900             : 
     901             :     SK_MAYBE_DISABLE_VECTORIZATION
     902             :     void convolve_horizontally(const unsigned char* srcData,
     903             :                                const SkConvolutionFilter1D& filter,
     904             :                                unsigned char* outRow,
     905             :                                bool hasAlpha) {
     906             :         if (hasAlpha) {
     907             :             ConvolveHorizontally<true>(srcData, filter, outRow);
     908             :         } else {
     909             :             ConvolveHorizontally<false>(srcData, filter, outRow);
     910             :         }
     911             :     }
     912             : #undef SK_MAYBE_DISABLE_VECTORIZATION
     913             : 
     914             :     void (*convolve_4_rows_horizontally)(const unsigned char* srcData[4],
     915             :                                          const SkConvolutionFilter1D& filter,
     916             :                                          unsigned char* outRow[4],
     917             :                                          size_t outRowBytes)
     918             :         = nullptr;
     919             : 
     920             : 
     921             : #endif
     922             : 
     923           0 :     void convolve_vertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
     924             :                              int filterLength,
     925             :                              unsigned char* const* sourceDataRows,
     926             :                              int pixelWidth,
     927             :                              unsigned char* outRow,
     928             :                              bool hasAlpha) {
     929           0 :         if (hasAlpha) {
     930             :             ConvolveVertically<true>(filterValues, filterLength, sourceDataRows,
     931           0 :                                      pixelWidth, outRow);
     932             :         } else {
     933             :             ConvolveVertically<false>(filterValues, filterLength, sourceDataRows,
     934           0 :                                       pixelWidth, outRow);
     935             :         }
     936           0 :     }
     937             : 
     938             : }  // namespace SK_OPTS_NS
     939             : 
     940             : #endif//SkBitmapFilter_opts_DEFINED

Generated by: LCOV version 1.13