Line data Source code
1 : /*
2 : * Copyright 2016 Google Inc.
3 : *
4 : * Use of this source code is governed by a BSD-style license that can be
5 : * found in the LICENSE file.
6 : */
7 :
8 : #ifndef SkBitmapFilter_opts_DEFINED
9 : #define SkBitmapFilter_opts_DEFINED
10 :
11 : #include "SkConvolver.h"
12 :
13 : #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
14 : #include <immintrin.h>
15 : #elif defined(SK_ARM_HAS_NEON)
16 : #include <arm_neon.h>
17 : #endif
18 :
19 : namespace SK_OPTS_NS {
20 :
21 : #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
22 :
23 : static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,
24 : const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i& accum, int r) {
25 0 : int remainder[4] = {0};
26 0 : for (int i = 0; i < r; i++) {
27 0 : SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
28 0 : remainder[0] += coeff * pixelsLeft[i * 4 + 0];
29 0 : remainder[1] += coeff * pixelsLeft[i * 4 + 1];
30 0 : remainder[2] += coeff * pixelsLeft[i * 4 + 2];
31 0 : remainder[3] += coeff * pixelsLeft[i * 4 + 3];
32 : }
33 0 : __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remainder[3]);
34 0 : accum = _mm_add_epi32(accum, t);
35 : }
36 :
37 : // Convolves horizontally along a single row. The row data is given in
38 : // |srcData| and continues for the numValues() of the filter.
39 0 : void convolve_horizontally(const unsigned char* srcData,
40 : const SkConvolutionFilter1D& filter,
41 : unsigned char* outRow,
42 : bool /*hasAlpha*/) {
43 : // Output one pixel each iteration, calculating all channels (RGBA) together.
44 0 : int numValues = filter.numValues();
45 0 : for (int outX = 0; outX < numValues; outX++) {
46 : // Get the filter that determines the current output pixel.
47 : int filterOffset, filterLength;
48 : const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
49 0 : filter.FilterForValue(outX, &filterOffset, &filterLength);
50 :
51 : // Compute the first pixel in this row that the filter affects. It will
52 : // touch |filterLength| pixels (4 bytes each) after this.
53 0 : const unsigned char* rowToFilter = &srcData[filterOffset * 4];
54 :
55 0 : __m128i zero = _mm_setzero_si128();
56 0 : __m128i accum = _mm_setzero_si128();
57 :
58 : // We will load and accumulate with four coefficients per iteration.
59 0 : for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
60 : // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
61 : __m128i coeff, coeff16;
62 : // [16] xx xx xx xx c3 c2 c1 c0
63 0 : coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterValues));
64 : // [16] xx xx xx xx c1 c1 c0 c0
65 0 : coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
66 : // [16] c1 c1 c1 c1 c0 c0 c0 c0
67 0 : coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
68 :
69 : // Load four pixels => unpack the first two pixels to 16 bits =>
70 : // multiply with coefficients => accumulate the convolution result.
71 : // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
72 0 : __m128i src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowToFilter));
73 : // [16] a1 b1 g1 r1 a0 b0 g0 r0
74 0 : __m128i src16 = _mm_unpacklo_epi8(src8, zero);
75 0 : __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
76 0 : __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
77 : // [32] a0*c0 b0*c0 g0*c0 r0*c0
78 0 : __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
79 0 : accum = _mm_add_epi32(accum, t);
80 : // [32] a1*c1 b1*c1 g1*c1 r1*c1
81 0 : t = _mm_unpackhi_epi16(mul_lo, mul_hi);
82 0 : accum = _mm_add_epi32(accum, t);
83 :
84 : // Duplicate 3rd and 4th coefficients for all channels =>
85 : // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
86 : // => accumulate the convolution results.
87 : // [16] xx xx xx xx c3 c3 c2 c2
88 0 : coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
89 : // [16] c3 c3 c3 c3 c2 c2 c2 c2
90 0 : coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
91 : // [16] a3 g3 b3 r3 a2 g2 b2 r2
92 0 : src16 = _mm_unpackhi_epi8(src8, zero);
93 0 : mul_hi = _mm_mulhi_epi16(src16, coeff16);
94 0 : mul_lo = _mm_mullo_epi16(src16, coeff16);
95 : // [32] a2*c2 b2*c2 g2*c2 r2*c2
96 0 : t = _mm_unpacklo_epi16(mul_lo, mul_hi);
97 0 : accum = _mm_add_epi32(accum, t);
98 : // [32] a3*c3 b3*c3 g3*c3 r3*c3
99 0 : t = _mm_unpackhi_epi16(mul_lo, mul_hi);
100 0 : accum = _mm_add_epi32(accum, t);
101 :
102 : // Advance the pixel and coefficients pointers.
103 0 : rowToFilter += 16;
104 0 : filterValues += 4;
105 : }
106 :
107 : // When |filterLength| is not divisible by 4, we accumulate the last 1 - 3
108 : // coefficients one at a time.
109 0 : int r = filterLength & 3;
110 0 : if (r) {
111 0 : int remainderOffset = (filterOffset + filterLength - r) * 4;
112 0 : AccumRemainder(srcData + remainderOffset, filterValues, accum, r);
113 : }
114 :
115 : // Shift right for fixed point implementation.
116 0 : accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
117 :
118 : // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
119 0 : accum = _mm_packs_epi32(accum, zero);
120 : // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
121 0 : accum = _mm_packus_epi16(accum, zero);
122 :
123 : // Store the pixel value of 32 bits.
124 0 : *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum);
125 0 : outRow += 4;
126 : }
127 0 : }
128 :
129 : // Convolves horizontally along four rows. The row data is given in
130 : // |srcData| and continues for the numValues() of the filter.
131 : // The algorithm is almost same as |convolve_horizontally|. Please
132 : // refer to that function for detailed comments.
133 0 : void convolve_4_rows_horizontally(const unsigned char* srcData[4],
134 : const SkConvolutionFilter1D& filter,
135 : unsigned char* outRow[4],
136 : size_t outRowBytes) {
137 0 : SkDEBUGCODE(const unsigned char* out_row_0_start = outRow[0];)
138 :
139 : // Output one pixel each iteration, calculating all channels (RGBA) together.
140 0 : int numValues = filter.numValues();
141 0 : for (int outX = 0; outX < numValues; outX++) {
142 : int filterOffset, filterLength;
143 : const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
144 0 : filter.FilterForValue(outX, &filterOffset, &filterLength);
145 :
146 0 : __m128i zero = _mm_setzero_si128();
147 :
148 : // four pixels in a column per iteration.
149 0 : __m128i accum0 = _mm_setzero_si128();
150 0 : __m128i accum1 = _mm_setzero_si128();
151 0 : __m128i accum2 = _mm_setzero_si128();
152 0 : __m128i accum3 = _mm_setzero_si128();
153 :
154 0 : int start = filterOffset * 4;
155 : // We will load and accumulate with four coefficients per iteration.
156 0 : for (int filterX = 0; filterX < (filterLength >> 2); filterX++) {
157 : __m128i coeff, coeff16lo, coeff16hi;
158 : // [16] xx xx xx xx c3 c2 c1 c0
159 0 : coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterValues));
160 : // [16] xx xx xx xx c1 c1 c0 c0
161 0 : coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
162 : // [16] c1 c1 c1 c1 c0 c0 c0 c0
163 0 : coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
164 : // [16] xx xx xx xx c3 c3 c2 c2
165 0 : coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
166 : // [16] c3 c3 c3 c3 c2 c2 c2 c2
167 0 : coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
168 :
169 : __m128i src8, src16, mul_hi, mul_lo, t;
170 :
171 : #define ITERATION(src, accum) \
172 : src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
173 : src16 = _mm_unpacklo_epi8(src8, zero); \
174 : mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
175 : mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
176 : t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
177 : accum = _mm_add_epi32(accum, t); \
178 : t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
179 : accum = _mm_add_epi32(accum, t); \
180 : src16 = _mm_unpackhi_epi8(src8, zero); \
181 : mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
182 : mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
183 : t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
184 : accum = _mm_add_epi32(accum, t); \
185 : t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
186 : accum = _mm_add_epi32(accum, t)
187 :
188 0 : ITERATION(srcData[0] + start, accum0);
189 0 : ITERATION(srcData[1] + start, accum1);
190 0 : ITERATION(srcData[2] + start, accum2);
191 0 : ITERATION(srcData[3] + start, accum3);
192 :
193 0 : start += 16;
194 0 : filterValues += 4;
195 : }
196 :
197 0 : int r = filterLength & 3;
198 0 : if (r) {
199 0 : int remainderOffset = (filterOffset + filterLength - r) * 4;
200 0 : AccumRemainder(srcData[0] + remainderOffset, filterValues, accum0, r);
201 0 : AccumRemainder(srcData[1] + remainderOffset, filterValues, accum1, r);
202 0 : AccumRemainder(srcData[2] + remainderOffset, filterValues, accum2, r);
203 0 : AccumRemainder(srcData[3] + remainderOffset, filterValues, accum3, r);
204 : }
205 :
206 0 : accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
207 0 : accum0 = _mm_packs_epi32(accum0, zero);
208 0 : accum0 = _mm_packus_epi16(accum0, zero);
209 0 : accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
210 0 : accum1 = _mm_packs_epi32(accum1, zero);
211 0 : accum1 = _mm_packus_epi16(accum1, zero);
212 0 : accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
213 0 : accum2 = _mm_packs_epi32(accum2, zero);
214 0 : accum2 = _mm_packus_epi16(accum2, zero);
215 0 : accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
216 0 : accum3 = _mm_packs_epi32(accum3, zero);
217 0 : accum3 = _mm_packus_epi16(accum3, zero);
218 :
219 : // We seem to be running off the edge here (chromium:491660).
220 0 : SkASSERT(((size_t)outRow[0] - (size_t)out_row_0_start) < outRowBytes);
221 :
222 0 : *(reinterpret_cast<int*>(outRow[0])) = _mm_cvtsi128_si32(accum0);
223 0 : *(reinterpret_cast<int*>(outRow[1])) = _mm_cvtsi128_si32(accum1);
224 0 : *(reinterpret_cast<int*>(outRow[2])) = _mm_cvtsi128_si32(accum2);
225 0 : *(reinterpret_cast<int*>(outRow[3])) = _mm_cvtsi128_si32(accum3);
226 :
227 0 : outRow[0] += 4;
228 0 : outRow[1] += 4;
229 0 : outRow[2] += 4;
230 0 : outRow[3] += 4;
231 : }
232 0 : }
233 :
234 : // Does vertical convolution to produce one output row. The filter values and
235 : // length are given in the first two parameters. These are applied to each
236 : // of the rows pointed to in the |sourceDataRows| array, with each row
237 : // being |pixelWidth| wide.
238 : //
239 : // The output must have room for |pixelWidth * 4| bytes.
240 : template<bool hasAlpha>
241 0 : void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
242 : int filterLength,
243 : unsigned char* const* sourceDataRows,
244 : int pixelWidth,
245 : unsigned char* outRow) {
246 : // Output four pixels per iteration (16 bytes).
247 0 : int width = pixelWidth & ~3;
248 0 : __m128i zero = _mm_setzero_si128();
249 0 : for (int outX = 0; outX < width; outX += 4) {
250 : // Accumulated result for each pixel. 32 bits per RGBA channel.
251 0 : __m128i accum0 = _mm_setzero_si128();
252 0 : __m128i accum1 = _mm_setzero_si128();
253 0 : __m128i accum2 = _mm_setzero_si128();
254 0 : __m128i accum3 = _mm_setzero_si128();
255 :
256 : // Convolve with one filter coefficient per iteration.
257 0 : for (int filterY = 0; filterY < filterLength; filterY++) {
258 :
259 : // Duplicate the filter coefficient 8 times.
260 : // [16] cj cj cj cj cj cj cj cj
261 0 : __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
262 :
263 : // Load four pixels (16 bytes) together.
264 : // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
265 : const __m128i* src = reinterpret_cast<const __m128i*>(
266 0 : &sourceDataRows[filterY][outX << 2]);
267 0 : __m128i src8 = _mm_loadu_si128(src);
268 :
269 : // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
270 : // multiply with current coefficient => accumulate the result.
271 : // [16] a1 b1 g1 r1 a0 b0 g0 r0
272 0 : __m128i src16 = _mm_unpacklo_epi8(src8, zero);
273 0 : __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
274 0 : __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
275 : // [32] a0 b0 g0 r0
276 0 : __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
277 0 : accum0 = _mm_add_epi32(accum0, t);
278 : // [32] a1 b1 g1 r1
279 0 : t = _mm_unpackhi_epi16(mul_lo, mul_hi);
280 0 : accum1 = _mm_add_epi32(accum1, t);
281 :
282 : // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
283 : // multiply with current coefficient => accumulate the result.
284 : // [16] a3 b3 g3 r3 a2 b2 g2 r2
285 0 : src16 = _mm_unpackhi_epi8(src8, zero);
286 0 : mul_hi = _mm_mulhi_epi16(src16, coeff16);
287 0 : mul_lo = _mm_mullo_epi16(src16, coeff16);
288 : // [32] a2 b2 g2 r2
289 0 : t = _mm_unpacklo_epi16(mul_lo, mul_hi);
290 0 : accum2 = _mm_add_epi32(accum2, t);
291 : // [32] a3 b3 g3 r3
292 0 : t = _mm_unpackhi_epi16(mul_lo, mul_hi);
293 0 : accum3 = _mm_add_epi32(accum3, t);
294 : }
295 :
296 : // Shift right for fixed point implementation.
297 0 : accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
298 0 : accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
299 0 : accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
300 0 : accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
301 :
302 : // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
303 : // [16] a1 b1 g1 r1 a0 b0 g0 r0
304 0 : accum0 = _mm_packs_epi32(accum0, accum1);
305 : // [16] a3 b3 g3 r3 a2 b2 g2 r2
306 0 : accum2 = _mm_packs_epi32(accum2, accum3);
307 :
308 : // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
309 : // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
310 0 : accum0 = _mm_packus_epi16(accum0, accum2);
311 :
312 : if (hasAlpha) {
313 : // Compute the max(ri, gi, bi) for each pixel.
314 : // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
315 0 : __m128i a = _mm_srli_epi32(accum0, 8);
316 : // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
317 0 : __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
318 : // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
319 0 : a = _mm_srli_epi32(accum0, 16);
320 : // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
321 0 : b = _mm_max_epu8(a, b); // Max of r and g and b.
322 : // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
323 0 : b = _mm_slli_epi32(b, 24);
324 :
325 : // Make sure the value of alpha channel is always larger than maximum
326 : // value of color channels.
327 0 : accum0 = _mm_max_epu8(b, accum0);
328 : } else {
329 : // Set value of alpha channels to 0xFF.
330 0 : __m128i mask = _mm_set1_epi32(0xff000000);
331 0 : accum0 = _mm_or_si128(accum0, mask);
332 : }
333 :
334 : // Store the convolution result (16 bytes) and advance the pixel pointers.
335 : _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0);
336 0 : outRow += 16;
337 : }
338 :
339 : // When the width of the output is not divisible by 4, We need to save one
340 : // pixel (4 bytes) each time. And also the fourth pixel is always absent.
341 0 : int r = pixelWidth & 3;
342 0 : if (r) {
343 0 : __m128i accum0 = _mm_setzero_si128();
344 0 : __m128i accum1 = _mm_setzero_si128();
345 0 : __m128i accum2 = _mm_setzero_si128();
346 0 : for (int filterY = 0; filterY < filterLength; ++filterY) {
347 0 : __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
348 : // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
349 : const __m128i* src = reinterpret_cast<const __m128i*>(
350 0 : &sourceDataRows[filterY][width << 2]);
351 0 : __m128i src8 = _mm_loadu_si128(src);
352 : // [16] a1 b1 g1 r1 a0 b0 g0 r0
353 0 : __m128i src16 = _mm_unpacklo_epi8(src8, zero);
354 0 : __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
355 0 : __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
356 : // [32] a0 b0 g0 r0
357 0 : __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
358 0 : accum0 = _mm_add_epi32(accum0, t);
359 : // [32] a1 b1 g1 r1
360 0 : t = _mm_unpackhi_epi16(mul_lo, mul_hi);
361 0 : accum1 = _mm_add_epi32(accum1, t);
362 : // [16] a3 b3 g3 r3 a2 b2 g2 r2
363 0 : src16 = _mm_unpackhi_epi8(src8, zero);
364 0 : mul_hi = _mm_mulhi_epi16(src16, coeff16);
365 0 : mul_lo = _mm_mullo_epi16(src16, coeff16);
366 : // [32] a2 b2 g2 r2
367 0 : t = _mm_unpacklo_epi16(mul_lo, mul_hi);
368 0 : accum2 = _mm_add_epi32(accum2, t);
369 : }
370 :
371 0 : accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
372 0 : accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
373 0 : accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
374 : // [16] a1 b1 g1 r1 a0 b0 g0 r0
375 0 : accum0 = _mm_packs_epi32(accum0, accum1);
376 : // [16] a3 b3 g3 r3 a2 b2 g2 r2
377 0 : accum2 = _mm_packs_epi32(accum2, zero);
378 : // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
379 0 : accum0 = _mm_packus_epi16(accum0, accum2);
380 : if (hasAlpha) {
381 : // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
382 0 : __m128i a = _mm_srli_epi32(accum0, 8);
383 : // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
384 0 : __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
385 : // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
386 0 : a = _mm_srli_epi32(accum0, 16);
387 : // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
388 0 : b = _mm_max_epu8(a, b); // Max of r and g and b.
389 : // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
390 0 : b = _mm_slli_epi32(b, 24);
391 0 : accum0 = _mm_max_epu8(b, accum0);
392 : } else {
393 0 : __m128i mask = _mm_set1_epi32(0xff000000);
394 0 : accum0 = _mm_or_si128(accum0, mask);
395 : }
396 :
397 0 : for (int i = 0; i < r; i++) {
398 0 : *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum0);
399 0 : accum0 = _mm_srli_si128(accum0, 4);
400 0 : outRow += 4;
401 : }
402 : }
403 0 : }
404 :
405 : #elif defined(SK_ARM_HAS_NEON)
406 :
407 : static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,
408 : const SkConvolutionFilter1D::ConvolutionFixed* filterValues, int32x4_t& accum, int r) {
409 : int remainder[4] = {0};
410 : for (int i = 0; i < r; i++) {
411 : SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
412 : remainder[0] += coeff * pixelsLeft[i * 4 + 0];
413 : remainder[1] += coeff * pixelsLeft[i * 4 + 1];
414 : remainder[2] += coeff * pixelsLeft[i * 4 + 2];
415 : remainder[3] += coeff * pixelsLeft[i * 4 + 3];
416 : }
417 : int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]};
418 : accum += t;
419 : }
420 :
421 : // Convolves horizontally along a single row. The row data is given in
422 : // |srcData| and continues for the numValues() of the filter.
423 : void convolve_horizontally(const unsigned char* srcData,
424 : const SkConvolutionFilter1D& filter,
425 : unsigned char* outRow,
426 : bool /*hasAlpha*/) {
427 : // Loop over each pixel on this row in the output image.
428 : int numValues = filter.numValues();
429 : for (int outX = 0; outX < numValues; outX++) {
430 : uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
431 : uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
432 : uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
433 : uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
434 : // Get the filter that determines the current output pixel.
435 : int filterOffset, filterLength;
436 : const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
437 : filter.FilterForValue(outX, &filterOffset, &filterLength);
438 :
439 : // Compute the first pixel in this row that the filter affects. It will
440 : // touch |filterLength| pixels (4 bytes each) after this.
441 : const unsigned char* rowToFilter = &srcData[filterOffset * 4];
442 :
443 : // Apply the filter to the row to get the destination pixel in |accum|.
444 : int32x4_t accum = vdupq_n_s32(0);
445 : for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
446 : // Load 4 coefficients
447 : int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
448 : coeffs = vld1_s16(filterValues);
449 : coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
450 : coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
451 : coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
452 : coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
453 :
454 : // Load pixels and calc
455 : uint8x16_t pixels = vld1q_u8(rowToFilter);
456 : int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
457 : int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
458 :
459 : int16x4_t p0_src = vget_low_s16(p01_16);
460 : int16x4_t p1_src = vget_high_s16(p01_16);
461 : int16x4_t p2_src = vget_low_s16(p23_16);
462 : int16x4_t p3_src = vget_high_s16(p23_16);
463 :
464 : int32x4_t p0 = vmull_s16(p0_src, coeff0);
465 : int32x4_t p1 = vmull_s16(p1_src, coeff1);
466 : int32x4_t p2 = vmull_s16(p2_src, coeff2);
467 : int32x4_t p3 = vmull_s16(p3_src, coeff3);
468 :
469 : accum += p0;
470 : accum += p1;
471 : accum += p2;
472 : accum += p3;
473 :
474 : // Advance the pointers
475 : rowToFilter += 16;
476 : filterValues += 4;
477 : }
478 :
479 : int r = filterLength & 3;
480 : if (r) {
481 : int remainder_offset = (filterOffset + filterLength - r) * 4;
482 : AccumRemainder(srcData + remainder_offset, filterValues, accum, r);
483 : }
484 :
485 : // Bring this value back in range. All of the filter scaling factors
486 : // are in fixed point with kShiftBits bits of fractional part.
487 : accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
488 :
489 : // Pack and store the new pixel.
490 : int16x4_t accum16 = vqmovn_s32(accum);
491 : uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
492 : vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(accum8), 0);
493 : outRow += 4;
494 : }
495 : }
496 :
497 : // Convolves horizontally along four rows. The row data is given in
498 : // |srcData| and continues for the numValues() of the filter.
499 : // The algorithm is almost same as |convolve_horizontally|. Please
500 : // refer to that function for detailed comments.
501 : void convolve_4_rows_horizontally(const unsigned char* srcData[4],
502 : const SkConvolutionFilter1D& filter,
503 : unsigned char* outRow[4],
504 : size_t outRowBytes) {
505 : // Output one pixel each iteration, calculating all channels (RGBA) together.
506 : int numValues = filter.numValues();
507 : for (int outX = 0; outX < numValues; outX++) {
508 :
509 : int filterOffset, filterLength;
510 : const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
511 : filter.FilterForValue(outX, &filterOffset, &filterLength);
512 :
513 : // four pixels in a column per iteration.
514 : int32x4_t accum0 = vdupq_n_s32(0);
515 : int32x4_t accum1 = vdupq_n_s32(0);
516 : int32x4_t accum2 = vdupq_n_s32(0);
517 : int32x4_t accum3 = vdupq_n_s32(0);
518 :
519 : uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
520 : uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
521 : uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
522 : uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
523 :
524 : int start = filterOffset * 4;
525 :
526 : // We will load and accumulate with four coefficients per iteration.
527 : for (int filterX = 0; filterX < (filterLength >> 2); filterX++) {
528 : int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
529 :
530 : coeffs = vld1_s16(filterValues);
531 : coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
532 : coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
533 : coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
534 : coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
535 :
536 : uint8x16_t pixels;
537 : int16x8_t p01_16, p23_16;
538 : int32x4_t p0, p1, p2, p3;
539 :
540 : #define ITERATION(src, accum) \
541 : pixels = vld1q_u8(src); \
542 : p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \
543 : p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \
544 : p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \
545 : p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \
546 : p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \
547 : p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \
548 : accum += p0; \
549 : accum += p1; \
550 : accum += p2; \
551 : accum += p3
552 :
553 : ITERATION(srcData[0] + start, accum0);
554 : ITERATION(srcData[1] + start, accum1);
555 : ITERATION(srcData[2] + start, accum2);
556 : ITERATION(srcData[3] + start, accum3);
557 :
558 : start += 16;
559 : filterValues += 4;
560 : }
561 :
562 : int r = filterLength & 3;
563 : if (r) {
564 : int remainder_offset = (filterOffset + filterLength - r) * 4;
565 : AccumRemainder(srcData[0] + remainder_offset, filterValues, accum0, r);
566 : AccumRemainder(srcData[1] + remainder_offset, filterValues, accum1, r);
567 : AccumRemainder(srcData[2] + remainder_offset, filterValues, accum2, r);
568 : AccumRemainder(srcData[3] + remainder_offset, filterValues, accum3, r);
569 : }
570 :
571 : int16x4_t accum16;
572 : uint8x8_t res0, res1, res2, res3;
573 :
574 : #define PACK_RESULT(accum, res) \
575 : accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \
576 : accum16 = vqmovn_s32(accum); \
577 : res = vqmovun_s16(vcombine_s16(accum16, accum16));
578 :
579 : PACK_RESULT(accum0, res0);
580 : PACK_RESULT(accum1, res1);
581 : PACK_RESULT(accum2, res2);
582 : PACK_RESULT(accum3, res3);
583 :
584 : vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u8(res0), 0);
585 : vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u8(res1), 0);
586 : vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u8(res2), 0);
587 : vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u8(res3), 0);
588 : outRow[0] += 4;
589 : outRow[1] += 4;
590 : outRow[2] += 4;
591 : outRow[3] += 4;
592 : }
593 : }
594 :
595 :
596 : // Does vertical convolution to produce one output row. The filter values and
597 : // length are given in the first two parameters. These are applied to each
598 : // of the rows pointed to in the |sourceDataRows| array, with each row
599 : // being |pixelWidth| wide.
600 : //
601 : // The output must have room for |pixelWidth * 4| bytes.
602 : template<bool hasAlpha>
603 : void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
604 : int filterLength,
605 : unsigned char* const* sourceDataRows,
606 : int pixelWidth,
607 : unsigned char* outRow) {
608 : int width = pixelWidth & ~3;
609 :
610 : // Output four pixels per iteration (16 bytes).
611 : for (int outX = 0; outX < width; outX += 4) {
612 :
613 : // Accumulated result for each pixel. 32 bits per RGBA channel.
614 : int32x4_t accum0 = vdupq_n_s32(0);
615 : int32x4_t accum1 = vdupq_n_s32(0);
616 : int32x4_t accum2 = vdupq_n_s32(0);
617 : int32x4_t accum3 = vdupq_n_s32(0);
618 :
619 : // Convolve with one filter coefficient per iteration.
620 : for (int filterY = 0; filterY < filterLength; filterY++) {
621 :
622 : // Duplicate the filter coefficient 4 times.
623 : // [16] cj cj cj cj
624 : int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);
625 :
626 : // Load four pixels (16 bytes) together.
627 : // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
628 : uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);
629 :
630 : int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
631 : int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
632 : int16x4_t src16_0 = vget_low_s16(src16_01);
633 : int16x4_t src16_1 = vget_high_s16(src16_01);
634 : int16x4_t src16_2 = vget_low_s16(src16_23);
635 : int16x4_t src16_3 = vget_high_s16(src16_23);
636 :
637 : accum0 += vmull_s16(src16_0, coeff16);
638 : accum1 += vmull_s16(src16_1, coeff16);
639 : accum2 += vmull_s16(src16_2, coeff16);
640 : accum3 += vmull_s16(src16_3, coeff16);
641 : }
642 :
643 : // Shift right for fixed point implementation.
644 : accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
645 : accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
646 : accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
647 : accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);
648 :
649 : // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
650 : // [16] a1 b1 g1 r1 a0 b0 g0 r0
651 : int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
652 : // [16] a3 b3 g3 r3 a2 b2 g2 r2
653 : int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3));
654 :
655 : // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
656 : // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
657 : uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
658 :
659 : if (hasAlpha) {
660 : // Compute the max(ri, gi, bi) for each pixel.
661 : // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
662 : uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
663 : // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
664 : uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
665 : // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
666 : a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
667 : // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
668 : b = vmaxq_u8(a, b); // Max of r and g and b.
669 : // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
670 : b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
671 :
672 : // Make sure the value of alpha channel is always larger than maximum
673 : // value of color channels.
674 : accum8 = vmaxq_u8(b, accum8);
675 : } else {
676 : // Set value of alpha channels to 0xFF.
677 : accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
678 : }
679 :
680 : // Store the convolution result (16 bytes) and advance the pixel pointers.
681 : vst1q_u8(outRow, accum8);
682 : outRow += 16;
683 : }
684 :
685 : // Process the leftovers when the width of the output is not divisible
686 : // by 4, that is at most 3 pixels.
687 : int r = pixelWidth & 3;
688 : if (r) {
689 :
690 : int32x4_t accum0 = vdupq_n_s32(0);
691 : int32x4_t accum1 = vdupq_n_s32(0);
692 : int32x4_t accum2 = vdupq_n_s32(0);
693 :
694 : for (int filterY = 0; filterY < filterLength; ++filterY) {
695 : int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);
696 :
697 : // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
698 : uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]);
699 :
700 : int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
701 : int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
702 : int16x4_t src16_0 = vget_low_s16(src16_01);
703 : int16x4_t src16_1 = vget_high_s16(src16_01);
704 : int16x4_t src16_2 = vget_low_s16(src16_23);
705 :
706 : accum0 += vmull_s16(src16_0, coeff16);
707 : accum1 += vmull_s16(src16_1, coeff16);
708 : accum2 += vmull_s16(src16_2, coeff16);
709 : }
710 :
711 : accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
712 : accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
713 : accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
714 :
715 : int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
716 : int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2));
717 :
718 : uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
719 :
720 : if (hasAlpha) {
721 : // Compute the max(ri, gi, bi) for each pixel.
722 : // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
723 : uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
724 : // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
725 : uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
726 : // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
727 : a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
728 : // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
729 : b = vmaxq_u8(a, b); // Max of r and g and b.
730 : // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
731 : b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
732 :
733 : // Make sure the value of alpha channel is always larger than maximum
734 : // value of color channels.
735 : accum8 = vmaxq_u8(b, accum8);
736 : } else {
737 : // Set value of alpha channels to 0xFF.
738 : accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
739 : }
740 :
741 : switch(r) {
742 : case 1:
743 : vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u32_u8(accum8), 0);
744 : break;
745 : case 2:
746 : vst1_u32(reinterpret_cast<uint32_t*>(outRow),
747 : vreinterpret_u32_u8(vget_low_u8(accum8)));
748 : break;
749 : case 3:
750 : vst1_u32(reinterpret_cast<uint32_t*>(outRow),
751 : vreinterpret_u32_u8(vget_low_u8(accum8)));
752 : vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_u32_u8(accum8), 2);
753 : break;
754 : }
755 : }
756 : }
757 :
758 : #else
759 :
760 : // Converts the argument to an 8-bit unsigned value by clamping to the range
761 : // 0-255.
762 : inline unsigned char ClampTo8(int a) {
763 : if (static_cast<unsigned>(a) < 256) {
764 : return a; // Avoid the extra check in the common case.
765 : }
766 : if (a < 0) {
767 : return 0;
768 : }
769 : return 255;
770 : }
771 :
772 : // Convolves horizontally along a single row. The row data is given in
773 : // |srcData| and continues for the numValues() of the filter.
774 : template<bool hasAlpha>
775 : void ConvolveHorizontally(const unsigned char* srcData,
776 : const SkConvolutionFilter1D& filter,
777 : unsigned char* outRow) {
778 : // Loop over each pixel on this row in the output image.
779 : int numValues = filter.numValues();
780 : for (int outX = 0; outX < numValues; outX++) {
781 : // Get the filter that determines the current output pixel.
782 : int filterOffset, filterLength;
783 : const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
784 : filter.FilterForValue(outX, &filterOffset, &filterLength);
785 :
786 : // Compute the first pixel in this row that the filter affects. It will
787 : // touch |filterLength| pixels (4 bytes each) after this.
788 : const unsigned char* rowToFilter = &srcData[filterOffset * 4];
789 :
790 : // Apply the filter to the row to get the destination pixel in |accum|.
791 : int accum[4] = {0};
792 : for (int filterX = 0; filterX < filterLength; filterX++) {
793 : SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX];
794 : accum[0] += curFilter * rowToFilter[filterX * 4 + 0];
795 : accum[1] += curFilter * rowToFilter[filterX * 4 + 1];
796 : accum[2] += curFilter * rowToFilter[filterX * 4 + 2];
797 : if (hasAlpha) {
798 : accum[3] += curFilter * rowToFilter[filterX * 4 + 3];
799 : }
800 : }
801 :
802 : // Bring this value back in range. All of the filter scaling factors
803 : // are in fixed point with kShiftBits bits of fractional part.
804 : accum[0] >>= SkConvolutionFilter1D::kShiftBits;
805 : accum[1] >>= SkConvolutionFilter1D::kShiftBits;
806 : accum[2] >>= SkConvolutionFilter1D::kShiftBits;
807 : if (hasAlpha) {
808 : accum[3] >>= SkConvolutionFilter1D::kShiftBits;
809 : }
810 :
811 : // Store the new pixel.
812 : outRow[outX * 4 + 0] = ClampTo8(accum[0]);
813 : outRow[outX * 4 + 1] = ClampTo8(accum[1]);
814 : outRow[outX * 4 + 2] = ClampTo8(accum[2]);
815 : if (hasAlpha) {
816 : outRow[outX * 4 + 3] = ClampTo8(accum[3]);
817 : }
818 : }
819 : }
820 :
821 : // Does vertical convolution to produce one output row. The filter values and
822 : // length are given in the first two parameters. These are applied to each
823 : // of the rows pointed to in the |sourceDataRows| array, with each row
824 : // being |pixelWidth| wide.
825 : //
826 : // The output must have room for |pixelWidth * 4| bytes.
827 : template<bool hasAlpha>
828 : void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
829 : int filterLength,
830 : unsigned char* const* sourceDataRows,
831 : int pixelWidth,
832 : unsigned char* outRow) {
833 : // We go through each column in the output and do a vertical convolution,
834 : // generating one output pixel each time.
835 : for (int outX = 0; outX < pixelWidth; outX++) {
836 : // Compute the number of bytes over in each row that the current column
837 : // we're convolving starts at. The pixel will cover the next 4 bytes.
838 : int byteOffset = outX * 4;
839 :
840 : // Apply the filter to one column of pixels.
841 : int accum[4] = {0};
842 : for (int filterY = 0; filterY < filterLength; filterY++) {
843 : SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY];
844 : accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0];
845 : accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1];
846 : accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2];
847 : if (hasAlpha) {
848 : accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3];
849 : }
850 : }
851 :
852 : // Bring this value back in range. All of the filter scaling factors
853 : // are in fixed point with kShiftBits bits of precision.
854 : accum[0] >>= SkConvolutionFilter1D::kShiftBits;
855 : accum[1] >>= SkConvolutionFilter1D::kShiftBits;
856 : accum[2] >>= SkConvolutionFilter1D::kShiftBits;
857 : if (hasAlpha) {
858 : accum[3] >>= SkConvolutionFilter1D::kShiftBits;
859 : }
860 :
861 : // Store the new pixel.
862 : outRow[byteOffset + 0] = ClampTo8(accum[0]);
863 : outRow[byteOffset + 1] = ClampTo8(accum[1]);
864 : outRow[byteOffset + 2] = ClampTo8(accum[2]);
865 : if (hasAlpha) {
866 : unsigned char alpha = ClampTo8(accum[3]);
867 :
868 : // Make sure the alpha channel doesn't come out smaller than any of the
869 : // color channels. We use premultipled alpha channels, so this should
870 : // never happen, but rounding errors will cause this from time to time.
871 : // These "impossible" colors will cause overflows (and hence random pixel
872 : // values) when the resulting bitmap is drawn to the screen.
873 : //
874 : // We only need to do this when generating the final output row (here).
875 : int maxColorChannel = SkTMax(outRow[byteOffset + 0],
876 : SkTMax(outRow[byteOffset + 1],
877 : outRow[byteOffset + 2]));
878 : if (alpha < maxColorChannel) {
879 : outRow[byteOffset + 3] = maxColorChannel;
880 : } else {
881 : outRow[byteOffset + 3] = alpha;
882 : }
883 : } else {
884 : // No alpha channel, the image is opaque.
885 : outRow[byteOffset + 3] = 0xff;
886 : }
887 : }
888 : }
889 :
890 : // There's a bug somewhere here with GCC autovectorization (-ftree-vectorize). We originally
891 : // thought this was 32 bit only, but subsequent tests show that some 64 bit gcc compiles
892 : // suffer here too.
893 : //
894 : // Dropping to -O2 disables -ftree-vectorize. GCC 4.6 needs noinline. https://bug.skia.org/2575
895 : #if SK_HAS_ATTRIBUTE(optimize) && defined(SK_RELEASE)
896 : #define SK_MAYBE_DISABLE_VECTORIZATION __attribute__((optimize("O2"), noinline))
897 : #else
898 : #define SK_MAYBE_DISABLE_VECTORIZATION
899 : #endif
900 :
901 : SK_MAYBE_DISABLE_VECTORIZATION
902 : void convolve_horizontally(const unsigned char* srcData,
903 : const SkConvolutionFilter1D& filter,
904 : unsigned char* outRow,
905 : bool hasAlpha) {
906 : if (hasAlpha) {
907 : ConvolveHorizontally<true>(srcData, filter, outRow);
908 : } else {
909 : ConvolveHorizontally<false>(srcData, filter, outRow);
910 : }
911 : }
912 : #undef SK_MAYBE_DISABLE_VECTORIZATION
913 :
914 : void (*convolve_4_rows_horizontally)(const unsigned char* srcData[4],
915 : const SkConvolutionFilter1D& filter,
916 : unsigned char* outRow[4],
917 : size_t outRowBytes)
918 : = nullptr;
919 :
920 :
921 : #endif
922 :
923 0 : void convolve_vertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
924 : int filterLength,
925 : unsigned char* const* sourceDataRows,
926 : int pixelWidth,
927 : unsigned char* outRow,
928 : bool hasAlpha) {
929 0 : if (hasAlpha) {
930 : ConvolveVertically<true>(filterValues, filterLength, sourceDataRows,
931 0 : pixelWidth, outRow);
932 : } else {
933 : ConvolveVertically<false>(filterValues, filterLength, sourceDataRows,
934 0 : pixelWidth, outRow);
935 : }
936 0 : }
937 :
938 : } // namespace SK_OPTS_NS
939 :
940 : #endif//SkBitmapFilter_opts_DEFINED
|