Line data Source code
1 : /* This Source Code Form is subject to the terms of the Mozilla Public
2 : * License, v. 2.0. If a copy of the MPL was not distributed with this
3 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 :
5 : #include "Blur.h"
6 :
7 : #include "SSEHelpers.h"
8 :
9 : #include <string.h>
10 :
11 : namespace mozilla {
12 : namespace gfx {
13 :
14 : MOZ_ALWAYS_INLINE
15 1140 : __m128i Divide(__m128i aValues, __m128i aDivisor)
16 : {
17 1140 : const __m128i mask = _mm_setr_epi32(0x0, 0xffffffff, 0x0, 0xffffffff);
18 : static const union {
19 : int64_t i64[2];
20 : __m128i m;
21 : } roundingAddition = { { int64_t(1) << 31, int64_t(1) << 31 } };
22 :
23 1140 : __m128i multiplied31 = _mm_mul_epu32(aValues, aDivisor);
24 2280 : __m128i multiplied42 = _mm_mul_epu32(_mm_srli_epi64(aValues, 32), aDivisor);
25 :
26 : // Add 1 << 31 before shifting or masking the lower 32 bits away, so that the
27 : // result is rounded.
28 3420 : __m128i p_3_1 = _mm_srli_epi64(_mm_add_epi64(multiplied31, roundingAddition.m), 32);
29 3420 : __m128i p4_2_ = _mm_and_si128(_mm_add_epi64(multiplied42, roundingAddition.m), mask);
30 1140 : __m128i p4321 = _mm_or_si128(p_3_1, p4_2_);
31 1140 : return p4321;
32 : }
33 :
34 : MOZ_ALWAYS_INLINE
35 1140 : __m128i BlurFourPixels(const __m128i& aTopLeft, const __m128i& aTopRight,
36 : const __m128i& aBottomRight, const __m128i& aBottomLeft,
37 : const __m128i& aDivisor)
38 : {
39 4560 : __m128i values = _mm_add_epi32(_mm_sub_epi32(_mm_sub_epi32(aBottomRight, aTopRight), aBottomLeft), aTopLeft);
40 1140 : return Divide(values, aDivisor);
41 : }
42 :
43 : MOZ_ALWAYS_INLINE
44 6 : void LoadIntegralRowFromRow(uint32_t *aDest, const uint8_t *aSource,
45 : int32_t aSourceWidth, int32_t aLeftInflation,
46 : int32_t aRightInflation)
47 : {
48 6 : int32_t currentRowSum = 0;
49 :
50 46 : for (int x = 0; x < aLeftInflation; x++) {
51 40 : currentRowSum += aSource[0];
52 40 : aDest[x] = currentRowSum;
53 : }
54 234 : for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) {
55 228 : currentRowSum += aSource[(x - aLeftInflation)];
56 228 : aDest[x] = currentRowSum;
57 : }
58 28 : for (int x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
59 22 : currentRowSum += aSource[aSourceWidth - 1];
60 22 : aDest[x] = currentRowSum;
61 : }
62 6 : }
63 :
64 : // This function calculates an integral of four pixels stored in the 4
65 : // 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns
66 : // { 30, 80, 160, 260 }. This seems to be the fastest way to do this after
67 : // much testing.
68 : MOZ_ALWAYS_INLINE
69 1406 : __m128i AccumulatePixelSums(__m128i aPixels)
70 : {
71 1406 : __m128i sumPixels = aPixels;
72 1406 : __m128i currentPixels = _mm_slli_si128(aPixels, 4);
73 1406 : sumPixels = _mm_add_epi32(sumPixels, currentPixels);
74 2812 : currentPixels = _mm_unpacklo_epi64(_mm_setzero_si128(), sumPixels);
75 :
76 1406 : return _mm_add_epi32(sumPixels, currentPixels);
77 : }
78 :
79 : MOZ_ALWAYS_INLINE void
80 3 : GenerateIntegralImage_SSE2(int32_t aLeftInflation, int32_t aRightInflation,
81 : int32_t aTopInflation, int32_t aBottomInflation,
82 : uint32_t *aIntegralImage, size_t aIntegralImageStride,
83 : uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
84 : {
85 3 : MOZ_ASSERT(!(aLeftInflation & 3));
86 :
87 3 : uint32_t stride32bit = aIntegralImageStride / 4;
88 :
89 3 : IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
90 6 : aSize.height + aTopInflation + aBottomInflation);
91 :
92 3 : LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation, aRightInflation);
93 :
94 17 : for (int y = 1; y < aTopInflation + 1; y++) {
95 14 : uint32_t *intRow = aIntegralImage + (y * stride32bit);
96 14 : uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
97 14 : uint32_t *intFirstRow = aIntegralImage;
98 :
99 192 : for (int x = 0; x < integralImageSize.width; x += 4) {
100 356 : __m128i firstRow = _mm_load_si128((__m128i*)(intFirstRow + x));
101 356 : __m128i previousRow = _mm_load_si128((__m128i*)(intPrevRow + x));
102 178 : _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(firstRow, previousRow));
103 : }
104 : }
105 :
106 114 : for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
107 111 : __m128i currentRowSum = _mm_setzero_si128();
108 111 : uint32_t *intRow = aIntegralImage + (y * stride32bit);
109 111 : uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
110 111 : uint8_t *sourceRow = aSource + aSourceStride * (y - aTopInflation);
111 :
112 111 : uint32_t pixel = sourceRow[0];
113 296 : for (int x = 0; x < aLeftInflation; x += 4) {
114 370 : __m128i sumPixels = AccumulatePixelSums(_mm_shuffle_epi32(_mm_set1_epi32(pixel), _MM_SHUFFLE(0, 0, 0, 0)));
115 :
116 370 : sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
117 :
118 185 : currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
119 :
120 555 : _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
121 : }
122 1221 : for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) {
123 1110 : uint32_t pixels = *(uint32_t*)(sourceRow + (x - aLeftInflation));
124 :
125 : // It's important to shuffle here. When we exit this loop currentRowSum
126 : // has to be set to sumPixels, so that the following loop can get the
127 : // correct pixel for the currentRowSum. The highest order pixel in
128 : // currentRowSum could've originated from accumulation in the stride.
129 1110 : currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
130 :
131 5550 : __m128i sumPixels = AccumulatePixelSums(_mm_unpacklo_epi16(_mm_unpacklo_epi8( _mm_set1_epi32(pixels), _mm_setzero_si128()), _mm_setzero_si128()));
132 2220 : sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
133 :
134 1110 : currentRowSum = sumPixels;
135 :
136 3330 : _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
137 : }
138 :
139 111 : pixel = sourceRow[aSize.width - 1];
140 111 : int x = (aSize.width + aLeftInflation);
141 111 : if ((aSize.width & 3)) {
142 : // Deal with unaligned portion. Get the correct pixel from currentRowSum,
143 : // see explanation above.
144 111 : uint32_t intCurrentRowSum = ((uint32_t*)¤tRowSum)[(aSize.width % 4) - 1];
145 555 : for (; x < integralImageSize.width; x++) {
146 : // We could be unaligned here!
147 333 : if (!(x & 3)) {
148 : // aligned!
149 222 : currentRowSum = _mm_set1_epi32(intCurrentRowSum);
150 111 : break;
151 : }
152 222 : intCurrentRowSum += pixel;
153 222 : intRow[x] = intPrevRow[x] + intCurrentRowSum;
154 : }
155 : } else {
156 0 : currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
157 : }
158 333 : for (; x < integralImageSize.width; x += 4) {
159 222 : __m128i sumPixels = AccumulatePixelSums(_mm_set1_epi32(pixel));
160 :
161 222 : sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
162 :
163 111 : currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
164 :
165 333 : _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
166 : }
167 : }
168 :
169 3 : if (aBottomInflation) {
170 : // Store the last valid row of our source image in the last row of
171 : // our integral image. This will be overwritten with the correct values
172 : // in the upcoming loop.
173 3 : LoadIntegralRowFromRow(aIntegralImage + (integralImageSize.height - 1) * stride32bit,
174 6 : aSource + (aSize.height - 1) * aSourceStride, aSize.width, aLeftInflation, aRightInflation);
175 :
176 :
177 14 : for (int y = aSize.height + aTopInflation; y < integralImageSize.height; y++) {
178 11 : __m128i *intRow = (__m128i*)(aIntegralImage + (y * stride32bit));
179 11 : __m128i *intPrevRow = (__m128i*)(aIntegralImage + (y - 1) * stride32bit);
180 11 : __m128i *intLastRow = (__m128i*)(aIntegralImage + (integralImageSize.height - 1) * stride32bit);
181 :
182 150 : for (int x = 0; x < integralImageSize.width; x += 4) {
183 417 : _mm_store_si128(intRow + (x / 4),
184 139 : _mm_add_epi32(_mm_load_si128(intLastRow + (x / 4)),
185 139 : _mm_load_si128(intPrevRow + (x / 4))));
186 : }
187 : }
188 : }
189 3 : }
190 :
191 : /**
192 : * Attempt to do an in-place box blur using an integral image.
193 : */
194 : void
195 3 : AlphaBoxBlur::BoxBlur_SSE2(uint8_t* aData,
196 : int32_t aLeftLobe,
197 : int32_t aRightLobe,
198 : int32_t aTopLobe,
199 : int32_t aBottomLobe,
200 : uint32_t *aIntegralImage,
201 : size_t aIntegralImageStride)
202 : {
203 3 : IntSize size = GetSize();
204 :
205 3 : MOZ_ASSERT(size.height > 0);
206 :
207 : // Our 'left' or 'top' lobe will include the current pixel. i.e. when
208 : // looking at an integral image the value of a pixel at 'x,y' is calculated
209 : // using the value of the integral image values above/below that.
210 3 : aLeftLobe++;
211 3 : aTopLobe++;
212 3 : int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
213 :
214 3 : MOZ_ASSERT(boxSize > 0);
215 :
216 3 : if (boxSize == 1) {
217 0 : return;
218 : }
219 :
220 3 : uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
221 :
222 3 : uint32_t stride32bit = aIntegralImageStride / 4;
223 3 : int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
224 :
225 3 : GenerateIntegralImage_SSE2(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
226 : aIntegralImage, aIntegralImageStride, aData,
227 3 : mStride, size);
228 :
229 6 : __m128i divisor = _mm_set1_epi32(reciprocal);
230 :
231 : // This points to the start of the rectangle within the IntegralImage that overlaps
232 : // the surface being blurred.
233 3 : uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
234 :
235 3 : IntRect skipRect = mSkipRect;
236 3 : int32_t stride = mStride;
237 3 : uint8_t *data = aData;
238 117 : for (int32_t y = 0; y < size.height; y++) {
239 114 : bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
240 :
241 114 : uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
242 114 : uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe);
243 114 : uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe);
244 114 : uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
245 :
246 114 : int32_t x = 0;
247 : // Process 16 pixels at a time for as long as possible.
248 570 : for (; x <= size.width - 16; x += 16) {
249 228 : if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
250 0 : x = skipRect.XMost() - 16;
251 : // Trigger early jump on coming loop iterations, this will be reset
252 : // next line anyway.
253 0 : inSkipRectY = false;
254 0 : continue;
255 : }
256 :
257 : __m128i topLeft;
258 : __m128i topRight;
259 : __m128i bottomRight;
260 : __m128i bottomLeft;
261 :
262 228 : topLeft = loadUnaligned128((__m128i*)(topLeftBase + x));
263 228 : topRight = loadUnaligned128((__m128i*)(topRightBase + x));
264 228 : bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x));
265 228 : bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x));
266 228 : __m128i result1 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
267 :
268 228 : topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 4));
269 228 : topRight = loadUnaligned128((__m128i*)(topRightBase + x + 4));
270 228 : bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 4));
271 228 : bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 4));
272 228 : __m128i result2 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
273 :
274 228 : topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 8));
275 228 : topRight = loadUnaligned128((__m128i*)(topRightBase + x + 8));
276 228 : bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 8));
277 228 : bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 8));
278 228 : __m128i result3 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
279 :
280 228 : topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 12));
281 228 : topRight = loadUnaligned128((__m128i*)(topRightBase + x + 12));
282 228 : bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 12));
283 228 : bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 12));
284 228 : __m128i result4 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
285 :
286 684 : __m128i final = _mm_packus_epi16(_mm_packs_epi32(result1, result2), _mm_packs_epi32(result3, result4));
287 :
288 228 : _mm_storeu_si128((__m128i*)(data + stride * y + x), final);
289 : }
290 :
291 : // Process the remaining pixels 4 bytes at a time.
292 570 : for (; x < size.width; x += 4) {
293 228 : if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
294 0 : x = skipRect.XMost() - 4;
295 : // Trigger early jump on coming loop iterations, this will be reset
296 : // next line anyway.
297 0 : inSkipRectY = false;
298 0 : continue;
299 : }
300 228 : __m128i topLeft = loadUnaligned128((__m128i*)(topLeftBase + x));
301 228 : __m128i topRight = loadUnaligned128((__m128i*)(topRightBase + x));
302 228 : __m128i bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x));
303 228 : __m128i bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x));
304 :
305 228 : __m128i result = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
306 912 : __m128i final = _mm_packus_epi16(_mm_packs_epi32(result, _mm_setzero_si128()), _mm_setzero_si128());
307 :
308 456 : *(uint32_t*)(data + stride * y + x) = _mm_cvtsi128_si32(final);
309 : }
310 : }
311 :
312 : }
313 :
314 : } // namespace gfx
315 : } // namespace mozilla
|