Line data Source code
1 : /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 : * This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #include "ImageScaling.h"
7 : #include "mozilla/Attributes.h"
8 :
9 : #include "SSEHelpers.h"
10 :
11 : /* The functions below use the following system for averaging 4 pixels:
12 : *
13 : * The first observation is that a half-adder is implemented as follows:
14 : * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1);
15 : *
16 : * This can be trivially extended to three pixels by observaring that when
17 : * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the
18 : * carries of the individual numbers, since the sum of 3 bits can only ever
19 : * have a carry of one.
20 : *
21 : * We then observe that the average is then ((carry << 1) + sum) >> 1, or,
22 : * assuming eliminating overflows and underflows, carry + (sum >> 1).
23 : *
24 : * We now average our existing sum with the fourth number, so we get:
25 : * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1).
26 : *
27 : * We now observe that our sum has been moved into place relative to the
28 : * carry, so we can now average with the carry to get the final 4 input
29 : * average: avg = (sum2 + carry) >> 1;
30 : *
31 : * Or to reverse the proof:
32 : * avg = ((sum >> 1) + carry + d >> 1) >> 1
33 : * avg = ((a + b + c) >> 1 + d >> 1) >> 1
34 : * avg = ((a + b + c + d) >> 2)
35 : *
36 : * An additional fact used in the SSE versions is the concept that we can
37 : * trivially convert a rounded average to a truncated average:
38 : *
39 : * We have:
40 : * f(a, b) = (a + b + 1) >> 1
41 : *
42 : * And want:
43 : * g(a, b) = (a + b) >> 1
44 : *
45 : * Observe:
46 : * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1)
47 : * == ~((-a - 1 + -b - 1 + 1) >> 1)
48 : * == ~((-a - 1 + -b) >> 1)
49 : * == ~((-(a + b) - 1) >> 1)
50 : * == ~((~(a + b)) >> 1)
51 : * == (a + b) >> 1
52 : * == g(a, b)
53 : */
54 :
55 0 : MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg)
56 : {
57 0 : __m128i minusone = _mm_set1_epi32(0xffffffff);
58 0 : return _mm_xor_si128(arg, minusone);
59 : }
60 :
61 : /* We have to pass pointers here, MSVC does not allow passing more than 3
62 : * __m128i arguments on the stack. And it does not allow 16-byte aligned
63 : * stack variables. This inlines properly on MSVC 2010. It does -not- inline
64 : * with just the inline directive.
65 : */
66 0 : MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i *a, __m128i *b, __m128i *c, __m128i *d)
67 : {
68 : #define shuf1 _MM_SHUFFLE(2, 0, 2, 0)
69 : #define shuf2 _MM_SHUFFLE(3, 1, 3, 1)
70 :
71 : // This cannot be an inline function as the __Imm argument to _mm_shuffle_ps
72 : // needs to be a compile time constant.
73 : #define shuffle_si128(arga, argb, imm) \
74 : _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), _mm_castsi128_ps((argb)), (imm)));
75 :
76 0 : __m128i t = shuffle_si128(*a, *b, shuf1);
77 0 : *b = shuffle_si128(*a, *b, shuf2);
78 0 : *a = t;
79 0 : t = shuffle_si128(*c, *d, shuf1);
80 0 : *d = shuffle_si128(*c, *d, shuf2);
81 0 : *c = t;
82 :
83 : #undef shuf1
84 : #undef shuf2
85 : #undef shuffle_si128
86 :
87 0 : __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c));
88 :
89 0 : __m128i carry = _mm_or_si128(_mm_and_si128(*a, *b), _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c)));
90 :
91 0 : sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d));
92 :
93 0 : return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry)));
94 : }
95 :
96 0 : MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b)
97 : {
98 0 : return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
99 : }
100 :
101 0 : MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b)
102 : {
103 0 : __m128i t = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));
104 0 : b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0)));
105 0 : a = t;
106 :
107 0 : return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
108 : }
109 :
110 0 : MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
111 : {
112 0 : uint32_t sum = a ^ b ^ c;
113 0 : uint32_t carry = (a & b) | (a & c) | (b & c);
114 :
115 0 : uint32_t mask = 0xfefefefe;
116 :
117 : // Not having a byte based average instruction means we should mask to avoid
118 : // underflow.
119 0 : sum = (((sum ^ d) & mask) >> 1) + (sum & d);
120 :
121 0 : return (((sum ^ carry) & mask) >> 1) + (sum & carry);
122 : }
123 :
124 : // Simple 2 pixel average version of the function above.
125 0 : MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b)
126 : {
127 0 : uint32_t sum = a ^ b;
128 0 : uint32_t carry = (a & b);
129 :
130 0 : uint32_t mask = 0xfefefefe;
131 :
132 0 : return ((sum & mask) >> 1) + carry;
133 : }
134 :
135 : namespace mozilla {
136 : namespace gfx {
137 :
138 : void
139 0 : ImageHalfScaler::HalfImage2D_SSE2(uint8_t *aSource, int32_t aSourceStride,
140 : const IntSize &aSourceSize, uint8_t *aDest,
141 : uint32_t aDestStride)
142 : {
143 0 : const int Bpp = 4;
144 :
145 0 : for (int y = 0; y < aSourceSize.height; y += 2) {
146 0 : __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);
147 0 : int x = 0;
148 : // Run a loop depending on alignment.
149 0 : if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
150 0 : !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
151 0 : for (; x < (aSourceSize.width - 7); x += 8) {
152 0 : __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
153 0 : __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
154 :
155 0 : __m128i a = _mm_load_si128(upperRow);
156 0 : __m128i b = _mm_load_si128(upperRow + 1);
157 0 : __m128i c = _mm_load_si128(lowerRow);
158 0 : __m128i d = _mm_load_si128(lowerRow + 1);
159 :
160 0 : *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
161 0 : }
162 0 : } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
163 0 : for (; x < (aSourceSize.width - 7); x += 8) {
164 0 : __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
165 0 : __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
166 :
167 0 : __m128i a = _mm_load_si128(upperRow);
168 0 : __m128i b = _mm_load_si128(upperRow + 1);
169 0 : __m128i c = loadUnaligned128(lowerRow);
170 0 : __m128i d = loadUnaligned128(lowerRow + 1);
171 :
172 0 : *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
173 : }
174 0 : } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
175 0 : for (; x < (aSourceSize.width - 7); x += 8) {
176 0 : __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
177 0 : __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
178 :
179 0 : __m128i a = loadUnaligned128((__m128i*)upperRow);
180 0 : __m128i b = loadUnaligned128((__m128i*)upperRow + 1);
181 0 : __m128i c = _mm_load_si128((__m128i*)lowerRow);
182 0 : __m128i d = _mm_load_si128((__m128i*)lowerRow + 1);
183 :
184 0 : *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
185 : }
186 : } else {
187 0 : for (; x < (aSourceSize.width - 7); x += 8) {
188 0 : __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
189 0 : __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
190 :
191 0 : __m128i a = loadUnaligned128(upperRow);
192 0 : __m128i b = loadUnaligned128(upperRow + 1);
193 0 : __m128i c = loadUnaligned128(lowerRow);
194 0 : __m128i d = loadUnaligned128(lowerRow + 1);
195 :
196 0 : *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
197 : }
198 : }
199 :
200 0 : uint32_t *unalignedStorage = (uint32_t*)storage;
201 : // Take care of the final pixels, we know there's an even number of pixels
202 : // in the source rectangle. We use a 2x2 'simd' implementation for this.
203 : //
204 : // Potentially we only have to do this in the last row since overflowing
205 : // 8 pixels in an earlier row would appear to be harmless as it doesn't
206 : // touch invalid memory. Even when reading and writing to the same surface.
207 : // in practice we only do this when doing an additional downscale pass, and
208 : // in this situation we have unused stride to write into harmlessly.
209 : // I do not believe the additional code complexity would be worth it though.
210 0 : for (; x < aSourceSize.width; x += 2) {
211 0 : uint8_t *upperRow = aSource + (y * aSourceStride + x * Bpp);
212 0 : uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp);
213 :
214 0 : *unalignedStorage++ = Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1),
215 0 : *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1));
216 : }
217 : }
218 0 : }
219 :
220 : void
221 0 : ImageHalfScaler::HalfImageVertical_SSE2(uint8_t *aSource, int32_t aSourceStride,
222 : const IntSize &aSourceSize, uint8_t *aDest,
223 : uint32_t aDestStride)
224 : {
225 0 : for (int y = 0; y < aSourceSize.height; y += 2) {
226 0 : __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);
227 0 : int x = 0;
228 : // Run a loop depending on alignment.
229 0 : if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
230 0 : !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
231 0 : for (; x < (aSourceSize.width - 3); x += 4) {
232 0 : uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
233 0 : uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
234 :
235 0 : __m128i a = _mm_load_si128((__m128i*)upperRow);
236 0 : __m128i b = _mm_load_si128((__m128i*)lowerRow);
237 :
238 0 : *storage++ = avg_sse2_4x2_4x1(a, b);
239 0 : }
240 0 : } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
241 : // This line doesn't align well.
242 0 : for (; x < (aSourceSize.width - 3); x += 4) {
243 0 : uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
244 0 : uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
245 :
246 0 : __m128i a = _mm_load_si128((__m128i*)upperRow);
247 0 : __m128i b = loadUnaligned128((__m128i*)lowerRow);
248 :
249 0 : *storage++ = avg_sse2_4x2_4x1(a, b);
250 : }
251 0 : } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
252 0 : for (; x < (aSourceSize.width - 3); x += 4) {
253 0 : uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
254 0 : uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
255 :
256 0 : __m128i a = loadUnaligned128((__m128i*)upperRow);
257 0 : __m128i b = _mm_load_si128((__m128i*)lowerRow);
258 :
259 0 : *storage++ = avg_sse2_4x2_4x1(a, b);
260 : }
261 : } else {
262 0 : for (; x < (aSourceSize.width - 3); x += 4) {
263 0 : uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
264 0 : uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
265 :
266 0 : __m128i a = loadUnaligned128((__m128i*)upperRow);
267 0 : __m128i b = loadUnaligned128((__m128i*)lowerRow);
268 :
269 0 : *storage++ = avg_sse2_4x2_4x1(a, b);
270 : }
271 : }
272 :
273 0 : uint32_t *unalignedStorage = (uint32_t*)storage;
274 : // Take care of the final pixels, we know there's an even number of pixels
275 : // in the source rectangle.
276 : //
277 : // Similar overflow considerations are valid as in the previous function.
278 0 : for (; x < aSourceSize.width; x++) {
279 0 : uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
280 0 : uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
281 :
282 0 : *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow);
283 : }
284 : }
285 0 : }
286 :
287 : void
288 0 : ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t *aSource, int32_t aSourceStride,
289 : const IntSize &aSourceSize, uint8_t *aDest,
290 : uint32_t aDestStride)
291 : {
292 0 : for (int y = 0; y < aSourceSize.height; y++) {
293 0 : __m128i *storage = (__m128i*)(aDest + (y * aDestStride));
294 0 : int x = 0;
295 : // Run a loop depending on alignment.
296 0 : if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
297 0 : for (; x < (aSourceSize.width - 7); x += 8) {
298 0 : __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
299 :
300 0 : __m128i a = _mm_load_si128(pixels);
301 0 : __m128i b = _mm_load_si128(pixels + 1);
302 :
303 0 : *storage++ = avg_sse2_8x1_4x1(a, b);
304 : }
305 : } else {
306 0 : for (; x < (aSourceSize.width - 7); x += 8) {
307 0 : __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
308 :
309 0 : __m128i a = loadUnaligned128(pixels);
310 0 : __m128i b = loadUnaligned128(pixels + 1);
311 :
312 0 : *storage++ = avg_sse2_8x1_4x1(a, b);
313 : }
314 : }
315 :
316 0 : uint32_t *unalignedStorage = (uint32_t*)storage;
317 : // Take care of the final pixels, we know there's an even number of pixels
318 : // in the source rectangle.
319 : //
320 : // Similar overflow considerations are valid as in the previous function.
321 0 : for (; x < aSourceSize.width; x += 2) {
322 0 : uint32_t *pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4));
323 :
324 0 : *unalignedStorage++ = Avg2(*pixels, *(pixels + 1));
325 : }
326 : }
327 0 : }
328 :
329 : } // namespace gfx
330 : } // namespace mozilla
|