Line data Source code
1 : /* This Source Code Form is subject to the terms of the Mozilla Public
2 : * License, v. 2.0. If a copy of the MPL was not distributed with this
3 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 :
5 : #include "Swizzle.h"
6 :
7 : #include <emmintrin.h>
8 :
9 : namespace mozilla {
10 : namespace gfx {
11 :
12 : // Load 1-3 pixels into a 4 pixel vector.
13 : static MOZ_ALWAYS_INLINE __m128i
14 0 : LoadRemainder_SSE2(const uint8_t* aSrc, size_t aLength)
15 : {
16 : __m128i px;
17 0 : if (aLength >= 2) {
18 : // Load first 2 pixels
19 0 : px = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(aSrc));
20 : // Load third pixel
21 0 : if (aLength >= 3) {
22 0 : px = _mm_unpacklo_epi64(px,
23 0 : _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc + 2 * 4)));
24 : }
25 : } else {
26 : // Load single pixel
27 0 : px = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc));
28 : }
29 0 : return px;
30 : }
31 :
32 : // Store 1-3 pixels from a vector into memory without overwriting.
33 : static MOZ_ALWAYS_INLINE void
34 0 : StoreRemainder_SSE2(uint8_t* aDst, size_t aLength, const __m128i& aSrc)
35 : {
36 0 : if (aLength >= 2) {
37 : // Store first 2 pixels
38 0 : _mm_storel_epi64(reinterpret_cast<__m128i*>(aDst), aSrc);
39 : // Store third pixel
40 0 : if (aLength >= 3) {
41 0 : *reinterpret_cast<uint32_t*>(aDst + 2 * 4) =
42 0 : _mm_cvtsi128_si32(_mm_srli_si128(aSrc, 2 * 4));
43 : }
44 : } else {
45 : // Store single pixel
46 0 : *reinterpret_cast<uint32_t*>(aDst) = _mm_cvtsi128_si32(aSrc);
47 : }
48 0 : }
49 :
50 : // Premultiply vector of 4 pixels using splayed math.
51 : template<bool aSwapRB, bool aOpaqueAlpha>
52 : static MOZ_ALWAYS_INLINE __m128i
53 0 : PremultiplyVector_SSE2(const __m128i& aSrc)
54 : {
55 : // Isolate R and B with mask.
56 0 : const __m128i mask = _mm_set1_epi32(0x00FF00FF);
57 0 : __m128i rb = _mm_and_si128(mask, aSrc);
58 : // Swap R and B if necessary.
59 : if (aSwapRB) {
60 0 : rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
61 0 : rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
62 : }
63 : // Isolate G and A by shifting down to bottom of word.
64 0 : __m128i ga = _mm_srli_epi16(aSrc, 8);
65 :
66 : // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
67 0 : __m128i alphas = _mm_shufflelo_epi16(ga, _MM_SHUFFLE(3, 3, 1, 1));
68 0 : alphas = _mm_shufflehi_epi16(alphas, _MM_SHUFFLE(3, 3, 1, 1));
69 :
70 : // rb = rb*a + 255; rb += rb >> 8;
71 0 : rb = _mm_add_epi16(_mm_mullo_epi16(rb, alphas), mask);
72 0 : rb = _mm_add_epi16(rb, _mm_srli_epi16(rb, 8));
73 :
74 : // If format is not opaque, force A to 255 so that A*alpha/255 = alpha
75 : if (!aOpaqueAlpha) {
76 0 : ga = _mm_or_si128(ga, _mm_set1_epi32(0x00FF0000));
77 : }
78 : // ga = ga*a + 255; ga += ga >> 8;
79 0 : ga = _mm_add_epi16(_mm_mullo_epi16(ga, alphas), mask);
80 0 : ga = _mm_add_epi16(ga, _mm_srli_epi16(ga, 8));
81 : // If format is opaque, force output A to be 255.
82 : if (aOpaqueAlpha) {
83 0 : ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
84 : }
85 :
86 : // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00)
87 0 : rb = _mm_srli_epi16(rb, 8);
88 0 : ga = _mm_andnot_si128(mask, ga);
89 0 : return _mm_or_si128(rb, ga);
90 : }
91 :
92 : template<bool aSwapRB, bool aOpaqueAlpha>
93 0 : void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
94 : uint8_t* aDst, int32_t aDstGap,
95 : IntSize aSize)
96 : {
97 0 : int32_t alignedRow = 4 * (aSize.width & ~3);
98 0 : int32_t remainder = aSize.width & 3;
99 : // Fold remainder into stride gap.
100 0 : aSrcGap += 4 * remainder;
101 0 : aDstGap += 4 * remainder;
102 :
103 0 : for (int32_t height = aSize.height; height > 0; height--) {
104 : // Process all 4-pixel chunks as one vector.
105 0 : for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
106 0 : __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
107 0 : px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
108 0 : _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
109 0 : aSrc += 4 * 4;
110 0 : aDst += 4 * 4;
111 : }
112 :
113 : // Handle any 1-3 remaining pixels.
114 0 : if (remainder) {
115 0 : __m128i px = LoadRemainder_SSE2(aSrc, remainder);
116 0 : px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
117 0 : StoreRemainder_SSE2(aDst, remainder, px);
118 : }
119 :
120 0 : aSrc += aSrcGap;
121 0 : aDst += aDstGap;
122 : }
123 0 : }
124 :
125 : // Force instantiation of premultiply variants here.
126 : template void Premultiply_SSE2<false, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
127 : template void Premultiply_SSE2<false, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
128 : template void Premultiply_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
129 : template void Premultiply_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
130 :
131 : // This generates a table of fixed-point reciprocals representing 1/alpha
132 : // similar to the fallback implementation. However, the reciprocal must fit
133 : // in 16 bits to multiply cheaply. Observe that reciprocals of smaller alphas
134 : // require more bits than for larger alphas. We take advantage of this by
135 : // shifting the reciprocal down by either 3 or 8 bits depending on whether
136 : // the alpha value is less than 0x20. This is easy to then undo by multiplying
137 : // the color component to be unpremultiplying by either 8 or 0x100, respectively.
138 : // The 16 bit reciprocal is duplicated into both words of a uint32_t here to
139 : // reduce unpacking overhead.
140 : #define UNPREMULQ_SSE2(x) (0x10001U * (0xFF0220U / ((x) * ((x) < 0x20 ? 0x100 : 8))))
141 : #define UNPREMULQ_SSE2_2(x) UNPREMULQ_SSE2(x), UNPREMULQ_SSE2((x) + 1)
142 : #define UNPREMULQ_SSE2_4(x) UNPREMULQ_SSE2_2(x), UNPREMULQ_SSE2_2((x) + 2)
143 : #define UNPREMULQ_SSE2_8(x) UNPREMULQ_SSE2_4(x), UNPREMULQ_SSE2_4((x) + 4)
144 : #define UNPREMULQ_SSE2_16(x) UNPREMULQ_SSE2_8(x), UNPREMULQ_SSE2_8((x) + 8)
145 : #define UNPREMULQ_SSE2_32(x) UNPREMULQ_SSE2_16(x), UNPREMULQ_SSE2_16((x) + 16)
146 : static const uint32_t sUnpremultiplyTable_SSE2[256] =
147 : {
148 : 0, UNPREMULQ_SSE2(1), UNPREMULQ_SSE2_2(2), UNPREMULQ_SSE2_4(4),
149 : UNPREMULQ_SSE2_8(8), UNPREMULQ_SSE2_16(16), UNPREMULQ_SSE2_32(32),
150 : UNPREMULQ_SSE2_32(64), UNPREMULQ_SSE2_32(96), UNPREMULQ_SSE2_32(128),
151 : UNPREMULQ_SSE2_32(160), UNPREMULQ_SSE2_32(192), UNPREMULQ_SSE2_32(224)
152 : };
153 :
154 : // Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
155 : // that avoids doing any actual division.
156 : template<bool aSwapRB>
157 : static MOZ_ALWAYS_INLINE __m128i
158 0 : UnpremultiplyVector_SSE2(const __m128i& aSrc)
159 : {
160 : // Isolate R and B with mask.
161 0 : __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
162 : // Swap R and B if necessary.
163 : if (aSwapRB) {
164 0 : rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
165 0 : rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
166 : }
167 :
168 : // Isolate G and A by shifting down to bottom of word.
169 0 : __m128i ga = _mm_srli_epi16(aSrc, 8);
170 : // Extract the alphas for the 4 pixels from the now isolated words.
171 0 : int a1 = _mm_extract_epi16(ga, 1);
172 0 : int a2 = _mm_extract_epi16(ga, 3);
173 0 : int a3 = _mm_extract_epi16(ga, 5);
174 0 : int a4 = _mm_extract_epi16(ga, 7);
175 :
176 : // Load the 16 bit reciprocals from the table for each alpha.
177 : // The reciprocals are doubled in each uint32_t entry.
178 : // Unpack them to a final vector of duplicated reciprocals of
179 : // the form Q1 Q1 Q2 Q2 Q3 Q3 Q4 Q4.
180 0 : __m128i q12 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a1]),
181 0 : _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a2]));
182 0 : __m128i q34 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a3]),
183 0 : _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a4]));
184 0 : __m128i q1234 = _mm_unpacklo_epi64(q12, q34);
185 :
186 : // Check if the alphas are less than 0x20, so that we can undo
187 : // scaling of the reciprocals as appropriate.
188 0 : __m128i scale = _mm_cmplt_epi32(ga, _mm_set1_epi32(0x00200000));
189 : // Produce scale factors by ((a < 0x20) ^ 8) & 0x108,
190 : // such that scale is 0x100 if < 0x20, and 8 otherwise.
191 0 : scale = _mm_xor_si128(scale, _mm_set1_epi16(8));
192 0 : scale = _mm_and_si128(scale, _mm_set1_epi16(0x108));
193 : // Isolate G now so that we don't accidentally unpremultiply A.
194 0 : ga = _mm_and_si128(ga, _mm_set1_epi32(0x000000FF));
195 :
196 : // Scale R, B, and G as required depending on reciprocal precision.
197 0 : rb = _mm_mullo_epi16(rb, scale);
198 0 : ga = _mm_mullo_epi16(ga, scale);
199 :
200 : // Multiply R, B, and G by the reciprocal, only taking the high word
201 : // too effectively shift right by 16.
202 0 : rb = _mm_mulhi_epu16(rb, q1234);
203 0 : ga = _mm_mulhi_epu16(ga, q1234);
204 :
205 : // Combine back to final pixel with rb | (ga << 8) | (aSrc & 0xFF000000),
206 : // which will add back on the original alpha value unchanged.
207 0 : ga = _mm_slli_si128(ga, 1);
208 0 : ga = _mm_or_si128(ga, _mm_and_si128(aSrc, _mm_set1_epi32(0xFF000000)));
209 0 : return _mm_or_si128(rb, ga);
210 : }
211 :
212 : template<bool aSwapRB>
213 0 : void Unpremultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
214 : uint8_t* aDst, int32_t aDstGap,
215 : IntSize aSize)
216 : {
217 0 : int32_t alignedRow = 4 * (aSize.width & ~3);
218 0 : int32_t remainder = aSize.width & 3;
219 : // Fold remainder into stride gap.
220 0 : aSrcGap += 4 * remainder;
221 0 : aDstGap += 4 * remainder;
222 :
223 0 : for (int32_t height = aSize.height; height > 0; height--) {
224 : // Process all 4-pixel chunks as one vector.
225 0 : for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
226 0 : __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
227 0 : px = UnpremultiplyVector_SSE2<aSwapRB>(px);
228 0 : _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
229 0 : aSrc += 4 * 4;
230 0 : aDst += 4 * 4;
231 : }
232 :
233 : // Handle any 1-3 remaining pixels.
234 0 : if (remainder) {
235 0 : __m128i px = LoadRemainder_SSE2(aSrc, remainder);
236 0 : px = UnpremultiplyVector_SSE2<aSwapRB>(px);
237 0 : StoreRemainder_SSE2(aDst, remainder, px);
238 : }
239 :
240 0 : aSrc += aSrcGap;
241 0 : aDst += aDstGap;
242 : }
243 0 : }
244 :
245 : // Force instantiation of unpremultiply variants here.
246 : template void Unpremultiply_SSE2<false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
247 : template void Unpremultiply_SSE2<true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
248 :
249 : // Swizzle a vector of 4 pixels providing swaps and opaquifying.
250 : template<bool aSwapRB, bool aOpaqueAlpha>
251 : static MOZ_ALWAYS_INLINE __m128i
252 0 : SwizzleVector_SSE2(const __m128i& aSrc)
253 : {
254 : // Isolate R and B.
255 0 : __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
256 : // Swap R and B.
257 0 : rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
258 0 : rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
259 : // Isolate G and A.
260 0 : __m128i ga = _mm_and_si128(aSrc, _mm_set1_epi32(0xFF00FF00));
261 : // Force alpha to 255 if necessary.
262 : if (aOpaqueAlpha) {
263 0 : ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
264 : }
265 : // Combine everything back together.
266 0 : return _mm_or_si128(rb, ga);
267 : }
268 :
269 : #if 0
270 : // These specializations currently do not profile faster than the generic versions,
271 : // so disable them for now.
272 :
273 : // Optimized implementations for when there is no R and B swap.
274 : template<>
275 : MOZ_ALWAYS_INLINE __m128i
276 : SwizzleVector_SSE2<false, true>(const __m128i& aSrc)
277 : {
278 : // Force alpha to 255.
279 : return _mm_or_si128(aSrc, _mm_set1_epi32(0xFF000000));
280 : }
281 :
282 : template<>
283 : MOZ_ALWAYS_INLINE __m128i
284 : SwizzleVector_SSE2<false, false>(const __m128i& aSrc)
285 : {
286 : return aSrc;
287 : }
288 : #endif
289 :
290 : template<bool aSwapRB, bool aOpaqueAlpha>
291 0 : void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
292 : uint8_t* aDst, int32_t aDstGap,
293 : IntSize aSize)
294 : {
295 0 : int32_t alignedRow = 4 * (aSize.width & ~3);
296 0 : int32_t remainder = aSize.width & 3;
297 : // Fold remainder into stride gap.
298 0 : aSrcGap += 4 * remainder;
299 0 : aDstGap += 4 * remainder;
300 :
301 0 : for (int32_t height = aSize.height; height > 0; height--) {
302 : // Process all 4-pixel chunks as one vector.
303 0 : for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
304 0 : __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
305 0 : px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
306 0 : _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
307 0 : aSrc += 4 * 4;
308 0 : aDst += 4 * 4;
309 : }
310 :
311 : // Handle any 1-3 remaining pixels.
312 0 : if (remainder) {
313 0 : __m128i px = LoadRemainder_SSE2(aSrc, remainder);
314 0 : px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
315 0 : StoreRemainder_SSE2(aDst, remainder, px);
316 : }
317 :
318 0 : aSrc += aSrcGap;
319 0 : aDst += aDstGap;
320 : }
321 0 : }
322 :
323 : // Force instantiation of swizzle variants here.
324 : template void Swizzle_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
325 : template void Swizzle_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
326 :
327 : } // namespace gfx
328 : } // namespace mozilla
329 :
|