Line data Source code
1 : /*
2 : * Copyright 2012 The Android Open Source Project
3 : *
4 : * Use of this source code is governed by a BSD-style license that can be
5 : * found in the LICENSE file.
6 : */
7 :
8 : #include <emmintrin.h>
9 : #include "SkBitmapProcState_opts_SSE2.h"
10 : #include "SkBlitRow_opts_SSE2.h"
11 : #include "SkColorPriv.h"
12 : #include "SkColor_opts_SSE2.h"
13 : #include "SkDither.h"
14 : #include "SkMSAN.h"
15 : #include "SkUtils.h"
16 :
17 : /* SSE2 version of S32_Blend_BlitRow32()
18 : * portable version is in core/SkBlitRow_D32.cpp
19 : */
20 0 : void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
21 : const SkPMColor* SK_RESTRICT src,
22 : int count, U8CPU alpha) {
23 0 : SkASSERT(alpha <= 255);
24 0 : if (count <= 0) {
25 0 : return;
26 : }
27 :
28 0 : uint32_t src_scale = SkAlpha255To256(alpha);
29 :
30 0 : if (count >= 4) {
31 0 : SkASSERT(((size_t)dst & 0x03) == 0);
32 0 : while (((size_t)dst & 0x0F) != 0) {
33 0 : *dst = SkPMLerp(*src, *dst, src_scale);
34 0 : src++;
35 0 : dst++;
36 0 : count--;
37 : }
38 :
39 0 : const __m128i *s = reinterpret_cast<const __m128i*>(src);
40 0 : __m128i *d = reinterpret_cast<__m128i*>(dst);
41 :
42 0 : while (count >= 4) {
43 : // Load 4 pixels each of src and dest.
44 0 : __m128i src_pixel = _mm_loadu_si128(s);
45 0 : __m128i dst_pixel = _mm_load_si128(d);
46 :
47 0 : __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
48 : _mm_store_si128(d, result);
49 0 : s++;
50 0 : d++;
51 0 : count -= 4;
52 : }
53 0 : src = reinterpret_cast<const SkPMColor*>(s);
54 0 : dst = reinterpret_cast<SkPMColor*>(d);
55 : }
56 :
57 0 : while (count > 0) {
58 0 : *dst = SkPMLerp(*src, *dst, src_scale);
59 0 : src++;
60 0 : dst++;
61 0 : count--;
62 : }
63 : }
64 :
65 1582 : void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
66 : const SkPMColor* SK_RESTRICT src,
67 : int count, U8CPU alpha) {
68 1582 : SkASSERT(alpha <= 255);
69 1582 : if (count <= 0) {
70 0 : return;
71 : }
72 :
73 1582 : if (count >= 4) {
74 2064 : while (((size_t)dst & 0x0F) != 0) {
75 736 : *dst = SkBlendARGB32(*src, *dst, alpha);
76 736 : src++;
77 736 : dst++;
78 736 : count--;
79 : }
80 :
81 592 : const __m128i *s = reinterpret_cast<const __m128i*>(src);
82 592 : __m128i *d = reinterpret_cast<__m128i*>(dst);
83 14376 : while (count >= 4) {
84 : // Load 4 pixels each of src and dest.
85 6892 : __m128i src_pixel = _mm_loadu_si128(s);
86 6892 : __m128i dst_pixel = _mm_load_si128(d);
87 :
88 6892 : __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
89 : _mm_store_si128(d, result);
90 6892 : s++;
91 6892 : d++;
92 6892 : count -= 4;
93 : }
94 592 : src = reinterpret_cast<const SkPMColor*>(s);
95 592 : dst = reinterpret_cast<SkPMColor*>(d);
96 : }
97 :
98 5040 : while (count > 0) {
99 1729 : *dst = SkBlendARGB32(*src, *dst, alpha);
100 1729 : src++;
101 1729 : dst++;
102 1729 : count--;
103 : }
104 : }
105 :
106 0 : void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
107 0 : SkASSERT(count > 0);
108 :
109 0 : uint32_t src_expand = (SkGetPackedG32(src) << 24) |
110 0 : (SkGetPackedR32(src) << 13) |
111 0 : (SkGetPackedB32(src) << 2);
112 0 : unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
113 :
114 : // Check if we have enough pixels to run SIMD
115 0 : if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
116 : __m128i* dst_wide;
117 0 : const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
118 0 : const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
119 0 : const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
120 0 : const __m128i scale_wide = _mm_set1_epi16(scale);
121 0 : const __m128i mask_blue = _mm_set1_epi16(SK_B16_MASK);
122 0 : const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
123 :
124 : // Align dst to an even 16 byte address (0-7 pixels)
125 0 : while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
126 0 : *dst = SkBlend32_RGB16(src_expand, *dst, scale);
127 0 : dst += 1;
128 0 : count--;
129 : }
130 :
131 0 : dst_wide = reinterpret_cast<__m128i*>(dst);
132 0 : do {
133 : // Load eight RGB565 pixels
134 0 : __m128i pixels = _mm_load_si128(dst_wide);
135 :
136 : // Mask out sub-pixels
137 0 : __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
138 0 : __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
139 0 : pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
140 0 : __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
141 :
142 : // Scale with alpha
143 0 : pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
144 0 : pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
145 0 : pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
146 :
147 : // Add src_X_wide and shift down again
148 0 : pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
149 0 : pixel_R = _mm_srli_epi16(pixel_R, 5);
150 0 : pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
151 0 : pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
152 0 : pixel_B = _mm_srli_epi16(pixel_B, 5);
153 :
154 : // Combine into RGB565 and store
155 0 : pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
156 0 : pixel_G = _mm_and_si128(pixel_G, mask_green);
157 0 : pixels = _mm_or_si128(pixel_R, pixel_G);
158 0 : pixels = _mm_or_si128(pixels, pixel_B);
159 : _mm_store_si128(dst_wide, pixels);
160 0 : count -= 8;
161 0 : dst_wide++;
162 0 : } while (count >= 8);
163 :
164 0 : dst = reinterpret_cast<uint16_t*>(dst_wide);
165 : }
166 :
167 : // Small loop to handle remaining pixels.
168 0 : while (count > 0) {
169 0 : *dst = SkBlend32_RGB16(src_expand, *dst, scale);
170 0 : dst += 1;
171 0 : count--;
172 : }
173 0 : }
174 :
175 : // The following (left) shifts cause the top 5 bits of the mask components to
176 : // line up with the corresponding components in an SkPMColor.
177 : // Note that the mask's RGB16 order may differ from the SkPMColor order.
178 : #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
179 : #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
180 : #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
181 :
182 : #if SK_R16x5_R32x5_SHIFT == 0
183 : #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
184 : #elif SK_R16x5_R32x5_SHIFT > 0
185 : #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
186 : #else
187 : #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
188 : #endif
189 :
190 : #if SK_G16x5_G32x5_SHIFT == 0
191 : #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
192 : #elif SK_G16x5_G32x5_SHIFT > 0
193 : #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
194 : #else
195 : #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
196 : #endif
197 :
198 : #if SK_B16x5_B32x5_SHIFT == 0
199 : #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
200 : #elif SK_B16x5_B32x5_SHIFT > 0
201 : #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
202 : #else
203 : #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
204 : #endif
205 :
206 3234 : static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
207 : __m128i &mask, __m128i &srcA) {
208 : // In the following comments, the components of src, dst and mask are
209 : // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
210 : // by an R, G, B, or A suffix. Components of one of the four pixels that
211 : // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
212 : // example is the blue channel of the second destination pixel. Memory
213 : // layout is shown for an ARGB byte order in a color value.
214 :
215 : // src and srcA store 8-bit values interleaved with zeros.
216 : // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
217 : // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
218 : // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
219 : // mask stores 16-bit values (compressed three channels) interleaved with zeros.
220 : // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
221 : // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
222 : // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
223 :
224 : // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
225 : // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
226 6468 : __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
227 3234 : _mm_set1_epi32(0x1F << SK_R32_SHIFT));
228 :
229 : // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
230 6468 : __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
231 3234 : _mm_set1_epi32(0x1F << SK_G32_SHIFT));
232 :
233 : // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
234 3234 : __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
235 3234 : _mm_set1_epi32(0x1F << SK_B32_SHIFT));
236 :
237 : // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
238 : // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
239 : // 8-bit position
240 : // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
241 : // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
242 6468 : mask = _mm_or_si128(_mm_or_si128(r, g), b);
243 :
244 : // Interleave R,G,B into the lower byte of word.
245 : // i.e. split the sixteen 8-bit values from mask into two sets of eight
246 : // 16-bit values, padded by zero.
247 : __m128i maskLo, maskHi;
248 : // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
249 6468 : maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
250 : // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
251 6468 : maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
252 :
253 : // Upscale from 0..31 to 0..32
254 : // (allows to replace division by left-shift further down)
255 : // Left-shift each component by 4 and add the result back to that component,
256 : // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
257 6468 : maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
258 6468 : maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
259 :
260 : // Multiply each component of maskLo and maskHi by srcA
261 6468 : maskLo = _mm_mullo_epi16(maskLo, srcA);
262 6468 : maskHi = _mm_mullo_epi16(maskHi, srcA);
263 :
264 : // Left shift mask components by 8 (divide by 256)
265 3234 : maskLo = _mm_srli_epi16(maskLo, 8);
266 3234 : maskHi = _mm_srli_epi16(maskHi, 8);
267 :
268 : // Interleave R,G,B into the lower byte of the word
269 : // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
270 6468 : __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
271 : // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
272 6468 : __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
273 :
274 : // mask = (src - dst) * mask
275 9702 : maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
276 9702 : maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
277 :
278 : // mask = (src - dst) * mask >> 5
279 3234 : maskLo = _mm_srai_epi16(maskLo, 5);
280 3234 : maskHi = _mm_srai_epi16(maskHi, 5);
281 :
282 : // Add two pixels into result.
283 : // result = dst + ((src - dst) * mask >> 5)
284 3234 : __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
285 3234 : __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
286 :
287 : // Pack into 4 32bit dst pixels.
288 : // resultLo and resultHi contain eight 16-bit components (two pixels) each.
289 : // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
290 : // clamping to 255 if necessary.
291 3234 : return _mm_packus_epi16(resultLo, resultHi);
292 : }
293 :
294 8009 : static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
295 : __m128i &mask) {
296 : // In the following comments, the components of src, dst and mask are
297 : // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
298 : // by an R, G, B, or A suffix. Components of one of the four pixels that
299 : // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
300 : // example is the blue channel of the second destination pixel. Memory
301 : // layout is shown for an ARGB byte order in a color value.
302 :
303 : // src and srcA store 8-bit values interleaved with zeros.
304 : // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
305 : // mask stores 16-bit values (shown as high and low bytes) interleaved with
306 : // zeros
307 : // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
308 : // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
309 :
310 : // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
311 : // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
312 16018 : __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
313 8009 : _mm_set1_epi32(0x1F << SK_R32_SHIFT));
314 :
315 : // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
316 16018 : __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
317 8009 : _mm_set1_epi32(0x1F << SK_G32_SHIFT));
318 :
319 : // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
320 8009 : __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
321 8009 : _mm_set1_epi32(0x1F << SK_B32_SHIFT));
322 :
323 : // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
324 : // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
325 : // 8-bit position
326 : // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
327 : // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
328 16018 : mask = _mm_or_si128(_mm_or_si128(r, g), b);
329 :
330 : // Interleave R,G,B into the lower byte of word.
331 : // i.e. split the sixteen 8-bit values from mask into two sets of eight
332 : // 16-bit values, padded by zero.
333 : __m128i maskLo, maskHi;
334 : // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
335 16018 : maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
336 : // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
337 16018 : maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
338 :
339 : // Upscale from 0..31 to 0..32
340 : // (allows to replace division by left-shift further down)
341 : // Left-shift each component by 4 and add the result back to that component,
342 : // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
343 16018 : maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
344 16018 : maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
345 :
346 : // Interleave R,G,B into the lower byte of the word
347 : // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
348 16018 : __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
349 : // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
350 16018 : __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
351 :
352 : // mask = (src - dst) * mask
353 24027 : maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
354 24027 : maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
355 :
356 : // mask = (src - dst) * mask >> 5
357 8009 : maskLo = _mm_srai_epi16(maskLo, 5);
358 8009 : maskHi = _mm_srai_epi16(maskHi, 5);
359 :
360 : // Add two pixels into result.
361 : // result = dst + ((src - dst) * mask >> 5)
362 8009 : __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
363 8009 : __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
364 :
365 : // Pack into 4 32bit dst pixels and force opaque.
366 : // resultLo and resultHi contain eight 16-bit components (two pixels) each.
367 : // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
368 : // clamping to 255 if necessary. Set alpha components to 0xFF.
369 16018 : return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
370 8009 : _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
371 : }
372 :
373 1620 : void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
374 : SkColor src, int width, SkPMColor) {
375 1620 : if (width <= 0) {
376 0 : return;
377 : }
378 :
379 1620 : int srcA = SkColorGetA(src);
380 1620 : int srcR = SkColorGetR(src);
381 1620 : int srcG = SkColorGetG(src);
382 1620 : int srcB = SkColorGetB(src);
383 :
384 1620 : srcA = SkAlpha255To256(srcA);
385 :
386 1620 : if (width >= 4) {
387 1620 : SkASSERT(((size_t)dst & 0x03) == 0);
388 6350 : while (((size_t)dst & 0x0F) != 0) {
389 2365 : *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
390 2365 : mask++;
391 2365 : dst++;
392 2365 : width--;
393 : }
394 :
395 1620 : __m128i *d = reinterpret_cast<__m128i*>(dst);
396 : // Set alpha to 0xFF and replicate source four times in SSE register.
397 3240 : __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
398 : // Interleave with zeros to get two sets of four 16-bit values.
399 3240 : src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
400 : // Set srcA_sse to contain eight copies of srcA, padded with zero.
401 : // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
402 3240 : __m128i srcA_sse = _mm_set1_epi16(srcA);
403 9940 : while (width >= 4) {
404 : // Load four destination pixels into dst_sse.
405 4160 : __m128i dst_sse = _mm_load_si128(d);
406 : // Load four 16-bit masks into lower half of mask_sse.
407 4160 : __m128i mask_sse = _mm_loadl_epi64(
408 4160 : reinterpret_cast<const __m128i*>(mask));
409 :
410 : // Check whether masks are equal to 0 and get the highest bit
411 : // of each byte of result, if masks are all zero, we will get
412 : // pack_cmp to 0xFFFF
413 8320 : int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
414 4160 : _mm_setzero_si128()));
415 :
416 : // if mask pixels are not all zero, we will blend the dst pixels
417 4160 : if (pack_cmp != 0xFFFF) {
418 : // Unpack 4 16bit mask pixels to
419 : // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
420 : // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
421 6468 : mask_sse = _mm_unpacklo_epi16(mask_sse,
422 : _mm_setzero_si128());
423 :
424 : // Process 4 32bit dst pixels
425 : __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
426 3234 : mask_sse, srcA_sse);
427 : _mm_store_si128(d, result);
428 : }
429 :
430 4160 : d++;
431 4160 : mask += 4;
432 4160 : width -= 4;
433 : }
434 :
435 1620 : dst = reinterpret_cast<SkPMColor*>(d);
436 : }
437 :
438 6514 : while (width > 0) {
439 2447 : *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
440 2447 : mask++;
441 2447 : dst++;
442 2447 : width--;
443 : }
444 : }
445 :
446 3913 : void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
447 : SkColor src, int width, SkPMColor opaqueDst) {
448 3913 : if (width <= 0) {
449 0 : return;
450 : }
451 :
452 3913 : int srcR = SkColorGetR(src);
453 3913 : int srcG = SkColorGetG(src);
454 3913 : int srcB = SkColorGetB(src);
455 :
456 3913 : if (width >= 4) {
457 3913 : SkASSERT(((size_t)dst & 0x03) == 0);
458 15813 : while (((size_t)dst & 0x0F) != 0) {
459 5950 : *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
460 5950 : mask++;
461 5950 : dst++;
462 5950 : width--;
463 : }
464 :
465 3913 : __m128i *d = reinterpret_cast<__m128i*>(dst);
466 : // Set alpha to 0xFF and replicate source four times in SSE register.
467 7826 : __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
468 : // Set srcA_sse to contain eight copies of srcA, padded with zero.
469 : // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
470 7826 : src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
471 24721 : while (width >= 4) {
472 : // Load four destination pixels into dst_sse.
473 10404 : __m128i dst_sse = _mm_load_si128(d);
474 : // Load four 16-bit masks into lower half of mask_sse.
475 10404 : __m128i mask_sse = _mm_loadl_epi64(
476 10404 : reinterpret_cast<const __m128i*>(mask));
477 :
478 : // Check whether masks are equal to 0 and get the highest bit
479 : // of each byte of result, if masks are all zero, we will get
480 : // pack_cmp to 0xFFFF
481 20808 : int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
482 10404 : _mm_setzero_si128()));
483 :
484 : // if mask pixels are not all zero, we will blend the dst pixels
485 10404 : if (pack_cmp != 0xFFFF) {
486 : // Unpack 4 16bit mask pixels to
487 : // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
488 : // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
489 16018 : mask_sse = _mm_unpacklo_epi16(mask_sse,
490 : _mm_setzero_si128());
491 :
492 : // Process 4 32bit dst pixels
493 : __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
494 8009 : mask_sse);
495 : _mm_store_si128(d, result);
496 : }
497 :
498 10404 : d++;
499 10404 : mask += 4;
500 10404 : width -= 4;
501 : }
502 :
503 3913 : dst = reinterpret_cast<SkPMColor*>(d);
504 : }
505 :
506 15635 : while (width > 0) {
507 5861 : *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
508 5861 : mask++;
509 5861 : dst++;
510 5861 : width--;
511 : }
512 : }
513 :
514 : /* SSE2 version of S32_D565_Opaque()
515 : * portable version is in core/SkBlitRow_D16.cpp
516 : */
517 0 : void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
518 : const SkPMColor* SK_RESTRICT src, int count,
519 : U8CPU alpha, int /*x*/, int /*y*/) {
520 0 : SkASSERT(255 == alpha);
521 :
522 0 : if (count <= 0) {
523 0 : return;
524 : }
525 :
526 0 : if (count >= 8) {
527 0 : while (((size_t)dst & 0x0F) != 0) {
528 0 : SkPMColor c = *src++;
529 0 : SkPMColorAssert(c);
530 :
531 0 : *dst++ = SkPixel32ToPixel16_ToU16(c);
532 0 : count--;
533 : }
534 :
535 0 : const __m128i* s = reinterpret_cast<const __m128i*>(src);
536 0 : __m128i* d = reinterpret_cast<__m128i*>(dst);
537 :
538 0 : while (count >= 8) {
539 : // Load 8 pixels of src.
540 0 : __m128i src_pixel1 = _mm_loadu_si128(s++);
541 0 : __m128i src_pixel2 = _mm_loadu_si128(s++);
542 :
543 0 : __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
544 0 : _mm_store_si128(d++, d_pixel);
545 0 : count -= 8;
546 : }
547 0 : src = reinterpret_cast<const SkPMColor*>(s);
548 0 : dst = reinterpret_cast<uint16_t*>(d);
549 : }
550 :
551 0 : if (count > 0) {
552 0 : do {
553 0 : SkPMColor c = *src++;
554 0 : SkPMColorAssert(c);
555 0 : *dst++ = SkPixel32ToPixel16_ToU16(c);
556 : } while (--count != 0);
557 : }
558 : }
559 :
560 : /* SSE2 version of S32A_D565_Opaque()
561 : * portable version is in core/SkBlitRow_D16.cpp
562 : */
563 0 : void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
564 : const SkPMColor* SK_RESTRICT src,
565 : int count, U8CPU alpha, int /*x*/, int /*y*/) {
566 0 : SkASSERT(255 == alpha);
567 :
568 0 : if (count <= 0) {
569 0 : return;
570 : }
571 :
572 0 : if (count >= 8) {
573 : // Make dst 16 bytes alignment
574 0 : while (((size_t)dst & 0x0F) != 0) {
575 0 : SkPMColor c = *src++;
576 0 : if (c) {
577 0 : *dst = SkSrcOver32To16(c, *dst);
578 : }
579 0 : dst += 1;
580 0 : count--;
581 : }
582 :
583 0 : const __m128i* s = reinterpret_cast<const __m128i*>(src);
584 0 : __m128i* d = reinterpret_cast<__m128i*>(dst);
585 0 : __m128i var255 = _mm_set1_epi16(255);
586 0 : __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
587 0 : __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
588 0 : __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
589 :
590 0 : while (count >= 8) {
591 : // Load 8 pixels of src.
592 0 : __m128i src_pixel1 = _mm_loadu_si128(s++);
593 0 : __m128i src_pixel2 = _mm_loadu_si128(s++);
594 :
595 : // Check whether src pixels are equal to 0 and get the highest bit
596 : // of each byte of result, if src pixels are all zero, src_cmp1 and
597 : // src_cmp2 will be 0xFFFF.
598 0 : int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
599 0 : _mm_setzero_si128()));
600 0 : int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
601 0 : _mm_setzero_si128()));
602 0 : if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
603 0 : d++;
604 0 : count -= 8;
605 0 : continue;
606 : }
607 :
608 : // Load 8 pixels of dst.
609 0 : __m128i dst_pixel = _mm_load_si128(d);
610 :
611 : // Extract A from src.
612 0 : __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
613 0 : sa1 = _mm_srli_epi32(sa1, 24);
614 0 : __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
615 0 : sa2 = _mm_srli_epi32(sa2, 24);
616 0 : __m128i sa = _mm_packs_epi32(sa1, sa2);
617 :
618 : // Extract R from src.
619 0 : __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
620 0 : sr1 = _mm_srli_epi32(sr1, 24);
621 0 : __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
622 0 : sr2 = _mm_srli_epi32(sr2, 24);
623 0 : __m128i sr = _mm_packs_epi32(sr1, sr2);
624 :
625 : // Extract G from src.
626 0 : __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
627 0 : sg1 = _mm_srli_epi32(sg1, 24);
628 0 : __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
629 0 : sg2 = _mm_srli_epi32(sg2, 24);
630 0 : __m128i sg = _mm_packs_epi32(sg1, sg2);
631 :
632 : // Extract B from src.
633 0 : __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
634 0 : sb1 = _mm_srli_epi32(sb1, 24);
635 0 : __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
636 0 : sb2 = _mm_srli_epi32(sb2, 24);
637 0 : __m128i sb = _mm_packs_epi32(sb1, sb2);
638 :
639 : // Extract R G B from dst.
640 0 : __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
641 0 : dr = _mm_and_si128(dr, r16_mask);
642 0 : __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
643 0 : dg = _mm_and_si128(dg, g16_mask);
644 0 : __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
645 0 : db = _mm_and_si128(db, b16_mask);
646 :
647 0 : __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
648 :
649 : // Calculate R G B of result.
650 : // Original algorithm is in SkSrcOver32To16().
651 0 : dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
652 0 : dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
653 0 : dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
654 0 : dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
655 0 : db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
656 0 : db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
657 :
658 : // Pack R G B into 16-bit color.
659 0 : __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
660 :
661 : // Store 8 16-bit colors in dst.
662 0 : _mm_store_si128(d++, d_pixel);
663 0 : count -= 8;
664 : }
665 :
666 0 : src = reinterpret_cast<const SkPMColor*>(s);
667 0 : dst = reinterpret_cast<uint16_t*>(d);
668 : }
669 :
670 0 : if (count > 0) {
671 0 : do {
672 0 : SkPMColor c = *src++;
673 0 : SkPMColorAssert(c);
674 0 : if (c) {
675 0 : *dst = SkSrcOver32To16(c, *dst);
676 : }
677 0 : dst += 1;
678 : } while (--count != 0);
679 : }
680 : }
681 :
682 0 : void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
683 : const SkPMColor* SK_RESTRICT src,
684 : int count, U8CPU alpha, int x, int y) {
685 0 : SkASSERT(255 == alpha);
686 :
687 0 : if (count <= 0) {
688 0 : return;
689 : }
690 :
691 0 : if (count >= 8) {
692 0 : while (((size_t)dst & 0x0F) != 0) {
693 0 : DITHER_565_SCAN(y);
694 0 : SkPMColor c = *src++;
695 0 : SkPMColorAssert(c);
696 :
697 0 : unsigned dither = DITHER_VALUE(x);
698 0 : *dst++ = SkDitherRGB32To565(c, dither);
699 0 : DITHER_INC_X(x);
700 0 : count--;
701 : }
702 :
703 : unsigned short dither_value[8];
704 : __m128i dither;
705 : #ifdef ENABLE_DITHER_MATRIX_4X4
706 : const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
707 : dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
708 : dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
709 : dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
710 : dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
711 : #else
712 0 : const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
713 0 : dither_value[0] = dither_value[4] = (dither_scan
714 0 : >> (((x) & 3) << 2)) & 0xF;
715 0 : dither_value[1] = dither_value[5] = (dither_scan
716 0 : >> (((x + 1) & 3) << 2)) & 0xF;
717 0 : dither_value[2] = dither_value[6] = (dither_scan
718 0 : >> (((x + 2) & 3) << 2)) & 0xF;
719 0 : dither_value[3] = dither_value[7] = (dither_scan
720 0 : >> (((x + 3) & 3) << 2)) & 0xF;
721 : #endif
722 0 : dither = _mm_loadu_si128((__m128i*) dither_value);
723 :
724 0 : const __m128i* s = reinterpret_cast<const __m128i*>(src);
725 0 : __m128i* d = reinterpret_cast<__m128i*>(dst);
726 :
727 0 : while (count >= 8) {
728 : // Load 8 pixels of src.
729 0 : __m128i src_pixel1 = _mm_loadu_si128(s++);
730 0 : __m128i src_pixel2 = _mm_loadu_si128(s++);
731 :
732 : // Extract R from src.
733 0 : __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
734 0 : sr1 = _mm_srli_epi32(sr1, 24);
735 0 : __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
736 0 : sr2 = _mm_srli_epi32(sr2, 24);
737 0 : __m128i sr = _mm_packs_epi32(sr1, sr2);
738 :
739 : // SkDITHER_R32To565(sr, dither)
740 0 : __m128i sr_offset = _mm_srli_epi16(sr, 5);
741 0 : sr = _mm_add_epi16(sr, dither);
742 0 : sr = _mm_sub_epi16(sr, sr_offset);
743 0 : sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
744 :
745 : // Extract G from src.
746 0 : __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
747 0 : sg1 = _mm_srli_epi32(sg1, 24);
748 0 : __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
749 0 : sg2 = _mm_srli_epi32(sg2, 24);
750 0 : __m128i sg = _mm_packs_epi32(sg1, sg2);
751 :
752 : // SkDITHER_R32To565(sg, dither)
753 0 : __m128i sg_offset = _mm_srli_epi16(sg, 6);
754 0 : sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
755 0 : sg = _mm_sub_epi16(sg, sg_offset);
756 0 : sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
757 :
758 : // Extract B from src.
759 0 : __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
760 0 : sb1 = _mm_srli_epi32(sb1, 24);
761 0 : __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
762 0 : sb2 = _mm_srli_epi32(sb2, 24);
763 0 : __m128i sb = _mm_packs_epi32(sb1, sb2);
764 :
765 : // SkDITHER_R32To565(sb, dither)
766 0 : __m128i sb_offset = _mm_srli_epi16(sb, 5);
767 0 : sb = _mm_add_epi16(sb, dither);
768 0 : sb = _mm_sub_epi16(sb, sb_offset);
769 0 : sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
770 :
771 : // Pack and store 16-bit dst pixel.
772 0 : __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
773 0 : _mm_store_si128(d++, d_pixel);
774 :
775 0 : count -= 8;
776 0 : x += 8;
777 : }
778 :
779 0 : src = reinterpret_cast<const SkPMColor*>(s);
780 0 : dst = reinterpret_cast<uint16_t*>(d);
781 : }
782 :
783 0 : if (count > 0) {
784 0 : DITHER_565_SCAN(y);
785 0 : do {
786 0 : SkPMColor c = *src++;
787 0 : SkPMColorAssert(c);
788 :
789 0 : unsigned dither = DITHER_VALUE(x);
790 0 : *dst++ = SkDitherRGB32To565(c, dither);
791 0 : DITHER_INC_X(x);
792 : } while (--count != 0);
793 : }
794 : }
795 :
796 : /* SSE2 version of S32A_D565_Opaque_Dither()
797 : * portable version is in core/SkBlitRow_D16.cpp
798 : */
799 0 : void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
800 : const SkPMColor* SK_RESTRICT src,
801 : int count, U8CPU alpha, int x, int y) {
802 0 : SkASSERT(255 == alpha);
803 :
804 0 : if (count <= 0) {
805 0 : return;
806 : }
807 :
808 0 : if (count >= 8) {
809 0 : while (((size_t)dst & 0x0F) != 0) {
810 0 : DITHER_565_SCAN(y);
811 0 : SkPMColor c = *src++;
812 0 : SkPMColorAssert(c);
813 0 : if (c) {
814 0 : unsigned a = SkGetPackedA32(c);
815 :
816 0 : int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
817 :
818 0 : unsigned sr = SkGetPackedR32(c);
819 0 : unsigned sg = SkGetPackedG32(c);
820 0 : unsigned sb = SkGetPackedB32(c);
821 0 : sr = SkDITHER_R32_FOR_565(sr, d);
822 0 : sg = SkDITHER_G32_FOR_565(sg, d);
823 0 : sb = SkDITHER_B32_FOR_565(sb, d);
824 :
825 0 : uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
826 0 : uint32_t dst_expanded = SkExpand_rgb_16(*dst);
827 0 : dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
828 : // now src and dst expanded are in g:11 r:10 x:1 b:10
829 0 : *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
830 : }
831 0 : dst += 1;
832 0 : DITHER_INC_X(x);
833 0 : count--;
834 : }
835 :
836 : unsigned short dither_value[8];
837 : __m128i dither, dither_cur;
838 : #ifdef ENABLE_DITHER_MATRIX_4X4
839 : const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
840 : dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
841 : dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
842 : dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
843 : dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
844 : #else
845 0 : const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
846 0 : dither_value[0] = dither_value[4] = (dither_scan
847 0 : >> (((x) & 3) << 2)) & 0xF;
848 0 : dither_value[1] = dither_value[5] = (dither_scan
849 0 : >> (((x + 1) & 3) << 2)) & 0xF;
850 0 : dither_value[2] = dither_value[6] = (dither_scan
851 0 : >> (((x + 2) & 3) << 2)) & 0xF;
852 0 : dither_value[3] = dither_value[7] = (dither_scan
853 0 : >> (((x + 3) & 3) << 2)) & 0xF;
854 : #endif
855 0 : dither = _mm_loadu_si128((__m128i*) dither_value);
856 :
857 0 : const __m128i* s = reinterpret_cast<const __m128i*>(src);
858 0 : __m128i* d = reinterpret_cast<__m128i*>(dst);
859 0 : __m128i var256 = _mm_set1_epi16(256);
860 0 : __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
861 0 : __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
862 0 : __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
863 :
864 0 : while (count >= 8) {
865 : // Load 8 pixels of src and dst.
866 0 : __m128i src_pixel1 = _mm_loadu_si128(s++);
867 0 : __m128i src_pixel2 = _mm_loadu_si128(s++);
868 0 : __m128i dst_pixel = _mm_load_si128(d);
869 :
870 : // Extract A from src.
871 0 : __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
872 0 : sa1 = _mm_srli_epi32(sa1, 24);
873 0 : __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
874 0 : sa2 = _mm_srli_epi32(sa2, 24);
875 0 : __m128i sa = _mm_packs_epi32(sa1, sa2);
876 :
877 : // Calculate current dither value.
878 0 : dither_cur = _mm_mullo_epi16(dither,
879 0 : _mm_add_epi16(sa, _mm_set1_epi16(1)));
880 0 : dither_cur = _mm_srli_epi16(dither_cur, 8);
881 :
882 : // Extract R from src.
883 0 : __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
884 0 : sr1 = _mm_srli_epi32(sr1, 24);
885 0 : __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
886 0 : sr2 = _mm_srli_epi32(sr2, 24);
887 0 : __m128i sr = _mm_packs_epi32(sr1, sr2);
888 :
889 : // SkDITHER_R32_FOR_565(sr, d)
890 0 : __m128i sr_offset = _mm_srli_epi16(sr, 5);
891 0 : sr = _mm_add_epi16(sr, dither_cur);
892 0 : sr = _mm_sub_epi16(sr, sr_offset);
893 :
894 : // Expand sr.
895 0 : sr = _mm_slli_epi16(sr, 2);
896 :
897 : // Extract G from src.
898 0 : __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
899 0 : sg1 = _mm_srli_epi32(sg1, 24);
900 0 : __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
901 0 : sg2 = _mm_srli_epi32(sg2, 24);
902 0 : __m128i sg = _mm_packs_epi32(sg1, sg2);
903 :
904 : // sg = SkDITHER_G32_FOR_565(sg, d).
905 0 : __m128i sg_offset = _mm_srli_epi16(sg, 6);
906 0 : sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
907 0 : sg = _mm_sub_epi16(sg, sg_offset);
908 :
909 : // Expand sg.
910 0 : sg = _mm_slli_epi16(sg, 3);
911 :
912 : // Extract B from src.
913 0 : __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
914 0 : sb1 = _mm_srli_epi32(sb1, 24);
915 0 : __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
916 0 : sb2 = _mm_srli_epi32(sb2, 24);
917 0 : __m128i sb = _mm_packs_epi32(sb1, sb2);
918 :
919 : // sb = SkDITHER_B32_FOR_565(sb, d).
920 0 : __m128i sb_offset = _mm_srli_epi16(sb, 5);
921 0 : sb = _mm_add_epi16(sb, dither_cur);
922 0 : sb = _mm_sub_epi16(sb, sb_offset);
923 :
924 : // Expand sb.
925 0 : sb = _mm_slli_epi16(sb, 2);
926 :
927 : // Extract R G B from dst.
928 0 : __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
929 0 : dr = _mm_and_si128(dr, r16_mask);
930 0 : __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
931 0 : dg = _mm_and_si128(dg, g16_mask);
932 0 : __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
933 0 : db = _mm_and_si128(db, b16_mask);
934 :
935 : // SkAlpha255To256(255 - a) >> 3
936 0 : __m128i isa = _mm_sub_epi16(var256, sa);
937 0 : isa = _mm_srli_epi16(isa, 3);
938 :
939 0 : dr = _mm_mullo_epi16(dr, isa);
940 0 : dr = _mm_add_epi16(dr, sr);
941 0 : dr = _mm_srli_epi16(dr, 5);
942 :
943 0 : dg = _mm_mullo_epi16(dg, isa);
944 0 : dg = _mm_add_epi16(dg, sg);
945 0 : dg = _mm_srli_epi16(dg, 5);
946 :
947 0 : db = _mm_mullo_epi16(db, isa);
948 0 : db = _mm_add_epi16(db, sb);
949 0 : db = _mm_srli_epi16(db, 5);
950 :
951 : // Package and store dst pixel.
952 0 : __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
953 0 : _mm_store_si128(d++, d_pixel);
954 :
955 0 : count -= 8;
956 0 : x += 8;
957 : }
958 :
959 0 : src = reinterpret_cast<const SkPMColor*>(s);
960 0 : dst = reinterpret_cast<uint16_t*>(d);
961 : }
962 :
963 0 : if (count > 0) {
964 0 : DITHER_565_SCAN(y);
965 0 : do {
966 0 : SkPMColor c = *src++;
967 0 : SkPMColorAssert(c);
968 0 : if (c) {
969 0 : unsigned a = SkGetPackedA32(c);
970 :
971 0 : int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
972 :
973 0 : unsigned sr = SkGetPackedR32(c);
974 0 : unsigned sg = SkGetPackedG32(c);
975 0 : unsigned sb = SkGetPackedB32(c);
976 0 : sr = SkDITHER_R32_FOR_565(sr, d);
977 0 : sg = SkDITHER_G32_FOR_565(sg, d);
978 0 : sb = SkDITHER_B32_FOR_565(sb, d);
979 :
980 0 : uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
981 0 : uint32_t dst_expanded = SkExpand_rgb_16(*dst);
982 0 : dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
983 : // now src and dst expanded are in g:11 r:10 x:1 b:10
984 0 : *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
985 : }
986 0 : dst += 1;
987 0 : DITHER_INC_X(x);
988 : } while (--count != 0);
989 : }
990 : }
|