Line data Source code
1 : /*
2 : * Copyright 2016 Google Inc.
3 : *
4 : * Use of this source code is governed by a BSD-style license that can be
5 : * found in the LICENSE file.
6 : */
7 :
8 : #ifndef SkSwizzler_opts_DEFINED
9 : #define SkSwizzler_opts_DEFINED
10 :
11 : #include "SkColorPriv.h"
12 :
13 : #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
14 : #include <immintrin.h>
15 : #elif defined(SK_ARM_HAS_NEON)
16 : #include <arm_neon.h>
17 : #endif
18 :
19 : namespace SK_OPTS_NS {
20 :
21 0 : static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
22 0 : auto src = (const uint32_t*)vsrc;
23 0 : for (int i = 0; i < count; i++) {
24 0 : uint8_t a = src[i] >> 24,
25 0 : b = src[i] >> 16,
26 0 : g = src[i] >> 8,
27 0 : r = src[i] >> 0;
28 0 : b = (b*a+127)/255;
29 0 : g = (g*a+127)/255;
30 0 : r = (r*a+127)/255;
31 0 : dst[i] = (uint32_t)a << 24
32 0 : | (uint32_t)b << 16
33 0 : | (uint32_t)g << 8
34 0 : | (uint32_t)r << 0;
35 : }
36 0 : }
37 :
38 0 : static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
39 0 : auto src = (const uint32_t*)vsrc;
40 0 : for (int i = 0; i < count; i++) {
41 0 : uint8_t a = src[i] >> 24,
42 0 : b = src[i] >> 16,
43 0 : g = src[i] >> 8,
44 0 : r = src[i] >> 0;
45 0 : b = (b*a+127)/255;
46 0 : g = (g*a+127)/255;
47 0 : r = (r*a+127)/255;
48 0 : dst[i] = (uint32_t)a << 24
49 0 : | (uint32_t)r << 16
50 0 : | (uint32_t)g << 8
51 0 : | (uint32_t)b << 0;
52 : }
53 0 : }
54 :
55 0 : static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
56 0 : auto src = (const uint32_t*)vsrc;
57 0 : for (int i = 0; i < count; i++) {
58 0 : uint8_t a = src[i] >> 24,
59 0 : b = src[i] >> 16,
60 0 : g = src[i] >> 8,
61 0 : r = src[i] >> 0;
62 0 : dst[i] = (uint32_t)a << 24
63 0 : | (uint32_t)r << 16
64 0 : | (uint32_t)g << 8
65 0 : | (uint32_t)b << 0;
66 : }
67 0 : }
68 :
69 0 : static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
70 0 : const uint8_t* src = (const uint8_t*)vsrc;
71 0 : for (int i = 0; i < count; i++) {
72 0 : uint8_t r = src[0],
73 0 : g = src[1],
74 0 : b = src[2];
75 0 : src += 3;
76 0 : dst[i] = (uint32_t)0xFF << 24
77 0 : | (uint32_t)b << 16
78 0 : | (uint32_t)g << 8
79 0 : | (uint32_t)r << 0;
80 : }
81 0 : }
82 :
83 0 : static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
84 0 : const uint8_t* src = (const uint8_t*)vsrc;
85 0 : for (int i = 0; i < count; i++) {
86 0 : uint8_t r = src[0],
87 0 : g = src[1],
88 0 : b = src[2];
89 0 : src += 3;
90 0 : dst[i] = (uint32_t)0xFF << 24
91 0 : | (uint32_t)r << 16
92 0 : | (uint32_t)g << 8
93 0 : | (uint32_t)b << 0;
94 : }
95 0 : }
96 :
97 0 : static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
98 0 : const uint8_t* src = (const uint8_t*)vsrc;
99 0 : for (int i = 0; i < count; i++) {
100 0 : dst[i] = (uint32_t)0xFF << 24
101 0 : | (uint32_t)src[i] << 16
102 0 : | (uint32_t)src[i] << 8
103 0 : | (uint32_t)src[i] << 0;
104 : }
105 0 : }
106 :
107 0 : static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) {
108 0 : const uint8_t* src = (const uint8_t*)vsrc;
109 0 : for (int i = 0; i < count; i++) {
110 0 : uint8_t g = src[0],
111 0 : a = src[1];
112 0 : src += 2;
113 0 : dst[i] = (uint32_t)a << 24
114 0 : | (uint32_t)g << 16
115 0 : | (uint32_t)g << 8
116 0 : | (uint32_t)g << 0;
117 : }
118 0 : }
119 :
120 0 : static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) {
121 0 : const uint8_t* src = (const uint8_t*)vsrc;
122 0 : for (int i = 0; i < count; i++) {
123 0 : uint8_t g = src[0],
124 0 : a = src[1];
125 0 : src += 2;
126 0 : g = (g*a+127)/255;
127 0 : dst[i] = (uint32_t)a << 24
128 0 : | (uint32_t)g << 16
129 0 : | (uint32_t)g << 8
130 0 : | (uint32_t)g << 0;
131 : }
132 0 : }
133 :
134 0 : static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {
135 0 : const uint32_t* src = (const uint32_t*)vsrc;
136 0 : for (int i = 0; i < count; i++) {
137 0 : uint8_t k = src[i] >> 24,
138 0 : y = src[i] >> 16,
139 0 : m = src[i] >> 8,
140 0 : c = src[i] >> 0;
141 : // See comments in SkSwizzler.cpp for details on the conversion formula.
142 0 : uint8_t b = (y*k+127)/255,
143 0 : g = (m*k+127)/255,
144 0 : r = (c*k+127)/255;
145 0 : dst[i] = (uint32_t)0xFF << 24
146 0 : | (uint32_t) b << 16
147 0 : | (uint32_t) g << 8
148 0 : | (uint32_t) r << 0;
149 : }
150 0 : }
151 :
152 0 : static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {
153 0 : const uint32_t* src = (const uint32_t*)vsrc;
154 0 : for (int i = 0; i < count; i++) {
155 0 : uint8_t k = src[i] >> 24,
156 0 : y = src[i] >> 16,
157 0 : m = src[i] >> 8,
158 0 : c = src[i] >> 0;
159 0 : uint8_t b = (y*k+127)/255,
160 0 : g = (m*k+127)/255,
161 0 : r = (c*k+127)/255;
162 0 : dst[i] = (uint32_t)0xFF << 24
163 0 : | (uint32_t) r << 16
164 0 : | (uint32_t) g << 8
165 0 : | (uint32_t) b << 0;
166 : }
167 0 : }
168 :
169 : #if defined(SK_ARM_HAS_NEON)
170 :
171 : // Rounded divide by 255, (x + 127) / 255
172 : static uint8x8_t div255_round(uint16x8_t x) {
173 : // result = (x + 127) / 255
174 : // result = (x + 127) / 256 + error1
175 : //
176 : // error1 = (x + 127) / (255 * 256)
177 : // error1 = (x + 127) / (256 * 256) + error2
178 : //
179 : // error2 = (x + 127) / (255 * 256 * 256)
180 : //
181 : // The maximum value of error2 is too small to matter. Thus:
182 : // result = (x + 127) / 256 + (x + 127) / (256 * 256)
183 : // result = ((x + 127) / 256 + x + 127) / 256
184 : // result = ((x + 127) >> 8 + x + 127) >> 8
185 : //
186 : // Use >>> to represent "rounded right shift" which, conveniently,
187 : // NEON supports in one instruction.
188 : // result = ((x >>> 8) + x) >>> 8
189 : //
190 : // Note that the second right shift is actually performed as an
191 : // "add, round, and narrow back to 8-bits" instruction.
192 : return vraddhn_u16(x, vrshrq_n_u16(x, 8));
193 : }
194 :
195 : // Scale a byte by another, (x * y + 127) / 255
196 : static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
197 : return div255_round(vmull_u8(x, y));
198 : }
199 :
200 : template <bool kSwapRB>
201 : static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
202 : auto src = (const uint32_t*)vsrc;
203 : while (count >= 8) {
204 : // Load 8 pixels.
205 : uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
206 :
207 : uint8x8_t a = rgba.val[3],
208 : b = rgba.val[2],
209 : g = rgba.val[1],
210 : r = rgba.val[0];
211 :
212 : // Premultiply.
213 : b = scale(b, a);
214 : g = scale(g, a);
215 : r = scale(r, a);
216 :
217 : // Store 8 premultiplied pixels.
218 : if (kSwapRB) {
219 : rgba.val[2] = r;
220 : rgba.val[1] = g;
221 : rgba.val[0] = b;
222 : } else {
223 : rgba.val[2] = b;
224 : rgba.val[1] = g;
225 : rgba.val[0] = r;
226 : }
227 : vst4_u8((uint8_t*) dst, rgba);
228 : src += 8;
229 : dst += 8;
230 : count -= 8;
231 : }
232 :
233 : // Call portable code to finish up the tail of [0,8) pixels.
234 : auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
235 : proc(dst, src, count);
236 : }
237 :
238 : static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
239 : premul_should_swapRB<false>(dst, src, count);
240 : }
241 :
242 : static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
243 : premul_should_swapRB<true>(dst, src, count);
244 : }
245 :
246 : static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
247 : auto src = (const uint32_t*)vsrc;
248 : while (count >= 16) {
249 : // Load 16 pixels.
250 : uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
251 :
252 : // Swap r and b.
253 : SkTSwap(rgba.val[0], rgba.val[2]);
254 :
255 : // Store 16 pixels.
256 : vst4q_u8((uint8_t*) dst, rgba);
257 : src += 16;
258 : dst += 16;
259 : count -= 16;
260 : }
261 :
262 : if (count >= 8) {
263 : // Load 8 pixels.
264 : uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
265 :
266 : // Swap r and b.
267 : SkTSwap(rgba.val[0], rgba.val[2]);
268 :
269 : // Store 8 pixels.
270 : vst4_u8((uint8_t*) dst, rgba);
271 : src += 8;
272 : dst += 8;
273 : count -= 8;
274 : }
275 :
276 : RGBA_to_BGRA_portable(dst, src, count);
277 : }
278 :
279 : template <bool kSwapRB>
280 : static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
281 : const uint8_t* src = (const uint8_t*) vsrc;
282 : while (count >= 16) {
283 : // Load 16 pixels.
284 : uint8x16x3_t rgb = vld3q_u8(src);
285 :
286 : // Insert an opaque alpha channel and swap if needed.
287 : uint8x16x4_t rgba;
288 : if (kSwapRB) {
289 : rgba.val[0] = rgb.val[2];
290 : rgba.val[2] = rgb.val[0];
291 : } else {
292 : rgba.val[0] = rgb.val[0];
293 : rgba.val[2] = rgb.val[2];
294 : }
295 : rgba.val[1] = rgb.val[1];
296 : rgba.val[3] = vdupq_n_u8(0xFF);
297 :
298 : // Store 16 pixels.
299 : vst4q_u8((uint8_t*) dst, rgba);
300 : src += 16*3;
301 : dst += 16;
302 : count -= 16;
303 : }
304 :
305 : if (count >= 8) {
306 : // Load 8 pixels.
307 : uint8x8x3_t rgb = vld3_u8(src);
308 :
309 : // Insert an opaque alpha channel and swap if needed.
310 : uint8x8x4_t rgba;
311 : if (kSwapRB) {
312 : rgba.val[0] = rgb.val[2];
313 : rgba.val[2] = rgb.val[0];
314 : } else {
315 : rgba.val[0] = rgb.val[0];
316 : rgba.val[2] = rgb.val[2];
317 : }
318 : rgba.val[1] = rgb.val[1];
319 : rgba.val[3] = vdup_n_u8(0xFF);
320 :
321 : // Store 8 pixels.
322 : vst4_u8((uint8_t*) dst, rgba);
323 : src += 8*3;
324 : dst += 8;
325 : count -= 8;
326 : }
327 :
328 : // Call portable code to finish up the tail of [0,8) pixels.
329 : auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
330 : proc(dst, src, count);
331 : }
332 :
333 : static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
334 : insert_alpha_should_swaprb<false>(dst, src, count);
335 : }
336 :
337 : static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
338 : insert_alpha_should_swaprb<true>(dst, src, count);
339 : }
340 :
341 : static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
342 : const uint8_t* src = (const uint8_t*) vsrc;
343 : while (count >= 16) {
344 : // Load 16 pixels.
345 : uint8x16_t gray = vld1q_u8(src);
346 :
347 : // Set each of the color channels.
348 : uint8x16x4_t rgba;
349 : rgba.val[0] = gray;
350 : rgba.val[1] = gray;
351 : rgba.val[2] = gray;
352 : rgba.val[3] = vdupq_n_u8(0xFF);
353 :
354 : // Store 16 pixels.
355 : vst4q_u8((uint8_t*) dst, rgba);
356 : src += 16;
357 : dst += 16;
358 : count -= 16;
359 : }
360 :
361 : if (count >= 8) {
362 : // Load 8 pixels.
363 : uint8x8_t gray = vld1_u8(src);
364 :
365 : // Set each of the color channels.
366 : uint8x8x4_t rgba;
367 : rgba.val[0] = gray;
368 : rgba.val[1] = gray;
369 : rgba.val[2] = gray;
370 : rgba.val[3] = vdup_n_u8(0xFF);
371 :
372 : // Store 8 pixels.
373 : vst4_u8((uint8_t*) dst, rgba);
374 : src += 8;
375 : dst += 8;
376 : count -= 8;
377 : }
378 :
379 : gray_to_RGB1_portable(dst, src, count);
380 : }
381 :
382 : template <bool kPremul>
383 : static void expand_grayA(uint32_t dst[], const void* vsrc, int count) {
384 : const uint8_t* src = (const uint8_t*) vsrc;
385 : while (count >= 16) {
386 : // Load 16 pixels.
387 : uint8x16x2_t ga = vld2q_u8(src);
388 :
389 : // Premultiply if requested.
390 : if (kPremul) {
391 : ga.val[0] = vcombine_u8(
392 : scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])),
393 : scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
394 : }
395 :
396 : // Set each of the color channels.
397 : uint8x16x4_t rgba;
398 : rgba.val[0] = ga.val[0];
399 : rgba.val[1] = ga.val[0];
400 : rgba.val[2] = ga.val[0];
401 : rgba.val[3] = ga.val[1];
402 :
403 : // Store 16 pixels.
404 : vst4q_u8((uint8_t*) dst, rgba);
405 : src += 16*2;
406 : dst += 16;
407 : count -= 16;
408 : }
409 :
410 : if (count >= 8) {
411 : // Load 8 pixels.
412 : uint8x8x2_t ga = vld2_u8(src);
413 :
414 : // Premultiply if requested.
415 : if (kPremul) {
416 : ga.val[0] = scale(ga.val[0], ga.val[1]);
417 : }
418 :
419 : // Set each of the color channels.
420 : uint8x8x4_t rgba;
421 : rgba.val[0] = ga.val[0];
422 : rgba.val[1] = ga.val[0];
423 : rgba.val[2] = ga.val[0];
424 : rgba.val[3] = ga.val[1];
425 :
426 : // Store 8 pixels.
427 : vst4_u8((uint8_t*) dst, rgba);
428 : src += 8*2;
429 : dst += 8;
430 : count -= 8;
431 : }
432 :
433 : auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
434 : proc(dst, src, count);
435 : }
436 :
437 : static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
438 : expand_grayA<false>(dst, src, count);
439 : }
440 :
441 : static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
442 : expand_grayA<true>(dst, src, count);
443 : }
444 :
445 : enum Format { kRGB1, kBGR1 };
446 : template <Format format>
447 : static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
448 : auto src = (const uint32_t*)vsrc;
449 : while (count >= 8) {
450 : // Load 8 cmyk pixels.
451 : uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
452 :
453 : uint8x8_t k = pixels.val[3],
454 : y = pixels.val[2],
455 : m = pixels.val[1],
456 : c = pixels.val[0];
457 :
458 : // Scale to r, g, b.
459 : uint8x8_t b = scale(y, k);
460 : uint8x8_t g = scale(m, k);
461 : uint8x8_t r = scale(c, k);
462 :
463 : // Store 8 rgba pixels.
464 : if (kBGR1 == format) {
465 : pixels.val[3] = vdup_n_u8(0xFF);
466 : pixels.val[2] = r;
467 : pixels.val[1] = g;
468 : pixels.val[0] = b;
469 : } else {
470 : pixels.val[3] = vdup_n_u8(0xFF);
471 : pixels.val[2] = b;
472 : pixels.val[1] = g;
473 : pixels.val[0] = r;
474 : }
475 : vst4_u8((uint8_t*) dst, pixels);
476 : src += 8;
477 : dst += 8;
478 : count -= 8;
479 : }
480 :
481 : auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
482 : proc(dst, src, count);
483 : }
484 :
485 : static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
486 : inverted_cmyk_to<kRGB1>(dst, src, count);
487 : }
488 :
489 : static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
490 : inverted_cmyk_to<kBGR1>(dst, src, count);
491 : }
492 :
493 : #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
494 :
495 : // Scale a byte by another.
496 : // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
497 0 : static __m128i scale(__m128i x, __m128i y) {
498 0 : const __m128i _128 = _mm_set1_epi16(128);
499 0 : const __m128i _257 = _mm_set1_epi16(257);
500 :
501 : // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
502 0 : return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
503 : }
504 :
505 : template <bool kSwapRB>
506 0 : static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
507 0 : auto src = (const uint32_t*)vsrc;
508 :
509 0 : auto premul8 = [](__m128i* lo, __m128i* hi) {
510 0 : const __m128i zeros = _mm_setzero_si128();
511 : __m128i planar;
512 : if (kSwapRB) {
513 0 : planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
514 : } else {
515 0 : planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
516 : }
517 :
518 : // Swizzle the pixels to 8-bit planar.
519 0 : *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa
520 0 : *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA
521 0 : __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG
522 0 : ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA
523 :
524 : // Unpack to 16-bit planar.
525 0 : __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_
526 0 : g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_
527 0 : b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_
528 0 : a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_
529 :
530 : // Premultiply!
531 0 : r = scale(r, a);
532 0 : g = scale(g, a);
533 0 : b = scale(b, a);
534 :
535 : // Repack into interlaced pixels.
536 0 : rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG
537 0 : ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA
538 0 : *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
539 0 : *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA
540 0 : };
541 :
542 0 : while (count >= 8) {
543 0 : __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
544 0 : hi = _mm_loadu_si128((const __m128i*) (src + 4));
545 :
546 0 : premul8(&lo, &hi);
547 :
548 0 : _mm_storeu_si128((__m128i*) (dst + 0), lo);
549 0 : _mm_storeu_si128((__m128i*) (dst + 4), hi);
550 :
551 0 : src += 8;
552 0 : dst += 8;
553 0 : count -= 8;
554 : }
555 :
556 0 : if (count >= 4) {
557 0 : __m128i lo = _mm_loadu_si128((const __m128i*) src),
558 0 : hi = _mm_setzero_si128();
559 :
560 0 : premul8(&lo, &hi);
561 :
562 0 : _mm_storeu_si128((__m128i*) dst, lo);
563 :
564 0 : src += 4;
565 0 : dst += 4;
566 0 : count -= 4;
567 : }
568 :
569 : // Call portable code to finish up the tail of [0,4) pixels.
570 0 : auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
571 0 : proc(dst, src, count);
572 0 : }
573 :
574 0 : static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
575 0 : premul_should_swapRB<false>(dst, src, count);
576 0 : }
577 :
578 0 : static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
579 0 : premul_should_swapRB<true>(dst, src, count);
580 0 : }
581 :
582 0 : static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
583 0 : auto src = (const uint32_t*)vsrc;
584 0 : const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
585 :
586 0 : while (count >= 4) {
587 0 : __m128i rgba = _mm_loadu_si128((const __m128i*) src);
588 0 : __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
589 : _mm_storeu_si128((__m128i*) dst, bgra);
590 :
591 0 : src += 4;
592 0 : dst += 4;
593 0 : count -= 4;
594 : }
595 :
596 0 : RGBA_to_BGRA_portable(dst, src, count);
597 0 : }
598 :
599 : template <bool kSwapRB>
600 0 : static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
601 0 : const uint8_t* src = (const uint8_t*) vsrc;
602 :
603 0 : const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
604 : __m128i expand;
605 0 : const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant.
606 : if (kSwapRB) {
607 0 : expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
608 : } else {
609 0 : expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
610 : }
611 :
612 0 : while (count >= 6) {
613 : // Load a vector. While this actually contains 5 pixels plus an
614 : // extra component, we will discard all but the first four pixels on
615 : // this iteration.
616 0 : __m128i rgb = _mm_loadu_si128((const __m128i*) src);
617 :
618 : // Expand the first four pixels to RGBX and then mask to RGB(FF).
619 0 : __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
620 :
621 : // Store 4 pixels.
622 : _mm_storeu_si128((__m128i*) dst, rgba);
623 :
624 0 : src += 4*3;
625 0 : dst += 4;
626 0 : count -= 4;
627 : }
628 :
629 : // Call portable code to finish up the tail of [0,4) pixels.
630 0 : auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
631 0 : proc(dst, src, count);
632 0 : }
633 :
634 0 : static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
635 0 : insert_alpha_should_swaprb<false>(dst, src, count);
636 0 : }
637 :
638 0 : static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
639 0 : insert_alpha_should_swaprb<true>(dst, src, count);
640 0 : }
641 :
642 0 : static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
643 0 : const uint8_t* src = (const uint8_t*) vsrc;
644 :
645 0 : const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
646 0 : while (count >= 16) {
647 0 : __m128i grays = _mm_loadu_si128((const __m128i*) src);
648 :
649 0 : __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
650 0 : __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
651 0 : __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
652 0 : __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
653 :
654 0 : __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
655 0 : __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
656 0 : __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
657 0 : __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
658 :
659 : _mm_storeu_si128((__m128i*) (dst + 0), ggga0);
660 0 : _mm_storeu_si128((__m128i*) (dst + 4), ggga1);
661 0 : _mm_storeu_si128((__m128i*) (dst + 8), ggga2);
662 0 : _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
663 :
664 0 : src += 16;
665 0 : dst += 16;
666 0 : count -= 16;
667 : }
668 :
669 0 : gray_to_RGB1_portable(dst, src, count);
670 0 : }
671 :
672 0 : static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
673 0 : const uint8_t* src = (const uint8_t*) vsrc;
674 0 : while (count >= 8) {
675 0 : __m128i ga = _mm_loadu_si128((const __m128i*) src);
676 :
677 0 : __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
678 0 : _mm_slli_epi16(ga, 8));
679 :
680 0 : __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
681 0 : __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
682 :
683 : _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
684 0 : _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
685 :
686 0 : src += 8*2;
687 0 : dst += 8;
688 0 : count -= 8;
689 : }
690 :
691 0 : grayA_to_RGBA_portable(dst, src, count);
692 0 : }
693 :
694 0 : static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
695 0 : const uint8_t* src = (const uint8_t*) vsrc;
696 0 : while (count >= 8) {
697 0 : __m128i grayA = _mm_loadu_si128((const __m128i*) src);
698 :
699 0 : __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
700 0 : __m128i a0 = _mm_srli_epi16(grayA, 8);
701 :
702 : // Premultiply
703 0 : g0 = scale(g0, a0);
704 :
705 0 : __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
706 0 : __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
707 :
708 :
709 0 : __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
710 0 : __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
711 :
712 : _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
713 0 : _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
714 :
715 0 : src += 8*2;
716 0 : dst += 8;
717 0 : count -= 8;
718 : }
719 :
720 0 : grayA_to_rgbA_portable(dst, src, count);
721 0 : }
722 :
723 : enum Format { kRGB1, kBGR1 };
724 : template <Format format>
725 0 : static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
726 0 : auto src = (const uint32_t*)vsrc;
727 :
728 0 : auto convert8 = [](__m128i* lo, __m128i* hi) {
729 0 : const __m128i zeros = _mm_setzero_si128();
730 : __m128i planar;
731 : if (kBGR1 == format) {
732 0 : planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
733 : } else {
734 0 : planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
735 : }
736 :
737 : // Swizzle the pixels to 8-bit planar.
738 0 : *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk
739 0 : *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK
740 0 : __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM
741 0 : yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK
742 :
743 : // Unpack to 16-bit planar.
744 0 : __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_
745 0 : m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_
746 0 : y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_
747 0 : k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_
748 :
749 : // Scale to r, g, b.
750 0 : __m128i r = scale(c, k),
751 0 : g = scale(m, k),
752 0 : b = scale(y, k);
753 :
754 : // Repack into interlaced pixels.
755 0 : __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG
756 0 : ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1
757 0 : *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
758 0 : *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1
759 0 : };
760 :
761 0 : while (count >= 8) {
762 0 : __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
763 0 : hi = _mm_loadu_si128((const __m128i*) (src + 4));
764 :
765 0 : convert8(&lo, &hi);
766 :
767 0 : _mm_storeu_si128((__m128i*) (dst + 0), lo);
768 0 : _mm_storeu_si128((__m128i*) (dst + 4), hi);
769 :
770 0 : src += 8;
771 0 : dst += 8;
772 0 : count -= 8;
773 : }
774 :
775 0 : if (count >= 4) {
776 0 : __m128i lo = _mm_loadu_si128((const __m128i*) src),
777 0 : hi = _mm_setzero_si128();
778 :
779 0 : convert8(&lo, &hi);
780 :
781 0 : _mm_storeu_si128((__m128i*) dst, lo);
782 :
783 0 : src += 4;
784 0 : dst += 4;
785 0 : count -= 4;
786 : }
787 :
788 0 : auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
789 0 : proc(dst, src, count);
790 0 : }
791 :
792 0 : static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
793 0 : inverted_cmyk_to<kRGB1>(dst, src, count);
794 0 : }
795 :
796 0 : static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
797 0 : inverted_cmyk_to<kBGR1>(dst, src, count);
798 0 : }
799 :
800 : #else
801 :
802 0 : static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
803 0 : RGBA_to_rgbA_portable(dst, src, count);
804 0 : }
805 :
806 0 : static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
807 0 : RGBA_to_bgrA_portable(dst, src, count);
808 0 : }
809 :
810 0 : static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
811 0 : RGBA_to_BGRA_portable(dst, src, count);
812 0 : }
813 :
814 0 : static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
815 0 : RGB_to_RGB1_portable(dst, src, count);
816 0 : }
817 :
818 0 : static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
819 0 : RGB_to_BGR1_portable(dst, src, count);
820 0 : }
821 :
822 0 : static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
823 0 : gray_to_RGB1_portable(dst, src, count);
824 0 : }
825 :
826 0 : static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
827 0 : grayA_to_RGBA_portable(dst, src, count);
828 0 : }
829 :
830 0 : static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
831 0 : grayA_to_rgbA_portable(dst, src, count);
832 0 : }
833 :
834 0 : static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
835 0 : inverted_CMYK_to_RGB1_portable(dst, src, count);
836 0 : }
837 :
838 0 : static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
839 0 : inverted_CMYK_to_BGR1_portable(dst, src, count);
840 0 : }
841 :
842 : #endif
843 :
844 : }
845 :
846 : #endif // SkSwizzler_opts_DEFINED
|