LCOV - code coverage report
Current view: top level - gfx/skia/skia/src/opts - SkSwizzler_opts.h (source / functions) Hit Total Coverage
Test: output.info Lines: 0 341 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 51 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright 2016 Google Inc.
       3             :  *
       4             :  * Use of this source code is governed by a BSD-style license that can be
       5             :  * found in the LICENSE file.
       6             :  */
       7             : 
       8             : #ifndef SkSwizzler_opts_DEFINED
       9             : #define SkSwizzler_opts_DEFINED
      10             : 
      11             : #include "SkColorPriv.h"
      12             : 
      13             : #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
      14             :     #include <immintrin.h>
      15             : #elif defined(SK_ARM_HAS_NEON)
      16             :     #include <arm_neon.h>
      17             : #endif
      18             : 
      19             : namespace SK_OPTS_NS {
      20             : 
      21           0 : static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
      22           0 :     auto src = (const uint32_t*)vsrc;
      23           0 :     for (int i = 0; i < count; i++) {
      24           0 :         uint8_t a = src[i] >> 24,
      25           0 :                 b = src[i] >> 16,
      26           0 :                 g = src[i] >>  8,
      27           0 :                 r = src[i] >>  0;
      28           0 :         b = (b*a+127)/255;
      29           0 :         g = (g*a+127)/255;
      30           0 :         r = (r*a+127)/255;
      31           0 :         dst[i] = (uint32_t)a << 24
      32           0 :                | (uint32_t)b << 16
      33           0 :                | (uint32_t)g <<  8
      34           0 :                | (uint32_t)r <<  0;
      35             :     }
      36           0 : }
      37             : 
      38           0 : static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
      39           0 :     auto src = (const uint32_t*)vsrc;
      40           0 :     for (int i = 0; i < count; i++) {
      41           0 :         uint8_t a = src[i] >> 24,
      42           0 :                 b = src[i] >> 16,
      43           0 :                 g = src[i] >>  8,
      44           0 :                 r = src[i] >>  0;
      45           0 :         b = (b*a+127)/255;
      46           0 :         g = (g*a+127)/255;
      47           0 :         r = (r*a+127)/255;
      48           0 :         dst[i] = (uint32_t)a << 24
      49           0 :                | (uint32_t)r << 16
      50           0 :                | (uint32_t)g <<  8
      51           0 :                | (uint32_t)b <<  0;
      52             :     }
      53           0 : }
      54             : 
      55           0 : static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
      56           0 :     auto src = (const uint32_t*)vsrc;
      57           0 :     for (int i = 0; i < count; i++) {
      58           0 :         uint8_t a = src[i] >> 24,
      59           0 :                 b = src[i] >> 16,
      60           0 :                 g = src[i] >>  8,
      61           0 :                 r = src[i] >>  0;
      62           0 :         dst[i] = (uint32_t)a << 24
      63           0 :                | (uint32_t)r << 16
      64           0 :                | (uint32_t)g <<  8
      65           0 :                | (uint32_t)b <<  0;
      66             :     }
      67           0 : }
      68             : 
      69           0 : static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
      70           0 :     const uint8_t* src = (const uint8_t*)vsrc;
      71           0 :     for (int i = 0; i < count; i++) {
      72           0 :         uint8_t r = src[0],
      73           0 :                 g = src[1],
      74           0 :                 b = src[2];
      75           0 :         src += 3;
      76           0 :         dst[i] = (uint32_t)0xFF << 24
      77           0 :                | (uint32_t)b    << 16
      78           0 :                | (uint32_t)g    <<  8
      79           0 :                | (uint32_t)r    <<  0;
      80             :     }
      81           0 : }
      82             : 
      83           0 : static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
      84           0 :     const uint8_t* src = (const uint8_t*)vsrc;
      85           0 :     for (int i = 0; i < count; i++) {
      86           0 :         uint8_t r = src[0],
      87           0 :                 g = src[1],
      88           0 :                 b = src[2];
      89           0 :         src += 3;
      90           0 :         dst[i] = (uint32_t)0xFF << 24
      91           0 :                | (uint32_t)r    << 16
      92           0 :                | (uint32_t)g    <<  8
      93           0 :                | (uint32_t)b    <<  0;
      94             :     }
      95           0 : }
      96             : 
      97           0 : static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
      98           0 :     const uint8_t* src = (const uint8_t*)vsrc;
      99           0 :     for (int i = 0; i < count; i++) {
     100           0 :         dst[i] = (uint32_t)0xFF   << 24
     101           0 :                | (uint32_t)src[i] << 16
     102           0 :                | (uint32_t)src[i] <<  8
     103           0 :                | (uint32_t)src[i] <<  0;
     104             :     }
     105           0 : }
     106             : 
     107           0 : static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) {
     108           0 :     const uint8_t* src = (const uint8_t*)vsrc;
     109           0 :     for (int i = 0; i < count; i++) {
     110           0 :         uint8_t g = src[0],
     111           0 :                 a = src[1];
     112           0 :         src += 2;
     113           0 :         dst[i] = (uint32_t)a << 24
     114           0 :                | (uint32_t)g << 16
     115           0 :                | (uint32_t)g <<  8
     116           0 :                | (uint32_t)g <<  0;
     117             :     }
     118           0 : }
     119             : 
     120           0 : static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) {
     121           0 :     const uint8_t* src = (const uint8_t*)vsrc;
     122           0 :     for (int i = 0; i < count; i++) {
     123           0 :         uint8_t g = src[0],
     124           0 :                 a = src[1];
     125           0 :         src += 2;
     126           0 :         g = (g*a+127)/255;
     127           0 :         dst[i] = (uint32_t)a << 24
     128           0 :                | (uint32_t)g << 16
     129           0 :                | (uint32_t)g <<  8
     130           0 :                | (uint32_t)g <<  0;
     131             :     }
     132           0 : }
     133             : 
     134           0 : static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {
     135           0 :     const uint32_t* src = (const uint32_t*)vsrc;
     136           0 :     for (int i = 0; i < count; i++) {
     137           0 :         uint8_t k = src[i] >> 24,
     138           0 :                 y = src[i] >> 16,
     139           0 :                 m = src[i] >>  8,
     140           0 :                 c = src[i] >>  0;
     141             :         // See comments in SkSwizzler.cpp for details on the conversion formula.
     142           0 :         uint8_t b = (y*k+127)/255,
     143           0 :                 g = (m*k+127)/255,
     144           0 :                 r = (c*k+127)/255;
     145           0 :         dst[i] = (uint32_t)0xFF << 24
     146           0 :                | (uint32_t)   b << 16
     147           0 :                | (uint32_t)   g <<  8
     148           0 :                | (uint32_t)   r <<  0;
     149             :     }
     150           0 : }
     151             : 
     152           0 : static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {
     153           0 :     const uint32_t* src = (const uint32_t*)vsrc;
     154           0 :     for (int i = 0; i < count; i++) {
     155           0 :         uint8_t k = src[i] >> 24,
     156           0 :                 y = src[i] >> 16,
     157           0 :                 m = src[i] >>  8,
     158           0 :                 c = src[i] >>  0;
     159           0 :         uint8_t b = (y*k+127)/255,
     160           0 :                 g = (m*k+127)/255,
     161           0 :                 r = (c*k+127)/255;
     162           0 :         dst[i] = (uint32_t)0xFF << 24
     163           0 :                | (uint32_t)   r << 16
     164           0 :                | (uint32_t)   g <<  8
     165           0 :                | (uint32_t)   b <<  0;
     166             :     }
     167           0 : }
     168             : 
     169             : #if defined(SK_ARM_HAS_NEON)
     170             : 
     171             : // Rounded divide by 255, (x + 127) / 255
     172             : static uint8x8_t div255_round(uint16x8_t x) {
     173             :     // result = (x + 127) / 255
     174             :     // result = (x + 127) / 256 + error1
     175             :     //
     176             :     // error1 = (x + 127) / (255 * 256)
     177             :     // error1 = (x + 127) / (256 * 256) + error2
     178             :     //
     179             :     // error2 = (x + 127) / (255 * 256 * 256)
     180             :     //
     181             :     // The maximum value of error2 is too small to matter.  Thus:
     182             :     // result = (x + 127) / 256 + (x + 127) / (256 * 256)
     183             :     // result = ((x + 127) / 256 + x + 127) / 256
     184             :     // result = ((x + 127) >> 8 + x + 127) >> 8
     185             :     //
     186             :     // Use >>> to represent "rounded right shift" which, conveniently,
     187             :     // NEON supports in one instruction.
     188             :     // result = ((x >>> 8) + x) >>> 8
     189             :     //
     190             :     // Note that the second right shift is actually performed as an
     191             :     // "add, round, and narrow back to 8-bits" instruction.
     192             :     return vraddhn_u16(x, vrshrq_n_u16(x, 8));
     193             : }
     194             : 
     195             : // Scale a byte by another, (x * y + 127) / 255
     196             : static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
     197             :     return div255_round(vmull_u8(x, y));
     198             : }
     199             : 
     200             : template <bool kSwapRB>
     201             : static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
     202             :     auto src = (const uint32_t*)vsrc;
     203             :     while (count >= 8) {
     204             :         // Load 8 pixels.
     205             :         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
     206             : 
     207             :         uint8x8_t a = rgba.val[3],
     208             :                   b = rgba.val[2],
     209             :                   g = rgba.val[1],
     210             :                   r = rgba.val[0];
     211             : 
     212             :         // Premultiply.
     213             :         b = scale(b, a);
     214             :         g = scale(g, a);
     215             :         r = scale(r, a);
     216             : 
     217             :         // Store 8 premultiplied pixels.
     218             :         if (kSwapRB) {
     219             :             rgba.val[2] = r;
     220             :             rgba.val[1] = g;
     221             :             rgba.val[0] = b;
     222             :         } else {
     223             :             rgba.val[2] = b;
     224             :             rgba.val[1] = g;
     225             :             rgba.val[0] = r;
     226             :         }
     227             :         vst4_u8((uint8_t*) dst, rgba);
     228             :         src += 8;
     229             :         dst += 8;
     230             :         count -= 8;
     231             :     }
     232             : 
     233             :     // Call portable code to finish up the tail of [0,8) pixels.
     234             :     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
     235             :     proc(dst, src, count);
     236             : }
     237             : 
     238             : static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
     239             :     premul_should_swapRB<false>(dst, src, count);
     240             : }
     241             : 
     242             : static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
     243             :     premul_should_swapRB<true>(dst, src, count);
     244             : }
     245             : 
     246             : static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
     247             :     auto src = (const uint32_t*)vsrc;
     248             :     while (count >= 16) {
     249             :         // Load 16 pixels.
     250             :         uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
     251             : 
     252             :         // Swap r and b.
     253             :         SkTSwap(rgba.val[0], rgba.val[2]);
     254             : 
     255             :         // Store 16 pixels.
     256             :         vst4q_u8((uint8_t*) dst, rgba);
     257             :         src += 16;
     258             :         dst += 16;
     259             :         count -= 16;
     260             :     }
     261             : 
     262             :     if (count >= 8) {
     263             :         // Load 8 pixels.
     264             :         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
     265             : 
     266             :         // Swap r and b.
     267             :         SkTSwap(rgba.val[0], rgba.val[2]);
     268             : 
     269             :         // Store 8 pixels.
     270             :         vst4_u8((uint8_t*) dst, rgba);
     271             :         src += 8;
     272             :         dst += 8;
     273             :         count -= 8;
     274             :     }
     275             : 
     276             :     RGBA_to_BGRA_portable(dst, src, count);
     277             : }
     278             : 
     279             : template <bool kSwapRB>
     280             : static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
     281             :     const uint8_t* src = (const uint8_t*) vsrc;
     282             :     while (count >= 16) {
     283             :         // Load 16 pixels.
     284             :         uint8x16x3_t rgb = vld3q_u8(src);
     285             : 
     286             :         // Insert an opaque alpha channel and swap if needed.
     287             :         uint8x16x4_t rgba;
     288             :         if (kSwapRB) {
     289             :             rgba.val[0] = rgb.val[2];
     290             :             rgba.val[2] = rgb.val[0];
     291             :         } else {
     292             :             rgba.val[0] = rgb.val[0];
     293             :             rgba.val[2] = rgb.val[2];
     294             :         }
     295             :         rgba.val[1] = rgb.val[1];
     296             :         rgba.val[3] = vdupq_n_u8(0xFF);
     297             : 
     298             :         // Store 16 pixels.
     299             :         vst4q_u8((uint8_t*) dst, rgba);
     300             :         src += 16*3;
     301             :         dst += 16;
     302             :         count -= 16;
     303             :     }
     304             : 
     305             :     if (count >= 8) {
     306             :         // Load 8 pixels.
     307             :         uint8x8x3_t rgb = vld3_u8(src);
     308             : 
     309             :         // Insert an opaque alpha channel and swap if needed.
     310             :         uint8x8x4_t rgba;
     311             :         if (kSwapRB) {
     312             :             rgba.val[0] = rgb.val[2];
     313             :             rgba.val[2] = rgb.val[0];
     314             :         } else {
     315             :             rgba.val[0] = rgb.val[0];
     316             :             rgba.val[2] = rgb.val[2];
     317             :         }
     318             :         rgba.val[1] = rgb.val[1];
     319             :         rgba.val[3] = vdup_n_u8(0xFF);
     320             : 
     321             :         // Store 8 pixels.
     322             :         vst4_u8((uint8_t*) dst, rgba);
     323             :         src += 8*3;
     324             :         dst += 8;
     325             :         count -= 8;
     326             :     }
     327             : 
     328             :     // Call portable code to finish up the tail of [0,8) pixels.
     329             :     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
     330             :     proc(dst, src, count);
     331             : }
     332             : 
     333             : static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
     334             :     insert_alpha_should_swaprb<false>(dst, src, count);
     335             : }
     336             : 
     337             : static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
     338             :     insert_alpha_should_swaprb<true>(dst, src, count);
     339             : }
     340             : 
     341             : static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
     342             :     const uint8_t* src = (const uint8_t*) vsrc;
     343             :     while (count >= 16) {
     344             :         // Load 16 pixels.
     345             :         uint8x16_t gray = vld1q_u8(src);
     346             : 
     347             :         // Set each of the color channels.
     348             :         uint8x16x4_t rgba;
     349             :         rgba.val[0] = gray;
     350             :         rgba.val[1] = gray;
     351             :         rgba.val[2] = gray;
     352             :         rgba.val[3] = vdupq_n_u8(0xFF);
     353             : 
     354             :         // Store 16 pixels.
     355             :         vst4q_u8((uint8_t*) dst, rgba);
     356             :         src += 16;
     357             :         dst += 16;
     358             :         count -= 16;
     359             :     }
     360             : 
     361             :     if (count >= 8) {
     362             :         // Load 8 pixels.
     363             :         uint8x8_t gray = vld1_u8(src);
     364             : 
     365             :         // Set each of the color channels.
     366             :         uint8x8x4_t rgba;
     367             :         rgba.val[0] = gray;
     368             :         rgba.val[1] = gray;
     369             :         rgba.val[2] = gray;
     370             :         rgba.val[3] = vdup_n_u8(0xFF);
     371             : 
     372             :         // Store 8 pixels.
     373             :         vst4_u8((uint8_t*) dst, rgba);
     374             :         src += 8;
     375             :         dst += 8;
     376             :         count -= 8;
     377             :     }
     378             : 
     379             :     gray_to_RGB1_portable(dst, src, count);
     380             : }
     381             : 
     382             : template <bool kPremul>
     383             : static void expand_grayA(uint32_t dst[], const void* vsrc, int count) {
     384             :     const uint8_t* src = (const uint8_t*) vsrc;
     385             :     while (count >= 16) {
     386             :         // Load 16 pixels.
     387             :         uint8x16x2_t ga = vld2q_u8(src);
     388             : 
     389             :         // Premultiply if requested.
     390             :         if (kPremul) {
     391             :             ga.val[0] = vcombine_u8(
     392             :                     scale(vget_low_u8(ga.val[0]),  vget_low_u8(ga.val[1])),
     393             :                     scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
     394             :         }
     395             : 
     396             :         // Set each of the color channels.
     397             :         uint8x16x4_t rgba;
     398             :         rgba.val[0] = ga.val[0];
     399             :         rgba.val[1] = ga.val[0];
     400             :         rgba.val[2] = ga.val[0];
     401             :         rgba.val[3] = ga.val[1];
     402             : 
     403             :         // Store 16 pixels.
     404             :         vst4q_u8((uint8_t*) dst, rgba);
     405             :         src += 16*2;
     406             :         dst += 16;
     407             :         count -= 16;
     408             :     }
     409             : 
     410             :     if (count >= 8) {
     411             :         // Load 8 pixels.
     412             :         uint8x8x2_t ga = vld2_u8(src);
     413             : 
     414             :         // Premultiply if requested.
     415             :         if (kPremul) {
     416             :             ga.val[0] = scale(ga.val[0], ga.val[1]);
     417             :         }
     418             : 
     419             :         // Set each of the color channels.
     420             :         uint8x8x4_t rgba;
     421             :         rgba.val[0] = ga.val[0];
     422             :         rgba.val[1] = ga.val[0];
     423             :         rgba.val[2] = ga.val[0];
     424             :         rgba.val[3] = ga.val[1];
     425             : 
     426             :         // Store 8 pixels.
     427             :         vst4_u8((uint8_t*) dst, rgba);
     428             :         src += 8*2;
     429             :         dst += 8;
     430             :         count -= 8;
     431             :     }
     432             : 
     433             :     auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
     434             :     proc(dst, src, count);
     435             : }
     436             : 
     437             : static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
     438             :     expand_grayA<false>(dst, src, count);
     439             : }
     440             : 
     441             : static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
     442             :     expand_grayA<true>(dst, src, count);
     443             : }
     444             : 
     445             : enum Format { kRGB1, kBGR1 };
     446             : template <Format format>
     447             : static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
     448             :     auto src = (const uint32_t*)vsrc;
     449             :     while (count >= 8) {
     450             :         // Load 8 cmyk pixels.
     451             :         uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
     452             : 
     453             :         uint8x8_t k = pixels.val[3],
     454             :                   y = pixels.val[2],
     455             :                   m = pixels.val[1],
     456             :                   c = pixels.val[0];
     457             : 
     458             :         // Scale to r, g, b.
     459             :         uint8x8_t b = scale(y, k);
     460             :         uint8x8_t g = scale(m, k);
     461             :         uint8x8_t r = scale(c, k);
     462             : 
     463             :         // Store 8 rgba pixels.
     464             :         if (kBGR1 == format) {
     465             :             pixels.val[3] = vdup_n_u8(0xFF);
     466             :             pixels.val[2] = r;
     467             :             pixels.val[1] = g;
     468             :             pixels.val[0] = b;
     469             :         } else {
     470             :             pixels.val[3] = vdup_n_u8(0xFF);
     471             :             pixels.val[2] = b;
     472             :             pixels.val[1] = g;
     473             :             pixels.val[0] = r;
     474             :         }
     475             :         vst4_u8((uint8_t*) dst, pixels);
     476             :         src += 8;
     477             :         dst += 8;
     478             :         count -= 8;
     479             :     }
     480             : 
     481             :     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
     482             :     proc(dst, src, count);
     483             : }
     484             : 
     485             : static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
     486             :     inverted_cmyk_to<kRGB1>(dst, src, count);
     487             : }
     488             : 
     489             : static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
     490             :     inverted_cmyk_to<kBGR1>(dst, src, count);
     491             : }
     492             : 
     493             : #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
     494             : 
     495             : // Scale a byte by another.
     496             : // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
     497           0 : static __m128i scale(__m128i x, __m128i y) {
     498           0 :     const __m128i _128 = _mm_set1_epi16(128);
     499           0 :     const __m128i _257 = _mm_set1_epi16(257);
     500             : 
     501             :     // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
     502           0 :     return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
     503             : }
     504             : 
     505             : template <bool kSwapRB>
     506           0 : static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
     507           0 :     auto src = (const uint32_t*)vsrc;
     508             : 
     509           0 :     auto premul8 = [](__m128i* lo, __m128i* hi) {
     510           0 :         const __m128i zeros = _mm_setzero_si128();
     511             :         __m128i planar;
     512             :         if (kSwapRB) {
     513           0 :             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
     514             :         } else {
     515           0 :             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
     516             :         }
     517             : 
     518             :         // Swizzle the pixels to 8-bit planar.
     519           0 :         *lo = _mm_shuffle_epi8(*lo, planar);                      // rrrrgggg bbbbaaaa
     520           0 :         *hi = _mm_shuffle_epi8(*hi, planar);                      // RRRRGGGG BBBBAAAA
     521           0 :         __m128i rg = _mm_unpacklo_epi32(*lo, *hi),                // rrrrRRRR ggggGGGG
     522           0 :                 ba = _mm_unpackhi_epi32(*lo, *hi);                // bbbbBBBB aaaaAAAA
     523             : 
     524             :         // Unpack to 16-bit planar.
     525           0 :         __m128i r = _mm_unpacklo_epi8(rg, zeros),                 // r_r_r_r_ R_R_R_R_
     526           0 :                 g = _mm_unpackhi_epi8(rg, zeros),                 // g_g_g_g_ G_G_G_G_
     527           0 :                 b = _mm_unpacklo_epi8(ba, zeros),                 // b_b_b_b_ B_B_B_B_
     528           0 :                 a = _mm_unpackhi_epi8(ba, zeros);                 // a_a_a_a_ A_A_A_A_
     529             : 
     530             :         // Premultiply!
     531           0 :         r = scale(r, a);
     532           0 :         g = scale(g, a);
     533           0 :         b = scale(b, a);
     534             : 
     535             :         // Repack into interlaced pixels.
     536           0 :         rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));               // rgrgrgrg RGRGRGRG
     537           0 :         ba = _mm_or_si128(b, _mm_slli_epi16(a, 8));               // babababa BABABABA
     538           0 :         *lo = _mm_unpacklo_epi16(rg, ba);                         // rgbargba rgbargba
     539           0 :         *hi = _mm_unpackhi_epi16(rg, ba);                         // RGBARGBA RGBARGBA
     540           0 :     };
     541             : 
     542           0 :     while (count >= 8) {
     543           0 :         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
     544           0 :                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
     545             : 
     546           0 :         premul8(&lo, &hi);
     547             : 
     548           0 :         _mm_storeu_si128((__m128i*) (dst + 0), lo);
     549           0 :         _mm_storeu_si128((__m128i*) (dst + 4), hi);
     550             : 
     551           0 :         src += 8;
     552           0 :         dst += 8;
     553           0 :         count -= 8;
     554             :     }
     555             : 
     556           0 :     if (count >= 4) {
     557           0 :         __m128i lo = _mm_loadu_si128((const __m128i*) src),
     558           0 :                 hi = _mm_setzero_si128();
     559             : 
     560           0 :         premul8(&lo, &hi);
     561             : 
     562           0 :         _mm_storeu_si128((__m128i*) dst, lo);
     563             : 
     564           0 :         src += 4;
     565           0 :         dst += 4;
     566           0 :         count -= 4;
     567             :     }
     568             : 
     569             :     // Call portable code to finish up the tail of [0,4) pixels.
     570           0 :     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
     571           0 :     proc(dst, src, count);
     572           0 : }
     573             : 
     574           0 : static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
     575           0 :     premul_should_swapRB<false>(dst, src, count);
     576           0 : }
     577             : 
     578           0 : static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
     579           0 :     premul_should_swapRB<true>(dst, src, count);
     580           0 : }
     581             : 
     582           0 : static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
     583           0 :     auto src = (const uint32_t*)vsrc;
     584           0 :     const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
     585             : 
     586           0 :     while (count >= 4) {
     587           0 :         __m128i rgba = _mm_loadu_si128((const __m128i*) src);
     588           0 :         __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
     589             :         _mm_storeu_si128((__m128i*) dst, bgra);
     590             : 
     591           0 :         src += 4;
     592           0 :         dst += 4;
     593           0 :         count -= 4;
     594             :     }
     595             : 
     596           0 :     RGBA_to_BGRA_portable(dst, src, count);
     597           0 : }
     598             : 
     599             : template <bool kSwapRB>
     600           0 : static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
     601           0 :     const uint8_t* src = (const uint8_t*) vsrc;
     602             : 
     603           0 :     const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
     604             :     __m128i expand;
     605           0 :     const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
     606             :     if (kSwapRB) {
     607           0 :         expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
     608             :     } else {
     609           0 :         expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
     610             :     }
     611             : 
     612           0 :     while (count >= 6) {
     613             :         // Load a vector.  While this actually contains 5 pixels plus an
     614             :         // extra component, we will discard all but the first four pixels on
     615             :         // this iteration.
     616           0 :         __m128i rgb = _mm_loadu_si128((const __m128i*) src);
     617             : 
     618             :         // Expand the first four pixels to RGBX and then mask to RGB(FF).
     619           0 :         __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
     620             : 
     621             :         // Store 4 pixels.
     622             :         _mm_storeu_si128((__m128i*) dst, rgba);
     623             : 
     624           0 :         src += 4*3;
     625           0 :         dst += 4;
     626           0 :         count -= 4;
     627             :     }
     628             : 
     629             :     // Call portable code to finish up the tail of [0,4) pixels.
     630           0 :     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
     631           0 :     proc(dst, src, count);
     632           0 : }
     633             : 
     634           0 : static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
     635           0 :     insert_alpha_should_swaprb<false>(dst, src, count);
     636           0 : }
     637             : 
     638           0 : static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
     639           0 :     insert_alpha_should_swaprb<true>(dst, src, count);
     640           0 : }
     641             : 
     642           0 : static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
     643           0 :     const uint8_t* src = (const uint8_t*) vsrc;
     644             : 
     645           0 :     const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
     646           0 :     while (count >= 16) {
     647           0 :         __m128i grays = _mm_loadu_si128((const __m128i*) src);
     648             : 
     649           0 :         __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
     650           0 :         __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
     651           0 :         __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
     652           0 :         __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
     653             : 
     654           0 :         __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
     655           0 :         __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
     656           0 :         __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
     657           0 :         __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
     658             : 
     659             :         _mm_storeu_si128((__m128i*) (dst +  0), ggga0);
     660           0 :         _mm_storeu_si128((__m128i*) (dst +  4), ggga1);
     661           0 :         _mm_storeu_si128((__m128i*) (dst +  8), ggga2);
     662           0 :         _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
     663             : 
     664           0 :         src += 16;
     665           0 :         dst += 16;
     666           0 :         count -= 16;
     667             :     }
     668             : 
     669           0 :     gray_to_RGB1_portable(dst, src, count);
     670           0 : }
     671             : 
     672           0 : static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
     673           0 :     const uint8_t* src = (const uint8_t*) vsrc;
     674           0 :     while (count >= 8) {
     675           0 :         __m128i ga = _mm_loadu_si128((const __m128i*) src);
     676             : 
     677           0 :         __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
     678           0 :                                   _mm_slli_epi16(ga, 8));
     679             : 
     680           0 :         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
     681           0 :         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
     682             : 
     683             :         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
     684           0 :         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
     685             : 
     686           0 :         src += 8*2;
     687           0 :         dst += 8;
     688           0 :         count -= 8;
     689             :     }
     690             : 
     691           0 :     grayA_to_RGBA_portable(dst, src, count);
     692           0 : }
     693             : 
     694           0 : static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
     695           0 :     const uint8_t* src = (const uint8_t*) vsrc;
     696           0 :     while (count >= 8) {
     697           0 :         __m128i grayA = _mm_loadu_si128((const __m128i*) src);
     698             : 
     699           0 :         __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
     700           0 :         __m128i a0 = _mm_srli_epi16(grayA, 8);
     701             : 
     702             :         // Premultiply
     703           0 :         g0 = scale(g0, a0);
     704             : 
     705           0 :         __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
     706           0 :         __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
     707             : 
     708             : 
     709           0 :         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
     710           0 :         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
     711             : 
     712             :         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
     713           0 :         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
     714             : 
     715           0 :         src += 8*2;
     716           0 :         dst += 8;
     717           0 :         count -= 8;
     718             :     }
     719             : 
     720           0 :     grayA_to_rgbA_portable(dst, src, count);
     721           0 : }
     722             : 
     723             : enum Format { kRGB1, kBGR1 };
     724             : template <Format format>
     725           0 : static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
     726           0 :     auto src = (const uint32_t*)vsrc;
     727             : 
     728           0 :     auto convert8 = [](__m128i* lo, __m128i* hi) {
     729           0 :         const __m128i zeros = _mm_setzero_si128();
     730             :         __m128i planar;
     731             :         if (kBGR1 == format) {
     732           0 :             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
     733             :         } else {
     734           0 :             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
     735             :         }
     736             : 
     737             :         // Swizzle the pixels to 8-bit planar.
     738           0 :         *lo = _mm_shuffle_epi8(*lo, planar);                                 // ccccmmmm yyyykkkk
     739           0 :         *hi = _mm_shuffle_epi8(*hi, planar);                                 // CCCCMMMM YYYYKKKK
     740           0 :         __m128i cm = _mm_unpacklo_epi32(*lo, *hi),                           // ccccCCCC mmmmMMMM
     741           0 :                 yk = _mm_unpackhi_epi32(*lo, *hi);                           // yyyyYYYY kkkkKKKK
     742             : 
     743             :         // Unpack to 16-bit planar.
     744           0 :         __m128i c = _mm_unpacklo_epi8(cm, zeros),                            // c_c_c_c_ C_C_C_C_
     745           0 :                 m = _mm_unpackhi_epi8(cm, zeros),                            // m_m_m_m_ M_M_M_M_
     746           0 :                 y = _mm_unpacklo_epi8(yk, zeros),                            // y_y_y_y_ Y_Y_Y_Y_
     747           0 :                 k = _mm_unpackhi_epi8(yk, zeros);                            // k_k_k_k_ K_K_K_K_
     748             : 
     749             :         // Scale to r, g, b.
     750           0 :         __m128i r = scale(c, k),
     751           0 :                 g = scale(m, k),
     752           0 :                 b = scale(y, k);
     753             : 
     754             :         // Repack into interlaced pixels.
     755           0 :         __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)),                  // rgrgrgrg RGRGRGRG
     756           0 :                 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00));     // b1b1b1b1 B1B1B1B1
     757           0 :         *lo = _mm_unpacklo_epi16(rg, ba);                                    // rgbargba rgbargba
     758           0 :         *hi = _mm_unpackhi_epi16(rg, ba);                                    // RGB1RGB1 RGB1RGB1
     759           0 :     };
     760             : 
     761           0 :     while (count >= 8) {
     762           0 :         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
     763           0 :                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
     764             : 
     765           0 :         convert8(&lo, &hi);
     766             : 
     767           0 :         _mm_storeu_si128((__m128i*) (dst + 0), lo);
     768           0 :         _mm_storeu_si128((__m128i*) (dst + 4), hi);
     769             : 
     770           0 :         src += 8;
     771           0 :         dst += 8;
     772           0 :         count -= 8;
     773             :     }
     774             : 
     775           0 :     if (count >= 4) {
     776           0 :         __m128i lo = _mm_loadu_si128((const __m128i*) src),
     777           0 :                 hi = _mm_setzero_si128();
     778             : 
     779           0 :         convert8(&lo, &hi);
     780             : 
     781           0 :         _mm_storeu_si128((__m128i*) dst, lo);
     782             : 
     783           0 :         src += 4;
     784           0 :         dst += 4;
     785           0 :         count -= 4;
     786             :     }
     787             : 
     788           0 :     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
     789           0 :     proc(dst, src, count);
     790           0 : }
     791             : 
     792           0 : static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
     793           0 :     inverted_cmyk_to<kRGB1>(dst, src, count);
     794           0 : }
     795             : 
     796           0 : static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
     797           0 :     inverted_cmyk_to<kBGR1>(dst, src, count);
     798           0 : }
     799             : 
     800             : #else
     801             : 
     802           0 : static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
     803           0 :     RGBA_to_rgbA_portable(dst, src, count);
     804           0 : }
     805             : 
     806           0 : static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
     807           0 :     RGBA_to_bgrA_portable(dst, src, count);
     808           0 : }
     809             : 
     810           0 : static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
     811           0 :     RGBA_to_BGRA_portable(dst, src, count);
     812           0 : }
     813             : 
     814           0 : static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
     815           0 :     RGB_to_RGB1_portable(dst, src, count);
     816           0 : }
     817             : 
     818           0 : static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
     819           0 :     RGB_to_BGR1_portable(dst, src, count);
     820           0 : }
     821             : 
     822           0 : static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
     823           0 :     gray_to_RGB1_portable(dst, src, count);
     824           0 : }
     825             : 
     826           0 : static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
     827           0 :     grayA_to_RGBA_portable(dst, src, count);
     828           0 : }
     829             : 
     830           0 : static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
     831           0 :     grayA_to_rgbA_portable(dst, src, count);
     832           0 : }
     833             : 
     834           0 : static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
     835           0 :     inverted_CMYK_to_RGB1_portable(dst, src, count);
     836           0 : }
     837             : 
     838           0 : static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
     839           0 :     inverted_CMYK_to_BGR1_portable(dst, src, count);
     840           0 : }
     841             : 
     842             : #endif
     843             : 
     844             : }
     845             : 
     846             : #endif // SkSwizzler_opts_DEFINED

Generated by: LCOV version 1.13