LCOV - code coverage report
Current view: top level - gfx/skia/skia/src/opts - SkColor_opts_SSE2.h (source / functions) Hit Total Coverage
Test: output.info Lines: 42 95 44.2 %
Date: 2017-07-14 16:53:18 Functions: 4 9 44.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright 2014 The Android Open Source Project
       3             :  *
       4             :  * Use of this source code is governed by a BSD-style license that can be
       5             :  * found in the LICENSE file.
       6             :  */
       7             : 
       8             : #ifndef SkColor_opts_SSE2_DEFINED
       9             : #define SkColor_opts_SSE2_DEFINED
      10             : 
      11             : #include <emmintrin.h>
      12             : 
      13             : #define ASSERT_EQ(a,b) SkASSERT(0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8((a), (b))))
      14             : 
      15             : // Because no _mm_mul_epi32() in SSE2, we emulate it here.
      16             : // Multiplies 4 32-bit integers from a by 4 32-bit intergers from b.
      17             : // The 4 multiplication results should be represented within 32-bit
      18             : // integers, otherwise they would be overflow.
      19             : static inline  __m128i Multiply32_SSE2(const __m128i& a, const __m128i& b) {
      20             :     // Calculate results of a0 * b0 and a2 * b2.
      21             :     __m128i r1 = _mm_mul_epu32(a, b);
      22             :     // Calculate results of a1 * b1 and a3 * b3.
      23             :     __m128i r2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
      24             :     // Shuffle results to [63..0] and interleave the results.
      25             :     __m128i r = _mm_unpacklo_epi32(_mm_shuffle_epi32(r1, _MM_SHUFFLE(0,0,2,0)),
      26             :                                    _mm_shuffle_epi32(r2, _MM_SHUFFLE(0,0,2,0)));
      27             :     return r;
      28             : }
      29             : 
      30             : static inline __m128i SkAlpha255To256_SSE2(const __m128i& alpha) {
      31             :     return _mm_add_epi32(alpha, _mm_set1_epi32(1));
      32             : }
      33             : 
      34             : // See #define SkAlphaMulAlpha(a, b)  SkMulDiv255Round(a, b) in SkXfermode.cpp.
      35             : static inline __m128i SkAlphaMulAlpha_SSE2(const __m128i& a,
      36             :                                            const __m128i& b) {
      37             :     __m128i prod = _mm_mullo_epi16(a, b);
      38             :     prod = _mm_add_epi32(prod, _mm_set1_epi32(128));
      39             :     prod = _mm_add_epi32(prod, _mm_srli_epi32(prod, 8));
      40             :     prod = _mm_srli_epi32(prod, 8);
      41             : 
      42             :     return prod;
      43             : }
      44             : 
      45             : // Portable version SkAlphaMulQ is in SkColorPriv.h.
      46      285424 : static inline __m128i SkAlphaMulQ_SSE2(const __m128i& c, const __m128i& scale) {
      47      285424 :     const __m128i mask = _mm_set1_epi32(0xFF00FF);
      48      856272 :     __m128i s = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);
      49             : 
      50             :     // uint32_t rb = ((c & mask) * scale) >> 8
      51      570848 :     __m128i rb = _mm_and_si128(mask, c);
      52      285424 :     rb = _mm_mullo_epi16(rb, s);
      53      285424 :     rb = _mm_srli_epi16(rb, 8);
      54             : 
      55             :     // uint32_t ag = ((c >> 8) & mask) * scale
      56      570848 :     __m128i ag = _mm_srli_epi16(c, 8);
      57      856272 :     ASSERT_EQ(ag, _mm_and_si128(mask, ag));  // ag = _mm_srli_epi16(c, 8) did this for us.
      58      285424 :     ag = _mm_mullo_epi16(ag, s);
      59             : 
      60             :     // (rb & mask) | (ag & ~mask)
      61      856272 :     ASSERT_EQ(rb, _mm_and_si128(mask, rb));  // rb = _mm_srli_epi16(rb, 8) did this for us.
      62      285424 :     ag = _mm_andnot_si128(mask, ag);
      63      285424 :     return _mm_or_si128(rb, ag);
      64             : }
      65             : 
      66             : // Fast path for SkAlphaMulQ_SSE2 with a constant scale factor.
      67             : static inline __m128i SkAlphaMulQ_SSE2(const __m128i& c, const unsigned scale) {
      68             :     const __m128i mask = _mm_set1_epi32(0xFF00FF);
      69             :     __m128i s = _mm_set1_epi16(scale << 8); // Move scale factor to upper byte of word.
      70             : 
      71             :     // With mulhi, red and blue values are already in the right place and
      72             :     // don't need to be divided by 256.
      73             :     __m128i rb = _mm_and_si128(mask, c);
      74             :     rb = _mm_mulhi_epu16(rb, s);
      75             : 
      76             :     __m128i ag = _mm_andnot_si128(mask, c);
      77             :     ag = _mm_mulhi_epu16(ag, s);     // Alpha and green values are in the higher byte of each word.
      78             :     ag = _mm_andnot_si128(mask, ag);
      79             : 
      80             :     return _mm_or_si128(rb, ag);
      81             : }
      82             : 
      83             : // Portable version SkFastFourByteInterp256 is in SkColorPriv.h.
      84           0 : static inline __m128i SkFastFourByteInterp256_SSE2(const __m128i& src, const __m128i& dst, const unsigned src_scale) {
      85             :     // Computes dst + (((src - dst)*src_scale)>>8)
      86           0 :     const __m128i mask = _mm_set1_epi32(0x00FF00FF);
      87             : 
      88             :     // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
      89           0 :     __m128i src_rb = _mm_and_si128(mask, src);
      90           0 :     __m128i src_ag = _mm_srli_epi16(src, 8);
      91           0 :     __m128i dst_rb = _mm_and_si128(mask, dst);
      92           0 :     __m128i dst_ag = _mm_srli_epi16(dst, 8);
      93             : 
      94             :     // Compute scaled differences.
      95           0 :     __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
      96           0 :     __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
      97           0 :     __m128i s = _mm_set1_epi16(src_scale);
      98           0 :     diff_rb = _mm_mullo_epi16(diff_rb, s);
      99           0 :     diff_ag = _mm_mullo_epi16(diff_ag, s);
     100             : 
     101             :     // Pack the differences back together.
     102           0 :     diff_rb = _mm_srli_epi16(diff_rb, 8);
     103           0 :     diff_ag = _mm_andnot_si128(mask, diff_ag);
     104           0 :     __m128i diff = _mm_or_si128(diff_rb, diff_ag);
     105             : 
     106             :     // Add difference to destination.
     107           0 :     return _mm_add_epi8(dst, diff);
     108             : }
     109             : 
     110             : // Portable version SkPMLerp is in SkColorPriv.h
     111           0 : static inline __m128i SkPMLerp_SSE2(const __m128i& src, const __m128i& dst, const unsigned scale) {
     112           0 :     return SkFastFourByteInterp256_SSE2(src, dst, scale);
     113             : }
     114             : 
     115      292316 : static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) {
     116             : #if SK_A32_SHIFT == 24                // It's very common (universal?) that alpha is the top byte.
     117      584632 :     return _mm_srli_epi32(src, 24);   // You'd hope the compiler would remove the left shift then,
     118             : #else                                 // but I've seen Clang just do a dumb left shift of zero. :(
     119             :     __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT));
     120             :     return _mm_srli_epi32(a, 24);
     121             : #endif
     122             : }
     123             : 
     124             : static inline __m128i SkGetPackedR32_SSE2(const __m128i& src) {
     125             :     __m128i r = _mm_slli_epi32(src, (24 - SK_R32_SHIFT));
     126             :     return _mm_srli_epi32(r, 24);
     127             : }
     128             : 
     129             : static inline __m128i SkGetPackedG32_SSE2(const __m128i& src) {
     130             :     __m128i g = _mm_slli_epi32(src, (24 - SK_G32_SHIFT));
     131             :     return _mm_srli_epi32(g, 24);
     132             : }
     133             : 
     134             : static inline __m128i SkGetPackedB32_SSE2(const __m128i& src) {
     135             :     __m128i b = _mm_slli_epi32(src, (24 - SK_B32_SHIFT));
     136             :     return _mm_srli_epi32(b, 24);
     137             : }
     138             : 
     139           0 : static inline __m128i SkMul16ShiftRound_SSE2(const __m128i& a,
     140             :                                              const __m128i& b, int shift) {
     141           0 :     __m128i prod = _mm_mullo_epi16(a, b);
     142           0 :     prod = _mm_add_epi16(prod, _mm_set1_epi16(1 << (shift - 1)));
     143           0 :     prod = _mm_add_epi16(prod, _mm_srli_epi16(prod, shift));
     144           0 :     prod = _mm_srli_epi16(prod, shift);
     145             : 
     146           0 :     return prod;
     147             : }
     148             : 
     149           0 : static inline __m128i SkPackRGB16_SSE2(const __m128i& r,
     150             :                                        const __m128i& g, const __m128i& b) {
     151           0 :     __m128i dr = _mm_slli_epi16(r, SK_R16_SHIFT);
     152           0 :     __m128i dg = _mm_slli_epi16(g, SK_G16_SHIFT);
     153           0 :     __m128i db = _mm_slli_epi16(b, SK_B16_SHIFT);
     154             : 
     155           0 :     __m128i c = _mm_or_si128(dr, dg);
     156           0 :     return _mm_or_si128(c, db);
     157             : }
     158             : 
     159             : static inline __m128i SkPackARGB32_SSE2(const __m128i& a, const __m128i& r,
     160             :                                         const __m128i& g, const __m128i& b) {
     161             :     __m128i da = _mm_slli_epi32(a, SK_A32_SHIFT);
     162             :     __m128i dr = _mm_slli_epi32(r, SK_R32_SHIFT);
     163             :     __m128i dg = _mm_slli_epi32(g, SK_G32_SHIFT);
     164             :     __m128i db = _mm_slli_epi32(b, SK_B32_SHIFT);
     165             : 
     166             :     __m128i c = _mm_or_si128(da, dr);
     167             :     c = _mm_or_si128(c, dg);
     168             :     return _mm_or_si128(c, db);
     169             : }
     170             : 
     171             : static inline __m128i SkPacked16ToR32_SSE2(const __m128i& src) {
     172             :     __m128i r = _mm_srli_epi32(src, SK_R16_SHIFT);
     173             :     r = _mm_and_si128(r, _mm_set1_epi32(SK_R16_MASK));
     174             :     r = _mm_or_si128(_mm_slli_epi32(r, (8 - SK_R16_BITS)),
     175             :                      _mm_srli_epi32(r, (2 * SK_R16_BITS - 8)));
     176             : 
     177             :     return r;
     178             : }
     179             : 
     180             : static inline __m128i SkPacked16ToG32_SSE2(const __m128i& src) {
     181             :     __m128i g = _mm_srli_epi32(src, SK_G16_SHIFT);
     182             :     g = _mm_and_si128(g, _mm_set1_epi32(SK_G16_MASK));
     183             :     g = _mm_or_si128(_mm_slli_epi32(g, (8 - SK_G16_BITS)),
     184             :                      _mm_srli_epi32(g, (2 * SK_G16_BITS - 8)));
     185             : 
     186             :     return g;
     187             : }
     188             : 
     189             : static inline __m128i SkPacked16ToB32_SSE2(const __m128i& src) {
     190             :     __m128i b = _mm_srli_epi32(src, SK_B16_SHIFT);
     191             :     b = _mm_and_si128(b, _mm_set1_epi32(SK_B16_MASK));
     192             :     b = _mm_or_si128(_mm_slli_epi32(b, (8 - SK_B16_BITS)),
     193             :                      _mm_srli_epi32(b, (2 * SK_B16_BITS - 8)));
     194             : 
     195             :     return b;
     196             : }
     197             : 
     198             : static inline __m128i SkPixel16ToPixel32_SSE2(const __m128i& src) {
     199             :     __m128i r = SkPacked16ToR32_SSE2(src);
     200             :     __m128i g = SkPacked16ToG32_SSE2(src);
     201             :     __m128i b = SkPacked16ToB32_SSE2(src);
     202             : 
     203             :     return SkPackARGB32_SSE2(_mm_set1_epi32(0xFF), r, g, b);
     204             : }
     205             : 
     206           0 : static inline __m128i SkPixel32ToPixel16_ToU16_SSE2(const __m128i& src_pixel1,
     207             :                                                     const __m128i& src_pixel2) {
     208             :     // Calculate result r.
     209           0 :     __m128i r1 = _mm_srli_epi32(src_pixel1,
     210           0 :                                 SK_R32_SHIFT + (8 - SK_R16_BITS));
     211           0 :     r1 = _mm_and_si128(r1, _mm_set1_epi32(SK_R16_MASK));
     212           0 :     __m128i r2 = _mm_srli_epi32(src_pixel2,
     213           0 :                                 SK_R32_SHIFT + (8 - SK_R16_BITS));
     214           0 :     r2 = _mm_and_si128(r2, _mm_set1_epi32(SK_R16_MASK));
     215           0 :     __m128i r = _mm_packs_epi32(r1, r2);
     216             : 
     217             :     // Calculate result g.
     218           0 :     __m128i g1 = _mm_srli_epi32(src_pixel1,
     219           0 :                                 SK_G32_SHIFT + (8 - SK_G16_BITS));
     220           0 :     g1 = _mm_and_si128(g1, _mm_set1_epi32(SK_G16_MASK));
     221           0 :     __m128i g2 = _mm_srli_epi32(src_pixel2,
     222           0 :                                 SK_G32_SHIFT + (8 - SK_G16_BITS));
     223           0 :     g2 = _mm_and_si128(g2, _mm_set1_epi32(SK_G16_MASK));
     224           0 :     __m128i g = _mm_packs_epi32(g1, g2);
     225             : 
     226             :     // Calculate result b.
     227           0 :     __m128i b1 = _mm_srli_epi32(src_pixel1,
     228           0 :                                 SK_B32_SHIFT + (8 - SK_B16_BITS));
     229           0 :     b1 = _mm_and_si128(b1, _mm_set1_epi32(SK_B16_MASK));
     230           0 :     __m128i b2 = _mm_srli_epi32(src_pixel2,
     231           0 :                                 SK_B32_SHIFT + (8 - SK_B16_BITS));
     232           0 :     b2 = _mm_and_si128(b2, _mm_set1_epi32(SK_B16_MASK));
     233           0 :     __m128i b = _mm_packs_epi32(b1, b2);
     234             : 
     235             :     // Store 8 16-bit colors in dst.
     236           0 :     __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);
     237             : 
     238           0 :     return d_pixel;
     239             : }
     240             : 
     241             : // Portable version is SkPMSrcOver in SkColorPriv.h.
     242      285424 : static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
     243      285424 :     return _mm_add_epi32(src,
     244     1141696 :                          SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(256),
     245      570848 :                                                              SkGetPackedA32_SSE2(src))));
     246             : }
     247             : 
     248             : // Fast path for SkBlendARGB32_SSE2 with a constant alpha factor.
     249        6892 : static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
     250             :                                          const unsigned aa) {
     251        6892 :     unsigned alpha = SkAlpha255To256(aa);
     252       13784 :     __m128i src_scale = _mm_set1_epi16(alpha);
     253             :     // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
     254        6892 :     __m128i dst_scale = SkGetPackedA32_SSE2(src);
     255             :     // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
     256        6892 :     dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
     257       13784 :     dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
     258       13784 :     dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
     259        6892 :     dst_scale = _mm_srli_epi32(dst_scale, 8);
     260             :     // Duplicate scales into 2x16-bit pattern per pixel.
     261        6892 :     dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
     262        6892 :     dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
     263             : 
     264        6892 :     const __m128i mask = _mm_set1_epi32(0x00FF00FF);
     265             : 
     266             :     // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
     267       13784 :     __m128i src_rb = _mm_and_si128(mask, src);
     268       13784 :     __m128i src_ag = _mm_srli_epi16(src, 8);
     269       13784 :     __m128i dst_rb = _mm_and_si128(mask, dst);
     270       13784 :     __m128i dst_ag = _mm_srli_epi16(dst, 8);
     271             : 
     272             :     // Scale them.
     273        6892 :     src_rb = _mm_mullo_epi16(src_rb, src_scale);
     274        6892 :     src_ag = _mm_mullo_epi16(src_ag, src_scale);
     275        6892 :     dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
     276        6892 :     dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
     277             : 
     278             :     // Add the scaled source and destination.
     279        6892 :     dst_rb = _mm_add_epi16(src_rb, dst_rb);
     280        6892 :     dst_ag = _mm_add_epi16(src_ag, dst_ag);
     281             : 
     282             :     // Unsplay the halves back together.
     283        6892 :     dst_rb = _mm_srli_epi16(dst_rb, 8);
     284        6892 :     dst_ag = _mm_andnot_si128(mask, dst_ag);
     285        6892 :     return _mm_or_si128(dst_rb, dst_ag);
     286             : }
     287             : 
     288             : #undef ASSERT_EQ
     289             : #endif // SkColor_opts_SSE2_DEFINED

Generated by: LCOV version 1.13