LCOV - output.info - gfx/skia/skia/src/opts/SkBlitRow_opts

LCOV - code coverage report

Current view:	top level - gfx/skia/skia/src/opts - SkBlitRow_opts_SSE2.cpp (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	141	508	27.8 %
Date:	2017-07-14 16:53:18	Functions:	5	11	45.5 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*
       2             :  * Copyright 2012 The Android Open Source Project
       3             :  *
       4             :  * Use of this source code is governed by a BSD-style license that can be
       5             :  * found in the LICENSE file.
       6             :  */
       7             : 
       8             : #include <emmintrin.h>
       9             : #include "SkBitmapProcState_opts_SSE2.h"
      10             : #include "SkBlitRow_opts_SSE2.h"
      11             : #include "SkColorPriv.h"
      12             : #include "SkColor_opts_SSE2.h"
      13             : #include "SkDither.h"
      14             : #include "SkMSAN.h"
      15             : #include "SkUtils.h"
      16             : 
      17             : /* SSE2 version of S32_Blend_BlitRow32()
      18             :  * portable version is in core/SkBlitRow_D32.cpp
      19             :  */
      20           0 : void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
      21             :                               const SkPMColor* SK_RESTRICT src,
      22             :                               int count, U8CPU alpha) {
      23           0 :     SkASSERT(alpha <= 255);
      24           0 :     if (count <= 0) {
      25           0 :         return;
      26             :     }
      27             : 
      28           0 :     uint32_t src_scale = SkAlpha255To256(alpha);
      29             : 
      30           0 :     if (count >= 4) {
      31           0 :         SkASSERT(((size_t)dst & 0x03) == 0);
      32           0 :         while (((size_t)dst & 0x0F) != 0) {
      33           0 :             *dst = SkPMLerp(*src, *dst, src_scale);
      34           0 :             src++;
      35           0 :             dst++;
      36           0 :             count--;
      37             :         }
      38             : 
      39           0 :         const __m128i *s = reinterpret_cast<const __m128i*>(src);
      40           0 :         __m128i *d = reinterpret_cast<__m128i*>(dst);
      41             : 
      42           0 :         while (count >= 4) {
      43             :             // Load 4 pixels each of src and dest.
      44           0 :             __m128i src_pixel = _mm_loadu_si128(s);
      45           0 :             __m128i dst_pixel = _mm_load_si128(d);
      46             : 
      47           0 :             __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
      48             :             _mm_store_si128(d, result);
      49           0 :             s++;
      50           0 :             d++;
      51           0 :             count -= 4;
      52             :         }
      53           0 :         src = reinterpret_cast<const SkPMColor*>(s);
      54           0 :         dst = reinterpret_cast<SkPMColor*>(d);
      55             :     }
      56             : 
      57           0 :     while (count > 0) {
      58           0 :         *dst = SkPMLerp(*src, *dst, src_scale);
      59           0 :         src++;
      60           0 :         dst++;
      61           0 :         count--;
      62             :     }
      63             : }
      64             : 
      65        1582 : void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
      66             :                                const SkPMColor* SK_RESTRICT src,
      67             :                                int count, U8CPU alpha) {
      68        1582 :     SkASSERT(alpha <= 255);
      69        1582 :     if (count <= 0) {
      70           0 :         return;
      71             :     }
      72             : 
      73        1582 :     if (count >= 4) {
      74        2064 :         while (((size_t)dst & 0x0F) != 0) {
      75         736 :             *dst = SkBlendARGB32(*src, *dst, alpha);
      76         736 :             src++;
      77         736 :             dst++;
      78         736 :             count--;
      79             :         }
      80             : 
      81         592 :         const __m128i *s = reinterpret_cast<const __m128i*>(src);
      82         592 :         __m128i *d = reinterpret_cast<__m128i*>(dst);
      83       14376 :         while (count >= 4) {
      84             :             // Load 4 pixels each of src and dest.
      85        6892 :             __m128i src_pixel = _mm_loadu_si128(s);
      86        6892 :             __m128i dst_pixel = _mm_load_si128(d);
      87             : 
      88        6892 :             __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
      89             :             _mm_store_si128(d, result);
      90        6892 :             s++;
      91        6892 :             d++;
      92        6892 :             count -= 4;
      93             :         }
      94         592 :         src = reinterpret_cast<const SkPMColor*>(s);
      95         592 :         dst = reinterpret_cast<SkPMColor*>(d);
      96             :     }
      97             : 
      98        5040 :     while (count > 0) {
      99        1729 :         *dst = SkBlendARGB32(*src, *dst, alpha);
     100        1729 :         src++;
     101        1729 :         dst++;
     102        1729 :         count--;
     103             :     }
     104             : }
     105             : 
     106           0 : void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
     107           0 :     SkASSERT(count > 0);
     108             : 
     109           0 :     uint32_t src_expand = (SkGetPackedG32(src) << 24) |
     110           0 :                           (SkGetPackedR32(src) << 13) |
     111           0 :                           (SkGetPackedB32(src) << 2);
     112           0 :     unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
     113             : 
     114             :     // Check if we have enough pixels to run SIMD
     115           0 :     if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
     116             :         __m128i* dst_wide;
     117           0 :         const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
     118           0 :         const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
     119           0 :         const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
     120           0 :         const __m128i scale_wide = _mm_set1_epi16(scale);
     121           0 :         const __m128i mask_blue  = _mm_set1_epi16(SK_B16_MASK);
     122           0 :         const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
     123             : 
     124             :         // Align dst to an even 16 byte address (0-7 pixels)
     125           0 :         while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
     126           0 :             *dst = SkBlend32_RGB16(src_expand, *dst, scale);
     127           0 :             dst += 1;
     128           0 :             count--;
     129             :         }
     130             : 
     131           0 :         dst_wide = reinterpret_cast<__m128i*>(dst);
     132           0 :         do {
     133             :             // Load eight RGB565 pixels
     134           0 :             __m128i pixels = _mm_load_si128(dst_wide);
     135             : 
     136             :             // Mask out sub-pixels
     137           0 :             __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
     138           0 :             __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
     139           0 :             pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
     140           0 :             __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
     141             : 
     142             :             // Scale with alpha
     143           0 :             pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
     144           0 :             pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
     145           0 :             pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
     146             : 
     147             :             // Add src_X_wide and shift down again
     148           0 :             pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
     149           0 :             pixel_R = _mm_srli_epi16(pixel_R, 5);
     150           0 :             pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
     151           0 :             pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
     152           0 :             pixel_B = _mm_srli_epi16(pixel_B, 5);
     153             : 
     154             :             // Combine into RGB565 and store
     155           0 :             pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
     156           0 :             pixel_G = _mm_and_si128(pixel_G, mask_green);
     157           0 :             pixels = _mm_or_si128(pixel_R, pixel_G);
     158           0 :             pixels = _mm_or_si128(pixels, pixel_B);
     159             :             _mm_store_si128(dst_wide, pixels);
     160           0 :             count -= 8;
     161           0 :             dst_wide++;
     162           0 :         } while (count >= 8);
     163             : 
     164           0 :         dst = reinterpret_cast<uint16_t*>(dst_wide);
     165             :     }
     166             : 
     167             :     // Small loop to handle remaining pixels.
     168           0 :     while (count > 0) {
     169           0 :         *dst = SkBlend32_RGB16(src_expand, *dst, scale);
     170           0 :         dst += 1;
     171           0 :         count--;
     172             :     }
     173           0 : }
     174             : 
     175             : // The following (left) shifts cause the top 5 bits of the mask components to
     176             : // line up with the corresponding components in an SkPMColor.
     177             : // Note that the mask's RGB16 order may differ from the SkPMColor order.
     178             : #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
     179             : #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
     180             : #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
     181             : 
     182             : #if SK_R16x5_R32x5_SHIFT == 0
     183             :     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
     184             : #elif SK_R16x5_R32x5_SHIFT > 0
     185             :     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
     186             : #else
     187             :     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
     188             : #endif
     189             : 
     190             : #if SK_G16x5_G32x5_SHIFT == 0
     191             :     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
     192             : #elif SK_G16x5_G32x5_SHIFT > 0
     193             :     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
     194             : #else
     195             :     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
     196             : #endif
     197             : 
     198             : #if SK_B16x5_B32x5_SHIFT == 0
     199             :     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
     200             : #elif SK_B16x5_B32x5_SHIFT > 0
     201             :     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
     202             : #else
     203             :     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
     204             : #endif
     205             : 
     206        3234 : static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
     207             :                                  __m128i &mask, __m128i &srcA) {
     208             :     // In the following comments, the components of src, dst and mask are
     209             :     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
     210             :     // by an R, G, B, or A suffix. Components of one of the four pixels that
     211             :     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
     212             :     // example is the blue channel of the second destination pixel. Memory
     213             :     // layout is shown for an ARGB byte order in a color value.
     214             : 
     215             :     // src and srcA store 8-bit values interleaved with zeros.
     216             :     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
     217             :     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
     218             :     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
     219             :     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
     220             :     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
     221             :     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
     222             :     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
     223             : 
     224             :     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
     225             :     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
     226        6468 :     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
     227        3234 :                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
     228             : 
     229             :     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
     230        6468 :     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
     231        3234 :                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
     232             : 
     233             :     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
     234        3234 :     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
     235        3234 :                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
     236             : 
     237             :     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
     238             :     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
     239             :     // 8-bit position
     240             :     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
     241             :     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
     242        6468 :     mask = _mm_or_si128(_mm_or_si128(r, g), b);
     243             : 
     244             :     // Interleave R,G,B into the lower byte of word.
     245             :     // i.e. split the sixteen 8-bit values from mask into two sets of eight
     246             :     // 16-bit values, padded by zero.
     247             :     __m128i maskLo, maskHi;
     248             :     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
     249        6468 :     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
     250             :     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
     251        6468 :     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
     252             : 
     253             :     // Upscale from 0..31 to 0..32
     254             :     // (allows to replace division by left-shift further down)
     255             :     // Left-shift each component by 4 and add the result back to that component,
     256             :     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
     257        6468 :     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
     258        6468 :     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
     259             : 
     260             :     // Multiply each component of maskLo and maskHi by srcA
     261        6468 :     maskLo = _mm_mullo_epi16(maskLo, srcA);
     262        6468 :     maskHi = _mm_mullo_epi16(maskHi, srcA);
     263             : 
     264             :     // Left shift mask components by 8 (divide by 256)
     265        3234 :     maskLo = _mm_srli_epi16(maskLo, 8);
     266        3234 :     maskHi = _mm_srli_epi16(maskHi, 8);
     267             : 
     268             :     // Interleave R,G,B into the lower byte of the word
     269             :     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
     270        6468 :     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
     271             :     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
     272        6468 :     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
     273             : 
     274             :     // mask = (src - dst) * mask
     275        9702 :     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
     276        9702 :     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
     277             : 
     278             :     // mask = (src - dst) * mask >> 5
     279        3234 :     maskLo = _mm_srai_epi16(maskLo, 5);
     280        3234 :     maskHi = _mm_srai_epi16(maskHi, 5);
     281             : 
     282             :     // Add two pixels into result.
     283             :     // result = dst + ((src - dst) * mask >> 5)
     284        3234 :     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
     285        3234 :     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
     286             : 
     287             :     // Pack into 4 32bit dst pixels.
     288             :     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
     289             :     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
     290             :     // clamping to 255 if necessary.
     291        3234 :     return _mm_packus_epi16(resultLo, resultHi);
     292             : }
     293             : 
     294        8009 : static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
     295             :                                        __m128i &mask) {
     296             :     // In the following comments, the components of src, dst and mask are
     297             :     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
     298             :     // by an R, G, B, or A suffix. Components of one of the four pixels that
     299             :     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
     300             :     // example is the blue channel of the second destination pixel. Memory
     301             :     // layout is shown for an ARGB byte order in a color value.
     302             : 
     303             :     // src and srcA store 8-bit values interleaved with zeros.
     304             :     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
     305             :     // mask stores 16-bit values (shown as high and low bytes) interleaved with
     306             :     // zeros
     307             :     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
     308             :     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
     309             : 
     310             :     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
     311             :     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
     312       16018 :     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
     313        8009 :                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
     314             : 
     315             :     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
     316       16018 :     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
     317        8009 :                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
     318             : 
     319             :     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
     320        8009 :     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
     321        8009 :                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
     322             : 
     323             :     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
     324             :     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
     325             :     // 8-bit position
     326             :     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
     327             :     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
     328       16018 :     mask = _mm_or_si128(_mm_or_si128(r, g), b);
     329             : 
     330             :     // Interleave R,G,B into the lower byte of word.
     331             :     // i.e. split the sixteen 8-bit values from mask into two sets of eight
     332             :     // 16-bit values, padded by zero.
     333             :     __m128i maskLo, maskHi;
     334             :     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
     335       16018 :     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
     336             :     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
     337       16018 :     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
     338             : 
     339             :     // Upscale from 0..31 to 0..32
     340             :     // (allows to replace division by left-shift further down)
     341             :     // Left-shift each component by 4 and add the result back to that component,
     342             :     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
     343       16018 :     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
     344       16018 :     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
     345             : 
     346             :     // Interleave R,G,B into the lower byte of the word
     347             :     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
     348       16018 :     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
     349             :     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
     350       16018 :     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
     351             : 
     352             :     // mask = (src - dst) * mask
     353       24027 :     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
     354       24027 :     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
     355             : 
     356             :     // mask = (src - dst) * mask >> 5
     357        8009 :     maskLo = _mm_srai_epi16(maskLo, 5);
     358        8009 :     maskHi = _mm_srai_epi16(maskHi, 5);
     359             : 
     360             :     // Add two pixels into result.
     361             :     // result = dst + ((src - dst) * mask >> 5)
     362        8009 :     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
     363        8009 :     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
     364             : 
     365             :     // Pack into 4 32bit dst pixels and force opaque.
     366             :     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
     367             :     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
     368             :     // clamping to 255 if necessary. Set alpha components to 0xFF.
     369       16018 :     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
     370        8009 :                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
     371             : }
     372             : 
     373        1620 : void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
     374             :                          SkColor src, int width, SkPMColor) {
     375        1620 :     if (width <= 0) {
     376           0 :         return;
     377             :     }
     378             : 
     379        1620 :     int srcA = SkColorGetA(src);
     380        1620 :     int srcR = SkColorGetR(src);
     381        1620 :     int srcG = SkColorGetG(src);
     382        1620 :     int srcB = SkColorGetB(src);
     383             : 
     384        1620 :     srcA = SkAlpha255To256(srcA);
     385             : 
     386        1620 :     if (width >= 4) {
     387        1620 :         SkASSERT(((size_t)dst & 0x03) == 0);
     388        6350 :         while (((size_t)dst & 0x0F) != 0) {
     389        2365 :             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
     390        2365 :             mask++;
     391        2365 :             dst++;
     392        2365 :             width--;
     393             :         }
     394             : 
     395        1620 :         __m128i *d = reinterpret_cast<__m128i*>(dst);
     396             :         // Set alpha to 0xFF and replicate source four times in SSE register.
     397        3240 :         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
     398             :         // Interleave with zeros to get two sets of four 16-bit values.
     399        3240 :         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
     400             :         // Set srcA_sse to contain eight copies of srcA, padded with zero.
     401             :         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
     402        3240 :         __m128i srcA_sse = _mm_set1_epi16(srcA);
     403        9940 :         while (width >= 4) {
     404             :             // Load four destination pixels into dst_sse.
     405        4160 :             __m128i dst_sse = _mm_load_si128(d);
     406             :             // Load four 16-bit masks into lower half of mask_sse.
     407        4160 :             __m128i mask_sse = _mm_loadl_epi64(
     408        4160 :                                    reinterpret_cast<const __m128i*>(mask));
     409             : 
     410             :             // Check whether masks are equal to 0 and get the highest bit
     411             :             // of each byte of result, if masks are all zero, we will get
     412             :             // pack_cmp to 0xFFFF
     413        8320 :             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
     414        4160 :                                              _mm_setzero_si128()));
     415             : 
     416             :             // if mask pixels are not all zero, we will blend the dst pixels
     417        4160 :             if (pack_cmp != 0xFFFF) {
     418             :                 // Unpack 4 16bit mask pixels to
     419             :                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
     420             :                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
     421        6468 :                 mask_sse = _mm_unpacklo_epi16(mask_sse,
     422             :                                               _mm_setzero_si128());
     423             : 
     424             :                 // Process 4 32bit dst pixels
     425             :                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
     426        3234 :                                                    mask_sse, srcA_sse);
     427             :                 _mm_store_si128(d, result);
     428             :             }
     429             : 
     430        4160 :             d++;
     431        4160 :             mask += 4;
     432        4160 :             width -= 4;
     433             :         }
     434             : 
     435        1620 :         dst = reinterpret_cast<SkPMColor*>(d);
     436             :     }
     437             : 
     438        6514 :     while (width > 0) {
     439        2447 :         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
     440        2447 :         mask++;
     441        2447 :         dst++;
     442        2447 :         width--;
     443             :     }
     444             : }
     445             : 
     446        3913 : void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
     447             :                                SkColor src, int width, SkPMColor opaqueDst) {
     448        3913 :     if (width <= 0) {
     449           0 :         return;
     450             :     }
     451             : 
     452        3913 :     int srcR = SkColorGetR(src);
     453        3913 :     int srcG = SkColorGetG(src);
     454        3913 :     int srcB = SkColorGetB(src);
     455             : 
     456        3913 :     if (width >= 4) {
     457        3913 :         SkASSERT(((size_t)dst & 0x03) == 0);
     458       15813 :         while (((size_t)dst & 0x0F) != 0) {
     459        5950 :             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
     460        5950 :             mask++;
     461        5950 :             dst++;
     462        5950 :             width--;
     463             :         }
     464             : 
     465        3913 :         __m128i *d = reinterpret_cast<__m128i*>(dst);
     466             :         // Set alpha to 0xFF and replicate source four times in SSE register.
     467        7826 :         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
     468             :         // Set srcA_sse to contain eight copies of srcA, padded with zero.
     469             :         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
     470        7826 :         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
     471       24721 :         while (width >= 4) {
     472             :             // Load four destination pixels into dst_sse.
     473       10404 :             __m128i dst_sse = _mm_load_si128(d);
     474             :             // Load four 16-bit masks into lower half of mask_sse.
     475       10404 :             __m128i mask_sse = _mm_loadl_epi64(
     476       10404 :                                    reinterpret_cast<const __m128i*>(mask));
     477             : 
     478             :             // Check whether masks are equal to 0 and get the highest bit
     479             :             // of each byte of result, if masks are all zero, we will get
     480             :             // pack_cmp to 0xFFFF
     481       20808 :             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
     482       10404 :                                              _mm_setzero_si128()));
     483             : 
     484             :             // if mask pixels are not all zero, we will blend the dst pixels
     485       10404 :             if (pack_cmp != 0xFFFF) {
     486             :                 // Unpack 4 16bit mask pixels to
     487             :                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
     488             :                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
     489       16018 :                 mask_sse = _mm_unpacklo_epi16(mask_sse,
     490             :                                               _mm_setzero_si128());
     491             : 
     492             :                 // Process 4 32bit dst pixels
     493             :                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
     494        8009 :                                                          mask_sse);
     495             :                 _mm_store_si128(d, result);
     496             :             }
     497             : 
     498       10404 :             d++;
     499       10404 :             mask += 4;
     500       10404 :             width -= 4;
     501             :         }
     502             : 
     503        3913 :         dst = reinterpret_cast<SkPMColor*>(d);
     504             :     }
     505             : 
     506       15635 :     while (width > 0) {
     507        5861 :         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
     508        5861 :         mask++;
     509        5861 :         dst++;
     510        5861 :         width--;
     511             :     }
     512             : }
     513             : 
     514             : /* SSE2 version of S32_D565_Opaque()
     515             :  * portable version is in core/SkBlitRow_D16.cpp
     516             :  */
     517           0 : void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
     518             :                           const SkPMColor* SK_RESTRICT src, int count,
     519             :                           U8CPU alpha, int /*x*/, int /*y*/) {
     520           0 :     SkASSERT(255 == alpha);
     521             : 
     522           0 :     if (count <= 0) {
     523           0 :         return;
     524             :     }
     525             : 
     526           0 :     if (count >= 8) {
     527           0 :         while (((size_t)dst & 0x0F) != 0) {
     528           0 :             SkPMColor c = *src++;
     529           0 :             SkPMColorAssert(c);
     530             : 
     531           0 :             *dst++ = SkPixel32ToPixel16_ToU16(c);
     532           0 :             count--;
     533             :         }
     534             : 
     535           0 :         const __m128i* s = reinterpret_cast<const __m128i*>(src);
     536           0 :         __m128i* d = reinterpret_cast<__m128i*>(dst);
     537             : 
     538           0 :         while (count >= 8) {
     539             :             // Load 8 pixels of src.
     540           0 :             __m128i src_pixel1 = _mm_loadu_si128(s++);
     541           0 :             __m128i src_pixel2 = _mm_loadu_si128(s++);
     542             : 
     543           0 :             __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
     544           0 :             _mm_store_si128(d++, d_pixel);
     545           0 :             count -= 8;
     546             :         }
     547           0 :         src = reinterpret_cast<const SkPMColor*>(s);
     548           0 :         dst = reinterpret_cast<uint16_t*>(d);
     549             :     }
     550             : 
     551           0 :     if (count > 0) {
     552           0 :         do {
     553           0 :             SkPMColor c = *src++;
     554           0 :             SkPMColorAssert(c);
     555           0 :             *dst++ = SkPixel32ToPixel16_ToU16(c);
     556             :         } while (--count != 0);
     557             :     }
     558             : }
     559             : 
     560             : /* SSE2 version of S32A_D565_Opaque()
     561             :  * portable version is in core/SkBlitRow_D16.cpp
     562             :  */
     563           0 : void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
     564             :                            const SkPMColor* SK_RESTRICT src,
     565             :                            int count, U8CPU alpha, int /*x*/, int /*y*/) {
     566           0 :     SkASSERT(255 == alpha);
     567             : 
     568           0 :     if (count <= 0) {
     569           0 :         return;
     570             :     }
     571             : 
     572           0 :     if (count >= 8) {
     573             :         // Make dst 16 bytes alignment
     574           0 :         while (((size_t)dst & 0x0F) != 0) {
     575           0 :             SkPMColor c = *src++;
     576           0 :             if (c) {
     577           0 :               *dst = SkSrcOver32To16(c, *dst);
     578             :             }
     579           0 :             dst += 1;
     580           0 :             count--;
     581             :         }
     582             : 
     583           0 :         const __m128i* s = reinterpret_cast<const __m128i*>(src);
     584           0 :         __m128i* d = reinterpret_cast<__m128i*>(dst);
     585           0 :         __m128i var255 = _mm_set1_epi16(255);
     586           0 :         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
     587           0 :         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
     588           0 :         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
     589             : 
     590           0 :         while (count >= 8) {
     591             :             // Load 8 pixels of src.
     592           0 :             __m128i src_pixel1 = _mm_loadu_si128(s++);
     593           0 :             __m128i src_pixel2 = _mm_loadu_si128(s++);
     594             : 
     595             :             // Check whether src pixels are equal to 0 and get the highest bit
     596             :             // of each byte of result, if src pixels are all zero, src_cmp1 and
     597             :             // src_cmp2 will be 0xFFFF.
     598           0 :             int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
     599           0 :                                              _mm_setzero_si128()));
     600           0 :             int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
     601           0 :                                              _mm_setzero_si128()));
     602           0 :             if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
     603           0 :                 d++;
     604           0 :                 count -= 8;
     605           0 :                 continue;
     606             :             }
     607             : 
     608             :             // Load 8 pixels of dst.
     609           0 :             __m128i dst_pixel = _mm_load_si128(d);
     610             : 
     611             :             // Extract A from src.
     612           0 :             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
     613           0 :             sa1 = _mm_srli_epi32(sa1, 24);
     614           0 :             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
     615           0 :             sa2 = _mm_srli_epi32(sa2, 24);
     616           0 :             __m128i sa = _mm_packs_epi32(sa1, sa2);
     617             : 
     618             :             // Extract R from src.
     619           0 :             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
     620           0 :             sr1 = _mm_srli_epi32(sr1, 24);
     621           0 :             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
     622           0 :             sr2 = _mm_srli_epi32(sr2, 24);
     623           0 :             __m128i sr = _mm_packs_epi32(sr1, sr2);
     624             : 
     625             :             // Extract G from src.
     626           0 :             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
     627           0 :             sg1 = _mm_srli_epi32(sg1, 24);
     628           0 :             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
     629           0 :             sg2 = _mm_srli_epi32(sg2, 24);
     630           0 :             __m128i sg = _mm_packs_epi32(sg1, sg2);
     631             : 
     632             :             // Extract B from src.
     633           0 :             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
     634           0 :             sb1 = _mm_srli_epi32(sb1, 24);
     635           0 :             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
     636           0 :             sb2 = _mm_srli_epi32(sb2, 24);
     637           0 :             __m128i sb = _mm_packs_epi32(sb1, sb2);
     638             : 
     639             :             // Extract R G B from dst.
     640           0 :             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
     641           0 :             dr = _mm_and_si128(dr, r16_mask);
     642           0 :             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
     643           0 :             dg = _mm_and_si128(dg, g16_mask);
     644           0 :             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
     645           0 :             db = _mm_and_si128(db, b16_mask);
     646             : 
     647           0 :             __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
     648             : 
     649             :             // Calculate R G B of result.
     650             :             // Original algorithm is in SkSrcOver32To16().
     651           0 :             dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
     652           0 :             dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
     653           0 :             dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
     654           0 :             dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
     655           0 :             db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
     656           0 :             db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
     657             : 
     658             :             // Pack R G B into 16-bit color.
     659           0 :             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
     660             : 
     661             :             // Store 8 16-bit colors in dst.
     662           0 :             _mm_store_si128(d++, d_pixel);
     663           0 :             count -= 8;
     664             :         }
     665             : 
     666           0 :         src = reinterpret_cast<const SkPMColor*>(s);
     667           0 :         dst = reinterpret_cast<uint16_t*>(d);
     668             :     }
     669             : 
     670           0 :     if (count > 0) {
     671           0 :         do {
     672           0 :             SkPMColor c = *src++;
     673           0 :             SkPMColorAssert(c);
     674           0 :             if (c) {
     675           0 :                 *dst = SkSrcOver32To16(c, *dst);
     676             :             }
     677           0 :             dst += 1;
     678             :         } while (--count != 0);
     679             :     }
     680             : }
     681             : 
     682           0 : void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
     683             :                                  const SkPMColor* SK_RESTRICT src,
     684             :                                  int count, U8CPU alpha, int x, int y) {
     685           0 :     SkASSERT(255 == alpha);
     686             : 
     687           0 :     if (count <= 0) {
     688           0 :         return;
     689             :     }
     690             : 
     691           0 :     if (count >= 8) {
     692           0 :         while (((size_t)dst & 0x0F) != 0) {
     693           0 :             DITHER_565_SCAN(y);
     694           0 :             SkPMColor c = *src++;
     695           0 :             SkPMColorAssert(c);
     696             : 
     697           0 :             unsigned dither = DITHER_VALUE(x);
     698           0 :             *dst++ = SkDitherRGB32To565(c, dither);
     699           0 :             DITHER_INC_X(x);
     700           0 :             count--;
     701             :         }
     702             : 
     703             :         unsigned short dither_value[8];
     704             :         __m128i dither;
     705             : #ifdef ENABLE_DITHER_MATRIX_4X4
     706             :         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
     707             :         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
     708             :         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
     709             :         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
     710             :         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
     711             : #else
     712           0 :         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
     713           0 :         dither_value[0] = dither_value[4] = (dither_scan
     714           0 :                                              >> (((x) & 3) << 2)) & 0xF;
     715           0 :         dither_value[1] = dither_value[5] = (dither_scan
     716           0 :                                              >> (((x + 1) & 3) << 2)) & 0xF;
     717           0 :         dither_value[2] = dither_value[6] = (dither_scan
     718           0 :                                              >> (((x + 2) & 3) << 2)) & 0xF;
     719           0 :         dither_value[3] = dither_value[7] = (dither_scan
     720           0 :                                              >> (((x + 3) & 3) << 2)) & 0xF;
     721             : #endif
     722           0 :         dither = _mm_loadu_si128((__m128i*) dither_value);
     723             : 
     724           0 :         const __m128i* s = reinterpret_cast<const __m128i*>(src);
     725           0 :         __m128i* d = reinterpret_cast<__m128i*>(dst);
     726             : 
     727           0 :         while (count >= 8) {
     728             :             // Load 8 pixels of src.
     729           0 :             __m128i src_pixel1 = _mm_loadu_si128(s++);
     730           0 :             __m128i src_pixel2 = _mm_loadu_si128(s++);
     731             : 
     732             :             // Extract R from src.
     733           0 :             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
     734           0 :             sr1 = _mm_srli_epi32(sr1, 24);
     735           0 :             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
     736           0 :             sr2 = _mm_srli_epi32(sr2, 24);
     737           0 :             __m128i sr = _mm_packs_epi32(sr1, sr2);
     738             : 
     739             :             // SkDITHER_R32To565(sr, dither)
     740           0 :             __m128i sr_offset = _mm_srli_epi16(sr, 5);
     741           0 :             sr = _mm_add_epi16(sr, dither);
     742           0 :             sr = _mm_sub_epi16(sr, sr_offset);
     743           0 :             sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
     744             : 
     745             :             // Extract G from src.
     746           0 :             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
     747           0 :             sg1 = _mm_srli_epi32(sg1, 24);
     748           0 :             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
     749           0 :             sg2 = _mm_srli_epi32(sg2, 24);
     750           0 :             __m128i sg = _mm_packs_epi32(sg1, sg2);
     751             : 
     752             :             // SkDITHER_R32To565(sg, dither)
     753           0 :             __m128i sg_offset = _mm_srli_epi16(sg, 6);
     754           0 :             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
     755           0 :             sg = _mm_sub_epi16(sg, sg_offset);
     756           0 :             sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
     757             : 
     758             :             // Extract B from src.
     759           0 :             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
     760           0 :             sb1 = _mm_srli_epi32(sb1, 24);
     761           0 :             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
     762           0 :             sb2 = _mm_srli_epi32(sb2, 24);
     763           0 :             __m128i sb = _mm_packs_epi32(sb1, sb2);
     764             : 
     765             :             // SkDITHER_R32To565(sb, dither)
     766           0 :             __m128i sb_offset = _mm_srli_epi16(sb, 5);
     767           0 :             sb = _mm_add_epi16(sb, dither);
     768           0 :             sb = _mm_sub_epi16(sb, sb_offset);
     769           0 :             sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
     770             : 
     771             :             // Pack and store 16-bit dst pixel.
     772           0 :             __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
     773           0 :             _mm_store_si128(d++, d_pixel);
     774             : 
     775           0 :             count -= 8;
     776           0 :             x += 8;
     777             :         }
     778             : 
     779           0 :         src = reinterpret_cast<const SkPMColor*>(s);
     780           0 :         dst = reinterpret_cast<uint16_t*>(d);
     781             :     }
     782             : 
     783           0 :     if (count > 0) {
     784           0 :         DITHER_565_SCAN(y);
     785           0 :         do {
     786           0 :             SkPMColor c = *src++;
     787           0 :             SkPMColorAssert(c);
     788             : 
     789           0 :             unsigned dither = DITHER_VALUE(x);
     790           0 :             *dst++ = SkDitherRGB32To565(c, dither);
     791           0 :             DITHER_INC_X(x);
     792             :         } while (--count != 0);
     793             :     }
     794             : }
     795             : 
     796             : /* SSE2 version of S32A_D565_Opaque_Dither()
     797             :  * portable version is in core/SkBlitRow_D16.cpp
     798             :  */
     799           0 : void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
     800             :                                   const SkPMColor* SK_RESTRICT src,
     801             :                                   int count, U8CPU alpha, int x, int y) {
     802           0 :     SkASSERT(255 == alpha);
     803             : 
     804           0 :     if (count <= 0) {
     805           0 :         return;
     806             :     }
     807             : 
     808           0 :     if (count >= 8) {
     809           0 :         while (((size_t)dst & 0x0F) != 0) {
     810           0 :             DITHER_565_SCAN(y);
     811           0 :             SkPMColor c = *src++;
     812           0 :             SkPMColorAssert(c);
     813           0 :             if (c) {
     814           0 :                 unsigned a = SkGetPackedA32(c);
     815             : 
     816           0 :                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
     817             : 
     818           0 :                 unsigned sr = SkGetPackedR32(c);
     819           0 :                 unsigned sg = SkGetPackedG32(c);
     820           0 :                 unsigned sb = SkGetPackedB32(c);
     821           0 :                 sr = SkDITHER_R32_FOR_565(sr, d);
     822           0 :                 sg = SkDITHER_G32_FOR_565(sg, d);
     823           0 :                 sb = SkDITHER_B32_FOR_565(sb, d);
     824             : 
     825           0 :                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
     826           0 :                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
     827           0 :                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
     828             :                 // now src and dst expanded are in g:11 r:10 x:1 b:10
     829           0 :                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
     830             :             }
     831           0 :             dst += 1;
     832           0 :             DITHER_INC_X(x);
     833           0 :             count--;
     834             :         }
     835             : 
     836             :         unsigned short dither_value[8];
     837             :         __m128i dither, dither_cur;
     838             : #ifdef ENABLE_DITHER_MATRIX_4X4
     839             :         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
     840             :         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
     841             :         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
     842             :         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
     843             :         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
     844             : #else
     845           0 :         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
     846           0 :         dither_value[0] = dither_value[4] = (dither_scan
     847           0 :                                              >> (((x) & 3) << 2)) & 0xF;
     848           0 :         dither_value[1] = dither_value[5] = (dither_scan
     849           0 :                                              >> (((x + 1) & 3) << 2)) & 0xF;
     850           0 :         dither_value[2] = dither_value[6] = (dither_scan
     851           0 :                                              >> (((x + 2) & 3) << 2)) & 0xF;
     852           0 :         dither_value[3] = dither_value[7] = (dither_scan
     853           0 :                                              >> (((x + 3) & 3) << 2)) & 0xF;
     854             : #endif
     855           0 :         dither = _mm_loadu_si128((__m128i*) dither_value);
     856             : 
     857           0 :         const __m128i* s = reinterpret_cast<const __m128i*>(src);
     858           0 :         __m128i* d = reinterpret_cast<__m128i*>(dst);
     859           0 :         __m128i var256 = _mm_set1_epi16(256);
     860           0 :         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
     861           0 :         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
     862           0 :         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
     863             : 
     864           0 :         while (count >= 8) {
     865             :             // Load 8 pixels of src and dst.
     866           0 :             __m128i src_pixel1 = _mm_loadu_si128(s++);
     867           0 :             __m128i src_pixel2 = _mm_loadu_si128(s++);
     868           0 :             __m128i dst_pixel = _mm_load_si128(d);
     869             : 
     870             :             // Extract A from src.
     871           0 :             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
     872           0 :             sa1 = _mm_srli_epi32(sa1, 24);
     873           0 :             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
     874           0 :             sa2 = _mm_srli_epi32(sa2, 24);
     875           0 :             __m128i sa = _mm_packs_epi32(sa1, sa2);
     876             : 
     877             :             // Calculate current dither value.
     878           0 :             dither_cur = _mm_mullo_epi16(dither,
     879           0 :                                          _mm_add_epi16(sa, _mm_set1_epi16(1)));
     880           0 :             dither_cur = _mm_srli_epi16(dither_cur, 8);
     881             : 
     882             :             // Extract R from src.
     883           0 :             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
     884           0 :             sr1 = _mm_srli_epi32(sr1, 24);
     885           0 :             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
     886           0 :             sr2 = _mm_srli_epi32(sr2, 24);
     887           0 :             __m128i sr = _mm_packs_epi32(sr1, sr2);
     888             : 
     889             :             // SkDITHER_R32_FOR_565(sr, d)
     890           0 :             __m128i sr_offset = _mm_srli_epi16(sr, 5);
     891           0 :             sr = _mm_add_epi16(sr, dither_cur);
     892           0 :             sr = _mm_sub_epi16(sr, sr_offset);
     893             : 
     894             :             // Expand sr.
     895           0 :             sr = _mm_slli_epi16(sr, 2);
     896             : 
     897             :             // Extract G from src.
     898           0 :             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
     899           0 :             sg1 = _mm_srli_epi32(sg1, 24);
     900           0 :             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
     901           0 :             sg2 = _mm_srli_epi32(sg2, 24);
     902           0 :             __m128i sg = _mm_packs_epi32(sg1, sg2);
     903             : 
     904             :             // sg = SkDITHER_G32_FOR_565(sg, d).
     905           0 :             __m128i sg_offset = _mm_srli_epi16(sg, 6);
     906           0 :             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
     907           0 :             sg = _mm_sub_epi16(sg, sg_offset);
     908             : 
     909             :             // Expand sg.
     910           0 :             sg = _mm_slli_epi16(sg, 3);
     911             : 
     912             :             // Extract B from src.
     913           0 :             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
     914           0 :             sb1 = _mm_srli_epi32(sb1, 24);
     915           0 :             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
     916           0 :             sb2 = _mm_srli_epi32(sb2, 24);
     917           0 :             __m128i sb = _mm_packs_epi32(sb1, sb2);
     918             : 
     919             :             // sb = SkDITHER_B32_FOR_565(sb, d).
     920           0 :             __m128i sb_offset = _mm_srli_epi16(sb, 5);
     921           0 :             sb = _mm_add_epi16(sb, dither_cur);
     922           0 :             sb = _mm_sub_epi16(sb, sb_offset);
     923             : 
     924             :             // Expand sb.
     925           0 :             sb = _mm_slli_epi16(sb, 2);
     926             : 
     927             :             // Extract R G B from dst.
     928           0 :             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
     929           0 :             dr = _mm_and_si128(dr, r16_mask);
     930           0 :             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
     931           0 :             dg = _mm_and_si128(dg, g16_mask);
     932           0 :             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
     933           0 :             db = _mm_and_si128(db, b16_mask);
     934             : 
     935             :             // SkAlpha255To256(255 - a) >> 3
     936           0 :             __m128i isa = _mm_sub_epi16(var256, sa);
     937           0 :             isa = _mm_srli_epi16(isa, 3);
     938             : 
     939           0 :             dr = _mm_mullo_epi16(dr, isa);
     940           0 :             dr = _mm_add_epi16(dr, sr);
     941           0 :             dr = _mm_srli_epi16(dr, 5);
     942             : 
     943           0 :             dg = _mm_mullo_epi16(dg, isa);
     944           0 :             dg = _mm_add_epi16(dg, sg);
     945           0 :             dg = _mm_srli_epi16(dg, 5);
     946             : 
     947           0 :             db = _mm_mullo_epi16(db, isa);
     948           0 :             db = _mm_add_epi16(db, sb);
     949           0 :             db = _mm_srli_epi16(db, 5);
     950             : 
     951             :             // Package and store dst pixel.
     952           0 :             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
     953           0 :             _mm_store_si128(d++, d_pixel);
     954             : 
     955           0 :             count -= 8;
     956           0 :             x += 8;
     957             :         }
     958             : 
     959           0 :         src = reinterpret_cast<const SkPMColor*>(s);
     960           0 :         dst = reinterpret_cast<uint16_t*>(d);
     961             :     }
     962             : 
     963           0 :     if (count > 0) {
     964           0 :         DITHER_565_SCAN(y);
     965           0 :         do {
     966           0 :             SkPMColor c = *src++;
     967           0 :             SkPMColorAssert(c);
     968           0 :             if (c) {
     969           0 :                 unsigned a = SkGetPackedA32(c);
     970             : 
     971           0 :                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
     972             : 
     973           0 :                 unsigned sr = SkGetPackedR32(c);
     974           0 :                 unsigned sg = SkGetPackedG32(c);
     975           0 :                 unsigned sb = SkGetPackedB32(c);
     976           0 :                 sr = SkDITHER_R32_FOR_565(sr, d);
     977           0 :                 sg = SkDITHER_G32_FOR_565(sg, d);
     978           0 :                 sb = SkDITHER_B32_FOR_565(sb, d);
     979             : 
     980           0 :                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
     981           0 :                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
     982           0 :                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
     983             :                 // now src and dst expanded are in g:11 r:10 x:1 b:10
     984           0 :                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
     985             :             }
     986           0 :             dst += 1;
     987           0 :             DITHER_INC_X(x);
     988             :         } while (--count != 0);
     989             :     }
     990             : }

Generated by: LCOV version 1.13