LCOV - code coverage report
Current view: top level - gfx/2d - SwizzleSSE2.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 122 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 18 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* This Source Code Form is subject to the terms of the Mozilla Public
       2             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       3             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       4             : 
       5             : #include "Swizzle.h"
       6             : 
       7             : #include <emmintrin.h>
       8             : 
       9             : namespace mozilla {
      10             : namespace gfx {
      11             : 
      12             : // Load 1-3 pixels into a 4 pixel vector.
      13             : static MOZ_ALWAYS_INLINE __m128i
      14           0 : LoadRemainder_SSE2(const uint8_t* aSrc, size_t aLength)
      15             : {
      16             :   __m128i px;
      17           0 :   if (aLength >= 2) {
      18             :     // Load first 2 pixels
      19           0 :     px = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(aSrc));
      20             :     // Load third pixel
      21           0 :     if (aLength >= 3) {
      22           0 :       px = _mm_unpacklo_epi64(px,
      23           0 :                               _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc + 2 * 4)));
      24             :     }
      25             :   } else {
      26             :     // Load single pixel
      27           0 :     px = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc));
      28             :   }
      29           0 :   return px;
      30             : }
      31             : 
      32             : // Store 1-3 pixels from a vector into memory without overwriting.
      33             : static MOZ_ALWAYS_INLINE void
      34           0 : StoreRemainder_SSE2(uint8_t* aDst, size_t aLength, const __m128i& aSrc)
      35             : {
      36           0 :   if (aLength >= 2) {
      37             :     // Store first 2 pixels
      38           0 :     _mm_storel_epi64(reinterpret_cast<__m128i*>(aDst), aSrc);
      39             :     // Store third pixel
      40           0 :     if (aLength >= 3) {
      41           0 :       *reinterpret_cast<uint32_t*>(aDst + 2 * 4) =
      42           0 :         _mm_cvtsi128_si32(_mm_srli_si128(aSrc, 2 * 4));
      43             :     }
      44             :   } else {
      45             :     // Store single pixel
      46           0 :     *reinterpret_cast<uint32_t*>(aDst) = _mm_cvtsi128_si32(aSrc);
      47             :   }
      48           0 : }
      49             : 
      50             : // Premultiply vector of 4 pixels using splayed math.
      51             : template<bool aSwapRB, bool aOpaqueAlpha>
      52             : static MOZ_ALWAYS_INLINE __m128i
      53           0 : PremultiplyVector_SSE2(const __m128i& aSrc)
      54             : {
      55             :   // Isolate R and B with mask.
      56           0 :   const __m128i mask = _mm_set1_epi32(0x00FF00FF);
      57           0 :   __m128i rb = _mm_and_si128(mask, aSrc);
      58             :   // Swap R and B if necessary.
      59             :   if (aSwapRB) {
      60           0 :     rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
      61           0 :     rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
      62             :   }
      63             :   // Isolate G and A by shifting down to bottom of word.
      64           0 :   __m128i ga = _mm_srli_epi16(aSrc, 8);
      65             : 
      66             :   // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
      67           0 :   __m128i alphas = _mm_shufflelo_epi16(ga, _MM_SHUFFLE(3, 3, 1, 1));
      68           0 :   alphas = _mm_shufflehi_epi16(alphas, _MM_SHUFFLE(3, 3, 1, 1));
      69             : 
      70             :   // rb = rb*a + 255; rb += rb >> 8;
      71           0 :   rb = _mm_add_epi16(_mm_mullo_epi16(rb, alphas), mask);
      72           0 :   rb = _mm_add_epi16(rb, _mm_srli_epi16(rb, 8));
      73             : 
      74             :   // If format is not opaque, force A to 255 so that A*alpha/255 = alpha
      75             :   if (!aOpaqueAlpha) {
      76           0 :     ga = _mm_or_si128(ga, _mm_set1_epi32(0x00FF0000));
      77             :   }
      78             :   // ga = ga*a + 255; ga += ga >> 8;
      79           0 :   ga = _mm_add_epi16(_mm_mullo_epi16(ga, alphas), mask);
      80           0 :   ga = _mm_add_epi16(ga, _mm_srli_epi16(ga, 8));
      81             :   // If format is opaque, force output A to be 255.
      82             :   if (aOpaqueAlpha) {
      83           0 :     ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
      84             :   }
      85             : 
      86             :   // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00)
      87           0 :   rb = _mm_srli_epi16(rb, 8);
      88           0 :   ga = _mm_andnot_si128(mask, ga);
      89           0 :   return _mm_or_si128(rb, ga);
      90             : }
      91             : 
      92             : template<bool aSwapRB, bool aOpaqueAlpha>
      93           0 : void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
      94             :                       uint8_t* aDst, int32_t aDstGap,
      95             :                       IntSize aSize)
      96             : {
      97           0 :   int32_t alignedRow = 4 * (aSize.width & ~3);
      98           0 :   int32_t remainder = aSize.width & 3;
      99             :   // Fold remainder into stride gap.
     100           0 :   aSrcGap += 4 * remainder;
     101           0 :   aDstGap += 4 * remainder;
     102             : 
     103           0 :   for (int32_t height = aSize.height; height > 0; height--) {
     104             :     // Process all 4-pixel chunks as one vector.
     105           0 :     for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
     106           0 :       __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
     107           0 :       px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
     108           0 :       _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
     109           0 :       aSrc += 4 * 4;
     110           0 :       aDst += 4 * 4;
     111             :     }
     112             : 
     113             :     // Handle any 1-3 remaining pixels.
     114           0 :     if (remainder) {
     115           0 :       __m128i px = LoadRemainder_SSE2(aSrc, remainder);
     116           0 :       px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
     117           0 :       StoreRemainder_SSE2(aDst, remainder, px);
     118             :     }
     119             : 
     120           0 :     aSrc += aSrcGap;
     121           0 :     aDst += aDstGap;
     122             :   }
     123           0 : }
     124             : 
     125             : // Force instantiation of premultiply variants here.
     126             : template void Premultiply_SSE2<false, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
     127             : template void Premultiply_SSE2<false, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
     128             : template void Premultiply_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
     129             : template void Premultiply_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
     130             : 
     131             : // This generates a table of fixed-point reciprocals representing 1/alpha
     132             : // similar to the fallback implementation. However, the reciprocal must fit
     133             : // in 16 bits to multiply cheaply. Observe that reciprocals of smaller alphas
     134             : // require more bits than for larger alphas. We take advantage of this by
     135             : // shifting the reciprocal down by either 3 or 8 bits depending on whether
     136             : // the alpha value is less than 0x20. This is easy to then undo by multiplying
     137             : // the color component to be unpremultiplying by either 8 or 0x100, respectively.
     138             : // The 16 bit reciprocal is duplicated into both words of a uint32_t here to
     139             : // reduce unpacking overhead.
     140             : #define UNPREMULQ_SSE2(x) (0x10001U * (0xFF0220U / ((x) * ((x) < 0x20 ? 0x100 : 8))))
     141             : #define UNPREMULQ_SSE2_2(x) UNPREMULQ_SSE2(x), UNPREMULQ_SSE2((x) + 1)
     142             : #define UNPREMULQ_SSE2_4(x) UNPREMULQ_SSE2_2(x), UNPREMULQ_SSE2_2((x) + 2)
     143             : #define UNPREMULQ_SSE2_8(x) UNPREMULQ_SSE2_4(x), UNPREMULQ_SSE2_4((x) + 4)
     144             : #define UNPREMULQ_SSE2_16(x) UNPREMULQ_SSE2_8(x), UNPREMULQ_SSE2_8((x) + 8)
     145             : #define UNPREMULQ_SSE2_32(x) UNPREMULQ_SSE2_16(x), UNPREMULQ_SSE2_16((x) + 16)
     146             : static const uint32_t sUnpremultiplyTable_SSE2[256] =
     147             : {
     148             :   0, UNPREMULQ_SSE2(1), UNPREMULQ_SSE2_2(2), UNPREMULQ_SSE2_4(4),
     149             :   UNPREMULQ_SSE2_8(8), UNPREMULQ_SSE2_16(16), UNPREMULQ_SSE2_32(32),
     150             :   UNPREMULQ_SSE2_32(64), UNPREMULQ_SSE2_32(96), UNPREMULQ_SSE2_32(128),
     151             :   UNPREMULQ_SSE2_32(160), UNPREMULQ_SSE2_32(192), UNPREMULQ_SSE2_32(224)
     152             : };
     153             : 
     154             : // Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
     155             : // that avoids doing any actual division.
     156             : template<bool aSwapRB>
     157             : static MOZ_ALWAYS_INLINE __m128i
     158           0 : UnpremultiplyVector_SSE2(const __m128i& aSrc)
     159             : {
     160             :   // Isolate R and B with mask.
     161           0 :   __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
     162             :   // Swap R and B if necessary.
     163             :   if (aSwapRB) {
     164           0 :     rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
     165           0 :     rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
     166             :   }
     167             : 
     168             :   // Isolate G and A by shifting down to bottom of word.
     169           0 :   __m128i ga = _mm_srli_epi16(aSrc, 8);
     170             :   // Extract the alphas for the 4 pixels from the now isolated words.
     171           0 :   int a1 = _mm_extract_epi16(ga, 1);
     172           0 :   int a2 = _mm_extract_epi16(ga, 3);
     173           0 :   int a3 = _mm_extract_epi16(ga, 5);
     174           0 :   int a4 = _mm_extract_epi16(ga, 7);
     175             : 
     176             :   // Load the 16 bit reciprocals from the table for each alpha.
     177             :   // The reciprocals are doubled in each uint32_t entry.
     178             :   // Unpack them to a final vector of duplicated reciprocals of
     179             :   // the form Q1 Q1 Q2 Q2 Q3 Q3 Q4 Q4.
     180           0 :   __m128i q12 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a1]),
     181           0 :                                    _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a2]));
     182           0 :   __m128i q34 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a3]),
     183           0 :                                    _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a4]));
     184           0 :   __m128i q1234 = _mm_unpacklo_epi64(q12, q34);
     185             : 
     186             :   // Check if the alphas are less than 0x20, so that we can undo
     187             :   // scaling of the reciprocals as appropriate.
     188           0 :   __m128i scale = _mm_cmplt_epi32(ga, _mm_set1_epi32(0x00200000));
     189             :   // Produce scale factors by ((a < 0x20) ^ 8) & 0x108,
     190             :   // such that scale is 0x100 if < 0x20, and 8 otherwise.
     191           0 :   scale = _mm_xor_si128(scale, _mm_set1_epi16(8));
     192           0 :   scale = _mm_and_si128(scale, _mm_set1_epi16(0x108));
     193             :   // Isolate G now so that we don't accidentally unpremultiply A.
     194           0 :   ga = _mm_and_si128(ga, _mm_set1_epi32(0x000000FF));
     195             : 
     196             :   // Scale R, B, and G as required depending on reciprocal precision.
     197           0 :   rb = _mm_mullo_epi16(rb, scale);
     198           0 :   ga = _mm_mullo_epi16(ga, scale);
     199             : 
     200             :   // Multiply R, B, and G by the reciprocal, only taking the high word
     201             :   // too effectively shift right by 16.
     202           0 :   rb = _mm_mulhi_epu16(rb, q1234);
     203           0 :   ga = _mm_mulhi_epu16(ga, q1234);
     204             : 
     205             :   // Combine back to final pixel with rb | (ga << 8) | (aSrc & 0xFF000000),
     206             :   // which will add back on the original alpha value unchanged.
     207           0 :   ga = _mm_slli_si128(ga, 1);
     208           0 :   ga = _mm_or_si128(ga, _mm_and_si128(aSrc, _mm_set1_epi32(0xFF000000)));
     209           0 :   return _mm_or_si128(rb, ga);
     210             : }
     211             : 
     212             : template<bool aSwapRB>
     213           0 : void Unpremultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
     214             :                         uint8_t* aDst, int32_t aDstGap,
     215             :                         IntSize aSize)
     216             : {
     217           0 :   int32_t alignedRow = 4 * (aSize.width & ~3);
     218           0 :   int32_t remainder = aSize.width & 3;
     219             :   // Fold remainder into stride gap.
     220           0 :   aSrcGap += 4 * remainder;
     221           0 :   aDstGap += 4 * remainder;
     222             : 
     223           0 :   for (int32_t height = aSize.height; height > 0; height--) {
     224             :     // Process all 4-pixel chunks as one vector.
     225           0 :     for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
     226           0 :       __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
     227           0 :       px = UnpremultiplyVector_SSE2<aSwapRB>(px);
     228           0 :       _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
     229           0 :       aSrc += 4 * 4;
     230           0 :       aDst += 4 * 4;
     231             :     }
     232             : 
     233             :     // Handle any 1-3 remaining pixels.
     234           0 :     if (remainder) {
     235           0 :       __m128i px = LoadRemainder_SSE2(aSrc, remainder);
     236           0 :       px = UnpremultiplyVector_SSE2<aSwapRB>(px);
     237           0 :       StoreRemainder_SSE2(aDst, remainder, px);
     238             :     }
     239             : 
     240           0 :     aSrc += aSrcGap;
     241           0 :     aDst += aDstGap;
     242             :   }
     243           0 : }
     244             : 
     245             : // Force instantiation of unpremultiply variants here.
     246             : template void Unpremultiply_SSE2<false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
     247             : template void Unpremultiply_SSE2<true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
     248             : 
     249             : // Swizzle a vector of 4 pixels providing swaps and opaquifying.
     250             : template<bool aSwapRB, bool aOpaqueAlpha>
     251             : static MOZ_ALWAYS_INLINE __m128i
     252           0 : SwizzleVector_SSE2(const __m128i& aSrc)
     253             : {
     254             :   // Isolate R and B.
     255           0 :   __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
     256             :   // Swap R and B.
     257           0 :   rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
     258           0 :   rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
     259             :   // Isolate G and A.
     260           0 :   __m128i ga = _mm_and_si128(aSrc, _mm_set1_epi32(0xFF00FF00));
     261             :   // Force alpha to 255 if necessary.
     262             :   if (aOpaqueAlpha) {
     263           0 :     ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
     264             :   }
     265             :   // Combine everything back together.
     266           0 :   return _mm_or_si128(rb, ga);
     267             : }
     268             : 
     269             : #if 0
     270             : // These specializations currently do not profile faster than the generic versions,
     271             : // so disable them for now.
     272             : 
     273             : // Optimized implementations for when there is no R and B swap.
     274             : template<>
     275             : MOZ_ALWAYS_INLINE __m128i
     276             : SwizzleVector_SSE2<false, true>(const __m128i& aSrc)
     277             : {
     278             :   // Force alpha to 255.
     279             :   return _mm_or_si128(aSrc, _mm_set1_epi32(0xFF000000));
     280             : }
     281             : 
     282             : template<>
     283             : MOZ_ALWAYS_INLINE __m128i
     284             : SwizzleVector_SSE2<false, false>(const __m128i& aSrc)
     285             : {
     286             :   return aSrc;
     287             : }
     288             : #endif
     289             : 
     290             : template<bool aSwapRB, bool aOpaqueAlpha>
     291           0 : void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap,
     292             :                   uint8_t* aDst, int32_t aDstGap,
     293             :                   IntSize aSize)
     294             : {
     295           0 :   int32_t alignedRow = 4 * (aSize.width & ~3);
     296           0 :   int32_t remainder = aSize.width & 3;
     297             :   // Fold remainder into stride gap.
     298           0 :   aSrcGap += 4 * remainder;
     299           0 :   aDstGap += 4 * remainder;
     300             : 
     301           0 :   for (int32_t height = aSize.height; height > 0; height--) {
     302             :     // Process all 4-pixel chunks as one vector.
     303           0 :     for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
     304           0 :       __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
     305           0 :       px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
     306           0 :       _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
     307           0 :       aSrc += 4 * 4;
     308           0 :       aDst += 4 * 4;
     309             :     }
     310             : 
     311             :     // Handle any 1-3 remaining pixels.
     312           0 :     if (remainder) {
     313           0 :       __m128i px = LoadRemainder_SSE2(aSrc, remainder);
     314           0 :       px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
     315           0 :       StoreRemainder_SSE2(aDst, remainder, px);
     316             :     }
     317             : 
     318           0 :     aSrc += aSrcGap;
     319           0 :     aDst += aDstGap;
     320             :   }
     321           0 : }
     322             : 
     323             : // Force instantiation of swizzle variants here.
     324             : template void Swizzle_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
     325             : template void Swizzle_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
     326             : 
     327             : } // namespace gfx
     328             : } // namespace mozilla
     329             : 

Generated by: LCOV version 1.13