LCOV - output.info - gfx/2d/BlurSSE2.cpp

LCOV - code coverage report

Current view:	top level - gfx/2d - BlurSSE2.cpp (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	146	154	94.8 %
Date:	2017-07-14 16:53:18	Functions:	6	6	100.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /* This Source Code Form is subject to the terms of the Mozilla Public
       2             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       3             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       4             : 
       5             : #include "Blur.h"
       6             : 
       7             : #include "SSEHelpers.h"
       8             : 
       9             : #include <string.h>
      10             : 
      11             : namespace mozilla {
      12             : namespace gfx {
      13             : 
      14             : MOZ_ALWAYS_INLINE
      15        1140 : __m128i Divide(__m128i aValues, __m128i aDivisor)
      16             : {
      17        1140 :   const __m128i mask = _mm_setr_epi32(0x0, 0xffffffff, 0x0, 0xffffffff);
      18             :   static const union {
      19             :     int64_t i64[2];
      20             :     __m128i m;
      21             :   } roundingAddition = { { int64_t(1) << 31, int64_t(1) << 31 } };
      22             : 
      23        1140 :   __m128i multiplied31 = _mm_mul_epu32(aValues, aDivisor);
      24        2280 :   __m128i multiplied42 = _mm_mul_epu32(_mm_srli_epi64(aValues, 32), aDivisor);
      25             : 
      26             :   // Add 1 << 31 before shifting or masking the lower 32 bits away, so that the
      27             :   // result is rounded.
      28        3420 :   __m128i p_3_1 = _mm_srli_epi64(_mm_add_epi64(multiplied31, roundingAddition.m), 32);
      29        3420 :   __m128i p4_2_ = _mm_and_si128(_mm_add_epi64(multiplied42, roundingAddition.m), mask);
      30        1140 :   __m128i p4321 = _mm_or_si128(p_3_1, p4_2_);
      31        1140 :   return p4321;
      32             : }
      33             : 
      34             : MOZ_ALWAYS_INLINE
      35        1140 : __m128i BlurFourPixels(const __m128i& aTopLeft, const __m128i& aTopRight,
      36             :                        const __m128i& aBottomRight, const __m128i& aBottomLeft,
      37             :                        const __m128i& aDivisor)
      38             : {
      39        4560 :   __m128i values = _mm_add_epi32(_mm_sub_epi32(_mm_sub_epi32(aBottomRight, aTopRight), aBottomLeft), aTopLeft);
      40        1140 :   return Divide(values, aDivisor);
      41             : }
      42             : 
      43             : MOZ_ALWAYS_INLINE
      44           6 : void LoadIntegralRowFromRow(uint32_t *aDest, const uint8_t *aSource,
      45             :                             int32_t aSourceWidth, int32_t aLeftInflation,
      46             :                             int32_t aRightInflation)
      47             : {
      48           6 :   int32_t currentRowSum = 0;
      49             : 
      50          46 :   for (int x = 0; x < aLeftInflation; x++) {
      51          40 :     currentRowSum += aSource[0];
      52          40 :     aDest[x] = currentRowSum;
      53             :   }
      54         234 :   for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) {
      55         228 :     currentRowSum += aSource[(x - aLeftInflation)];
      56         228 :     aDest[x] = currentRowSum;
      57             :   }
      58          28 :   for (int x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
      59          22 :     currentRowSum += aSource[aSourceWidth - 1];
      60          22 :     aDest[x] = currentRowSum;
      61             :   }
      62           6 : }
      63             : 
      64             : // This function calculates an integral of four pixels stored in the 4
      65             : // 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns
      66             : // { 30, 80, 160, 260 }. This seems to be the fastest way to do this after
      67             : // much testing.
      68             : MOZ_ALWAYS_INLINE
      69        1406 : __m128i AccumulatePixelSums(__m128i aPixels)
      70             : {
      71        1406 :   __m128i sumPixels = aPixels;
      72        1406 :   __m128i currentPixels = _mm_slli_si128(aPixels, 4);
      73        1406 :   sumPixels = _mm_add_epi32(sumPixels, currentPixels);
      74        2812 :   currentPixels = _mm_unpacklo_epi64(_mm_setzero_si128(), sumPixels);
      75             : 
      76        1406 :   return _mm_add_epi32(sumPixels, currentPixels);
      77             : }
      78             : 
      79             : MOZ_ALWAYS_INLINE void
      80           3 : GenerateIntegralImage_SSE2(int32_t aLeftInflation, int32_t aRightInflation,
      81             :                            int32_t aTopInflation, int32_t aBottomInflation,
      82             :                            uint32_t *aIntegralImage, size_t aIntegralImageStride,
      83             :                            uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
      84             : {
      85           3 :   MOZ_ASSERT(!(aLeftInflation & 3));
      86             : 
      87           3 :   uint32_t stride32bit = aIntegralImageStride / 4;
      88             : 
      89           3 :   IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
      90           6 :                             aSize.height + aTopInflation + aBottomInflation);
      91             : 
      92           3 :   LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation, aRightInflation);
      93             : 
      94          17 :   for (int y = 1; y < aTopInflation + 1; y++) {
      95          14 :     uint32_t *intRow = aIntegralImage + (y * stride32bit);
      96          14 :     uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
      97          14 :     uint32_t *intFirstRow = aIntegralImage;
      98             : 
      99         192 :     for (int x = 0; x < integralImageSize.width; x += 4) {
     100         356 :       __m128i firstRow = _mm_load_si128((__m128i*)(intFirstRow + x));
     101         356 :       __m128i previousRow = _mm_load_si128((__m128i*)(intPrevRow + x));
     102         178 :       _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(firstRow, previousRow));
     103             :     }
     104             :   }
     105             : 
     106         114 :   for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
     107         111 :     __m128i currentRowSum = _mm_setzero_si128();
     108         111 :     uint32_t *intRow = aIntegralImage + (y * stride32bit);
     109         111 :     uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
     110         111 :     uint8_t *sourceRow = aSource + aSourceStride * (y - aTopInflation);
     111             : 
     112         111 :     uint32_t pixel = sourceRow[0];
     113         296 :     for (int x = 0; x < aLeftInflation; x += 4) {
     114         370 :       __m128i sumPixels = AccumulatePixelSums(_mm_shuffle_epi32(_mm_set1_epi32(pixel), _MM_SHUFFLE(0, 0, 0, 0)));
     115             : 
     116         370 :       sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
     117             : 
     118         185 :       currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
     119             : 
     120         555 :       _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
     121             :     }
     122        1221 :     for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) {
     123        1110 :       uint32_t pixels = *(uint32_t*)(sourceRow + (x - aLeftInflation));
     124             : 
     125             :       // It's important to shuffle here. When we exit this loop currentRowSum
     126             :       // has to be set to sumPixels, so that the following loop can get the
     127             :       // correct pixel for the currentRowSum. The highest order pixel in
     128             :       // currentRowSum could've originated from accumulation in the stride.
     129        1110 :       currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
     130             : 
     131        5550 :       __m128i sumPixels = AccumulatePixelSums(_mm_unpacklo_epi16(_mm_unpacklo_epi8( _mm_set1_epi32(pixels), _mm_setzero_si128()), _mm_setzero_si128()));
     132        2220 :       sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
     133             : 
     134        1110 :       currentRowSum = sumPixels;
     135             : 
     136        3330 :       _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
     137             :     }
     138             : 
     139         111 :     pixel = sourceRow[aSize.width - 1];
     140         111 :     int x = (aSize.width + aLeftInflation);
     141         111 :     if ((aSize.width & 3)) {
     142             :       // Deal with unaligned portion. Get the correct pixel from currentRowSum,
     143             :       // see explanation above.
     144         111 :       uint32_t intCurrentRowSum = ((uint32_t*)&currentRowSum)[(aSize.width % 4) - 1];
     145         555 :       for (; x < integralImageSize.width; x++) {
     146             :         // We could be unaligned here!
     147         333 :         if (!(x & 3)) {
     148             :           // aligned!
     149         222 :           currentRowSum = _mm_set1_epi32(intCurrentRowSum);
     150         111 :           break;
     151             :         }
     152         222 :         intCurrentRowSum += pixel;
     153         222 :         intRow[x] = intPrevRow[x] + intCurrentRowSum;
     154             :       }
     155             :     } else {
     156           0 :       currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
     157             :     }
     158         333 :     for (; x < integralImageSize.width; x += 4) {
     159         222 :       __m128i sumPixels = AccumulatePixelSums(_mm_set1_epi32(pixel));
     160             : 
     161         222 :       sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
     162             : 
     163         111 :       currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
     164             : 
     165         333 :       _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
     166             :     }
     167             :   }
     168             : 
     169           3 :   if (aBottomInflation) {
     170             :     // Store the last valid row of our source image in the last row of
     171             :     // our integral image. This will be overwritten with the correct values
     172             :     // in the upcoming loop.
     173           3 :     LoadIntegralRowFromRow(aIntegralImage + (integralImageSize.height - 1) * stride32bit,
     174           6 :                            aSource + (aSize.height - 1) * aSourceStride, aSize.width, aLeftInflation, aRightInflation);
     175             : 
     176             : 
     177          14 :     for (int y = aSize.height + aTopInflation; y < integralImageSize.height; y++) {
     178          11 :       __m128i *intRow = (__m128i*)(aIntegralImage + (y * stride32bit));
     179          11 :       __m128i *intPrevRow = (__m128i*)(aIntegralImage + (y - 1) * stride32bit);
     180          11 :       __m128i *intLastRow = (__m128i*)(aIntegralImage + (integralImageSize.height - 1) * stride32bit);
     181             : 
     182         150 :       for (int x = 0; x < integralImageSize.width; x += 4) {
     183         417 :         _mm_store_si128(intRow + (x / 4),
     184         139 :                         _mm_add_epi32(_mm_load_si128(intLastRow + (x / 4)),
     185         139 :                                       _mm_load_si128(intPrevRow + (x / 4))));
     186             :       }
     187             :     }
     188             :   }
     189           3 : }
     190             : 
     191             : /**
     192             :  * Attempt to do an in-place box blur using an integral image.
     193             :  */
     194             : void
     195           3 : AlphaBoxBlur::BoxBlur_SSE2(uint8_t* aData,
     196             :                            int32_t aLeftLobe,
     197             :                            int32_t aRightLobe,
     198             :                            int32_t aTopLobe,
     199             :                            int32_t aBottomLobe,
     200             :                            uint32_t *aIntegralImage,
     201             :                            size_t aIntegralImageStride)
     202             : {
     203           3 :   IntSize size = GetSize();
     204             : 
     205           3 :   MOZ_ASSERT(size.height > 0);
     206             : 
     207             :   // Our 'left' or 'top' lobe will include the current pixel. i.e. when
     208             :   // looking at an integral image the value of a pixel at 'x,y' is calculated
     209             :   // using the value of the integral image values above/below that.
     210           3 :   aLeftLobe++;
     211           3 :   aTopLobe++;
     212           3 :   int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
     213             : 
     214           3 :   MOZ_ASSERT(boxSize > 0);
     215             : 
     216           3 :   if (boxSize == 1) {
     217           0 :       return;
     218             :   }
     219             : 
     220           3 :   uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
     221             : 
     222           3 :   uint32_t stride32bit = aIntegralImageStride / 4;
     223           3 :   int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
     224             : 
     225           3 :   GenerateIntegralImage_SSE2(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
     226             :                              aIntegralImage, aIntegralImageStride, aData,
     227           3 :                              mStride, size);
     228             : 
     229           6 :   __m128i divisor = _mm_set1_epi32(reciprocal);
     230             : 
     231             :   // This points to the start of the rectangle within the IntegralImage that overlaps
     232             :   // the surface being blurred.
     233           3 :   uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
     234             : 
     235           3 :   IntRect skipRect = mSkipRect;
     236           3 :   int32_t stride = mStride;
     237           3 :   uint8_t *data = aData;
     238         117 :   for (int32_t y = 0; y < size.height; y++) {
     239         114 :     bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
     240             : 
     241         114 :     uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
     242         114 :     uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe);
     243         114 :     uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe);
     244         114 :     uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
     245             : 
     246         114 :     int32_t x = 0;
     247             :     // Process 16 pixels at a time for as long as possible.
     248         570 :     for (; x <= size.width - 16; x += 16) {
     249         228 :       if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
     250           0 :         x = skipRect.XMost() - 16;
     251             :         // Trigger early jump on coming loop iterations, this will be reset
     252             :         // next line anyway.
     253           0 :         inSkipRectY = false;
     254           0 :         continue;
     255             :       }
     256             : 
     257             :       __m128i topLeft;
     258             :       __m128i topRight;
     259             :       __m128i bottomRight;
     260             :       __m128i bottomLeft;
     261             : 
     262         228 :       topLeft = loadUnaligned128((__m128i*)(topLeftBase + x));
     263         228 :       topRight = loadUnaligned128((__m128i*)(topRightBase + x));
     264         228 :       bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x));
     265         228 :       bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x));
     266         228 :       __m128i result1 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
     267             : 
     268         228 :       topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 4));
     269         228 :       topRight = loadUnaligned128((__m128i*)(topRightBase + x + 4));
     270         228 :       bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 4));
     271         228 :       bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 4));
     272         228 :       __m128i result2 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
     273             : 
     274         228 :       topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 8));
     275         228 :       topRight = loadUnaligned128((__m128i*)(topRightBase + x + 8));
     276         228 :       bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 8));
     277         228 :       bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 8));
     278         228 :       __m128i result3 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
     279             : 
     280         228 :       topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 12));
     281         228 :       topRight = loadUnaligned128((__m128i*)(topRightBase + x + 12));
     282         228 :       bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 12));
     283         228 :       bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 12));
     284         228 :       __m128i result4 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
     285             : 
     286         684 :       __m128i final = _mm_packus_epi16(_mm_packs_epi32(result1, result2), _mm_packs_epi32(result3, result4));
     287             : 
     288         228 :       _mm_storeu_si128((__m128i*)(data + stride * y + x), final);
     289             :     }
     290             : 
     291             :     // Process the remaining pixels 4 bytes at a time.
     292         570 :     for (; x < size.width; x += 4) {
     293         228 :       if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
     294           0 :         x = skipRect.XMost() - 4;
     295             :         // Trigger early jump on coming loop iterations, this will be reset
     296             :         // next line anyway.
     297           0 :         inSkipRectY = false;
     298           0 :         continue;
     299             :       }
     300         228 :       __m128i topLeft = loadUnaligned128((__m128i*)(topLeftBase + x));
     301         228 :       __m128i topRight = loadUnaligned128((__m128i*)(topRightBase + x));
     302         228 :       __m128i bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x));
     303         228 :       __m128i bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x));
     304             : 
     305         228 :       __m128i result = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
     306         912 :       __m128i final = _mm_packus_epi16(_mm_packs_epi32(result, _mm_setzero_si128()), _mm_setzero_si128());
     307             : 
     308         456 :       *(uint32_t*)(data + stride * y + x) = _mm_cvtsi128_si32(final);
     309             :     }
     310             :   }
     311             : 
     312             : }
     313             : 
     314             : } // namespace gfx
     315             : } // namespace mozilla

Generated by: LCOV version 1.13