LCOV - output.info - gfx/thebes/gfxAlphaRecoverySSE2.cpp

LCOV - code coverage report

Current view:	top level - gfx/thebes - gfxAlphaRecoverySSE2.cpp (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	0	94	0.0 %
Date:	2017-07-14 16:53:18	Functions:	0	3	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       5             : 
       6             : #include "gfxAlphaRecovery.h"
       7             : #include "gfxImageSurface.h"
       8             : #include <emmintrin.h>
       9             : 
      10             : // This file should only be compiled on x86 and x64 systems.  Additionally,
      11             : // you'll need to compile it with -msse2 if you're using GCC on x86.
      12             : 
      13             : #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
      14             : __declspec(align(16)) static uint32_t greenMaski[] =
      15             :     { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
      16             : __declspec(align(16)) static uint32_t alphaMaski[] =
      17             :     { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
      18             : #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
      19             : static uint32_t greenMaski[] __attribute__ ((aligned (16))) =
      20             :     { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
      21             : static uint32_t alphaMaski[] __attribute__ ((aligned (16))) =
      22             :     { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
      23             : #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
      24             : #pragma align 16 (greenMaski, alphaMaski)
      25             : static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
      26             : static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
      27             : #endif
      28             : 
      29             : bool
      30           0 : gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
      31             :                                    const gfxImageSurface* whiteSurf)
      32             : {
      33           0 :     mozilla::gfx::IntSize size = blackSurf->GetSize();
      34             : 
      35           0 :     if (size != whiteSurf->GetSize() ||
      36           0 :         (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
      37           0 :          blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) ||
      38           0 :         (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
      39           0 :          whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
      40           0 :         return false;
      41             : 
      42           0 :     blackSurf->Flush();
      43           0 :     whiteSurf->Flush();
      44             : 
      45           0 :     unsigned char* blackData = blackSurf->Data();
      46           0 :     unsigned char* whiteData = whiteSurf->Data();
      47             : 
      48           0 :     if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
      49           0 :         (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
      50             :         // Cannot keep these in alignment.
      51           0 :         return false;
      52             :     }
      53             : 
      54           0 :     __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
      55           0 :     __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
      56             : 
      57           0 :     for (int32_t i = 0; i < size.height; ++i) {
      58           0 :         int32_t j = 0;
      59             :         // Loop single pixels until at 4 byte alignment.
      60           0 :         while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
      61           0 :             *((uint32_t*)blackData) =
      62           0 :                 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
      63             :                              *reinterpret_cast<uint32_t*>(whiteData));
      64           0 :             blackData += 4;
      65           0 :             whiteData += 4;
      66           0 :             j++;
      67             :         }
      68             :         // This extra loop allows the compiler to do some more clever registry
      69             :         // management and makes it about 5% faster than with only the 4 pixel
      70             :         // at a time loop.
      71           0 :         for (; j < size.width - 8; j += 8) {
      72           0 :             __m128i black1 = _mm_load_si128((__m128i*)blackData);
      73           0 :             __m128i white1 = _mm_load_si128((__m128i*)whiteData);
      74           0 :             __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
      75           0 :             __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
      76             : 
      77             :             // Execute the same instructions as described in RecoverPixel, only
      78             :             // using an SSE2 packed saturated subtract.
      79           0 :             white1 = _mm_subs_epu8(white1, black1);
      80           0 :             white2 = _mm_subs_epu8(white2, black2);
      81           0 :             white1 = _mm_subs_epu8(greenMask, white1);
      82           0 :             white2 = _mm_subs_epu8(greenMask, white2);
      83             :             // Producing the final black pixel in an XMM register and storing
      84             :             // that is actually faster than doing a masked store since that
      85             :             // does an unaligned storage. We have the black pixel in a register
      86             :             // anyway.
      87           0 :             black1 = _mm_andnot_si128(alphaMask, black1);
      88           0 :             black2 = _mm_andnot_si128(alphaMask, black2);
      89           0 :             white1 = _mm_slli_si128(white1, 2);
      90           0 :             white2 = _mm_slli_si128(white2, 2);
      91           0 :             white1 = _mm_and_si128(alphaMask, white1);
      92           0 :             white2 = _mm_and_si128(alphaMask, white2);
      93           0 :             black1 = _mm_or_si128(white1, black1);
      94           0 :             black2 = _mm_or_si128(white2, black2);
      95             : 
      96             :             _mm_store_si128((__m128i*)blackData, black1);
      97           0 :             _mm_store_si128((__m128i*)(blackData + 16), black2);
      98           0 :             blackData += 32;
      99           0 :             whiteData += 32;
     100             :         }
     101           0 :         for (; j < size.width - 4; j += 4) {
     102           0 :             __m128i black = _mm_load_si128((__m128i*)blackData);
     103           0 :             __m128i white = _mm_load_si128((__m128i*)whiteData);
     104             : 
     105           0 :             white = _mm_subs_epu8(white, black);
     106           0 :             white = _mm_subs_epu8(greenMask, white);
     107           0 :             black = _mm_andnot_si128(alphaMask, black);
     108           0 :             white = _mm_slli_si128(white, 2);
     109           0 :             white = _mm_and_si128(alphaMask, white);
     110           0 :             black = _mm_or_si128(white, black);
     111             :             _mm_store_si128((__m128i*)blackData, black);
     112           0 :             blackData += 16;
     113           0 :             whiteData += 16;
     114             :         }
     115             :         // Loop single pixels until we're done.
     116           0 :         while (j < size.width) {
     117           0 :             *((uint32_t*)blackData) =
     118           0 :                 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
     119             :                              *reinterpret_cast<uint32_t*>(whiteData));
     120           0 :             blackData += 4;
     121           0 :             whiteData += 4;
     122           0 :             j++;
     123             :         }
     124           0 :         blackData += blackSurf->Stride() - j * 4;
     125           0 :         whiteData += whiteSurf->Stride() - j * 4;
     126             :     }
     127             : 
     128           0 :     blackSurf->MarkDirty();
     129             : 
     130           0 :     return true;
     131             : }
     132             : 
     133             : static int32_t
     134           0 : ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1)
     135             : {
     136           0 :     return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
     137             : }
     138             : 
     139             : /*static*/ mozilla::gfx::IntRect
     140           0 : gfxAlphaRecovery::AlignRectForSubimageRecovery(const mozilla::gfx::IntRect& aRect,
     141             :                                                gfxImageSurface* aSurface)
     142             : {
     143           0 :     NS_ASSERTION(mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 == aSurface->Format(),
     144             :                  "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
     145           0 :     static const int32_t kByteAlignLog2 = GoodAlignmentLog2();
     146             :     static const int32_t bpp = 4;
     147           0 :     static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp;
     148             :     //
     149             :     // We're going to create a subimage of the surface with size
     150             :     // <sw,sh> for alpha recovery, and want a SIMD fast-path.  The
     151             :     // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
     152             :     // properly aligned for SIMD.  So we want to find a rect <x',y',
     153             :     // w',h'> that's a superset of what needs to be redrawn but is
     154             :     // properly aligned.  Proper alignment is
     155             :     //
     156             :     //   BPP * (x' + y' * sw) \cong 0         (mod ALIGN)
     157             :     //   BPP * w'             \cong BPP * sw  (mod ALIGN)
     158             :     //
     159             :     // (We assume the pixel at surface <0,0> is already ALIGN'd.)
     160             :     // That rect (obviously) has to fit within the surface bounds, and
     161             :     // we should also minimize the extra pixels redrawn only for
     162             :     // alignment's sake.  So we also want
     163             :     //
     164             :     //  minimize <x',y', w',h'>
     165             :     //   0 <= x' <= x
     166             :     //   0 <= y' <= y
     167             :     //   w <= w' <= sw
     168             :     //   h <= h' <= sh
     169             :     //
     170             :     // This is a messy integer non-linear programming problem, except
     171             :     // ... we can assume that ALIGN/BPP is a very small constant.  So,
     172             :     // brute force is viable.  The algorithm below will find a
     173             :     // solution if one exists, but isn't guaranteed to find the
     174             :     // minimum solution.  (For SSE2, ALIGN/BPP = 4, so it'll do at
     175             :     // most 64 iterations below).  In what's likely the common case,
     176             :     // an already-aligned rectangle, it only needs 1 iteration.
     177             :     //
     178             :     // Is this alignment worth doing?  Recovering alpha will take work
     179             :     // proportional to w*h (assuming alpha recovery computation isn't
     180             :     // memory bound).  This analysis can lead to O(w+h) extra work
     181             :     // (with small constants).  In exchange, we expect to shave off a
     182             :     // ALIGN/BPP constant by using SIMD-ized alpha recovery.  So as
     183             :     // w*h diverges from w+h, the win factor approaches ALIGN/BPP.  We
     184             :     // only really care about the w*h >> w+h case anyway; others
     185             :     // should be fast enough even with the overhead.  (Unless the cost
     186             :     // of repainting the expanded rect is high, but in that case
     187             :     // SIMD-ized alpha recovery won't make a difference so this code
     188             :     // shouldn't be called.)
     189             :     //
     190           0 :     mozilla::gfx::IntSize surfaceSize = aSurface->GetSize();
     191           0 :     const int32_t stride = bpp * surfaceSize.width;
     192           0 :     if (stride != aSurface->Stride()) {
     193           0 :         NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
     194           0 :         return aRect;
     195             :     }
     196             : 
     197           0 :     const int32_t x = aRect.x, y = aRect.y, w = aRect.width, h = aRect.height;
     198           0 :     const int32_t r = x + w;
     199           0 :     const int32_t sw = surfaceSize.width;
     200           0 :     const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride);
     201             : 
     202             :     // The outer two loops below keep the rightmost (|r| above) and
     203             :     // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
     204             :     // return only a superset of the original rect.  These loops
     205             :     // search for an aligned top-left pixel by trying to expand <x,y>
     206             :     // left and up by <dx,dy> pixels, respectively.
     207             :     //
     208             :     // Then if a properly-aligned top-left pixel is found, the
     209             :     // innermost loop tries to find an aligned stride by moving the
     210             :     // rightmost pixel rightward by dr.
     211             :     int32_t dx, dy, dr;
     212           0 :     for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
     213           0 :         for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
     214           0 :             if (0 != ByteAlignment(kByteAlignLog2,
     215           0 :                                    bpp * (x - dx), y - dy, stride)) {
     216           0 :                 continue;
     217             :             }
     218           0 :             for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
     219           0 :                 if (strideAlign == ByteAlignment(kByteAlignLog2,
     220           0 :                                                  bpp * (w + dr + dx))) {
     221           0 :                     goto FOUND_SOLUTION;
     222             :                 }
     223             :             }
     224             :         }
     225             :     }
     226             : 
     227             :     // Didn't find a solution.
     228           0 :     return aRect;
     229             : 
     230             : FOUND_SOLUTION:
     231           0 :     mozilla::gfx::IntRect solution = mozilla::gfx::IntRect(x - dx, y - dy, w + dr + dx, h + dy);
     232           0 :     MOZ_ASSERT(mozilla::gfx::IntRect(0, 0, sw, surfaceSize.height).Contains(solution),
     233             :                "'Solution' extends outside surface bounds!");
     234           0 :     return solution;
     235             : }

Generated by: LCOV version 1.13