Line data Source code
1 : /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #include "gfxAlphaRecovery.h"
7 : #include "gfxImageSurface.h"
8 : #include <emmintrin.h>
9 :
10 : // This file should only be compiled on x86 and x64 systems. Additionally,
11 : // you'll need to compile it with -msse2 if you're using GCC on x86.
12 :
13 : #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
14 : __declspec(align(16)) static uint32_t greenMaski[] =
15 : { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
16 : __declspec(align(16)) static uint32_t alphaMaski[] =
17 : { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
18 : #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
19 : static uint32_t greenMaski[] __attribute__ ((aligned (16))) =
20 : { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
21 : static uint32_t alphaMaski[] __attribute__ ((aligned (16))) =
22 : { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
23 : #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
24 : #pragma align 16 (greenMaski, alphaMaski)
25 : static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
26 : static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
27 : #endif
28 :
29 : bool
30 0 : gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
31 : const gfxImageSurface* whiteSurf)
32 : {
33 0 : mozilla::gfx::IntSize size = blackSurf->GetSize();
34 :
35 0 : if (size != whiteSurf->GetSize() ||
36 0 : (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
37 0 : blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) ||
38 0 : (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
39 0 : whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
40 0 : return false;
41 :
42 0 : blackSurf->Flush();
43 0 : whiteSurf->Flush();
44 :
45 0 : unsigned char* blackData = blackSurf->Data();
46 0 : unsigned char* whiteData = whiteSurf->Data();
47 :
48 0 : if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
49 0 : (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
50 : // Cannot keep these in alignment.
51 0 : return false;
52 : }
53 :
54 0 : __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
55 0 : __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
56 :
57 0 : for (int32_t i = 0; i < size.height; ++i) {
58 0 : int32_t j = 0;
59 : // Loop single pixels until at 4 byte alignment.
60 0 : while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
61 0 : *((uint32_t*)blackData) =
62 0 : RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
63 : *reinterpret_cast<uint32_t*>(whiteData));
64 0 : blackData += 4;
65 0 : whiteData += 4;
66 0 : j++;
67 : }
68 : // This extra loop allows the compiler to do some more clever registry
69 : // management and makes it about 5% faster than with only the 4 pixel
70 : // at a time loop.
71 0 : for (; j < size.width - 8; j += 8) {
72 0 : __m128i black1 = _mm_load_si128((__m128i*)blackData);
73 0 : __m128i white1 = _mm_load_si128((__m128i*)whiteData);
74 0 : __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
75 0 : __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
76 :
77 : // Execute the same instructions as described in RecoverPixel, only
78 : // using an SSE2 packed saturated subtract.
79 0 : white1 = _mm_subs_epu8(white1, black1);
80 0 : white2 = _mm_subs_epu8(white2, black2);
81 0 : white1 = _mm_subs_epu8(greenMask, white1);
82 0 : white2 = _mm_subs_epu8(greenMask, white2);
83 : // Producing the final black pixel in an XMM register and storing
84 : // that is actually faster than doing a masked store since that
85 : // does an unaligned storage. We have the black pixel in a register
86 : // anyway.
87 0 : black1 = _mm_andnot_si128(alphaMask, black1);
88 0 : black2 = _mm_andnot_si128(alphaMask, black2);
89 0 : white1 = _mm_slli_si128(white1, 2);
90 0 : white2 = _mm_slli_si128(white2, 2);
91 0 : white1 = _mm_and_si128(alphaMask, white1);
92 0 : white2 = _mm_and_si128(alphaMask, white2);
93 0 : black1 = _mm_or_si128(white1, black1);
94 0 : black2 = _mm_or_si128(white2, black2);
95 :
96 : _mm_store_si128((__m128i*)blackData, black1);
97 0 : _mm_store_si128((__m128i*)(blackData + 16), black2);
98 0 : blackData += 32;
99 0 : whiteData += 32;
100 : }
101 0 : for (; j < size.width - 4; j += 4) {
102 0 : __m128i black = _mm_load_si128((__m128i*)blackData);
103 0 : __m128i white = _mm_load_si128((__m128i*)whiteData);
104 :
105 0 : white = _mm_subs_epu8(white, black);
106 0 : white = _mm_subs_epu8(greenMask, white);
107 0 : black = _mm_andnot_si128(alphaMask, black);
108 0 : white = _mm_slli_si128(white, 2);
109 0 : white = _mm_and_si128(alphaMask, white);
110 0 : black = _mm_or_si128(white, black);
111 : _mm_store_si128((__m128i*)blackData, black);
112 0 : blackData += 16;
113 0 : whiteData += 16;
114 : }
115 : // Loop single pixels until we're done.
116 0 : while (j < size.width) {
117 0 : *((uint32_t*)blackData) =
118 0 : RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
119 : *reinterpret_cast<uint32_t*>(whiteData));
120 0 : blackData += 4;
121 0 : whiteData += 4;
122 0 : j++;
123 : }
124 0 : blackData += blackSurf->Stride() - j * 4;
125 0 : whiteData += whiteSurf->Stride() - j * 4;
126 : }
127 :
128 0 : blackSurf->MarkDirty();
129 :
130 0 : return true;
131 : }
132 :
133 : static int32_t
134 0 : ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1)
135 : {
136 0 : return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
137 : }
138 :
139 : /*static*/ mozilla::gfx::IntRect
140 0 : gfxAlphaRecovery::AlignRectForSubimageRecovery(const mozilla::gfx::IntRect& aRect,
141 : gfxImageSurface* aSurface)
142 : {
143 0 : NS_ASSERTION(mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 == aSurface->Format(),
144 : "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
145 0 : static const int32_t kByteAlignLog2 = GoodAlignmentLog2();
146 : static const int32_t bpp = 4;
147 0 : static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp;
148 : //
149 : // We're going to create a subimage of the surface with size
150 : // <sw,sh> for alpha recovery, and want a SIMD fast-path. The
151 : // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
152 : // properly aligned for SIMD. So we want to find a rect <x',y',
153 : // w',h'> that's a superset of what needs to be redrawn but is
154 : // properly aligned. Proper alignment is
155 : //
156 : // BPP * (x' + y' * sw) \cong 0 (mod ALIGN)
157 : // BPP * w' \cong BPP * sw (mod ALIGN)
158 : //
159 : // (We assume the pixel at surface <0,0> is already ALIGN'd.)
160 : // That rect (obviously) has to fit within the surface bounds, and
161 : // we should also minimize the extra pixels redrawn only for
162 : // alignment's sake. So we also want
163 : //
164 : // minimize <x',y', w',h'>
165 : // 0 <= x' <= x
166 : // 0 <= y' <= y
167 : // w <= w' <= sw
168 : // h <= h' <= sh
169 : //
170 : // This is a messy integer non-linear programming problem, except
171 : // ... we can assume that ALIGN/BPP is a very small constant. So,
172 : // brute force is viable. The algorithm below will find a
173 : // solution if one exists, but isn't guaranteed to find the
174 : // minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at
175 : // most 64 iterations below). In what's likely the common case,
176 : // an already-aligned rectangle, it only needs 1 iteration.
177 : //
178 : // Is this alignment worth doing? Recovering alpha will take work
179 : // proportional to w*h (assuming alpha recovery computation isn't
180 : // memory bound). This analysis can lead to O(w+h) extra work
181 : // (with small constants). In exchange, we expect to shave off a
182 : // ALIGN/BPP constant by using SIMD-ized alpha recovery. So as
183 : // w*h diverges from w+h, the win factor approaches ALIGN/BPP. We
184 : // only really care about the w*h >> w+h case anyway; others
185 : // should be fast enough even with the overhead. (Unless the cost
186 : // of repainting the expanded rect is high, but in that case
187 : // SIMD-ized alpha recovery won't make a difference so this code
188 : // shouldn't be called.)
189 : //
190 0 : mozilla::gfx::IntSize surfaceSize = aSurface->GetSize();
191 0 : const int32_t stride = bpp * surfaceSize.width;
192 0 : if (stride != aSurface->Stride()) {
193 0 : NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
194 0 : return aRect;
195 : }
196 :
197 0 : const int32_t x = aRect.x, y = aRect.y, w = aRect.width, h = aRect.height;
198 0 : const int32_t r = x + w;
199 0 : const int32_t sw = surfaceSize.width;
200 0 : const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride);
201 :
202 : // The outer two loops below keep the rightmost (|r| above) and
203 : // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
204 : // return only a superset of the original rect. These loops
205 : // search for an aligned top-left pixel by trying to expand <x,y>
206 : // left and up by <dx,dy> pixels, respectively.
207 : //
208 : // Then if a properly-aligned top-left pixel is found, the
209 : // innermost loop tries to find an aligned stride by moving the
210 : // rightmost pixel rightward by dr.
211 : int32_t dx, dy, dr;
212 0 : for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
213 0 : for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
214 0 : if (0 != ByteAlignment(kByteAlignLog2,
215 0 : bpp * (x - dx), y - dy, stride)) {
216 0 : continue;
217 : }
218 0 : for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
219 0 : if (strideAlign == ByteAlignment(kByteAlignLog2,
220 0 : bpp * (w + dr + dx))) {
221 0 : goto FOUND_SOLUTION;
222 : }
223 : }
224 : }
225 : }
226 :
227 : // Didn't find a solution.
228 0 : return aRect;
229 :
230 : FOUND_SOLUTION:
231 0 : mozilla::gfx::IntRect solution = mozilla::gfx::IntRect(x - dx, y - dy, w + dr + dx, h + dy);
232 0 : MOZ_ASSERT(mozilla::gfx::IntRect(0, 0, sw, surfaceSize.height).Contains(solution),
233 : "'Solution' extends outside surface bounds!");
234 0 : return solution;
235 : }
|