Line data Source code
1 : /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 : * This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #include "FilterProcessing.h"
7 :
8 : #include "SIMD.h"
9 : #include "SVGTurbulenceRenderer-inl.h"
10 :
11 : namespace mozilla {
12 : namespace gfx {
13 :
14 : template<typename u8x16_t>
15 : inline already_AddRefed<DataSourceSurface>
16 0 : ConvertToB8G8R8A8_SIMD(SourceSurface* aSurface)
17 : {
18 0 : IntSize size = aSurface->GetSize();
19 0 : RefPtr<DataSourceSurface> input = aSurface->GetDataSurface();
20 : RefPtr<DataSourceSurface> output =
21 0 : Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
22 0 : uint8_t *inputData = input->GetData();
23 0 : uint8_t *outputData = output->GetData();
24 0 : int32_t inputStride = input->Stride();
25 0 : int32_t outputStride = output->Stride();
26 0 : switch (input->GetFormat()) {
27 : case SurfaceFormat::B8G8R8A8:
28 0 : output = input;
29 0 : break;
30 : case SurfaceFormat::B8G8R8X8:
31 0 : for (int32_t y = 0; y < size.height; y++) {
32 0 : for (int32_t x = 0; x < size.width; x++) {
33 0 : int32_t inputIndex = y * inputStride + 4 * x;
34 0 : int32_t outputIndex = y * outputStride + 4 * x;
35 0 : outputData[outputIndex + 0] = inputData[inputIndex + 0];
36 0 : outputData[outputIndex + 1] = inputData[inputIndex + 1];
37 0 : outputData[outputIndex + 2] = inputData[inputIndex + 2];
38 0 : outputData[outputIndex + 3] = 255;
39 : }
40 : }
41 0 : break;
42 : case SurfaceFormat::R8G8B8A8:
43 0 : for (int32_t y = 0; y < size.height; y++) {
44 0 : for (int32_t x = 0; x < size.width; x++) {
45 0 : int32_t inputIndex = y * inputStride + 4 * x;
46 0 : int32_t outputIndex = y * outputStride + 4 * x;
47 0 : outputData[outputIndex + 2] = inputData[inputIndex + 0];
48 0 : outputData[outputIndex + 1] = inputData[inputIndex + 1];
49 0 : outputData[outputIndex + 0] = inputData[inputIndex + 2];
50 0 : outputData[outputIndex + 3] = inputData[inputIndex + 3];
51 : }
52 : }
53 0 : break;
54 : case SurfaceFormat::R8G8B8X8:
55 0 : for (int32_t y = 0; y < size.height; y++) {
56 0 : for (int32_t x = 0; x < size.width; x++) {
57 0 : int32_t inputIndex = y * inputStride + 4 * x;
58 0 : int32_t outputIndex = y * outputStride + 4 * x;
59 0 : outputData[outputIndex + 2] = inputData[inputIndex + 0];
60 0 : outputData[outputIndex + 1] = inputData[inputIndex + 1];
61 0 : outputData[outputIndex + 0] = inputData[inputIndex + 2];
62 0 : outputData[outputIndex + 3] = 255;
63 : }
64 : }
65 0 : break;
66 : case SurfaceFormat::A8:
67 0 : for (int32_t y = 0; y < size.height; y++) {
68 0 : for (int32_t x = 0; x < size.width; x += 16) {
69 0 : int32_t inputIndex = y * inputStride + x;
70 0 : int32_t outputIndex = y * outputStride + 4 * x;
71 0 : u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]);
72 : // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by
73 : // interleaving with 0000000000000000 twice.
74 0 : u8x16_t zero = simd::FromZero8<u8x16_t>();
75 0 : u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16);
76 0 : u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16);
77 0 : u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8);
78 0 : u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8);
79 0 : u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16);
80 0 : u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16);
81 0 : simd::Store8(&outputData[outputIndex], p1To4);
82 0 : if ((x + 4) * 4 < outputStride) {
83 0 : simd::Store8(&outputData[outputIndex + 4 * 4], p5To8);
84 : }
85 0 : if ((x + 8) * 4 < outputStride) {
86 0 : simd::Store8(&outputData[outputIndex + 4 * 8], p9To12);
87 : }
88 0 : if ((x + 12) * 4 < outputStride) {
89 0 : simd::Store8(&outputData[outputIndex + 4 * 12], p13To16);
90 : }
91 : }
92 : }
93 0 : break;
94 : default:
95 0 : output = nullptr;
96 0 : break;
97 : }
98 0 : return output.forget();
99 : }
100 :
101 : template<typename u8x16_t>
102 : inline void
103 0 : ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData, int32_t sourceStride, uint8_t* alphaData, int32_t alphaStride)
104 : {
105 0 : for (int32_t y = 0; y < size.height; y++) {
106 0 : for (int32_t x = 0; x < size.width; x += 16) {
107 : // Process 16 pixels at a time.
108 : // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of AAAAAAAAAAAAAAAA.
109 0 : int32_t sourceIndex = y * sourceStride + 4 * x;
110 0 : int32_t targetIndex = y * alphaStride + x;
111 :
112 0 : u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
113 0 : u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
114 0 : u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
115 0 : u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
116 :
117 0 : bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
118 0 : if (4 * (x + 4) < sourceStride) {
119 0 : bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
120 : }
121 0 : if (4 * (x + 8) < sourceStride) {
122 0 : bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
123 : }
124 0 : if (4 * (x + 12) < sourceStride) {
125 0 : bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
126 : }
127 :
128 0 : u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
129 0 : u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
130 0 : u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
131 0 : u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
132 0 : u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
133 0 : u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
134 0 : u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
135 0 : u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
136 0 : u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
137 0 : u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
138 0 : u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
139 :
140 0 : simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa);
141 : }
142 : }
143 0 : }
144 :
145 : // This function calculates the result color values for four pixels, but for
146 : // only two color channels - either b & r or g & a. However, the a result will
147 : // not be used.
148 : // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa.
149 : // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the
150 : // alpha of all four pixels (and both aaaa's are the same).
151 : // blendendComponent1 and blendedComponent2 are the out parameters.
152 : template<typename i16x8_t, typename i32x4_t, uint32_t aBlendMode>
153 : inline void
154 0 : BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha,
155 : i16x8_t dest, const i16x8_t& destAlpha,
156 : i32x4_t& blendedComponent1, i32x4_t& blendedComponent2)
157 : {
158 0 : i16x8_t x255 = simd::FromI16<i16x8_t>(255);
159 :
160 : switch (aBlendMode) {
161 :
162 : case BLEND_MODE_MULTIPLY:
163 : {
164 : // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) * dest);
165 0 : i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
166 0 : i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
167 0 : i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource = simd::Add16(twoFiftyFiveMinusSourceAlpha, source);
168 :
169 0 : i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
170 0 : i16x8_t leftFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
171 0 : blendedComponent1 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1);
172 0 : blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
173 :
174 0 : i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
175 0 : i16x8_t leftFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
176 0 : blendedComponent2 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2);
177 0 : blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
178 :
179 0 : break;
180 : }
181 :
182 : case BLEND_MODE_SCREEN:
183 : {
184 : // val = 255 * (source + dest) + (0 - dest) * source;
185 0 : i16x8_t sourcePlusDest = simd::Add16(source, dest);
186 0 : i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest);
187 :
188 0 : i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 = simd::InterleaveLo16(x255, zeroMinusDest);
189 0 : i16x8_t sourcePlusDestInterleavedWithSource1 = simd::InterleaveLo16(sourcePlusDest, source);
190 0 : blendedComponent1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1, sourcePlusDestInterleavedWithSource1);
191 0 : blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
192 :
193 0 : i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 = simd::InterleaveHi16(x255, zeroMinusDest);
194 0 : i16x8_t sourcePlusDestInterleavedWithSource2 = simd::InterleaveHi16(sourcePlusDest, source);
195 0 : blendedComponent2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2, sourcePlusDestInterleavedWithSource2);
196 0 : blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
197 :
198 0 : break;
199 : }
200 :
201 : case BLEND_MODE_DARKEN:
202 : case BLEND_MODE_LIGHTEN:
203 : {
204 : // Darken:
205 : // val = min((255 - destAlpha) * source + 255 * dest,
206 : // 255 * source + (255 - sourceAlpha) * dest);
207 : //
208 : // Lighten:
209 : // val = max((255 - destAlpha) * source + 255 * dest,
210 : // 255 * source + (255 - sourceAlpha) * dest);
211 :
212 0 : i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
213 0 : i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
214 :
215 0 : i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255);
216 0 : i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 = simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha);
217 0 : i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
218 0 : i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1, sourceInterleavedWithDest1);
219 0 : i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1, sourceInterleavedWithDest1);
220 0 : blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product1_1, product1_2) : simd::Max32(product1_1, product1_2);
221 0 : blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
222 :
223 0 : i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255);
224 0 : i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 = simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha);
225 0 : i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
226 0 : i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2, sourceInterleavedWithDest2);
227 0 : i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2, sourceInterleavedWithDest2);
228 0 : blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product2_1, product2_2) : simd::Max32(product2_1, product2_2);
229 0 : blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
230 :
231 0 : break;
232 : }
233 :
234 : }
235 0 : }
236 :
237 : // The alpha channel is subject to a different calculation than the RGB
238 : // channels, and this calculation is the same for all blend modes:
239 : // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha)
240 : template<typename i16x8_t, typename i32x4_t>
241 : inline i32x4_t
242 0 : BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234, i16x8_t d_rrrraaaa1234)
243 : {
244 : // We're using MulAdd16x8x2To32x4, so we need to interleave our factors
245 : // appropriately. The calculation is rewritten as follows:
246 : // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0])
247 : // = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
248 : // = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
249 : // = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0]
250 0 : i16x8_t zeroInterleavedWithSourceAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234);
251 0 : i16x8_t fiveTenInterleavedWithDestAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234);
252 0 : i16x8_t f1 = simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha);
253 0 : i16x8_t f2 = simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255));
254 0 : return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2));
255 : }
256 :
257 : template<typename u8x16_t, typename i16x8_t>
258 : inline void
259 0 : UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,
260 : i16x8_t& bbbbgggg1234, i16x8_t& rrrraaaa1234)
261 : {
262 : // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234
263 0 : i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234);
264 0 : i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234);
265 0 : i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34);
266 0 : i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34);
267 0 : bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24);
268 0 : rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24);
269 0 : }
270 :
271 : template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
272 : inline u8x16_t
273 0 : ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234,
274 : i32x4_t rrrr1234, const i32x4_t& aaaa1234)
275 : {
276 : // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234
277 0 : i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234);
278 0 : i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234);
279 0 : i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234);
280 0 : i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234);
281 0 : i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234);
282 0 : i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234);
283 0 : return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34);
284 : }
285 :
286 : template<typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
287 : inline already_AddRefed<DataSourceSurface>
288 0 : ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2)
289 : {
290 0 : IntSize size = aInput1->GetSize();
291 : RefPtr<DataSourceSurface> target =
292 0 : Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
293 0 : if (!target) {
294 0 : return nullptr;
295 : }
296 :
297 0 : uint8_t* source1Data = aInput1->GetData();
298 0 : uint8_t* source2Data = aInput2->GetData();
299 0 : uint8_t* targetData = target->GetData();
300 0 : int32_t targetStride = target->Stride();
301 0 : int32_t source1Stride = aInput1->Stride();
302 0 : int32_t source2Stride = aInput2->Stride();
303 :
304 0 : for (int32_t y = 0; y < size.height; y++) {
305 0 : for (int32_t x = 0; x < size.width; x += 4) {
306 0 : int32_t targetIndex = y * targetStride + 4 * x;
307 0 : int32_t source1Index = y * source1Stride + 4 * x;
308 0 : int32_t source2Index = y * source2Stride + 4 * x;
309 :
310 0 : u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
311 0 : u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
312 :
313 : // The blending calculation for the RGB channels all need access to the
314 : // alpha channel of their pixel, and the alpha calculation is different,
315 : // so it makes sense to separate by channel.
316 :
317 : i16x8_t s_bbbbgggg1234, s_rrrraaaa1234;
318 : i16x8_t d_bbbbgggg1234, d_rrrraaaa1234;
319 0 : UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234);
320 0 : UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234);
321 0 : i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(s_rrrraaaa1234);
322 0 : i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(d_rrrraaaa1234);
323 :
324 : // We only use blendedB, blendedG and blendedR.
325 : i32x4_t blendedB, blendedG, blendedR, blendedA;
326 0 : BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234, blendedB, blendedG);
327 0 : BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234, blendedR, blendedA);
328 :
329 : // Throw away blendedA and overwrite it with the correct blended alpha.
330 0 : blendedA = BlendAlphaOfFourPixels<i16x8_t,i32x4_t>(s_rrrraaaa1234, d_rrrraaaa1234);
331 :
332 0 : u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t,i16x8_t,u8x16_t>(blendedB, blendedG, blendedR, blendedA);
333 0 : simd::Store8(&targetData[targetIndex], result1234);
334 : }
335 : }
336 :
337 0 : return target.forget();
338 : }
339 :
340 : template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
341 : static already_AddRefed<DataSourceSurface>
342 0 : ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
343 : BlendMode aBlendMode)
344 : {
345 0 : switch (aBlendMode) {
346 : case BLEND_MODE_MULTIPLY:
347 0 : return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_MULTIPLY>(aInput1, aInput2);
348 : case BLEND_MODE_SCREEN:
349 0 : return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_SCREEN>(aInput1, aInput2);
350 : case BLEND_MODE_DARKEN:
351 0 : return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_DARKEN>(aInput1, aInput2);
352 : case BLEND_MODE_LIGHTEN:
353 0 : return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_LIGHTEN>(aInput1, aInput2);
354 : default:
355 0 : return nullptr;
356 : }
357 : }
358 :
359 : template<MorphologyOperator Operator, typename u8x16_t>
360 : static u8x16_t
361 0 : Morph8(u8x16_t a, u8x16_t b)
362 : {
363 : return Operator == MORPHOLOGY_OPERATOR_ERODE ?
364 0 : simd::Min8(a, b) : simd::Max8(a, b);
365 : }
366 :
367 : // Set every pixel to the per-component minimum or maximum of the pixels around
368 : // it that are up to aRadius pixels away from it (horizontally).
369 : template<MorphologyOperator op, typename i16x8_t, typename u8x16_t>
370 0 : inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
371 : uint8_t* aDestData, int32_t aDestStride,
372 : const IntRect& aDestRect, int32_t aRadius)
373 : {
374 : static_assert(op == MORPHOLOGY_OPERATOR_ERODE ||
375 : op == MORPHOLOGY_OPERATOR_DILATE,
376 : "unexpected morphology operator");
377 :
378 0 : int32_t kernelSize = aRadius + 1 + aRadius;
379 0 : MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0");
380 0 : MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3);
381 0 : int32_t completeKernelSizeForFourPixels = kernelSize + 3;
382 0 : MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 ||
383 : completeKernelSizeForFourPixels % 4 == 2);
384 :
385 : // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just
386 : // the way we need them to be.
387 :
388 0 : IntRect sourceRect = aDestRect;
389 0 : sourceRect.Inflate(aRadius, 0);
390 :
391 0 : for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++) {
392 0 : int32_t kernelStartX = aDestRect.x - aRadius;
393 0 : for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4, kernelStartX += 4) {
394 : // We process four pixels (16 color values) at a time.
395 : // aSourceData[0] points to the pixel located at aDestRect.TopLeft();
396 : // source values can be read beyond that because the source is extended
397 : // by aRadius pixels.
398 :
399 0 : int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX;
400 0 : u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
401 0 : u8x16_t m1234 = p1234;
402 :
403 0 : for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) {
404 0 : u8x16_t p5678 = (kernelStartX + i < sourceRect.XMost()) ?
405 0 : simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i]) :
406 0 : simd::FromZero8<u8x16_t>();
407 0 : u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678);
408 0 : u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678);
409 0 : m1234 = Morph8<op,u8x16_t>(m1234, p2345);
410 0 : m1234 = Morph8<op,u8x16_t>(m1234, p3456);
411 0 : if (i + 2 < completeKernelSizeForFourPixels) {
412 0 : u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678);
413 0 : m1234 = Morph8<op,u8x16_t>(m1234, p4567);
414 0 : m1234 = Morph8<op,u8x16_t>(m1234, p5678);
415 : }
416 0 : p1234 = p5678;
417 : }
418 :
419 0 : int32_t destIndex = y * aDestStride + 4 * x;
420 0 : simd::Store8(&aDestData[destIndex], m1234);
421 : }
422 : }
423 0 : }
424 :
425 : template<typename i16x8_t, typename u8x16_t>
426 0 : inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
427 : uint8_t* aDestData, int32_t aDestStride,
428 : const IntRect& aDestRect, int32_t aRadius,
429 : MorphologyOperator aOp)
430 : {
431 0 : if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
432 0 : ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>(
433 : aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
434 : } else {
435 0 : ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>(
436 : aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
437 : }
438 0 : }
439 :
440 : // Set every pixel to the per-component minimum or maximum of the pixels around
441 : // it that are up to aRadius pixels away from it (vertically).
442 : template<MorphologyOperator op, typename i16x8_t, typename u8x16_t>
443 0 : static void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
444 : uint8_t* aDestData, int32_t aDestStride,
445 : const IntRect& aDestRect, int32_t aRadius)
446 : {
447 : static_assert(op == MORPHOLOGY_OPERATOR_ERODE ||
448 : op == MORPHOLOGY_OPERATOR_DILATE,
449 : "unexpected morphology operator");
450 :
451 0 : int32_t startY = aDestRect.y - aRadius;
452 0 : int32_t endY = aDestRect.y + aRadius;
453 0 : for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++, startY++, endY++) {
454 0 : for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4) {
455 0 : int32_t sourceIndex = startY * aSourceStride + 4 * x;
456 0 : u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
457 0 : sourceIndex += aSourceStride;
458 0 : for (int32_t iy = startY + 1; iy <= endY; iy++, sourceIndex += aSourceStride) {
459 0 : u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
460 0 : u = Morph8<op,u8x16_t>(u, u2);
461 : }
462 :
463 0 : int32_t destIndex = y * aDestStride + 4 * x;
464 0 : simd::Store8(&aDestData[destIndex], u);
465 : }
466 : }
467 0 : }
468 :
469 : template<typename i16x8_t, typename u8x16_t>
470 0 : inline void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
471 : uint8_t* aDestData, int32_t aDestStride,
472 : const IntRect& aDestRect, int32_t aRadius,
473 : MorphologyOperator aOp)
474 : {
475 0 : if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
476 0 : ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>(
477 : aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
478 : } else {
479 0 : ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>(
480 : aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
481 : }
482 0 : }
483 :
484 : template<typename i32x4_t, typename i16x8_t>
485 : static i32x4_t
486 0 : ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra, const i32x4_t& bias)
487 : {
488 : // int16_t p[8] == { b, g, r, a, b, g, r, a }.
489 : // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }.
490 : // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }.
491 : // int32_t bias[4] == { _B, _G, _R, _A }.
492 :
493 0 : i32x4_t sum = bias;
494 :
495 : // int16_t bg[8] = { b, g, b, g, b, g, b, g };
496 0 : i16x8_t bg = simd::ShuffleHi16<1,0,1,0>(simd::ShuffleLo16<1,0,1,0>(p));
497 : // int32_t prodsum_bg[4] = { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA }
498 0 : i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg);
499 0 : sum = simd::Add32(sum, prodsum_bg);
500 :
501 : // uint16_t ra[8] = { r, a, r, a, r, a, r, a };
502 0 : i16x8_t ra = simd::ShuffleHi16<3,2,3,2>(simd::ShuffleLo16<3,2,3,2>(p));
503 : // int32_t prodsum_ra[4] = { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA }
504 0 : i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra);
505 0 : sum = simd::Add32(sum, prodsum_ra);
506 :
507 : // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }.
508 0 : return sum;
509 : }
510 :
511 : template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
512 : static already_AddRefed<DataSourceSurface>
513 0 : ApplyColorMatrix_SIMD(DataSourceSurface* aInput, const Matrix5x4 &aMatrix)
514 : {
515 0 : IntSize size = aInput->GetSize();
516 : RefPtr<DataSourceSurface> target =
517 0 : Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
518 0 : if (!target) {
519 0 : return nullptr;
520 : }
521 :
522 0 : uint8_t* sourceData = aInput->GetData();
523 0 : uint8_t* targetData = target->GetData();
524 0 : int32_t sourceStride = aInput->Stride();
525 0 : int32_t targetStride = target->Stride();
526 :
527 0 : const int16_t factor = 128;
528 0 : const Float floatElementMax = INT16_MAX / factor; // 255
529 : MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX, "badly chosen float-to-int scale");
530 :
531 0 : const Float *floats = &aMatrix._11;
532 :
533 : ptrdiff_t componentOffsets[4] = {
534 : B8G8R8A8_COMPONENT_BYTEOFFSET_R,
535 : B8G8R8A8_COMPONENT_BYTEOFFSET_G,
536 : B8G8R8A8_COMPONENT_BYTEOFFSET_B,
537 : B8G8R8A8_COMPONENT_BYTEOFFSET_A
538 0 : };
539 :
540 : // We store the color matrix in rows_bgra in the following format:
541 : // { bB, bG, bR, bA, gB, gG, gR, gA }.
542 : // { bB, gB, bG, gG, bR, gR, bA, gA }
543 : // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16
544 : // which works especially well for our use case.
545 : int16_t rows_bgra[2][8];
546 0 : for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) {
547 0 : for (size_t colIndex = 0; colIndex < 4; colIndex++) {
548 0 : const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
549 0 : Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -floatElementMax), floatElementMax);
550 0 : int16_t scaledIntMatrixElement = int16_t(clampedFloatMatrixElement * factor + 0.5);
551 0 : int8_t bg_or_ra = componentOffsets[rowIndex] / 2;
552 0 : int8_t g_or_a = componentOffsets[rowIndex] % 2;
553 0 : int8_t B_or_G_or_R_or_A = componentOffsets[colIndex];
554 0 : rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] = scaledIntMatrixElement;
555 : }
556 : }
557 :
558 : int32_t rowBias[4];
559 0 : Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255);
560 0 : for (size_t colIndex = 0; colIndex < 4; colIndex++) {
561 0 : size_t rowIndex = 4;
562 0 : const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
563 0 : Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -biasMax), biasMax);
564 0 : int32_t scaledIntMatrixElement = int32_t(clampedFloatMatrixElement * factor * 255 + 0.5);
565 0 : rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement;
566 : }
567 :
568 0 : i16x8_t row_bg_v = simd::FromI16<i16x8_t>(
569 : rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3],
570 0 : rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]);
571 :
572 0 : i16x8_t row_ra_v = simd::FromI16<i16x8_t>(
573 : rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3],
574 0 : rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]);
575 :
576 : i32x4_t rowsBias_v =
577 0 : simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]);
578 :
579 0 : for (int32_t y = 0; y < size.height; y++) {
580 0 : for (int32_t x = 0; x < size.width; x += 4) {
581 0 : MOZ_ASSERT(sourceStride >= 4 * (x + 4), "need to be able to read 4 pixels at this position");
582 0 : MOZ_ASSERT(targetStride >= 4 * (x + 4), "need to be able to write 4 pixels at this position");
583 0 : int32_t sourceIndex = y * sourceStride + 4 * x;
584 0 : int32_t targetIndex = y * targetStride + 4 * x;
585 :
586 : // We load 4 pixels, unpack them, process them 1 pixel at a time, and
587 : // finally pack and store the 4 result pixels.
588 :
589 0 : u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
590 :
591 : // Splat needed to get each pixel twice into i16x8
592 0 : i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234));
593 0 : i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234));
594 0 : i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234));
595 0 : i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234));
596 :
597 0 : i32x4_t result_p1 = ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v);
598 0 : i32x4_t result_p2 = ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v);
599 0 : i32x4_t result_p3 = ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v);
600 0 : i32x4_t result_p4 = ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v);
601 :
602 : static_assert(factor == 1 << 7, "Please adapt the calculation in the lines below for a different factor.");
603 0 : u8x16_t result_p1234 = simd::PackAndSaturate32To8(simd::ShiftRight32<7>(result_p1),
604 : simd::ShiftRight32<7>(result_p2),
605 : simd::ShiftRight32<7>(result_p3),
606 0 : simd::ShiftRight32<7>(result_p4));
607 0 : simd::Store8(&targetData[targetIndex], result_p1234);
608 : }
609 : }
610 :
611 0 : return target.forget();
612 : }
613 :
614 : // source / dest: bgra bgra
615 : // sourceAlpha / destAlpha: aaaa aaaa
616 : // result: bgra bgra
617 : template<typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator>
618 : static inline u16x8_t
619 0 : CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha, u16x8_t dest, const u16x8_t& destAlpha)
620 : {
621 0 : u16x8_t x255 = simd::FromU16<u16x8_t>(255);
622 :
623 : switch (aCompositeOperator) {
624 :
625 : case COMPOSITE_OPERATOR_OVER:
626 : {
627 : // val = dest * (255 - sourceAlpha) + source * 255;
628 0 : u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
629 :
630 0 : u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
631 0 : u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255);
632 0 : i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
633 :
634 0 : u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
635 0 : u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255);
636 0 : i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
637 :
638 0 : return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
639 0 : simd::FastDivideBy255(result2));
640 : }
641 :
642 : case COMPOSITE_OPERATOR_IN:
643 : {
644 : // val = source * destAlpha;
645 0 : return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha));
646 : }
647 :
648 : case COMPOSITE_OPERATOR_OUT:
649 : {
650 : // val = source * (255 - destAlpha);
651 0 : u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha));
652 0 : return simd::FastDivideBy255_16(prod);
653 : }
654 :
655 : case COMPOSITE_OPERATOR_ATOP:
656 : {
657 : // val = dest * (255 - sourceAlpha) + source * destAlpha;
658 0 : u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
659 :
660 0 : u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
661 0 : u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha);
662 0 : i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
663 :
664 0 : u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
665 0 : u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha);
666 0 : i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
667 :
668 0 : return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
669 0 : simd::FastDivideBy255(result2));
670 : }
671 :
672 : case COMPOSITE_OPERATOR_XOR:
673 : {
674 : // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha);
675 0 : u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
676 0 : u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
677 :
678 0 : u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
679 : u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha,
680 0 : twoFiftyFiveMinusDestAlpha);
681 0 : i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
682 :
683 0 : u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
684 : u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha,
685 0 : twoFiftyFiveMinusDestAlpha);
686 0 : i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
687 :
688 0 : return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
689 0 : simd::FastDivideBy255(result2));
690 : }
691 :
692 : default:
693 : return simd::FromU16<u16x8_t>(0);
694 :
695 : }
696 : }
697 :
698 : template<typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op>
699 : static void
700 0 : ApplyComposition(DataSourceSurface* aSource, DataSourceSurface* aDest)
701 : {
702 0 : IntSize size = aDest->GetSize();
703 :
704 0 : uint8_t* sourceData = aSource->GetData();
705 0 : uint8_t* destData = aDest->GetData();
706 0 : uint32_t sourceStride = aSource->Stride();
707 0 : uint32_t destStride = aDest->Stride();
708 :
709 0 : for (int32_t y = 0; y < size.height; y++) {
710 0 : for (int32_t x = 0; x < size.width; x += 4) {
711 0 : uint32_t sourceIndex = y * sourceStride + 4 * x;
712 0 : uint32_t destIndex = y * destStride + 4 * x;
713 :
714 0 : u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
715 0 : u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]);
716 :
717 0 : u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234);
718 0 : u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234);
719 0 : u16x8_t sa12 = simd::Splat16<3,3>(s12);
720 0 : u16x8_t da12 = simd::Splat16<3,3>(d12);
721 0 : u16x8_t result12 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s12, sa12, d12, da12);
722 :
723 0 : u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234);
724 0 : u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234);
725 0 : u16x8_t sa34 = simd::Splat16<3,3>(s34);
726 0 : u16x8_t da34 = simd::Splat16<3,3>(d34);
727 0 : u16x8_t result34 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s34, sa34, d34, da34);
728 :
729 0 : u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34);
730 0 : simd::Store8(&destData[destIndex], result1234);
731 : }
732 : }
733 0 : }
734 :
735 : template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
736 : static void
737 0 : ApplyComposition_SIMD(DataSourceSurface* aSource, DataSourceSurface* aDest,
738 : CompositeOperator aOperator)
739 : {
740 0 : switch (aOperator) {
741 : case COMPOSITE_OPERATOR_OVER:
742 0 : ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OVER>(aSource, aDest);
743 0 : break;
744 : case COMPOSITE_OPERATOR_IN:
745 0 : ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_IN>(aSource, aDest);
746 0 : break;
747 : case COMPOSITE_OPERATOR_OUT:
748 0 : ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OUT>(aSource, aDest);
749 0 : break;
750 : case COMPOSITE_OPERATOR_ATOP:
751 0 : ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_ATOP>(aSource, aDest);
752 0 : break;
753 : case COMPOSITE_OPERATOR_XOR:
754 0 : ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_XOR>(aSource, aDest);
755 0 : break;
756 : default:
757 0 : MOZ_CRASH("GFX: Incomplete switch");
758 : }
759 0 : }
760 :
761 : template<typename u8x16_t>
762 : static void
763 0 : SeparateColorChannels_SIMD(const IntSize &size, uint8_t* sourceData, int32_t sourceStride,
764 : uint8_t* channel0Data, uint8_t* channel1Data,
765 : uint8_t* channel2Data, uint8_t* channel3Data,
766 : int32_t channelStride)
767 : {
768 0 : for (int32_t y = 0; y < size.height; y++) {
769 0 : for (int32_t x = 0; x < size.width; x += 16) {
770 : // Process 16 pixels at a time.
771 0 : int32_t sourceIndex = y * sourceStride + 4 * x;
772 0 : int32_t targetIndex = y * channelStride + x;
773 :
774 0 : u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
775 0 : u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
776 0 : u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
777 0 : u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
778 :
779 0 : bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
780 0 : if (4 * (x + 4) < sourceStride) {
781 0 : bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
782 : }
783 0 : if (4 * (x + 8) < sourceStride) {
784 0 : bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
785 : }
786 0 : if (4 * (x + 12) < sourceStride) {
787 0 : bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
788 : }
789 :
790 0 : u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
791 0 : u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
792 0 : u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
793 0 : u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
794 0 : u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
795 0 : u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
796 0 : u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
797 0 : u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
798 0 : u8x16_t bbbbbbbbgggggggg1 = simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
799 0 : u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
800 0 : u8x16_t bbbbbbbbgggggggg2 = simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
801 0 : u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
802 0 : u8x16_t bbbbbbbbbbbbbbbb = simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
803 0 : u8x16_t gggggggggggggggg = simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
804 0 : u8x16_t rrrrrrrrrrrrrrrr = simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
805 0 : u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
806 :
807 0 : simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb);
808 0 : simd::Store8(&channel1Data[targetIndex], gggggggggggggggg);
809 0 : simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr);
810 0 : simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa);
811 : }
812 : }
813 0 : }
814 :
815 : template<typename u8x16_t>
816 : static void
817 0 : CombineColorChannels_SIMD(const IntSize &size, int32_t resultStride, uint8_t* resultData, int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data)
818 : {
819 0 : for (int32_t y = 0; y < size.height; y++) {
820 0 : for (int32_t x = 0; x < size.width; x += 16) {
821 : // Process 16 pixels at a time.
822 0 : int32_t resultIndex = y * resultStride + 4 * x;
823 0 : int32_t channelIndex = y * channelStride + x;
824 :
825 0 : u8x16_t bbbbbbbbbbbbbbbb = simd::Load8<u8x16_t>(&channel0Data[channelIndex]);
826 0 : u8x16_t gggggggggggggggg = simd::Load8<u8x16_t>(&channel1Data[channelIndex]);
827 0 : u8x16_t rrrrrrrrrrrrrrrr = simd::Load8<u8x16_t>(&channel2Data[channelIndex]);
828 0 : u8x16_t aaaaaaaaaaaaaaaa = simd::Load8<u8x16_t>(&channel3Data[channelIndex]);
829 :
830 0 : u8x16_t brbrbrbrbrbrbrbr1 = simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
831 0 : u8x16_t brbrbrbrbrbrbrbr2 = simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
832 0 : u8x16_t gagagagagagagaga1 = simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
833 0 : u8x16_t gagagagagagagaga2 = simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
834 :
835 0 : u8x16_t bgrabgrabgrabgra1 = simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
836 0 : u8x16_t bgrabgrabgrabgra2 = simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
837 0 : u8x16_t bgrabgrabgrabgra3 = simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
838 0 : u8x16_t bgrabgrabgrabgra4 = simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
839 :
840 0 : simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1);
841 0 : if (4 * (x + 4) < resultStride) {
842 0 : simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2);
843 : }
844 0 : if (4 * (x + 8) < resultStride) {
845 0 : simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3);
846 : }
847 0 : if (4 * (x + 12) < resultStride) {
848 0 : simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4);
849 : }
850 : }
851 : }
852 0 : }
853 :
854 :
855 : template<typename i32x4_t, typename u16x8_t, typename u8x16_t>
856 : static void
857 0 : DoPremultiplicationCalculation_SIMD(const IntSize& aSize,
858 : uint8_t* aTargetData, int32_t aTargetStride,
859 : uint8_t* aSourceData, int32_t aSourceStride)
860 : {
861 0 : const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff);
862 0 : for (int32_t y = 0; y < aSize.height; y++) {
863 0 : for (int32_t x = 0; x < aSize.width; x += 4) {
864 0 : int32_t inputIndex = y * aSourceStride + 4 * x;
865 0 : int32_t targetIndex = y * aTargetStride + 4 * x;
866 :
867 0 : u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
868 0 : u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
869 0 : u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
870 :
871 : // Multiply all components with alpha.
872 0 : p12 = simd::Mul16(p12, simd::Splat16<3,3>(p12));
873 0 : p34 = simd::Mul16(p34, simd::Splat16<3,3>(p34));
874 :
875 : // Divide by 255 and pack.
876 0 : u8x16_t result = simd::PackAndSaturate16To8(simd::FastDivideBy255_16(p12),
877 0 : simd::FastDivideBy255_16(p34));
878 :
879 : // Get the original alpha channel value back from p1234.
880 0 : result = simd::Pick(alphaMask, result, p1234);
881 :
882 0 : simd::Store8(&aTargetData[targetIndex], result);
883 : }
884 : }
885 0 : }
886 :
887 : // We use a table of precomputed factors for unpremultiplying.
888 : // We want to compute round(r / (alpha / 255.0f)) for arbitrary values of
889 : // r and alpha in constant time. This table of factors has the property that
890 : // (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with
891 : // a maximum deviation of 1).
892 : //
893 : // sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha)
894 : //
895 : // This table has been created using the python code
896 : // ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha in range(256))
897 : static const uint16_t sAlphaFactors[256] = {
898 : 0, 65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528, 5935,
899 : 5440, 5022, 4663, 4352, 4080, 3840, 3627, 3436, 3264, 3109, 2967, 2838, 2720,
900 : 2611, 2511, 2418, 2331, 2251, 2176, 2106, 2040, 1978, 1920, 1865, 1813, 1764,
901 : 1718, 1674, 1632, 1592, 1554, 1518, 1484, 1451, 1419, 1389, 1360, 1332, 1306,
902 : 1280, 1255, 1232, 1209, 1187, 1166, 1145, 1126, 1106, 1088, 1070, 1053, 1036,
903 : 1020, 1004, 989, 974, 960, 946, 933, 919, 907, 894, 882, 870, 859, 848, 837,
904 : 826, 816, 806, 796, 787, 777, 768, 759, 750, 742, 733, 725, 717, 710, 702,
905 : 694, 687, 680, 673, 666, 659, 653, 646, 640, 634, 628, 622, 616, 610, 604,
906 : 599, 593, 588, 583, 578, 573, 568, 563, 558, 553, 549, 544, 540, 535, 531,
907 : 526, 522, 518, 514, 510, 506, 502, 498, 495, 491, 487, 484, 480, 476, 473,
908 : 470, 466, 463, 460, 457, 453, 450, 447, 444, 441, 438, 435, 432, 429, 427,
909 : 424, 421, 418, 416, 413, 411, 408, 405, 403, 400, 398, 396, 393, 391, 389,
910 : 386, 384, 382, 380, 377, 375, 373, 371, 369, 367, 365, 363, 361, 359, 357,
911 : 355, 353, 351, 349, 347, 345, 344, 342, 340, 338, 336, 335, 333, 331, 330,
912 : 328, 326, 325, 323, 322, 320, 318, 317, 315, 314, 312, 311, 309, 308, 306,
913 : 305, 304, 302, 301, 299, 298, 297, 295, 294, 293, 291, 290, 289, 288, 286,
914 : 285, 284, 283, 281, 280, 279, 278, 277, 275, 274, 273, 272, 271, 270, 269,
915 : 268, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257, 256
916 : };
917 :
918 : template<typename u16x8_t, typename u8x16_t>
919 : static void
920 0 : DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize,
921 : uint8_t* aTargetData, int32_t aTargetStride,
922 : uint8_t* aSourceData, int32_t aSourceStride)
923 : {
924 0 : for (int32_t y = 0; y < aSize.height; y++) {
925 0 : for (int32_t x = 0; x < aSize.width; x += 4) {
926 0 : int32_t inputIndex = y * aSourceStride + 4 * x;
927 0 : int32_t targetIndex = y * aTargetStride + 4 * x;
928 : union {
929 : u8x16_t p1234;
930 : uint8_t u8[4][4];
931 0 : };
932 0 : p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
933 :
934 : // Prepare the alpha factors.
935 0 : uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
936 0 : uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
937 0 : uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
938 0 : uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
939 0 : u16x8_t aF12 = simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8);
940 0 : u16x8_t aF34 = simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8);
941 :
942 0 : u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
943 0 : u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
944 :
945 : // Multiply with the alpha factors, add 128 for rounding, and shift right by 8 bits.
946 0 : p12 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128)));
947 0 : p34 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128)));
948 :
949 0 : u8x16_t result = simd::PackAndSaturate16To8(p12, p34);
950 0 : simd::Store8(&aTargetData[targetIndex], result);
951 : }
952 : }
953 0 : }
954 :
955 : template<typename f32x4_t, typename i32x4_t, typename u8x16_t>
956 : static already_AddRefed<DataSourceSurface>
957 0 : RenderTurbulence_SIMD(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency,
958 : int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect)
959 : {
960 : #define RETURN_TURBULENCE(Type, Stitch) \
961 : SVGTurbulenceRenderer<Type,Stitch,f32x4_t,i32x4_t,u8x16_t> \
962 : renderer(aBaseFrequency, aSeed, aNumOctaves, aTileRect); \
963 : return renderer.Render(aSize, aOffset);
964 :
965 0 : switch (aType) {
966 : case TURBULENCE_TYPE_TURBULENCE:
967 : {
968 0 : if (aStitch) {
969 0 : RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true);
970 : }
971 0 : RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false);
972 : }
973 : case TURBULENCE_TYPE_FRACTAL_NOISE:
974 : {
975 0 : if (aStitch) {
976 0 : RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true);
977 : }
978 0 : RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false);
979 : }
980 : }
981 0 : return nullptr;
982 : #undef RETURN_TURBULENCE
983 : }
984 :
985 : // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
986 : template<typename i32x4_t, typename i16x8_t>
987 : static MOZ_ALWAYS_INLINE i16x8_t
988 0 : ArithmeticCombineTwoPixels(i16x8_t in1, i16x8_t in2,
989 : const i16x8_t &k1And4, const i16x8_t &k2And3)
990 : {
991 : // Calculate input product: inProd = (in1 * in2) / 255.
992 : i32x4_t inProd_1, inProd_2;
993 0 : simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2);
994 0 : i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1), simd::FastDivideBy255(inProd_2));
995 :
996 : // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128
997 0 : i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128);
998 0 : i16x8_t inProd1AndOneTwentyEight = simd::InterleaveLo16(inProd, oneTwentyEight);
999 0 : i16x8_t inProd2AndOneTwentyEight = simd::InterleaveHi16(inProd, oneTwentyEight);
1000 0 : i32x4_t inProdTimesK1PlusK4_1 = simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight);
1001 0 : i32x4_t inProdTimesK1PlusK4_2 = simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight);
1002 :
1003 : // Calculate k2 * in1 + k3 * in2
1004 0 : i16x8_t in12_1 = simd::InterleaveLo16(in1, in2);
1005 0 : i16x8_t in12_2 = simd::InterleaveHi16(in1, in2);
1006 0 : i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1);
1007 0 : i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2);
1008 :
1009 : // Sum everything up and truncate the fractional part.
1010 0 : i32x4_t result_1 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1));
1011 0 : i32x4_t result_2 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2));
1012 0 : return simd::PackAndSaturate32To16(result_1, result_2);
1013 : }
1014 :
1015 : template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
1016 : static already_AddRefed<DataSourceSurface>
1017 0 : ApplyArithmeticCombine_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
1018 : Float aK1, Float aK2, Float aK3, Float aK4)
1019 : {
1020 0 : IntSize size = aInput1->GetSize();
1021 : RefPtr<DataSourceSurface> target =
1022 0 : Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
1023 0 : if (!target) {
1024 0 : return nullptr;
1025 : }
1026 :
1027 0 : uint8_t* source1Data = aInput1->GetData();
1028 0 : uint8_t* source2Data = aInput2->GetData();
1029 0 : uint8_t* targetData = target->GetData();
1030 0 : uint32_t source1Stride = aInput1->Stride();
1031 0 : uint32_t source2Stride = aInput2->Stride();
1032 0 : uint32_t targetStride = target->Stride();
1033 :
1034 : // The arithmetic combine filter does the following calculation:
1035 : // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
1036 : //
1037 : // Or, with in1/2 integers between 0 and 255:
1038 : // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255
1039 : //
1040 : // We want the whole calculation to happen in integer, with 16-bit factors.
1041 : // So we convert our factors to fixed-point with precision 1.8.7.
1042 : // K4 is premultiplied with 255, and it will be multiplied with 128 later
1043 : // during the actual calculation, because premultiplying it with 255 * 128
1044 : // would overflow int16.
1045 :
1046 0 : i16x8_t k1 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f)));
1047 0 : i16x8_t k2 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f)));
1048 0 : i16x8_t k3 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f)));
1049 0 : i16x8_t k4 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f)));
1050 :
1051 0 : i16x8_t k1And4 = simd::InterleaveLo16(k1, k4);
1052 0 : i16x8_t k2And3 = simd::InterleaveLo16(k2, k3);
1053 :
1054 0 : for (int32_t y = 0; y < size.height; y++) {
1055 0 : for (int32_t x = 0; x < size.width; x += 4) {
1056 0 : uint32_t source1Index = y * source1Stride + 4 * x;
1057 0 : uint32_t source2Index = y * source2Stride + 4 * x;
1058 0 : uint32_t targetIndex = y * targetStride + 4 * x;
1059 :
1060 : // Load and unpack.
1061 0 : u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
1062 0 : u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
1063 0 : i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1);
1064 0 : i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1);
1065 0 : i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2);
1066 0 : i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2);
1067 :
1068 : // Multiply and add.
1069 0 : i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_12, in2_12, k1And4, k2And3);
1070 0 : i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_34, in2_34, k1And4, k2And3);
1071 :
1072 : // Pack and store.
1073 0 : simd::Store8(&targetData[targetIndex], simd::PackAndSaturate16To8(result_12, result_34));
1074 : }
1075 : }
1076 :
1077 0 : return target.forget();
1078 : }
1079 :
1080 : } // namespace gfx
1081 : } // namespace mozilla
|