LCOV - output.info - gfx/2d/FilterProcessingSIMD-inl.h

LCOV - code coverage report

Current view:	top level - gfx/2d - FilterProcessingSIMD-inl.h (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	0	534	0.0 %
Date:	2017-07-14 16:53:18	Functions:	0	59	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       5             : 
       6             : #include "FilterProcessing.h"
       7             : 
       8             : #include "SIMD.h"
       9             : #include "SVGTurbulenceRenderer-inl.h"
      10             : 
      11             : namespace mozilla {
      12             : namespace gfx {
      13             : 
      14             : template<typename u8x16_t>
      15             : inline already_AddRefed<DataSourceSurface>
      16           0 : ConvertToB8G8R8A8_SIMD(SourceSurface* aSurface)
      17             : {
      18           0 :   IntSize size = aSurface->GetSize();
      19           0 :   RefPtr<DataSourceSurface> input = aSurface->GetDataSurface();
      20             :   RefPtr<DataSourceSurface> output =
      21           0 :     Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
      22           0 :   uint8_t *inputData = input->GetData();
      23           0 :   uint8_t *outputData = output->GetData();
      24           0 :   int32_t inputStride = input->Stride();
      25           0 :   int32_t outputStride = output->Stride();
      26           0 :   switch (input->GetFormat()) {
      27             :     case SurfaceFormat::B8G8R8A8:
      28           0 :       output = input;
      29           0 :       break;
      30             :     case SurfaceFormat::B8G8R8X8:
      31           0 :       for (int32_t y = 0; y < size.height; y++) {
      32           0 :         for (int32_t x = 0; x < size.width; x++) {
      33           0 :           int32_t inputIndex = y * inputStride + 4 * x;
      34           0 :           int32_t outputIndex = y * outputStride + 4 * x;
      35           0 :           outputData[outputIndex + 0] = inputData[inputIndex + 0];
      36           0 :           outputData[outputIndex + 1] = inputData[inputIndex + 1];
      37           0 :           outputData[outputIndex + 2] = inputData[inputIndex + 2];
      38           0 :           outputData[outputIndex + 3] = 255;
      39             :         }
      40             :       }
      41           0 :       break;
      42             :     case SurfaceFormat::R8G8B8A8:
      43           0 :       for (int32_t y = 0; y < size.height; y++) {
      44           0 :         for (int32_t x = 0; x < size.width; x++) {
      45           0 :           int32_t inputIndex = y * inputStride + 4 * x;
      46           0 :           int32_t outputIndex = y * outputStride + 4 * x;
      47           0 :           outputData[outputIndex + 2] = inputData[inputIndex + 0];
      48           0 :           outputData[outputIndex + 1] = inputData[inputIndex + 1];
      49           0 :           outputData[outputIndex + 0] = inputData[inputIndex + 2];
      50           0 :           outputData[outputIndex + 3] = inputData[inputIndex + 3];
      51             :         }
      52             :       }
      53           0 :       break;
      54             :     case SurfaceFormat::R8G8B8X8:
      55           0 :       for (int32_t y = 0; y < size.height; y++) {
      56           0 :         for (int32_t x = 0; x < size.width; x++) {
      57           0 :           int32_t inputIndex = y * inputStride + 4 * x;
      58           0 :           int32_t outputIndex = y * outputStride + 4 * x;
      59           0 :           outputData[outputIndex + 2] = inputData[inputIndex + 0];
      60           0 :           outputData[outputIndex + 1] = inputData[inputIndex + 1];
      61           0 :           outputData[outputIndex + 0] = inputData[inputIndex + 2];
      62           0 :           outputData[outputIndex + 3] = 255;
      63             :         }
      64             :       }
      65           0 :       break;
      66             :     case SurfaceFormat::A8:
      67           0 :       for (int32_t y = 0; y < size.height; y++) {
      68           0 :         for (int32_t x = 0; x < size.width; x += 16) {
      69           0 :           int32_t inputIndex = y * inputStride + x;
      70           0 :           int32_t outputIndex = y * outputStride + 4 * x;
      71           0 :           u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]);
      72             :           // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by
      73             :           // interleaving with 0000000000000000 twice.
      74           0 :           u8x16_t zero = simd::FromZero8<u8x16_t>();
      75           0 :           u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16);
      76           0 :           u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16);
      77           0 :           u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8);
      78           0 :           u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8);
      79           0 :           u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16);
      80           0 :           u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16);
      81           0 :           simd::Store8(&outputData[outputIndex], p1To4);
      82           0 :           if ((x + 4) * 4 < outputStride) {
      83           0 :             simd::Store8(&outputData[outputIndex + 4 * 4], p5To8);
      84             :           }
      85           0 :           if ((x + 8) * 4 < outputStride) {
      86           0 :             simd::Store8(&outputData[outputIndex + 4 * 8], p9To12);
      87             :           }
      88           0 :           if ((x + 12) * 4 < outputStride) {
      89           0 :             simd::Store8(&outputData[outputIndex + 4 * 12], p13To16);
      90             :           }
      91             :         }
      92             :       }
      93           0 :       break;
      94             :     default:
      95           0 :       output = nullptr;
      96           0 :       break;
      97             :   }
      98           0 :   return output.forget();
      99             : }
     100             : 
     101             : template<typename u8x16_t>
     102             : inline void
     103           0 : ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData, int32_t sourceStride, uint8_t* alphaData, int32_t alphaStride)
     104             : {
     105           0 :   for (int32_t y = 0; y < size.height; y++) {
     106           0 :     for (int32_t x = 0; x < size.width; x += 16) {
     107             :       // Process 16 pixels at a time.
     108             :       // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of AAAAAAAAAAAAAAAA.
     109           0 :       int32_t sourceIndex = y * sourceStride + 4 * x;
     110           0 :       int32_t targetIndex = y * alphaStride + x;
     111             : 
     112           0 :       u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
     113           0 :       u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
     114           0 :       u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
     115           0 :       u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
     116             : 
     117           0 :       bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
     118           0 :       if (4 * (x + 4) < sourceStride) {
     119           0 :         bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
     120             :       }
     121           0 :       if (4 * (x + 8) < sourceStride) {
     122           0 :         bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
     123             :       }
     124           0 :       if (4 * (x + 12) < sourceStride) {
     125           0 :         bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
     126             :       }
     127             : 
     128           0 :       u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
     129           0 :       u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
     130           0 :       u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
     131           0 :       u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
     132           0 :       u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
     133           0 :       u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
     134           0 :       u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
     135           0 :       u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
     136           0 :       u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
     137           0 :       u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
     138           0 :       u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
     139             : 
     140           0 :       simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa);
     141             :     }
     142             :   }
     143           0 : }
     144             : 
     145             : // This function calculates the result color values for four pixels, but for
     146             : // only two color channels - either b & r or g & a. However, the a result will
     147             : // not be used.
     148             : // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa.
     149             : // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the
     150             : // alpha of all four pixels (and both aaaa's are the same).
     151             : // blendendComponent1 and blendedComponent2 are the out parameters.
     152             : template<typename i16x8_t, typename i32x4_t, uint32_t aBlendMode>
     153             : inline void
     154           0 : BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha,
     155             :                                i16x8_t dest, const i16x8_t& destAlpha,
     156             :                                i32x4_t& blendedComponent1, i32x4_t& blendedComponent2)
     157             : {
     158           0 :   i16x8_t x255 = simd::FromI16<i16x8_t>(255);
     159             : 
     160             :   switch (aBlendMode) {
     161             : 
     162             :     case BLEND_MODE_MULTIPLY:
     163             :     {
     164             :       // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) * dest);
     165           0 :       i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
     166           0 :       i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
     167           0 :       i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource = simd::Add16(twoFiftyFiveMinusSourceAlpha, source);
     168             : 
     169           0 :       i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
     170           0 :       i16x8_t leftFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
     171           0 :       blendedComponent1 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1);
     172           0 :       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
     173             : 
     174           0 :       i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
     175           0 :       i16x8_t leftFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
     176           0 :       blendedComponent2 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2);
     177           0 :       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
     178             : 
     179           0 :       break;
     180             :     }
     181             : 
     182             :     case BLEND_MODE_SCREEN:
     183             :     {
     184             :       // val = 255 * (source + dest) + (0 - dest) * source;
     185           0 :       i16x8_t sourcePlusDest = simd::Add16(source, dest);
     186           0 :       i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest);
     187             : 
     188           0 :       i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 = simd::InterleaveLo16(x255, zeroMinusDest);
     189           0 :       i16x8_t sourcePlusDestInterleavedWithSource1 = simd::InterleaveLo16(sourcePlusDest, source);
     190           0 :       blendedComponent1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1, sourcePlusDestInterleavedWithSource1);
     191           0 :       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
     192             : 
     193           0 :       i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 = simd::InterleaveHi16(x255, zeroMinusDest);
     194           0 :       i16x8_t sourcePlusDestInterleavedWithSource2 = simd::InterleaveHi16(sourcePlusDest, source);
     195           0 :       blendedComponent2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2, sourcePlusDestInterleavedWithSource2);
     196           0 :       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
     197             : 
     198           0 :       break;
     199             :     }
     200             : 
     201             :     case BLEND_MODE_DARKEN:
     202             :     case BLEND_MODE_LIGHTEN:
     203             :     {
     204             :       // Darken:
     205             :       // val = min((255 - destAlpha) * source + 255                 * dest,
     206             :       //           255               * source + (255 - sourceAlpha) * dest);
     207             :       //
     208             :       // Lighten:
     209             :       // val = max((255 - destAlpha) * source + 255                 * dest,
     210             :       //           255               * source + (255 - sourceAlpha) * dest);
     211             : 
     212           0 :       i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
     213           0 :       i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
     214             : 
     215           0 :       i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255);
     216           0 :       i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 = simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha);
     217           0 :       i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
     218           0 :       i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1, sourceInterleavedWithDest1);
     219           0 :       i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1, sourceInterleavedWithDest1);
     220           0 :       blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product1_1, product1_2) : simd::Max32(product1_1, product1_2);
     221           0 :       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
     222             : 
     223           0 :       i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255);
     224           0 :       i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 = simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha);
     225           0 :       i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
     226           0 :       i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2, sourceInterleavedWithDest2);
     227           0 :       i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2, sourceInterleavedWithDest2);
     228           0 :       blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product2_1, product2_2) : simd::Max32(product2_1, product2_2);
     229           0 :       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
     230             : 
     231           0 :       break;
     232             :     }
     233             : 
     234             :   }
     235           0 : }
     236             : 
     237             : // The alpha channel is subject to a different calculation than the RGB
     238             : // channels, and this calculation is the same for all blend modes:
     239             : // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha)
     240             : template<typename i16x8_t, typename i32x4_t>
     241             : inline i32x4_t
     242           0 : BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234, i16x8_t d_rrrraaaa1234)
     243             : {
     244             :   // We're using MulAdd16x8x2To32x4, so we need to interleave our factors
     245             :   // appropriately. The calculation is rewritten as follows:
     246             :   // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0])
     247             :   //                      = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
     248             :   //                      = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
     249             :   //                      = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0]
     250           0 :   i16x8_t zeroInterleavedWithSourceAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234);
     251           0 :   i16x8_t fiveTenInterleavedWithDestAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234);
     252           0 :   i16x8_t f1 = simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha);
     253           0 :   i16x8_t f2 = simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255));
     254           0 :   return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2));
     255             : }
     256             : 
     257             : template<typename u8x16_t, typename i16x8_t>
     258             : inline void
     259           0 : UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,
     260             :                            i16x8_t& bbbbgggg1234, i16x8_t& rrrraaaa1234)
     261             : {
     262             :   // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234
     263           0 :   i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234);
     264           0 :   i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234);
     265           0 :   i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34);
     266           0 :   i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34);
     267           0 :   bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24);
     268           0 :   rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24);
     269           0 : }
     270             : 
     271             : template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
     272             : inline u8x16_t
     273           0 : ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234,
     274             :                          i32x4_t rrrr1234, const i32x4_t& aaaa1234)
     275             : {
     276             :   // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234
     277           0 :   i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234);
     278           0 :   i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234);
     279           0 :   i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234);
     280           0 :   i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234);
     281           0 :   i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234);
     282           0 :   i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234);
     283           0 :   return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34);
     284             : }
     285             : 
     286             : template<typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
     287             : inline already_AddRefed<DataSourceSurface>
     288           0 : ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2)
     289             : {
     290           0 :   IntSize size = aInput1->GetSize();
     291             :   RefPtr<DataSourceSurface> target =
     292           0 :     Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
     293           0 :   if (!target) {
     294           0 :     return nullptr;
     295             :   }
     296             : 
     297           0 :   uint8_t* source1Data = aInput1->GetData();
     298           0 :   uint8_t* source2Data = aInput2->GetData();
     299           0 :   uint8_t* targetData = target->GetData();
     300           0 :   int32_t targetStride = target->Stride();
     301           0 :   int32_t source1Stride = aInput1->Stride();
     302           0 :   int32_t source2Stride = aInput2->Stride();
     303             : 
     304           0 :   for (int32_t y = 0; y < size.height; y++) {
     305           0 :     for (int32_t x = 0; x < size.width; x += 4) {
     306           0 :       int32_t targetIndex = y * targetStride + 4 * x;
     307           0 :       int32_t source1Index = y * source1Stride + 4 * x;
     308           0 :       int32_t source2Index = y * source2Stride + 4 * x;
     309             : 
     310           0 :       u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
     311           0 :       u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
     312             : 
     313             :       // The blending calculation for the RGB channels all need access to the
     314             :       // alpha channel of their pixel, and the alpha calculation is different,
     315             :       // so it makes sense to separate by channel.
     316             : 
     317             :       i16x8_t s_bbbbgggg1234, s_rrrraaaa1234;
     318             :       i16x8_t d_bbbbgggg1234, d_rrrraaaa1234;
     319           0 :       UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234);
     320           0 :       UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234);
     321           0 :       i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(s_rrrraaaa1234);
     322           0 :       i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(d_rrrraaaa1234);
     323             : 
     324             :       // We only use blendedB, blendedG and blendedR.
     325             :       i32x4_t blendedB, blendedG, blendedR, blendedA;
     326           0 :       BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234, blendedB, blendedG);
     327           0 :       BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234, blendedR, blendedA);
     328             : 
     329             :       // Throw away blendedA and overwrite it with the correct blended alpha.
     330           0 :       blendedA = BlendAlphaOfFourPixels<i16x8_t,i32x4_t>(s_rrrraaaa1234, d_rrrraaaa1234);
     331             : 
     332           0 :       u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t,i16x8_t,u8x16_t>(blendedB, blendedG, blendedR, blendedA);
     333           0 :       simd::Store8(&targetData[targetIndex], result1234);
     334             :     }
     335             :   }
     336             : 
     337           0 :   return target.forget();
     338             : }
     339             : 
     340             : template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
     341             : static already_AddRefed<DataSourceSurface>
     342           0 : ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
     343             :                       BlendMode aBlendMode)
     344             : {
     345           0 :   switch (aBlendMode) {
     346             :     case BLEND_MODE_MULTIPLY:
     347           0 :       return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_MULTIPLY>(aInput1, aInput2);
     348             :     case BLEND_MODE_SCREEN:
     349           0 :       return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_SCREEN>(aInput1, aInput2);
     350             :     case BLEND_MODE_DARKEN:
     351           0 :       return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_DARKEN>(aInput1, aInput2);
     352             :     case BLEND_MODE_LIGHTEN:
     353           0 :       return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_LIGHTEN>(aInput1, aInput2);
     354             :     default:
     355           0 :       return nullptr;
     356             :   }
     357             : }
     358             : 
     359             : template<MorphologyOperator Operator, typename u8x16_t>
     360             : static u8x16_t
     361           0 : Morph8(u8x16_t a, u8x16_t b)
     362             : {
     363             :   return Operator == MORPHOLOGY_OPERATOR_ERODE ?
     364           0 :     simd::Min8(a, b) : simd::Max8(a, b);
     365             : }
     366             : 
     367             : // Set every pixel to the per-component minimum or maximum of the pixels around
     368             : // it that are up to aRadius pixels away from it (horizontally).
     369             : template<MorphologyOperator op, typename i16x8_t, typename u8x16_t>
     370           0 : inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
     371             :                                            uint8_t* aDestData, int32_t aDestStride,
     372             :                                            const IntRect& aDestRect, int32_t aRadius)
     373             : {
     374             :   static_assert(op == MORPHOLOGY_OPERATOR_ERODE ||
     375             :                 op == MORPHOLOGY_OPERATOR_DILATE,
     376             :                 "unexpected morphology operator");
     377             : 
     378           0 :   int32_t kernelSize = aRadius + 1 + aRadius;
     379           0 :   MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0");
     380           0 :   MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3);
     381           0 :   int32_t completeKernelSizeForFourPixels = kernelSize + 3;
     382           0 :   MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 ||
     383             :              completeKernelSizeForFourPixels % 4 == 2);
     384             : 
     385             :   // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just
     386             :   // the way we need them to be.
     387             : 
     388           0 :   IntRect sourceRect = aDestRect;
     389           0 :   sourceRect.Inflate(aRadius, 0);
     390             : 
     391           0 :   for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++) {
     392           0 :     int32_t kernelStartX = aDestRect.x - aRadius;
     393           0 :     for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4, kernelStartX += 4) {
     394             :       // We process four pixels (16 color values) at a time.
     395             :       // aSourceData[0] points to the pixel located at aDestRect.TopLeft();
     396             :       // source values can be read beyond that because the source is extended
     397             :       // by aRadius pixels.
     398             : 
     399           0 :       int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX;
     400           0 :       u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
     401           0 :       u8x16_t m1234 = p1234;
     402             : 
     403           0 :       for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) {
     404           0 :         u8x16_t p5678 = (kernelStartX + i < sourceRect.XMost()) ?
     405           0 :           simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i]) :
     406           0 :           simd::FromZero8<u8x16_t>();
     407           0 :         u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678);
     408           0 :         u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678);
     409           0 :         m1234 = Morph8<op,u8x16_t>(m1234, p2345);
     410           0 :         m1234 = Morph8<op,u8x16_t>(m1234, p3456);
     411           0 :         if (i + 2 < completeKernelSizeForFourPixels) {
     412           0 :           u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678);
     413           0 :           m1234 = Morph8<op,u8x16_t>(m1234, p4567);
     414           0 :           m1234 = Morph8<op,u8x16_t>(m1234, p5678);
     415             :         }
     416           0 :         p1234 = p5678;
     417             :       }
     418             : 
     419           0 :       int32_t destIndex = y * aDestStride + 4 * x;
     420           0 :       simd::Store8(&aDestData[destIndex], m1234);
     421             :     }
     422             :   }
     423           0 : }
     424             : 
     425             : template<typename i16x8_t, typename u8x16_t>
     426           0 : inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
     427             :                                            uint8_t* aDestData, int32_t aDestStride,
     428             :                                            const IntRect& aDestRect, int32_t aRadius,
     429             :                                            MorphologyOperator aOp)
     430             : {
     431           0 :   if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
     432           0 :     ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>(
     433             :       aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
     434             :   } else {
     435           0 :     ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>(
     436             :       aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
     437             :   }
     438           0 : }
     439             : 
     440             : // Set every pixel to the per-component minimum or maximum of the pixels around
     441             : // it that are up to aRadius pixels away from it (vertically).
     442             : template<MorphologyOperator op, typename i16x8_t, typename u8x16_t>
     443           0 : static void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
     444             :                                          uint8_t* aDestData, int32_t aDestStride,
     445             :                                          const IntRect& aDestRect, int32_t aRadius)
     446             : {
     447             :   static_assert(op == MORPHOLOGY_OPERATOR_ERODE ||
     448             :                 op == MORPHOLOGY_OPERATOR_DILATE,
     449             :                 "unexpected morphology operator");
     450             : 
     451           0 :   int32_t startY = aDestRect.y - aRadius;
     452           0 :   int32_t endY = aDestRect.y + aRadius;
     453           0 :   for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++, startY++, endY++) {
     454           0 :     for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4) {
     455           0 :       int32_t sourceIndex = startY * aSourceStride + 4 * x;
     456           0 :       u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
     457           0 :       sourceIndex += aSourceStride;
     458           0 :       for (int32_t iy = startY + 1; iy <= endY; iy++, sourceIndex += aSourceStride) {
     459           0 :         u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
     460           0 :         u = Morph8<op,u8x16_t>(u, u2);
     461             :       }
     462             : 
     463           0 :       int32_t destIndex = y * aDestStride + 4 * x;
     464           0 :       simd::Store8(&aDestData[destIndex], u);
     465             :     }
     466             :   }
     467           0 : }
     468             : 
     469             : template<typename i16x8_t, typename u8x16_t>
     470           0 : inline void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
     471             :                                            uint8_t* aDestData, int32_t aDestStride,
     472             :                                            const IntRect& aDestRect, int32_t aRadius,
     473             :                                            MorphologyOperator aOp)
     474             : {
     475           0 :   if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
     476           0 :     ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>(
     477             :       aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
     478             :   } else {
     479           0 :     ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>(
     480             :       aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
     481             :   }
     482           0 : }
     483             : 
     484             : template<typename i32x4_t, typename i16x8_t>
     485             : static i32x4_t
     486           0 : ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra, const i32x4_t& bias)
     487             : {
     488             :   // int16_t p[8] == { b, g, r, a, b, g, r, a }.
     489             :   // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }.
     490             :   // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }.
     491             :   // int32_t bias[4] == { _B, _G, _R, _A }.
     492             : 
     493           0 :   i32x4_t sum = bias;
     494             : 
     495             :   // int16_t bg[8] = { b, g, b, g, b, g, b, g };
     496           0 :   i16x8_t bg = simd::ShuffleHi16<1,0,1,0>(simd::ShuffleLo16<1,0,1,0>(p));
     497             :   // int32_t prodsum_bg[4] = { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA }
     498           0 :   i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg);
     499           0 :   sum = simd::Add32(sum, prodsum_bg);
     500             : 
     501             :   // uint16_t ra[8] = { r, a, r, a, r, a, r, a };
     502           0 :   i16x8_t ra = simd::ShuffleHi16<3,2,3,2>(simd::ShuffleLo16<3,2,3,2>(p));
     503             :   // int32_t prodsum_ra[4] = { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA }
     504           0 :   i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra);
     505           0 :   sum = simd::Add32(sum, prodsum_ra);
     506             : 
     507             :   // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }.
     508           0 :   return sum;
     509             : }
     510             : 
     511             : template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
     512             : static already_AddRefed<DataSourceSurface>
     513           0 : ApplyColorMatrix_SIMD(DataSourceSurface* aInput, const Matrix5x4 &aMatrix)
     514             : {
     515           0 :   IntSize size = aInput->GetSize();
     516             :   RefPtr<DataSourceSurface> target =
     517           0 :     Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
     518           0 :   if (!target) {
     519           0 :     return nullptr;
     520             :   }
     521             : 
     522           0 :   uint8_t* sourceData = aInput->GetData();
     523           0 :   uint8_t* targetData = target->GetData();
     524           0 :   int32_t sourceStride = aInput->Stride();
     525           0 :   int32_t targetStride = target->Stride();
     526             : 
     527           0 :   const int16_t factor = 128;
     528           0 :   const Float floatElementMax = INT16_MAX / factor; // 255
     529             :   MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX, "badly chosen float-to-int scale");
     530             : 
     531           0 :   const Float *floats = &aMatrix._11;
     532             : 
     533             :   ptrdiff_t componentOffsets[4] = {
     534             :     B8G8R8A8_COMPONENT_BYTEOFFSET_R,
     535             :     B8G8R8A8_COMPONENT_BYTEOFFSET_G,
     536             :     B8G8R8A8_COMPONENT_BYTEOFFSET_B,
     537             :     B8G8R8A8_COMPONENT_BYTEOFFSET_A
     538           0 :   };
     539             : 
     540             :   // We store the color matrix in rows_bgra in the following format:
     541             :   // { bB, bG, bR, bA, gB, gG, gR, gA }.
     542             :   // { bB, gB, bG, gG, bR, gR, bA, gA }
     543             :   // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16
     544             :   // which works especially well for our use case.
     545             :   int16_t rows_bgra[2][8];
     546           0 :   for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) {
     547           0 :     for (size_t colIndex = 0; colIndex < 4; colIndex++) {
     548           0 :       const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
     549           0 :       Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -floatElementMax), floatElementMax);
     550           0 :       int16_t scaledIntMatrixElement = int16_t(clampedFloatMatrixElement * factor + 0.5);
     551           0 :       int8_t bg_or_ra = componentOffsets[rowIndex] / 2;
     552           0 :       int8_t g_or_a = componentOffsets[rowIndex] % 2;
     553           0 :       int8_t B_or_G_or_R_or_A = componentOffsets[colIndex];
     554           0 :       rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] = scaledIntMatrixElement;
     555             :     }
     556             :   }
     557             : 
     558             :   int32_t rowBias[4];
     559           0 :   Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255);
     560           0 :   for (size_t colIndex = 0; colIndex < 4; colIndex++) {
     561           0 :     size_t rowIndex = 4;
     562           0 :     const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
     563           0 :     Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -biasMax), biasMax);
     564           0 :     int32_t scaledIntMatrixElement = int32_t(clampedFloatMatrixElement * factor * 255 + 0.5);
     565           0 :     rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement;
     566             :   }
     567             : 
     568           0 :   i16x8_t row_bg_v = simd::FromI16<i16x8_t>(
     569             :     rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3],
     570           0 :     rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]);
     571             : 
     572           0 :   i16x8_t row_ra_v = simd::FromI16<i16x8_t>(
     573             :     rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3],
     574           0 :     rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]);
     575             : 
     576             :   i32x4_t rowsBias_v =
     577           0 :     simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]);
     578             : 
     579           0 :   for (int32_t y = 0; y < size.height; y++) {
     580           0 :     for (int32_t x = 0; x < size.width; x += 4) {
     581           0 :       MOZ_ASSERT(sourceStride >= 4 * (x + 4), "need to be able to read 4 pixels at this position");
     582           0 :       MOZ_ASSERT(targetStride >= 4 * (x + 4), "need to be able to write 4 pixels at this position");
     583           0 :       int32_t sourceIndex = y * sourceStride + 4 * x;
     584           0 :       int32_t targetIndex = y * targetStride + 4 * x;
     585             : 
     586             :       // We load 4 pixels, unpack them, process them 1 pixel at a time, and
     587             :       // finally pack and store the 4 result pixels.
     588             : 
     589           0 :       u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
     590             : 
     591             :       // Splat needed to get each pixel twice into i16x8
     592           0 :       i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234));
     593           0 :       i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234));
     594           0 :       i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234));
     595           0 :       i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234));
     596             : 
     597           0 :       i32x4_t result_p1 = ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v);
     598           0 :       i32x4_t result_p2 = ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v);
     599           0 :       i32x4_t result_p3 = ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v);
     600           0 :       i32x4_t result_p4 = ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v);
     601             : 
     602             :       static_assert(factor == 1 << 7, "Please adapt the calculation in the lines below for a different factor.");
     603           0 :       u8x16_t result_p1234 = simd::PackAndSaturate32To8(simd::ShiftRight32<7>(result_p1),
     604             :                                                         simd::ShiftRight32<7>(result_p2),
     605             :                                                         simd::ShiftRight32<7>(result_p3),
     606           0 :                                                         simd::ShiftRight32<7>(result_p4));
     607           0 :       simd::Store8(&targetData[targetIndex], result_p1234);
     608             :     }
     609             :   }
     610             : 
     611           0 :   return target.forget();
     612             : }
     613             : 
     614             : // source / dest: bgra bgra
     615             : // sourceAlpha / destAlpha: aaaa aaaa
     616             : // result: bgra bgra
     617             : template<typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator>
     618             : static inline u16x8_t
     619           0 : CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha, u16x8_t dest, const u16x8_t& destAlpha)
     620             : {
     621           0 :   u16x8_t x255 = simd::FromU16<u16x8_t>(255);
     622             : 
     623             :   switch (aCompositeOperator) {
     624             : 
     625             :     case COMPOSITE_OPERATOR_OVER:
     626             :     {
     627             :       // val = dest * (255 - sourceAlpha) + source * 255;
     628           0 :       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
     629             : 
     630           0 :       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
     631           0 :       u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255);
     632           0 :       i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
     633             : 
     634           0 :       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
     635           0 :       u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255);
     636           0 :       i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
     637             : 
     638           0 :       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
     639           0 :                                           simd::FastDivideBy255(result2));
     640             :     }
     641             : 
     642             :     case COMPOSITE_OPERATOR_IN:
     643             :     {
     644             :       // val = source * destAlpha;
     645           0 :       return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha));
     646             :     }
     647             : 
     648             :     case COMPOSITE_OPERATOR_OUT:
     649             :     {
     650             :       // val = source * (255 - destAlpha);
     651           0 :       u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha));
     652           0 :       return simd::FastDivideBy255_16(prod);
     653             :     }
     654             : 
     655             :     case COMPOSITE_OPERATOR_ATOP:
     656             :     {
     657             :       // val = dest * (255 - sourceAlpha) + source * destAlpha;
     658           0 :       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
     659             : 
     660           0 :       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
     661           0 :       u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha);
     662           0 :       i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
     663             : 
     664           0 :       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
     665           0 :       u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha);
     666           0 :       i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
     667             : 
     668           0 :       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
     669           0 :                                           simd::FastDivideBy255(result2));
     670             :     }
     671             : 
     672             :     case COMPOSITE_OPERATOR_XOR:
     673             :     {
     674             :       // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha);
     675           0 :       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
     676           0 :       u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
     677             : 
     678           0 :       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
     679             :       u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha,
     680           0 :                                                      twoFiftyFiveMinusDestAlpha);
     681           0 :       i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
     682             : 
     683           0 :       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
     684             :       u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha,
     685           0 :                                                      twoFiftyFiveMinusDestAlpha);
     686           0 :       i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
     687             : 
     688           0 :       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
     689           0 :                                           simd::FastDivideBy255(result2));
     690             :     }
     691             : 
     692             :     default:
     693             :       return simd::FromU16<u16x8_t>(0);
     694             : 
     695             :   }
     696             : }
     697             : 
     698             : template<typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op>
     699             : static void
     700           0 : ApplyComposition(DataSourceSurface* aSource, DataSourceSurface* aDest)
     701             : {
     702           0 :   IntSize size = aDest->GetSize();
     703             : 
     704           0 :   uint8_t* sourceData = aSource->GetData();
     705           0 :   uint8_t* destData = aDest->GetData();
     706           0 :   uint32_t sourceStride = aSource->Stride();
     707           0 :   uint32_t destStride = aDest->Stride();
     708             : 
     709           0 :   for (int32_t y = 0; y < size.height; y++) {
     710           0 :     for (int32_t x = 0; x < size.width; x += 4) {
     711           0 :       uint32_t sourceIndex = y * sourceStride + 4 * x;
     712           0 :       uint32_t destIndex = y * destStride + 4 * x;
     713             : 
     714           0 :       u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
     715           0 :       u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]);
     716             : 
     717           0 :       u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234);
     718           0 :       u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234);
     719           0 :       u16x8_t sa12 = simd::Splat16<3,3>(s12);
     720           0 :       u16x8_t da12 = simd::Splat16<3,3>(d12);
     721           0 :       u16x8_t result12 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s12, sa12, d12, da12);
     722             : 
     723           0 :       u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234);
     724           0 :       u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234);
     725           0 :       u16x8_t sa34 = simd::Splat16<3,3>(s34);
     726           0 :       u16x8_t da34 = simd::Splat16<3,3>(d34);
     727           0 :       u16x8_t result34 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s34, sa34, d34, da34);
     728             : 
     729           0 :       u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34);
     730           0 :       simd::Store8(&destData[destIndex], result1234);
     731             :     }
     732             :   }
     733           0 : }
     734             : 
     735             : template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
     736             : static void
     737           0 : ApplyComposition_SIMD(DataSourceSurface* aSource, DataSourceSurface* aDest,
     738             :                       CompositeOperator aOperator)
     739             : {
     740           0 :   switch (aOperator) {
     741             :     case COMPOSITE_OPERATOR_OVER:
     742           0 :       ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OVER>(aSource, aDest);
     743           0 :       break;
     744             :     case COMPOSITE_OPERATOR_IN:
     745           0 :       ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_IN>(aSource, aDest);
     746           0 :       break;
     747             :     case COMPOSITE_OPERATOR_OUT:
     748           0 :       ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OUT>(aSource, aDest);
     749           0 :       break;
     750             :     case COMPOSITE_OPERATOR_ATOP:
     751           0 :       ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_ATOP>(aSource, aDest);
     752           0 :       break;
     753             :     case COMPOSITE_OPERATOR_XOR:
     754           0 :       ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_XOR>(aSource, aDest);
     755           0 :       break;
     756             :     default:
     757           0 :       MOZ_CRASH("GFX: Incomplete switch");
     758             :   }
     759           0 : }
     760             : 
     761             : template<typename u8x16_t>
     762             : static void
     763           0 : SeparateColorChannels_SIMD(const IntSize &size, uint8_t* sourceData, int32_t sourceStride,
     764             :                            uint8_t* channel0Data, uint8_t* channel1Data,
     765             :                            uint8_t* channel2Data, uint8_t* channel3Data,
     766             :                            int32_t channelStride)
     767             : {
     768           0 :   for (int32_t y = 0; y < size.height; y++) {
     769           0 :     for (int32_t x = 0; x < size.width; x += 16) {
     770             :       // Process 16 pixels at a time.
     771           0 :       int32_t sourceIndex = y * sourceStride + 4 * x;
     772           0 :       int32_t targetIndex = y * channelStride + x;
     773             : 
     774           0 :       u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
     775           0 :       u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
     776           0 :       u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
     777           0 :       u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
     778             : 
     779           0 :       bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
     780           0 :       if (4 * (x + 4) < sourceStride) {
     781           0 :         bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
     782             :       }
     783           0 :       if (4 * (x + 8) < sourceStride) {
     784           0 :         bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
     785             :       }
     786           0 :       if (4 * (x + 12) < sourceStride) {
     787           0 :         bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
     788             :       }
     789             : 
     790           0 :       u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
     791           0 :       u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
     792           0 :       u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
     793           0 :       u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
     794           0 :       u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
     795           0 :       u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
     796           0 :       u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
     797           0 :       u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
     798           0 :       u8x16_t bbbbbbbbgggggggg1 = simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
     799           0 :       u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
     800           0 :       u8x16_t bbbbbbbbgggggggg2 = simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
     801           0 :       u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
     802           0 :       u8x16_t bbbbbbbbbbbbbbbb = simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
     803           0 :       u8x16_t gggggggggggggggg = simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
     804           0 :       u8x16_t rrrrrrrrrrrrrrrr = simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
     805           0 :       u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
     806             : 
     807           0 :       simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb);
     808           0 :       simd::Store8(&channel1Data[targetIndex], gggggggggggggggg);
     809           0 :       simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr);
     810           0 :       simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa);
     811             :     }
     812             :   }
     813           0 : }
     814             : 
     815             : template<typename u8x16_t>
     816             : static void
     817           0 : CombineColorChannels_SIMD(const IntSize &size, int32_t resultStride, uint8_t* resultData, int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data)
     818             : {
     819           0 :   for (int32_t y = 0; y < size.height; y++) {
     820           0 :     for (int32_t x = 0; x < size.width; x += 16) {
     821             :       // Process 16 pixels at a time.
     822           0 :       int32_t resultIndex = y * resultStride + 4 * x;
     823           0 :       int32_t channelIndex = y * channelStride + x;
     824             : 
     825           0 :       u8x16_t bbbbbbbbbbbbbbbb = simd::Load8<u8x16_t>(&channel0Data[channelIndex]);
     826           0 :       u8x16_t gggggggggggggggg = simd::Load8<u8x16_t>(&channel1Data[channelIndex]);
     827           0 :       u8x16_t rrrrrrrrrrrrrrrr = simd::Load8<u8x16_t>(&channel2Data[channelIndex]);
     828           0 :       u8x16_t aaaaaaaaaaaaaaaa = simd::Load8<u8x16_t>(&channel3Data[channelIndex]);
     829             : 
     830           0 :       u8x16_t brbrbrbrbrbrbrbr1 = simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
     831           0 :       u8x16_t brbrbrbrbrbrbrbr2 = simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
     832           0 :       u8x16_t gagagagagagagaga1 = simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
     833           0 :       u8x16_t gagagagagagagaga2 = simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
     834             : 
     835           0 :       u8x16_t bgrabgrabgrabgra1 = simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
     836           0 :       u8x16_t bgrabgrabgrabgra2 = simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
     837           0 :       u8x16_t bgrabgrabgrabgra3 = simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
     838           0 :       u8x16_t bgrabgrabgrabgra4 = simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
     839             : 
     840           0 :       simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1);
     841           0 :       if (4 * (x + 4) < resultStride) {
     842           0 :         simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2);
     843             :       }
     844           0 :       if (4 * (x + 8) < resultStride) {
     845           0 :         simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3);
     846             :       }
     847           0 :       if (4 * (x + 12) < resultStride) {
     848           0 :         simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4);
     849             :       }
     850             :     }
     851             :   }
     852           0 : }
     853             : 
     854             : 
     855             : template<typename i32x4_t, typename u16x8_t, typename u8x16_t>
     856             : static void
     857           0 : DoPremultiplicationCalculation_SIMD(const IntSize& aSize,
     858             :                                     uint8_t* aTargetData, int32_t aTargetStride,
     859             :                                     uint8_t* aSourceData, int32_t aSourceStride)
     860             : {
     861           0 :   const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff);
     862           0 :   for (int32_t y = 0; y < aSize.height; y++) {
     863           0 :     for (int32_t x = 0; x < aSize.width; x += 4) {
     864           0 :       int32_t inputIndex = y * aSourceStride + 4 * x;
     865           0 :       int32_t targetIndex = y * aTargetStride + 4 * x;
     866             : 
     867           0 :       u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
     868           0 :       u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
     869           0 :       u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
     870             : 
     871             :       // Multiply all components with alpha.
     872           0 :       p12 = simd::Mul16(p12, simd::Splat16<3,3>(p12));
     873           0 :       p34 = simd::Mul16(p34, simd::Splat16<3,3>(p34));
     874             : 
     875             :       // Divide by 255 and pack.
     876           0 :       u8x16_t result = simd::PackAndSaturate16To8(simd::FastDivideBy255_16(p12),
     877           0 :                                                   simd::FastDivideBy255_16(p34));
     878             : 
     879             :       // Get the original alpha channel value back from p1234.
     880           0 :       result = simd::Pick(alphaMask, result, p1234);
     881             : 
     882           0 :       simd::Store8(&aTargetData[targetIndex], result);
     883             :     }
     884             :   }
     885           0 : }
     886             : 
     887             : // We use a table of precomputed factors for unpremultiplying.
     888             : // We want to compute round(r / (alpha / 255.0f)) for arbitrary values of
     889             : // r and alpha in constant time. This table of factors has the property that
     890             : // (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with
     891             : // a maximum deviation of 1).
     892             : //
     893             : // sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha)
     894             : //
     895             : // This table has been created using the python code
     896             : // ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha in range(256))
     897             : static const uint16_t sAlphaFactors[256] = {
     898             :   0, 65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528, 5935,
     899             :   5440, 5022, 4663, 4352, 4080, 3840, 3627, 3436, 3264, 3109, 2967, 2838, 2720,
     900             :   2611, 2511, 2418, 2331, 2251, 2176, 2106, 2040, 1978, 1920, 1865, 1813, 1764,
     901             :   1718, 1674, 1632, 1592, 1554, 1518, 1484, 1451, 1419, 1389, 1360, 1332, 1306,
     902             :   1280, 1255, 1232, 1209, 1187, 1166, 1145, 1126, 1106, 1088, 1070, 1053, 1036,
     903             :   1020, 1004, 989, 974, 960, 946, 933, 919, 907, 894, 882, 870, 859, 848, 837,
     904             :   826, 816, 806, 796, 787, 777, 768, 759, 750, 742, 733, 725, 717, 710, 702,
     905             :   694, 687, 680, 673, 666, 659, 653, 646, 640, 634, 628, 622, 616, 610, 604,
     906             :   599, 593, 588, 583, 578, 573, 568, 563, 558, 553, 549, 544, 540, 535, 531,
     907             :   526, 522, 518, 514, 510, 506, 502, 498, 495, 491, 487, 484, 480, 476, 473,
     908             :   470, 466, 463, 460, 457, 453, 450, 447, 444, 441, 438, 435, 432, 429, 427,
     909             :   424, 421, 418, 416, 413, 411, 408, 405, 403, 400, 398, 396, 393, 391, 389,
     910             :   386, 384, 382, 380, 377, 375, 373, 371, 369, 367, 365, 363, 361, 359, 357,
     911             :   355, 353, 351, 349, 347, 345, 344, 342, 340, 338, 336, 335, 333, 331, 330,
     912             :   328, 326, 325, 323, 322, 320, 318, 317, 315, 314, 312, 311, 309, 308, 306,
     913             :   305, 304, 302, 301, 299, 298, 297, 295, 294, 293, 291, 290, 289, 288, 286,
     914             :   285, 284, 283, 281, 280, 279, 278, 277, 275, 274, 273, 272, 271, 270, 269,
     915             :   268, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257, 256
     916             : };
     917             : 
     918             : template<typename u16x8_t, typename u8x16_t>
     919             : static void
     920           0 : DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize,
     921             :                                  uint8_t* aTargetData, int32_t aTargetStride,
     922             :                                  uint8_t* aSourceData, int32_t aSourceStride)
     923             : {
     924           0 :   for (int32_t y = 0; y < aSize.height; y++) {
     925           0 :     for (int32_t x = 0; x < aSize.width; x += 4) {
     926           0 :       int32_t inputIndex = y * aSourceStride + 4 * x;
     927           0 :       int32_t targetIndex = y * aTargetStride + 4 * x;
     928             :       union {
     929             :         u8x16_t p1234;
     930             :         uint8_t u8[4][4];
     931           0 :       };
     932           0 :       p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
     933             : 
     934             :       // Prepare the alpha factors.
     935           0 :       uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
     936           0 :       uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
     937           0 :       uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
     938           0 :       uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
     939           0 :       u16x8_t aF12 = simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8);
     940           0 :       u16x8_t aF34 = simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8);
     941             : 
     942           0 :       u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
     943           0 :       u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
     944             : 
     945             :       // Multiply with the alpha factors, add 128 for rounding, and shift right by 8 bits.
     946           0 :       p12 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128)));
     947           0 :       p34 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128)));
     948             : 
     949           0 :       u8x16_t result = simd::PackAndSaturate16To8(p12, p34);
     950           0 :       simd::Store8(&aTargetData[targetIndex], result);
     951             :     }
     952             :   }
     953           0 : }
     954             : 
     955             : template<typename f32x4_t, typename i32x4_t, typename u8x16_t>
     956             : static already_AddRefed<DataSourceSurface>
     957           0 : RenderTurbulence_SIMD(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency,
     958             :                       int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect)
     959             : {
     960             : #define RETURN_TURBULENCE(Type, Stitch) \
     961             :   SVGTurbulenceRenderer<Type,Stitch,f32x4_t,i32x4_t,u8x16_t> \
     962             :     renderer(aBaseFrequency, aSeed, aNumOctaves, aTileRect); \
     963             :   return renderer.Render(aSize, aOffset);
     964             : 
     965           0 :   switch (aType) {
     966             :     case TURBULENCE_TYPE_TURBULENCE:
     967             :     {
     968           0 :       if (aStitch) {
     969           0 :         RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true);
     970             :       }
     971           0 :       RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false);
     972             :     }
     973             :     case TURBULENCE_TYPE_FRACTAL_NOISE:
     974             :     {
     975           0 :       if (aStitch) {
     976           0 :         RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true);
     977             :       }
     978           0 :       RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false);
     979             :     }
     980             :   }
     981           0 :   return nullptr;
     982             : #undef RETURN_TURBULENCE
     983             : }
     984             : 
     985             : // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
     986             : template<typename i32x4_t, typename i16x8_t>
     987             : static MOZ_ALWAYS_INLINE i16x8_t
     988           0 : ArithmeticCombineTwoPixels(i16x8_t in1, i16x8_t in2,
     989             :                            const i16x8_t &k1And4, const i16x8_t &k2And3)
     990             : {
     991             :   // Calculate input product: inProd = (in1 * in2) / 255.
     992             :   i32x4_t inProd_1, inProd_2;
     993           0 :   simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2);
     994           0 :   i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1), simd::FastDivideBy255(inProd_2));
     995             : 
     996             :   // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128
     997           0 :   i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128);
     998           0 :   i16x8_t inProd1AndOneTwentyEight = simd::InterleaveLo16(inProd, oneTwentyEight);
     999           0 :   i16x8_t inProd2AndOneTwentyEight = simd::InterleaveHi16(inProd, oneTwentyEight);
    1000           0 :   i32x4_t inProdTimesK1PlusK4_1 = simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight);
    1001           0 :   i32x4_t inProdTimesK1PlusK4_2 = simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight);
    1002             : 
    1003             :   // Calculate k2 * in1 + k3 * in2
    1004           0 :   i16x8_t in12_1 = simd::InterleaveLo16(in1, in2);
    1005           0 :   i16x8_t in12_2 = simd::InterleaveHi16(in1, in2);
    1006           0 :   i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1);
    1007           0 :   i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2);
    1008             : 
    1009             :   // Sum everything up and truncate the fractional part.
    1010           0 :   i32x4_t result_1 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1));
    1011           0 :   i32x4_t result_2 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2));
    1012           0 :   return simd::PackAndSaturate32To16(result_1, result_2);
    1013             : }
    1014             : 
    1015             : template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
    1016             : static already_AddRefed<DataSourceSurface>
    1017           0 : ApplyArithmeticCombine_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
    1018             :                             Float aK1, Float aK2, Float aK3, Float aK4)
    1019             : {
    1020           0 :   IntSize size = aInput1->GetSize();
    1021             :   RefPtr<DataSourceSurface> target =
    1022           0 :   Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
    1023           0 :   if (!target) {
    1024           0 :     return nullptr;
    1025             :   }
    1026             : 
    1027           0 :   uint8_t* source1Data = aInput1->GetData();
    1028           0 :   uint8_t* source2Data = aInput2->GetData();
    1029           0 :   uint8_t* targetData = target->GetData();
    1030           0 :   uint32_t source1Stride = aInput1->Stride();
    1031           0 :   uint32_t source2Stride = aInput2->Stride();
    1032           0 :   uint32_t targetStride = target->Stride();
    1033             : 
    1034             :   // The arithmetic combine filter does the following calculation:
    1035             :   // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
    1036             :   //
    1037             :   // Or, with in1/2 integers between 0 and 255:
    1038             :   // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255
    1039             :   //
    1040             :   // We want the whole calculation to happen in integer, with 16-bit factors.
    1041             :   // So we convert our factors to fixed-point with precision 1.8.7.
    1042             :   // K4 is premultiplied with 255, and it will be multiplied with 128 later
    1043             :   // during the actual calculation, because premultiplying it with 255 * 128
    1044             :   // would overflow int16.
    1045             : 
    1046           0 :   i16x8_t k1 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f)));
    1047           0 :   i16x8_t k2 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f)));
    1048           0 :   i16x8_t k3 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f)));
    1049           0 :   i16x8_t k4 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f)));
    1050             : 
    1051           0 :   i16x8_t k1And4 = simd::InterleaveLo16(k1, k4);
    1052           0 :   i16x8_t k2And3 = simd::InterleaveLo16(k2, k3);
    1053             : 
    1054           0 :   for (int32_t y = 0; y < size.height; y++) {
    1055           0 :     for (int32_t x = 0; x < size.width; x += 4) {
    1056           0 :       uint32_t source1Index = y * source1Stride + 4 * x;
    1057           0 :       uint32_t source2Index = y * source2Stride + 4 * x;
    1058           0 :       uint32_t targetIndex = y * targetStride + 4 * x;
    1059             : 
    1060             :       // Load and unpack.
    1061           0 :       u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
    1062           0 :       u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
    1063           0 :       i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1);
    1064           0 :       i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1);
    1065           0 :       i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2);
    1066           0 :       i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2);
    1067             : 
    1068             :       // Multiply and add.
    1069           0 :       i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_12, in2_12, k1And4, k2And3);
    1070           0 :       i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_34, in2_34, k1And4, k2And3);
    1071             : 
    1072             :       // Pack and store.
    1073           0 :       simd::Store8(&targetData[targetIndex], simd::PackAndSaturate16To8(result_12, result_34));
    1074             :     }
    1075             :   }
    1076             : 
    1077           0 :   return target.forget();
    1078             : }
    1079             : 
    1080             : } // namespace gfx
    1081             : } // namespace mozilla

Generated by: LCOV version 1.13