LCOV - output.info - gfx/2d/ImageScalingSSE2.cpp

LCOV - code coverage report

Current view:	top level - gfx/2d - ImageScalingSSE2.cpp (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	0	140	0.0 %
Date:	2017-07-14 16:53:18	Functions:	0	9	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       5             : 
       6             : #include "ImageScaling.h"
       7             : #include "mozilla/Attributes.h"
       8             : 
       9             : #include "SSEHelpers.h"
      10             : 
      11             : /* The functions below use the following system for averaging 4 pixels:
      12             :  *
      13             :  * The first observation is that a half-adder is implemented as follows:
      14             :  * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1);
      15             :  *
      16             :  * This can be trivially extended to three pixels by observaring that when
      17             :  * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the
      18             :  * carries of the individual numbers, since the sum of 3 bits can only ever
      19             :  * have a carry of one.
      20             :  *
      21             :  * We then observe that the average is then ((carry << 1) + sum) >> 1, or,
      22             :  * assuming eliminating overflows and underflows, carry + (sum >> 1).
      23             :  *
      24             :  * We now average our existing sum with the fourth number, so we get:
      25             :  * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1).
      26             :  *
      27             :  * We now observe that our sum has been moved into place relative to the
      28             :  * carry, so we can now average with the carry to get the final 4 input
      29             :  * average: avg = (sum2 + carry) >> 1;
      30             :  *
      31             :  * Or to reverse the proof:
      32             :  * avg = ((sum >> 1) + carry + d >> 1) >> 1
      33             :  * avg = ((a + b + c) >> 1 + d >> 1) >> 1
      34             :  * avg = ((a + b + c + d) >> 2)
      35             :  *
      36             :  * An additional fact used in the SSE versions is the concept that we can
      37             :  * trivially convert a rounded average to a truncated average:
      38             :  *
      39             :  * We have:
      40             :  * f(a, b) = (a + b + 1) >> 1
      41             :  *
      42             :  * And want:
      43             :  * g(a, b) = (a + b) >> 1
      44             :  *
      45             :  * Observe:
      46             :  * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1)
      47             :  *            == ~((-a - 1 + -b - 1 + 1) >> 1)
      48             :  *            == ~((-a - 1 + -b) >> 1)
      49             :  *            == ~((-(a + b) - 1) >> 1)
      50             :  *            == ~((~(a + b)) >> 1)
      51             :  *            == (a + b) >> 1
      52             :  *            == g(a, b)
      53             :  */
      54             : 
      55           0 : MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg)
      56             : {
      57           0 :   __m128i minusone = _mm_set1_epi32(0xffffffff);
      58           0 :   return _mm_xor_si128(arg, minusone);
      59             : }
      60             : 
      61             : /* We have to pass pointers here, MSVC does not allow passing more than 3
      62             :  * __m128i arguments on the stack. And it does not allow 16-byte aligned
      63             :  * stack variables. This inlines properly on MSVC 2010. It does -not- inline
      64             :  * with just the inline directive.
      65             :  */
      66           0 : MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i *a, __m128i *b, __m128i *c, __m128i *d)
      67             : {
      68             : #define shuf1 _MM_SHUFFLE(2, 0, 2, 0)
      69             : #define shuf2 _MM_SHUFFLE(3, 1, 3, 1)
      70             : 
      71             : // This cannot be an inline function as the __Imm argument to _mm_shuffle_ps
      72             : // needs to be a compile time constant.
      73             : #define shuffle_si128(arga, argb, imm) \
      74             :   _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), _mm_castsi128_ps((argb)), (imm)));
      75             : 
      76           0 :   __m128i t = shuffle_si128(*a, *b, shuf1);
      77           0 :   *b = shuffle_si128(*a, *b, shuf2);
      78           0 :   *a = t;
      79           0 :   t = shuffle_si128(*c, *d, shuf1);
      80           0 :   *d = shuffle_si128(*c, *d, shuf2);
      81           0 :   *c = t;
      82             : 
      83             : #undef shuf1
      84             : #undef shuf2
      85             : #undef shuffle_si128
      86             : 
      87           0 :   __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c));
      88             : 
      89           0 :   __m128i carry = _mm_or_si128(_mm_and_si128(*a, *b), _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c)));
      90             : 
      91           0 :   sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d));
      92             : 
      93           0 :   return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry)));
      94             : }
      95             : 
      96           0 : MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b)
      97             : {
      98           0 :   return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
      99             : }
     100             : 
     101           0 : MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b)
     102             : {
     103           0 :   __m128i t = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));
     104           0 :   b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0)));
     105           0 :   a = t;
     106             : 
     107           0 :   return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
     108             : }
     109             : 
     110           0 : MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
     111             : {
     112           0 :   uint32_t sum = a ^ b ^ c;
     113           0 :   uint32_t carry = (a & b) | (a & c) | (b & c);
     114             : 
     115           0 :   uint32_t mask = 0xfefefefe;
     116             : 
     117             :   // Not having a byte based average instruction means we should mask to avoid
     118             :   // underflow.
     119           0 :   sum = (((sum ^ d) & mask) >> 1) + (sum & d);
     120             : 
     121           0 :   return (((sum ^ carry) & mask) >> 1) + (sum & carry);
     122             : }
     123             : 
     124             : // Simple 2 pixel average version of the function above.
     125           0 : MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b)
     126             : {
     127           0 :   uint32_t sum = a ^ b;
     128           0 :   uint32_t carry = (a & b);
     129             : 
     130           0 :   uint32_t mask = 0xfefefefe;
     131             : 
     132           0 :   return ((sum & mask) >> 1) + carry;
     133             : }
     134             : 
     135             : namespace mozilla {
     136             : namespace gfx {
     137             : 
     138             : void
     139           0 : ImageHalfScaler::HalfImage2D_SSE2(uint8_t *aSource, int32_t aSourceStride,
     140             :                                   const IntSize &aSourceSize, uint8_t *aDest,
     141             :                                   uint32_t aDestStride)
     142             : {
     143           0 :   const int Bpp = 4;
     144             : 
     145           0 :   for (int y = 0; y < aSourceSize.height; y += 2) {
     146           0 :     __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);
     147           0 :     int x = 0;
     148             :     // Run a loop depending on alignment.
     149           0 :     if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
     150           0 :         !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
     151           0 :       for (; x < (aSourceSize.width - 7); x += 8) {
     152           0 :         __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
     153           0 :         __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
     154             : 
     155           0 :         __m128i a = _mm_load_si128(upperRow);
     156           0 :         __m128i b = _mm_load_si128(upperRow + 1);
     157           0 :         __m128i c = _mm_load_si128(lowerRow);
     158           0 :         __m128i d = _mm_load_si128(lowerRow + 1);
     159             : 
     160           0 :         *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
     161           0 :       }
     162           0 :     } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
     163           0 :       for (; x < (aSourceSize.width - 7); x += 8) {
     164           0 :         __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
     165           0 :         __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
     166             : 
     167           0 :         __m128i a = _mm_load_si128(upperRow);
     168           0 :         __m128i b = _mm_load_si128(upperRow + 1);
     169           0 :         __m128i c = loadUnaligned128(lowerRow);
     170           0 :         __m128i d = loadUnaligned128(lowerRow + 1);
     171             : 
     172           0 :         *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
     173             :       }
     174           0 :     } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
     175           0 :       for (; x < (aSourceSize.width - 7); x += 8) {
     176           0 :         __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
     177           0 :         __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
     178             : 
     179           0 :         __m128i a = loadUnaligned128((__m128i*)upperRow);
     180           0 :         __m128i b = loadUnaligned128((__m128i*)upperRow + 1);
     181           0 :         __m128i c = _mm_load_si128((__m128i*)lowerRow);
     182           0 :         __m128i d = _mm_load_si128((__m128i*)lowerRow + 1);
     183             : 
     184           0 :         *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
     185             :       }
     186             :     } else {
     187           0 :       for (; x < (aSourceSize.width - 7); x += 8) {
     188           0 :         __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
     189           0 :         __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
     190             : 
     191           0 :         __m128i a = loadUnaligned128(upperRow);
     192           0 :         __m128i b = loadUnaligned128(upperRow + 1);
     193           0 :         __m128i c = loadUnaligned128(lowerRow);
     194           0 :         __m128i d = loadUnaligned128(lowerRow + 1);
     195             : 
     196           0 :         *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
     197             :       }
     198             :     }
     199             : 
     200           0 :     uint32_t *unalignedStorage = (uint32_t*)storage;
     201             :     // Take care of the final pixels, we know there's an even number of pixels
     202             :     // in the source rectangle. We use a 2x2 'simd' implementation for this.
     203             :     //
     204             :     // Potentially we only have to do this in the last row since overflowing 
     205             :     // 8 pixels in an earlier row would appear to be harmless as it doesn't
     206             :     // touch invalid memory. Even when reading and writing to the same surface.
     207             :     // in practice we only do this when doing an additional downscale pass, and
     208             :     // in this situation we have unused stride to write into harmlessly.
     209             :     // I do not believe the additional code complexity would be worth it though.
     210           0 :     for (; x < aSourceSize.width; x += 2) {
     211           0 :       uint8_t *upperRow = aSource + (y * aSourceStride + x * Bpp);
     212           0 :       uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp);
     213             : 
     214           0 :       *unalignedStorage++ = Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1),
     215           0 :                                    *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1));
     216             :     }
     217             :   }
     218           0 : }
     219             : 
     220             : void
     221           0 : ImageHalfScaler::HalfImageVertical_SSE2(uint8_t *aSource, int32_t aSourceStride,
     222             :                                         const IntSize &aSourceSize, uint8_t *aDest,
     223             :                                         uint32_t aDestStride)
     224             : {
     225           0 :   for (int y = 0; y < aSourceSize.height; y += 2) {
     226           0 :     __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);
     227           0 :     int x = 0;
     228             :     // Run a loop depending on alignment.
     229           0 :     if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
     230           0 :         !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
     231           0 :       for (; x < (aSourceSize.width - 3); x += 4) {
     232           0 :         uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
     233           0 :         uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
     234             : 
     235           0 :         __m128i a = _mm_load_si128((__m128i*)upperRow);
     236           0 :         __m128i b = _mm_load_si128((__m128i*)lowerRow);
     237             : 
     238           0 :         *storage++ = avg_sse2_4x2_4x1(a, b);
     239           0 :       }
     240           0 :     } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
     241             :       // This line doesn't align well.
     242           0 :       for (; x < (aSourceSize.width - 3); x += 4) {
     243           0 :         uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
     244           0 :         uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
     245             : 
     246           0 :         __m128i a = _mm_load_si128((__m128i*)upperRow);
     247           0 :         __m128i b = loadUnaligned128((__m128i*)lowerRow);
     248             : 
     249           0 :         *storage++ = avg_sse2_4x2_4x1(a, b);
     250             :       }
     251           0 :     } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
     252           0 :       for (; x < (aSourceSize.width - 3); x += 4) {
     253           0 :         uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
     254           0 :         uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
     255             : 
     256           0 :         __m128i a = loadUnaligned128((__m128i*)upperRow);
     257           0 :         __m128i b = _mm_load_si128((__m128i*)lowerRow);
     258             : 
     259           0 :         *storage++ = avg_sse2_4x2_4x1(a, b);
     260             :       }
     261             :     } else {
     262           0 :       for (; x < (aSourceSize.width - 3); x += 4) {
     263           0 :         uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
     264           0 :         uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
     265             : 
     266           0 :         __m128i a = loadUnaligned128((__m128i*)upperRow);
     267           0 :         __m128i b = loadUnaligned128((__m128i*)lowerRow);
     268             : 
     269           0 :         *storage++ = avg_sse2_4x2_4x1(a, b);
     270             :       }
     271             :     }
     272             : 
     273           0 :     uint32_t *unalignedStorage = (uint32_t*)storage;
     274             :     // Take care of the final pixels, we know there's an even number of pixels
     275             :     // in the source rectangle.
     276             :     //
     277             :     // Similar overflow considerations are valid as in the previous function.
     278           0 :     for (; x < aSourceSize.width; x++) {
     279           0 :       uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
     280           0 :       uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
     281             : 
     282           0 :       *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow);
     283             :     }
     284             :   }
     285           0 : }
     286             : 
     287             : void
     288           0 : ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t *aSource, int32_t aSourceStride,
     289             :                                           const IntSize &aSourceSize, uint8_t *aDest,
     290             :                                           uint32_t aDestStride)
     291             : {
     292           0 :   for (int y = 0; y < aSourceSize.height; y++) {
     293           0 :     __m128i *storage = (__m128i*)(aDest + (y * aDestStride));
     294           0 :     int x = 0;
     295             :     // Run a loop depending on alignment.
     296           0 :     if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
     297           0 :       for (; x < (aSourceSize.width - 7); x += 8) {
     298           0 :         __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
     299             : 
     300           0 :         __m128i a = _mm_load_si128(pixels);
     301           0 :         __m128i b = _mm_load_si128(pixels + 1);
     302             : 
     303           0 :         *storage++ = avg_sse2_8x1_4x1(a, b);
     304             :       }
     305             :     } else {
     306           0 :       for (; x < (aSourceSize.width - 7); x += 8) {
     307           0 :         __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
     308             : 
     309           0 :         __m128i a = loadUnaligned128(pixels);
     310           0 :         __m128i b = loadUnaligned128(pixels + 1);
     311             : 
     312           0 :         *storage++ = avg_sse2_8x1_4x1(a, b);
     313             :       }
     314             :     }
     315             : 
     316           0 :     uint32_t *unalignedStorage = (uint32_t*)storage;
     317             :     // Take care of the final pixels, we know there's an even number of pixels
     318             :     // in the source rectangle.
     319             :     //
     320             :     // Similar overflow considerations are valid as in the previous function.
     321           0 :     for (; x < aSourceSize.width; x += 2) {
     322           0 :       uint32_t *pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4));
     323             : 
     324           0 :       *unalignedStorage++ = Avg2(*pixels, *(pixels + 1));
     325             :     }
     326             :   }
     327           0 : }
     328             : 
     329             : } // namespace gfx
     330             : } // namespace mozilla

Generated by: LCOV version 1.13