LCOV - code coverage report
Current view: top level - media/libpng/intel - filter_sse2_intrinsics.c (source / functions) Hit Total Coverage
Test: output.info Lines: 59 151 39.1 %
Date: 2017-07-14 16:53:18 Functions: 7 12 58.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : 
       2             : /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
       3             :  *
       4             :  * Copyright (c) 2016-2017 Glenn Randers-Pehrson
       5             :  * Written by Mike Klein and Matt Sarett
       6             :  * Derived from arm/filter_neon_intrinsics.c
       7             :  *
       8             :  * Last changed in libpng 1.6.29 [March 16, 2017]
       9             :  *
      10             :  * This code is released under the libpng license.
      11             :  * For conditions of distribution and use, see the disclaimer
      12             :  * and license in png.h
      13             :  */
      14             : 
      15             : #include "../pngpriv.h"
      16             : 
      17             : #ifdef PNG_READ_SUPPORTED
      18             : 
      19             : #if PNG_INTEL_SSE_IMPLEMENTATION > 0
      20             : 
      21             : #include <immintrin.h>
      22             : 
      23             : /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
      24             :  * They're positioned like this:
      25             :  *    prev:  c b
      26             :  *    row:   a d
      27             :  * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
      28             :  * whichever of a, b, or c is closest to p=a+b-c.
      29             :  */
      30             : 
      31       34310 : static __m128i load4(const void* p) {
      32       68620 :    return _mm_cvtsi32_si128(*(const int*)p);
      33             : }
      34             : 
      35       21204 : static void store4(void* p, __m128i v) {
      36       21204 :    *(int*)p = _mm_cvtsi128_si32(v);
      37       21204 : }
      38             : 
      39           0 : static __m128i load3(const void* p) {
      40             :    /* We'll load 2 bytes, then 1 byte,
      41             :     * then mask them together, and finally load into SSE.
      42             :     */
      43           0 :    const png_uint_16* p01 = p;
      44           0 :    const png_byte*    p2  = (const png_byte*)(p01+1);
      45             : 
      46           0 :    png_uint_32 v012 = (png_uint_32)(*p01)
      47           0 :                     | (png_uint_32)(*p2) << 16;
      48           0 :    return load4(&v012);
      49             : }
      50             : 
      51           0 : static void store3(void* p, __m128i v) {
      52             :    /* We'll pull from SSE as a 32-bit int, then write
      53             :     * its bottom two bytes, then its third byte.
      54             :     */
      55             :    png_uint_32 v012;
      56           0 :    store4(&v012, v);
      57             : 
      58           0 :    png_uint_16* p01 = p;
      59           0 :    png_byte*    p2  = (png_byte*)(p01+1);
      60           0 :    *p01 = v012;
      61           0 :    *p2  = v012 >> 16;
      62           0 : }
      63             : 
      64           0 : void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
      65             :    png_const_bytep prev)
      66             : {
      67             :    /* The Sub filter predicts each pixel as the previous pixel, a.
      68             :     * There is no pixel to the left of the first pixel.  It's encoded directly.
      69             :     * That works with our main loop if we just say that left pixel was zero.
      70             :     */
      71             :    png_debug(1, "in png_read_filter_row_sub3_sse2");
      72           0 :    __m128i a, d = _mm_setzero_si128();
      73             : 
      74           0 :    int rb = row_info->rowbytes;
      75           0 :    while (rb >= 4) {
      76           0 :       a = d; d = load4(row);
      77           0 :       d = _mm_add_epi8(d, a);
      78           0 :       store3(row, d);
      79             : 
      80           0 :       row += 3;
      81           0 :       rb  -= 3;
      82             :    }
      83           0 :    if (rb > 0) {
      84           0 :       a = d; d = load3(row);
      85           0 :       d = _mm_add_epi8(d, a);
      86           0 :       store3(row, d);
      87             : 
      88           0 :       row += 3;
      89           0 :       rb  -= 3;
      90             :    }
      91           0 : }
      92             : 
      93         261 : void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
      94             :    png_const_bytep prev)
      95             : {
      96             :    /* The Sub filter predicts each pixel as the previous pixel, a.
      97             :     * There is no pixel to the left of the first pixel.  It's encoded directly.
      98             :     * That works with our main loop if we just say that left pixel was zero.
      99             :     */
     100             :    png_debug(1, "in png_read_filter_row_sub4_sse2");
     101         261 :    __m128i a, d = _mm_setzero_si128();
     102             : 
     103         261 :    int rb = row_info->rowbytes;
     104        8620 :    while (rb > 0) {
     105        8098 :       a = d; d = load4(row);
     106        8098 :       d = _mm_add_epi8(d, a);
     107        8098 :       store4(row, d);
     108             : 
     109        8098 :       row += 4;
     110        8098 :       rb  -= 4;
     111             :    }
     112         261 : }
     113             : 
     114           0 : void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
     115             :    png_const_bytep prev)
     116             : {
     117             :    /* The Avg filter predicts each pixel as the (truncated) average of a and b.
     118             :     * There's no pixel to the left of the first pixel.  Luckily, it's
     119             :     * predicted to be half of the pixel above it.  So again, this works
     120             :     * perfectly with our loop if we make sure a starts at zero.
     121             :     */
     122             :    png_debug(1, "in png_read_filter_row_avg3_sse2");
     123           0 :    const __m128i zero = _mm_setzero_si128();
     124             :    __m128i    b;
     125           0 :    __m128i a, d = zero;
     126             : 
     127           0 :    int rb = row_info->rowbytes;
     128           0 :    while (rb >= 4) {
     129           0 :              b = load4(prev);
     130           0 :       a = d; d = load4(row );
     131             : 
     132             :       /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
     133           0 :       __m128i avg = _mm_avg_epu8(a,b);
     134             :       /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     135           0 :       avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
     136             :                                             _mm_set1_epi8(1)));
     137           0 :       d = _mm_add_epi8(d, avg);
     138           0 :       store3(row, d);
     139             : 
     140           0 :       prev += 3;
     141           0 :       row  += 3;
     142           0 :       rb   -= 3;
     143             :    }
     144           0 :    if (rb > 0) {
     145           0 :              b = load3(prev);
     146           0 :       a = d; d = load3(row );
     147             : 
     148             :       /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
     149           0 :       __m128i avg = _mm_avg_epu8(a,b);
     150             :       /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     151           0 :       avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
     152             :                                             _mm_set1_epi8(1)));
     153             : 
     154           0 :       d = _mm_add_epi8(d, avg);
     155           0 :       store3(row, d);
     156             : 
     157           0 :       prev += 3;
     158           0 :       row  += 3;
     159           0 :       rb   -= 3;
     160             :    }
     161           0 : }
     162             : 
     163          57 : void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
     164             :    png_const_bytep prev)
     165             : {
     166             :    /* The Avg filter predicts each pixel as the (truncated) average of a and b.
     167             :     * There's no pixel to the left of the first pixel.  Luckily, it's
     168             :     * predicted to be half of the pixel above it.  So again, this works
     169             :     * perfectly with our loop if we make sure a starts at zero.
     170             :     */
     171             :    png_debug(1, "in png_read_filter_row_avg4_sse2");
     172          57 :    const __m128i zero = _mm_setzero_si128();
     173             :    __m128i    b;
     174          57 :    __m128i a, d = zero;
     175             : 
     176          57 :    int rb = row_info->rowbytes;
     177        1917 :    while (rb > 0) {
     178        1803 :              b = load4(prev);
     179        1803 :       a = d; d = load4(row );
     180             : 
     181             :       /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
     182        1803 :       __m128i avg = _mm_avg_epu8(a,b);
     183             :       /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     184        7212 :       avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
     185             :                                             _mm_set1_epi8(1)));
     186             : 
     187        1803 :       d = _mm_add_epi8(d, avg);
     188        1803 :       store4(row, d);
     189             : 
     190        1803 :       prev += 4;
     191        1803 :       row  += 4;
     192        1803 :       rb   -= 4;
     193             :    }
     194          57 : }
     195             : 
     196             : /* Returns |x| for 16-bit lanes. */
     197       33909 : static __m128i abs_i16(__m128i x) {
     198             : #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
     199             :    return _mm_abs_epi16(x);
     200             : #else
     201             :    /* Read this all as, return x<0 ? -x : x.
     202             :    * To negate two's complement, you flip all the bits then add 1.
     203             :     */
     204       67818 :    __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
     205             : 
     206             :    /* Flip negative lanes. */
     207       33909 :    x = _mm_xor_si128(x, is_negative);
     208             : 
     209             :    /* +1 to negative lanes, else +0. */
     210       33909 :    x = _mm_sub_epi16(x, is_negative);
     211       33909 :    return x;
     212             : #endif
     213             : }
     214             : 
     215             : /* Bytewise c ? t : e. */
     216       22606 : static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
     217             : #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
     218             :    return _mm_blendv_epi8(e,t,c);
     219             : #else
     220       67818 :    return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
     221             : #endif
     222             : }
     223             : 
     224           0 : void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
     225             :    png_const_bytep prev)
     226             : {
     227             :    /* Paeth tries to predict pixel d using the pixel to the left of it, a,
     228             :     * and two pixels from the previous row, b and c:
     229             :     *   prev: c b
     230             :     *   row:  a d
     231             :     * The Paeth function predicts d to be whichever of a, b, or c is nearest to
     232             :     * p=a+b-c.
     233             :     *
     234             :     * The first pixel has no left context, and so uses an Up filter, p = b.
     235             :     * This works naturally with our main loop's p = a+b-c if we force a and c
     236             :     * to zero.
     237             :     * Here we zero b and d, which become c and a respectively at the start of
     238             :     * the loop.
     239             :     */
     240             :    png_debug(1, "in png_read_filter_row_paeth3_sse2");
     241           0 :    const __m128i zero = _mm_setzero_si128();
     242           0 :    __m128i c, b = zero,
     243           0 :            a, d = zero;
     244             : 
     245           0 :    int rb = row_info->rowbytes;
     246           0 :    while (rb >= 4) {
     247             :       /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     248             :        * intermediates.
     249             :        */
     250           0 :       c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
     251           0 :       a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
     252             : 
     253             :       /* (p-a) == (a+b-c - a) == (b-c) */
     254           0 :       __m128i pa = _mm_sub_epi16(b,c);
     255             : 
     256             :       /* (p-b) == (a+b-c - b) == (a-c) */
     257           0 :       __m128i pb = _mm_sub_epi16(a,c);
     258             : 
     259             :       /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     260           0 :       __m128i pc = _mm_add_epi16(pa,pb);
     261             : 
     262           0 :       pa = abs_i16(pa);  /* |p-a| */
     263           0 :       pb = abs_i16(pb);  /* |p-b| */
     264           0 :       pc = abs_i16(pc);  /* |p-c| */
     265             : 
     266           0 :       __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     267             : 
     268             :       /* Paeth breaks ties favoring a over b over c. */
     269           0 :       __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     270             :                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
     271             :                                                                      c));
     272             : 
     273             :       /* Note `_epi8`: we need addition to wrap modulo 255. */
     274           0 :       d = _mm_add_epi8(d, nearest);
     275           0 :       store3(row, _mm_packus_epi16(d,d));
     276             : 
     277           0 :       prev += 3;
     278           0 :       row  += 3;
     279           0 :       rb   -= 3;
     280             :    }
     281           0 :    if (rb > 0) {
     282             :       /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     283             :        * intermediates.
     284             :        */
     285           0 :       c = b; b = _mm_unpacklo_epi8(load3(prev), zero);
     286           0 :       a = d; d = _mm_unpacklo_epi8(load3(row ), zero);
     287             : 
     288             :       /* (p-a) == (a+b-c - a) == (b-c) */
     289           0 :       __m128i pa = _mm_sub_epi16(b,c);
     290             : 
     291             :       /* (p-b) == (a+b-c - b) == (a-c) */
     292           0 :       __m128i pb = _mm_sub_epi16(a,c);
     293             : 
     294             :       /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     295           0 :       __m128i pc = _mm_add_epi16(pa,pb);
     296             : 
     297           0 :       pa = abs_i16(pa);  /* |p-a| */
     298           0 :       pb = abs_i16(pb);  /* |p-b| */
     299           0 :       pc = abs_i16(pc);  /* |p-c| */
     300             : 
     301           0 :       __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     302             : 
     303             :       /* Paeth breaks ties favoring a over b over c. */
     304           0 :       __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     305             :                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
     306             :                                                                      c));
     307             : 
     308             :       /* Note `_epi8`: we need addition to wrap modulo 255. */
     309           0 :       d = _mm_add_epi8(d, nearest);
     310           0 :       store3(row, _mm_packus_epi16(d,d));
     311             : 
     312           0 :       prev += 3;
     313           0 :       row  += 3;
     314           0 :       rb   -= 3;
     315             :    }
     316           0 : }
     317             : 
     318         358 : void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
     319             :    png_const_bytep prev)
     320             : {
     321             :    /* Paeth tries to predict pixel d using the pixel to the left of it, a,
     322             :     * and two pixels from the previous row, b and c:
     323             :     *   prev: c b
     324             :     *   row:  a d
     325             :     * The Paeth function predicts d to be whichever of a, b, or c is nearest to
     326             :     * p=a+b-c.
     327             :     *
     328             :     * The first pixel has no left context, and so uses an Up filter, p = b.
     329             :     * This works naturally with our main loop's p = a+b-c if we force a and c
     330             :     * to zero.
     331             :     * Here we zero b and d, which become c and a respectively at the start of
     332             :     * the loop.
     333             :     */
     334             :    png_debug(1, "in png_read_filter_row_paeth4_sse2");
     335         358 :    const __m128i zero = _mm_setzero_si128();
     336         358 :    __m128i c, b = zero,
     337         358 :            a, d = zero;
     338             : 
     339         358 :    int rb = row_info->rowbytes;
     340       12019 :    while (rb > 0) {
     341             :       /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     342             :        * intermediates.
     343             :        */
     344       22606 :       c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
     345       22606 :       a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
     346             : 
     347             :       /* (p-a) == (a+b-c - a) == (b-c) */
     348       11303 :       __m128i pa = _mm_sub_epi16(b,c);
     349             : 
     350             :       /* (p-b) == (a+b-c - b) == (a-c) */
     351       11303 :       __m128i pb = _mm_sub_epi16(a,c);
     352             : 
     353             :       /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     354       11303 :       __m128i pc = _mm_add_epi16(pa,pb);
     355             : 
     356       11303 :       pa = abs_i16(pa);  /* |p-a| */
     357       11303 :       pb = abs_i16(pb);  /* |p-b| */
     358       11303 :       pc = abs_i16(pc);  /* |p-c| */
     359             : 
     360       22606 :       __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     361             : 
     362             :       /* Paeth breaks ties favoring a over b over c. */
     363       22606 :       __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     364             :                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
     365             :                                                                      c));
     366             : 
     367             :       /* Note `_epi8`: we need addition to wrap modulo 255. */
     368       11303 :       d = _mm_add_epi8(d, nearest);
     369       11303 :       store4(row, _mm_packus_epi16(d,d));
     370             : 
     371       11303 :       prev += 4;
     372       11303 :       row  += 4;
     373       11303 :       rb   -= 4;
     374             :    }
     375         358 : }
     376             : 
     377             : #endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */
     378             : #endif /* READ */

Generated by: LCOV version 1.13