LCOV - code coverage report
Current view: top level - gfx/qcms - transform-sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 60 118 50.8 %
Date: 2017-07-14 16:53:18 Functions: 1 2 50.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : #include <emmintrin.h>
       2             : 
       3             : #include "qcmsint.h"
       4             : 
       5             : /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
       6             : #define FLOATSCALE  (float)(PRECACHE_OUTPUT_SIZE)
       7             : #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
       8             : static const ALIGN float floatScaleX4[4] =
       9             :     { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
      10             : static const ALIGN float clampMaxValueX4[4] =
      11             :     { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
      12             : 
      13           0 : void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
      14             :                                           unsigned char *src,
      15             :                                           unsigned char *dest,
      16             :                                           size_t length)
      17             : {
      18             :     unsigned int i;
      19           0 :     float (*mat)[4] = transform->matrix;
      20             :     char input_back[32];
      21             :     /* Ensure we have a buffer that's 16 byte aligned regardless of the original
      22             :      * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
      23             :      * because they don't work on stack variables. gcc 4.4 does do the right thing
      24             :      * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
      25           0 :     float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
      26             :     /* share input and output locations to save having to keep the
      27             :      * locations in separate registers */
      28           0 :     uint32_t const * output = (uint32_t*)input;
      29             : 
      30             :     /* deref *transform now to avoid it in loop */
      31           0 :     const float *igtbl_r = transform->input_gamma_table_r;
      32           0 :     const float *igtbl_g = transform->input_gamma_table_g;
      33           0 :     const float *igtbl_b = transform->input_gamma_table_b;
      34             : 
      35             :     /* deref *transform now to avoid it in loop */
      36           0 :     const uint8_t *otdata_r = &transform->output_table_r->data[0];
      37           0 :     const uint8_t *otdata_g = &transform->output_table_g->data[0];
      38           0 :     const uint8_t *otdata_b = &transform->output_table_b->data[0];
      39             : 
      40             :     /* input matrix values never change */
      41           0 :     const __m128 mat0  = _mm_load_ps(mat[0]);
      42           0 :     const __m128 mat1  = _mm_load_ps(mat[1]);
      43           0 :     const __m128 mat2  = _mm_load_ps(mat[2]);
      44             : 
      45             :     /* these values don't change, either */
      46           0 :     const __m128 max   = _mm_load_ps(clampMaxValueX4);
      47           0 :     const __m128 min   = _mm_setzero_ps();
      48           0 :     const __m128 scale = _mm_load_ps(floatScaleX4);
      49             : 
      50             :     /* working variables */
      51             :     __m128 vec_r, vec_g, vec_b, result;
      52             : 
      53             :     /* CYA */
      54           0 :     if (!length)
      55           0 :         return;
      56             : 
      57             :     /* one pixel is handled outside of the loop */
      58           0 :     length--;
      59             : 
      60             :     /* setup for transforming 1st pixel */
      61           0 :     vec_r = _mm_load_ss(&igtbl_r[src[0]]);
      62           0 :     vec_g = _mm_load_ss(&igtbl_g[src[1]]);
      63           0 :     vec_b = _mm_load_ss(&igtbl_b[src[2]]);
      64           0 :     src += 3;
      65             : 
      66             :     /* transform all but final pixel */
      67             : 
      68           0 :     for (i=0; i<length; i++)
      69             :     {
      70             :         /* position values from gamma tables */
      71           0 :         vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
      72           0 :         vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
      73           0 :         vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
      74             : 
      75             :         /* gamma * matrix */
      76           0 :         vec_r = _mm_mul_ps(vec_r, mat0);
      77           0 :         vec_g = _mm_mul_ps(vec_g, mat1);
      78           0 :         vec_b = _mm_mul_ps(vec_b, mat2);
      79             : 
      80             :         /* crunch, crunch, crunch */
      81           0 :         vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
      82           0 :         vec_r  = _mm_max_ps(min, vec_r);
      83           0 :         vec_r  = _mm_min_ps(max, vec_r);
      84           0 :         result = _mm_mul_ps(vec_r, scale);
      85             : 
      86             :         /* store calc'd output tables indices */
      87           0 :         _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
      88             : 
      89             :         /* load for next loop while store completes */
      90           0 :         vec_r = _mm_load_ss(&igtbl_r[src[0]]);
      91           0 :         vec_g = _mm_load_ss(&igtbl_g[src[1]]);
      92           0 :         vec_b = _mm_load_ss(&igtbl_b[src[2]]);
      93           0 :         src += 3;
      94             : 
      95             :         /* use calc'd indices to output RGB values */
      96           0 :         dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
      97           0 :         dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
      98           0 :         dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
      99           0 :         dest += RGB_OUTPUT_COMPONENTS;
     100             :     }
     101             : 
     102             :     /* handle final (maybe only) pixel */
     103             : 
     104           0 :     vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     105           0 :     vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     106           0 :     vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
     107             : 
     108           0 :     vec_r = _mm_mul_ps(vec_r, mat0);
     109           0 :     vec_g = _mm_mul_ps(vec_g, mat1);
     110           0 :     vec_b = _mm_mul_ps(vec_b, mat2);
     111             : 
     112           0 :     vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     113           0 :     vec_r  = _mm_max_ps(min, vec_r);
     114           0 :     vec_r  = _mm_min_ps(max, vec_r);
     115           0 :     result = _mm_mul_ps(vec_r, scale);
     116             : 
     117           0 :     _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
     118             : 
     119           0 :     dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
     120           0 :     dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
     121           0 :     dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
     122             : }
     123             : 
     124          42 : void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
     125             :                                            unsigned char *src,
     126             :                                            unsigned char *dest,
     127             :                                            size_t length)
     128             : {
     129             :     unsigned int i;
     130          42 :     float (*mat)[4] = transform->matrix;
     131             :     char input_back[32];
     132             :     /* Ensure we have a buffer that's 16 byte aligned regardless of the original
     133             :      * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
     134             :      * because they don't work on stack variables. gcc 4.4 does do the right thing
     135             :      * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
     136          42 :     float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
     137             :     /* share input and output locations to save having to keep the
     138             :      * locations in separate registers */
     139          42 :     uint32_t const * output = (uint32_t*)input;
     140             : 
     141             :     /* deref *transform now to avoid it in loop */
     142          42 :     const float *igtbl_r = transform->input_gamma_table_r;
     143          42 :     const float *igtbl_g = transform->input_gamma_table_g;
     144          42 :     const float *igtbl_b = transform->input_gamma_table_b;
     145             : 
     146             :     /* deref *transform now to avoid it in loop */
     147          42 :     const uint8_t *otdata_r = &transform->output_table_r->data[0];
     148          42 :     const uint8_t *otdata_g = &transform->output_table_g->data[0];
     149          42 :     const uint8_t *otdata_b = &transform->output_table_b->data[0];
     150             : 
     151             :     /* input matrix values never change */
     152          42 :     const __m128 mat0  = _mm_load_ps(mat[0]);
     153          84 :     const __m128 mat1  = _mm_load_ps(mat[1]);
     154          84 :     const __m128 mat2  = _mm_load_ps(mat[2]);
     155             : 
     156             :     /* these values don't change, either */
     157          42 :     const __m128 max   = _mm_load_ps(clampMaxValueX4);
     158          42 :     const __m128 min   = _mm_setzero_ps();
     159          42 :     const __m128 scale = _mm_load_ps(floatScaleX4);
     160             : 
     161             :     /* working variables */
     162             :     __m128 vec_r, vec_g, vec_b, result;
     163             :     unsigned char alpha;
     164             : 
     165             :     /* CYA */
     166          42 :     if (!length)
     167           0 :         return;
     168             : 
     169             :     /* one pixel is handled outside of the loop */
     170          42 :     length--;
     171             : 
     172             :     /* setup for transforming 1st pixel */
     173          84 :     vec_r = _mm_load_ss(&igtbl_r[src[0]]);
     174          84 :     vec_g = _mm_load_ss(&igtbl_g[src[1]]);
     175          84 :     vec_b = _mm_load_ss(&igtbl_b[src[2]]);
     176          42 :     alpha = src[3];
     177          42 :     src += 4;
     178             : 
     179             :     /* transform all but final pixel */
     180             : 
     181         502 :     for (i=0; i<length; i++)
     182             :     {
     183             :         /* position values from gamma tables */
     184         460 :         vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     185         460 :         vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     186         460 :         vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
     187             : 
     188             :         /* gamma * matrix */
     189         460 :         vec_r = _mm_mul_ps(vec_r, mat0);
     190         460 :         vec_g = _mm_mul_ps(vec_g, mat1);
     191         460 :         vec_b = _mm_mul_ps(vec_b, mat2);
     192             : 
     193             :         /* store alpha for this pixel; load alpha for next */
     194         460 :         dest[OUTPUT_A_INDEX] = alpha;
     195         460 :         alpha   = src[3];
     196             : 
     197             :         /* crunch, crunch, crunch */
     198         920 :         vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     199         460 :         vec_r  = _mm_max_ps(min, vec_r);
     200         460 :         vec_r  = _mm_min_ps(max, vec_r);
     201         460 :         result = _mm_mul_ps(vec_r, scale);
     202             : 
     203             :         /* store calc'd output tables indices */
     204         460 :         _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
     205             : 
     206             :         /* load gamma values for next loop while store completes */
     207         920 :         vec_r = _mm_load_ss(&igtbl_r[src[0]]);
     208         920 :         vec_g = _mm_load_ss(&igtbl_g[src[1]]);
     209         920 :         vec_b = _mm_load_ss(&igtbl_b[src[2]]);
     210         460 :         src += 4;
     211             : 
     212             :         /* use calc'd indices to output RGB values */
     213         460 :         dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
     214         460 :         dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
     215         460 :         dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
     216         460 :         dest += RGBA_OUTPUT_COMPONENTS;
     217             :     }
     218             : 
     219             :     /* handle final (maybe only) pixel */
     220             : 
     221          42 :     vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     222          42 :     vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     223          42 :     vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
     224             : 
     225          42 :     vec_r = _mm_mul_ps(vec_r, mat0);
     226          42 :     vec_g = _mm_mul_ps(vec_g, mat1);
     227          42 :     vec_b = _mm_mul_ps(vec_b, mat2);
     228             : 
     229          42 :     dest[OUTPUT_A_INDEX] = alpha;
     230             : 
     231          84 :     vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     232          42 :     vec_r  = _mm_max_ps(min, vec_r);
     233          42 :     vec_r  = _mm_min_ps(max, vec_r);
     234          42 :     result = _mm_mul_ps(vec_r, scale);
     235             : 
     236          42 :     _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
     237             : 
     238          42 :     dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
     239          42 :     dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
     240          42 :     dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
     241             : }
     242             : 
     243             : 

Generated by: LCOV version 1.13