LCOV - code coverage report
Current view: top level - gfx/qcms - transform-sse1.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 126 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 2 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : #include <xmmintrin.h>
       2             : 
       3             : #include "qcmsint.h"
       4             : 
       5             : /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
       6             : #define FLOATSCALE  (float)(PRECACHE_OUTPUT_SIZE)
       7             : #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
       8             : static const ALIGN float floatScaleX4[4] =
       9             :     { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
      10             : static const ALIGN float clampMaxValueX4[4] =
      11             :     { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
      12             : 
      13           0 : void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
      14             :                                           unsigned char *src,
      15             :                                           unsigned char *dest,
      16             :                                           size_t length)
      17             : {
      18             :     unsigned int i;
      19           0 :     float (*mat)[4] = transform->matrix;
      20             :     char input_back[32];
      21             :     /* Ensure we have a buffer that's 16 byte aligned regardless of the original
      22             :      * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
      23             :      * because they don't work on stack variables. gcc 4.4 does do the right thing
      24             :      * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
      25           0 :     float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
      26             :     /* share input and output locations to save having to keep the
      27             :      * locations in separate registers */
      28           0 :     uint32_t const * output = (uint32_t*)input;
      29             : 
      30             :     /* deref *transform now to avoid it in loop */
      31           0 :     const float *igtbl_r = transform->input_gamma_table_r;
      32           0 :     const float *igtbl_g = transform->input_gamma_table_g;
      33           0 :     const float *igtbl_b = transform->input_gamma_table_b;
      34             : 
      35             :     /* deref *transform now to avoid it in loop */
      36           0 :     const uint8_t *otdata_r = &transform->output_table_r->data[0];
      37           0 :     const uint8_t *otdata_g = &transform->output_table_g->data[0];
      38           0 :     const uint8_t *otdata_b = &transform->output_table_b->data[0];
      39             : 
      40             :     /* input matrix values never change */
      41           0 :     const __m128 mat0  = _mm_load_ps(mat[0]);
      42           0 :     const __m128 mat1  = _mm_load_ps(mat[1]);
      43           0 :     const __m128 mat2  = _mm_load_ps(mat[2]);
      44             : 
      45             :     /* these values don't change, either */
      46           0 :     const __m128 max   = _mm_load_ps(clampMaxValueX4);
      47           0 :     const __m128 min   = _mm_setzero_ps();
      48           0 :     const __m128 scale = _mm_load_ps(floatScaleX4);
      49             : 
      50             :     /* working variables */
      51             :     __m128 vec_r, vec_g, vec_b, result;
      52             : 
      53             :     /* CYA */
      54           0 :     if (!length)
      55           0 :         return;
      56             : 
      57             :     /* one pixel is handled outside of the loop */
      58           0 :     length--;
      59             : 
      60             :     /* setup for transforming 1st pixel */
      61           0 :     vec_r = _mm_load_ss(&igtbl_r[src[0]]);
      62           0 :     vec_g = _mm_load_ss(&igtbl_g[src[1]]);
      63           0 :     vec_b = _mm_load_ss(&igtbl_b[src[2]]);
      64           0 :     src += 3;
      65             : 
      66             :     /* transform all but final pixel */
      67             : 
      68           0 :     for (i=0; i<length; i++)
      69             :     {
      70             :         /* position values from gamma tables */
      71           0 :         vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
      72           0 :         vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
      73           0 :         vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
      74             : 
      75             :         /* gamma * matrix */
      76           0 :         vec_r = _mm_mul_ps(vec_r, mat0);
      77           0 :         vec_g = _mm_mul_ps(vec_g, mat1);
      78           0 :         vec_b = _mm_mul_ps(vec_b, mat2);
      79             : 
      80             :         /* crunch, crunch, crunch */
      81           0 :         vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
      82           0 :         vec_r  = _mm_max_ps(min, vec_r);
      83           0 :         vec_r  = _mm_min_ps(max, vec_r);
      84           0 :         result = _mm_mul_ps(vec_r, scale);
      85             : 
      86             :         /* store calc'd output tables indices */
      87           0 :         *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
      88           0 :         result = _mm_movehl_ps(result, result);
      89           0 :         *((__m64 *)&output[2]) = _mm_cvtps_pi32(result) ;
      90             : 
      91             :         /* load for next loop while store completes */
      92           0 :         vec_r = _mm_load_ss(&igtbl_r[src[0]]);
      93           0 :         vec_g = _mm_load_ss(&igtbl_g[src[1]]);
      94           0 :         vec_b = _mm_load_ss(&igtbl_b[src[2]]);
      95           0 :         src += 3;
      96             : 
      97             :         /* use calc'd indices to output RGB values */
      98           0 :         dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
      99           0 :         dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
     100           0 :         dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
     101           0 :         dest += RGB_OUTPUT_COMPONENTS;
     102             :     }
     103             : 
     104             :     /* handle final (maybe only) pixel */
     105             : 
     106           0 :     vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     107           0 :     vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     108           0 :     vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
     109             : 
     110           0 :     vec_r = _mm_mul_ps(vec_r, mat0);
     111           0 :     vec_g = _mm_mul_ps(vec_g, mat1);
     112           0 :     vec_b = _mm_mul_ps(vec_b, mat2);
     113             : 
     114           0 :     vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     115           0 :     vec_r  = _mm_max_ps(min, vec_r);
     116           0 :     vec_r  = _mm_min_ps(max, vec_r);
     117           0 :     result = _mm_mul_ps(vec_r, scale);
     118             : 
     119           0 :     *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
     120           0 :     result = _mm_movehl_ps(result, result);
     121           0 :     *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
     122             : 
     123           0 :     dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
     124           0 :     dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
     125           0 :     dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
     126             : 
     127             :     _mm_empty();
     128             : }
     129             : 
     130           0 : void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
     131             :                                            unsigned char *src,
     132             :                                            unsigned char *dest,
     133             :                                            size_t length)
     134             : {
     135             :     unsigned int i;
     136           0 :     float (*mat)[4] = transform->matrix;
     137             :     char input_back[32];
     138             :     /* Ensure we have a buffer that's 16 byte aligned regardless of the original
     139             :      * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
     140             :      * because they don't work on stack variables. gcc 4.4 does do the right thing
     141             :      * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
     142           0 :     float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
     143             :     /* share input and output locations to save having to keep the
     144             :      * locations in separate registers */
     145           0 :     uint32_t const * output = (uint32_t*)input;
     146             : 
     147             :     /* deref *transform now to avoid it in loop */
     148           0 :     const float *igtbl_r = transform->input_gamma_table_r;
     149           0 :     const float *igtbl_g = transform->input_gamma_table_g;
     150           0 :     const float *igtbl_b = transform->input_gamma_table_b;
     151             : 
     152             :     /* deref *transform now to avoid it in loop */
     153           0 :     const uint8_t *otdata_r = &transform->output_table_r->data[0];
     154           0 :     const uint8_t *otdata_g = &transform->output_table_g->data[0];
     155           0 :     const uint8_t *otdata_b = &transform->output_table_b->data[0];
     156             : 
     157             :     /* input matrix values never change */
     158           0 :     const __m128 mat0  = _mm_load_ps(mat[0]);
     159           0 :     const __m128 mat1  = _mm_load_ps(mat[1]);
     160           0 :     const __m128 mat2  = _mm_load_ps(mat[2]);
     161             : 
     162             :     /* these values don't change, either */
     163           0 :     const __m128 max   = _mm_load_ps(clampMaxValueX4);
     164           0 :     const __m128 min   = _mm_setzero_ps();
     165           0 :     const __m128 scale = _mm_load_ps(floatScaleX4);
     166             : 
     167             :     /* working variables */
     168             :     __m128 vec_r, vec_g, vec_b, result;
     169             :     unsigned char alpha;
     170             : 
     171             :     /* CYA */
     172           0 :     if (!length)
     173           0 :         return;
     174             : 
     175             :     /* one pixel is handled outside of the loop */
     176           0 :     length--;
     177             : 
     178             :     /* setup for transforming 1st pixel */
     179           0 :     vec_r = _mm_load_ss(&igtbl_r[src[0]]);
     180           0 :     vec_g = _mm_load_ss(&igtbl_g[src[1]]);
     181           0 :     vec_b = _mm_load_ss(&igtbl_b[src[2]]);
     182           0 :     alpha = src[3];
     183           0 :     src += 4;
     184             : 
     185             :     /* transform all but final pixel */
     186             : 
     187           0 :     for (i=0; i<length; i++)
     188             :     {
     189             :         /* position values from gamma tables */
     190           0 :         vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     191           0 :         vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     192           0 :         vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
     193             : 
     194             :         /* gamma * matrix */
     195           0 :         vec_r = _mm_mul_ps(vec_r, mat0);
     196           0 :         vec_g = _mm_mul_ps(vec_g, mat1);
     197           0 :         vec_b = _mm_mul_ps(vec_b, mat2);
     198             : 
     199             :         /* store alpha for this pixel; load alpha for next */
     200           0 :         dest[OUTPUT_A_INDEX] = alpha;
     201           0 :         alpha   = src[3];
     202             : 
     203             :         /* crunch, crunch, crunch */
     204           0 :         vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     205           0 :         vec_r  = _mm_max_ps(min, vec_r);
     206           0 :         vec_r  = _mm_min_ps(max, vec_r);
     207           0 :         result = _mm_mul_ps(vec_r, scale);
     208             : 
     209             :         /* store calc'd output tables indices */
     210           0 :         *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
     211           0 :         result = _mm_movehl_ps(result, result);
     212           0 :         *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
     213             : 
     214             :         /* load gamma values for next loop while store completes */
     215           0 :         vec_r = _mm_load_ss(&igtbl_r[src[0]]);
     216           0 :         vec_g = _mm_load_ss(&igtbl_g[src[1]]);
     217           0 :         vec_b = _mm_load_ss(&igtbl_b[src[2]]);
     218           0 :         src += 4;
     219             : 
     220             :         /* use calc'd indices to output RGB values */
     221           0 :         dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
     222           0 :         dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
     223           0 :         dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
     224           0 :         dest += 4;
     225             :     }
     226             : 
     227             :     /* handle final (maybe only) pixel */
     228             : 
     229           0 :     vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     230           0 :     vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     231           0 :     vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
     232             : 
     233           0 :     vec_r = _mm_mul_ps(vec_r, mat0);
     234           0 :     vec_g = _mm_mul_ps(vec_g, mat1);
     235           0 :     vec_b = _mm_mul_ps(vec_b, mat2);
     236             : 
     237           0 :     dest[OUTPUT_A_INDEX] = alpha;
     238             : 
     239           0 :     vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     240           0 :     vec_r  = _mm_max_ps(min, vec_r);
     241           0 :     vec_r  = _mm_min_ps(max, vec_r);
     242           0 :     result = _mm_mul_ps(vec_r, scale);
     243             : 
     244           0 :     *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
     245           0 :     result = _mm_movehl_ps(result, result);
     246           0 :     *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
     247             : 
     248           0 :     dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
     249           0 :     dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
     250           0 :     dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
     251             : 
     252             :     _mm_empty();
     253             : }

Generated by: LCOV version 1.13