Line data Source code
1 : #include <emmintrin.h>
2 :
3 : #include "qcmsint.h"
4 :
5 : /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
6 : #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE)
7 : #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
8 : static const ALIGN float floatScaleX4[4] =
9 : { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
10 : static const ALIGN float clampMaxValueX4[4] =
11 : { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
12 :
13 0 : void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
14 : unsigned char *src,
15 : unsigned char *dest,
16 : size_t length)
17 : {
18 : unsigned int i;
19 0 : float (*mat)[4] = transform->matrix;
20 : char input_back[32];
21 : /* Ensure we have a buffer that's 16 byte aligned regardless of the original
22 : * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
23 : * because they don't work on stack variables. gcc 4.4 does do the right thing
24 : * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
25 0 : float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
26 : /* share input and output locations to save having to keep the
27 : * locations in separate registers */
28 0 : uint32_t const * output = (uint32_t*)input;
29 :
30 : /* deref *transform now to avoid it in loop */
31 0 : const float *igtbl_r = transform->input_gamma_table_r;
32 0 : const float *igtbl_g = transform->input_gamma_table_g;
33 0 : const float *igtbl_b = transform->input_gamma_table_b;
34 :
35 : /* deref *transform now to avoid it in loop */
36 0 : const uint8_t *otdata_r = &transform->output_table_r->data[0];
37 0 : const uint8_t *otdata_g = &transform->output_table_g->data[0];
38 0 : const uint8_t *otdata_b = &transform->output_table_b->data[0];
39 :
40 : /* input matrix values never change */
41 0 : const __m128 mat0 = _mm_load_ps(mat[0]);
42 0 : const __m128 mat1 = _mm_load_ps(mat[1]);
43 0 : const __m128 mat2 = _mm_load_ps(mat[2]);
44 :
45 : /* these values don't change, either */
46 0 : const __m128 max = _mm_load_ps(clampMaxValueX4);
47 0 : const __m128 min = _mm_setzero_ps();
48 0 : const __m128 scale = _mm_load_ps(floatScaleX4);
49 :
50 : /* working variables */
51 : __m128 vec_r, vec_g, vec_b, result;
52 :
53 : /* CYA */
54 0 : if (!length)
55 0 : return;
56 :
57 : /* one pixel is handled outside of the loop */
58 0 : length--;
59 :
60 : /* setup for transforming 1st pixel */
61 0 : vec_r = _mm_load_ss(&igtbl_r[src[0]]);
62 0 : vec_g = _mm_load_ss(&igtbl_g[src[1]]);
63 0 : vec_b = _mm_load_ss(&igtbl_b[src[2]]);
64 0 : src += 3;
65 :
66 : /* transform all but final pixel */
67 :
68 0 : for (i=0; i<length; i++)
69 : {
70 : /* position values from gamma tables */
71 0 : vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
72 0 : vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
73 0 : vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
74 :
75 : /* gamma * matrix */
76 0 : vec_r = _mm_mul_ps(vec_r, mat0);
77 0 : vec_g = _mm_mul_ps(vec_g, mat1);
78 0 : vec_b = _mm_mul_ps(vec_b, mat2);
79 :
80 : /* crunch, crunch, crunch */
81 0 : vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
82 0 : vec_r = _mm_max_ps(min, vec_r);
83 0 : vec_r = _mm_min_ps(max, vec_r);
84 0 : result = _mm_mul_ps(vec_r, scale);
85 :
86 : /* store calc'd output tables indices */
87 0 : _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
88 :
89 : /* load for next loop while store completes */
90 0 : vec_r = _mm_load_ss(&igtbl_r[src[0]]);
91 0 : vec_g = _mm_load_ss(&igtbl_g[src[1]]);
92 0 : vec_b = _mm_load_ss(&igtbl_b[src[2]]);
93 0 : src += 3;
94 :
95 : /* use calc'd indices to output RGB values */
96 0 : dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
97 0 : dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
98 0 : dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
99 0 : dest += RGB_OUTPUT_COMPONENTS;
100 : }
101 :
102 : /* handle final (maybe only) pixel */
103 :
104 0 : vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
105 0 : vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
106 0 : vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
107 :
108 0 : vec_r = _mm_mul_ps(vec_r, mat0);
109 0 : vec_g = _mm_mul_ps(vec_g, mat1);
110 0 : vec_b = _mm_mul_ps(vec_b, mat2);
111 :
112 0 : vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
113 0 : vec_r = _mm_max_ps(min, vec_r);
114 0 : vec_r = _mm_min_ps(max, vec_r);
115 0 : result = _mm_mul_ps(vec_r, scale);
116 :
117 0 : _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
118 :
119 0 : dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
120 0 : dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
121 0 : dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
122 : }
123 :
124 42 : void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
125 : unsigned char *src,
126 : unsigned char *dest,
127 : size_t length)
128 : {
129 : unsigned int i;
130 42 : float (*mat)[4] = transform->matrix;
131 : char input_back[32];
132 : /* Ensure we have a buffer that's 16 byte aligned regardless of the original
133 : * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
134 : * because they don't work on stack variables. gcc 4.4 does do the right thing
135 : * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
136 42 : float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
137 : /* share input and output locations to save having to keep the
138 : * locations in separate registers */
139 42 : uint32_t const * output = (uint32_t*)input;
140 :
141 : /* deref *transform now to avoid it in loop */
142 42 : const float *igtbl_r = transform->input_gamma_table_r;
143 42 : const float *igtbl_g = transform->input_gamma_table_g;
144 42 : const float *igtbl_b = transform->input_gamma_table_b;
145 :
146 : /* deref *transform now to avoid it in loop */
147 42 : const uint8_t *otdata_r = &transform->output_table_r->data[0];
148 42 : const uint8_t *otdata_g = &transform->output_table_g->data[0];
149 42 : const uint8_t *otdata_b = &transform->output_table_b->data[0];
150 :
151 : /* input matrix values never change */
152 42 : const __m128 mat0 = _mm_load_ps(mat[0]);
153 84 : const __m128 mat1 = _mm_load_ps(mat[1]);
154 84 : const __m128 mat2 = _mm_load_ps(mat[2]);
155 :
156 : /* these values don't change, either */
157 42 : const __m128 max = _mm_load_ps(clampMaxValueX4);
158 42 : const __m128 min = _mm_setzero_ps();
159 42 : const __m128 scale = _mm_load_ps(floatScaleX4);
160 :
161 : /* working variables */
162 : __m128 vec_r, vec_g, vec_b, result;
163 : unsigned char alpha;
164 :
165 : /* CYA */
166 42 : if (!length)
167 0 : return;
168 :
169 : /* one pixel is handled outside of the loop */
170 42 : length--;
171 :
172 : /* setup for transforming 1st pixel */
173 84 : vec_r = _mm_load_ss(&igtbl_r[src[0]]);
174 84 : vec_g = _mm_load_ss(&igtbl_g[src[1]]);
175 84 : vec_b = _mm_load_ss(&igtbl_b[src[2]]);
176 42 : alpha = src[3];
177 42 : src += 4;
178 :
179 : /* transform all but final pixel */
180 :
181 502 : for (i=0; i<length; i++)
182 : {
183 : /* position values from gamma tables */
184 460 : vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
185 460 : vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
186 460 : vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
187 :
188 : /* gamma * matrix */
189 460 : vec_r = _mm_mul_ps(vec_r, mat0);
190 460 : vec_g = _mm_mul_ps(vec_g, mat1);
191 460 : vec_b = _mm_mul_ps(vec_b, mat2);
192 :
193 : /* store alpha for this pixel; load alpha for next */
194 460 : dest[OUTPUT_A_INDEX] = alpha;
195 460 : alpha = src[3];
196 :
197 : /* crunch, crunch, crunch */
198 920 : vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
199 460 : vec_r = _mm_max_ps(min, vec_r);
200 460 : vec_r = _mm_min_ps(max, vec_r);
201 460 : result = _mm_mul_ps(vec_r, scale);
202 :
203 : /* store calc'd output tables indices */
204 460 : _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
205 :
206 : /* load gamma values for next loop while store completes */
207 920 : vec_r = _mm_load_ss(&igtbl_r[src[0]]);
208 920 : vec_g = _mm_load_ss(&igtbl_g[src[1]]);
209 920 : vec_b = _mm_load_ss(&igtbl_b[src[2]]);
210 460 : src += 4;
211 :
212 : /* use calc'd indices to output RGB values */
213 460 : dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
214 460 : dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
215 460 : dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
216 460 : dest += RGBA_OUTPUT_COMPONENTS;
217 : }
218 :
219 : /* handle final (maybe only) pixel */
220 :
221 42 : vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
222 42 : vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
223 42 : vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
224 :
225 42 : vec_r = _mm_mul_ps(vec_r, mat0);
226 42 : vec_g = _mm_mul_ps(vec_g, mat1);
227 42 : vec_b = _mm_mul_ps(vec_b, mat2);
228 :
229 42 : dest[OUTPUT_A_INDEX] = alpha;
230 :
231 84 : vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
232 42 : vec_r = _mm_max_ps(min, vec_r);
233 42 : vec_r = _mm_min_ps(max, vec_r);
234 42 : result = _mm_mul_ps(vec_r, scale);
235 :
236 42 : _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
237 :
238 42 : dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
239 42 : dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
240 42 : dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
241 : }
242 :
243 :
|