Line data Source code
1 : /*
2 : * Copyright © 2008 Rodrigo Kumpera
3 : * Copyright © 2008 André Tupinambá
4 : *
5 : * Permission to use, copy, modify, distribute, and sell this software and its
6 : * documentation for any purpose is hereby granted without fee, provided that
7 : * the above copyright notice appear in all copies and that both that
8 : * copyright notice and this permission notice appear in supporting
9 : * documentation, and that the name of Red Hat not be used in advertising or
10 : * publicity pertaining to distribution of the software without specific,
11 : * written prior permission. Red Hat makes no representations about the
12 : * suitability of this software for any purpose. It is provided "as is"
13 : * without express or implied warranty.
14 : *
15 : * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 : * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 : * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 : * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 : * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 : * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22 : * SOFTWARE.
23 : *
24 : * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 : * André Tupinambá (andrelrt@gmail.com)
26 : *
27 : * Based on work by Owen Taylor and Søren Sandmann
28 : */
29 : #ifdef HAVE_CONFIG_H
30 : #include <config.h>
31 : #endif
32 :
33 : #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
34 : #include <emmintrin.h> /* for SSE2 intrinsics */
35 : #include "pixman-private.h"
36 : #include "pixman-combine32.h"
37 : #include "pixman-inlines.h"
38 :
39 : static __m128i mask_0080;
40 : static __m128i mask_00ff;
41 : static __m128i mask_0101;
42 : static __m128i mask_ffff;
43 : static __m128i mask_ff000000;
44 : static __m128i mask_alpha;
45 :
46 : static __m128i mask_565_r;
47 : static __m128i mask_565_g1, mask_565_g2;
48 : static __m128i mask_565_b;
49 : static __m128i mask_red;
50 : static __m128i mask_green;
51 : static __m128i mask_blue;
52 :
53 : static __m128i mask_565_fix_rb;
54 : static __m128i mask_565_fix_g;
55 :
56 : static __m128i mask_565_rb;
57 : static __m128i mask_565_pack_multiplier;
58 :
59 : static force_inline __m128i
60 : unpack_32_1x128 (uint32_t data)
61 : {
62 0 : return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
63 : }
64 :
65 : static force_inline void
66 : unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
67 : {
68 0 : *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
69 0 : *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
70 : }
71 :
72 : static force_inline __m128i
73 : unpack_565_to_8888 (__m128i lo)
74 : {
75 : __m128i r, g, b, rb, t;
76 :
77 0 : r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
78 0 : g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
79 0 : b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
80 :
81 0 : rb = _mm_or_si128 (r, b);
82 0 : t = _mm_and_si128 (rb, mask_565_fix_rb);
83 0 : t = _mm_srli_epi32 (t, 5);
84 0 : rb = _mm_or_si128 (rb, t);
85 :
86 0 : t = _mm_and_si128 (g, mask_565_fix_g);
87 0 : t = _mm_srli_epi32 (t, 6);
88 0 : g = _mm_or_si128 (g, t);
89 :
90 0 : return _mm_or_si128 (rb, g);
91 : }
92 :
93 : static force_inline void
94 : unpack_565_128_4x128 (__m128i data,
95 : __m128i* data0,
96 : __m128i* data1,
97 : __m128i* data2,
98 : __m128i* data3)
99 : {
100 : __m128i lo, hi;
101 :
102 0 : lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
103 0 : hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
104 :
105 0 : lo = unpack_565_to_8888 (lo);
106 0 : hi = unpack_565_to_8888 (hi);
107 :
108 : unpack_128_2x128 (lo, data0, data1);
109 : unpack_128_2x128 (hi, data2, data3);
110 : }
111 :
112 : static force_inline uint16_t
113 : pack_565_32_16 (uint32_t pixel)
114 : {
115 0 : return (uint16_t) (((pixel >> 8) & 0xf800) |
116 0 : ((pixel >> 5) & 0x07e0) |
117 0 : ((pixel >> 3) & 0x001f));
118 : }
119 :
120 : static force_inline __m128i
121 : pack_2x128_128 (__m128i lo, __m128i hi)
122 : {
123 0 : return _mm_packus_epi16 (lo, hi);
124 : }
125 :
126 : static force_inline __m128i
127 : pack_565_2packedx128_128 (__m128i lo, __m128i hi)
128 : {
129 0 : __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
130 0 : __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
131 :
132 0 : __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
133 0 : __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
134 :
135 0 : __m128i g0 = _mm_and_si128 (lo, mask_green);
136 0 : __m128i g1 = _mm_and_si128 (hi, mask_green);
137 :
138 0 : t0 = _mm_or_si128 (t0, g0);
139 0 : t1 = _mm_or_si128 (t1, g1);
140 :
141 : /* Simulates _mm_packus_epi32 */
142 0 : t0 = _mm_slli_epi32 (t0, 16 - 5);
143 0 : t1 = _mm_slli_epi32 (t1, 16 - 5);
144 0 : t0 = _mm_srai_epi32 (t0, 16);
145 0 : t1 = _mm_srai_epi32 (t1, 16);
146 0 : return _mm_packs_epi32 (t0, t1);
147 : }
148 :
149 : static force_inline __m128i
150 : pack_565_2x128_128 (__m128i lo, __m128i hi)
151 : {
152 : __m128i data;
153 : __m128i r, g1, g2, b;
154 :
155 0 : data = pack_2x128_128 (lo, hi);
156 :
157 0 : r = _mm_and_si128 (data, mask_565_r);
158 0 : g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159 0 : g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160 0 : b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
161 :
162 0 : return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
163 : }
164 :
165 : static force_inline __m128i
166 : pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
167 : {
168 0 : return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169 : pack_565_2x128_128 (*xmm2, *xmm3));
170 : }
171 :
172 : static force_inline int
173 : is_opaque (__m128i x)
174 : {
175 0 : __m128i ffs = _mm_cmpeq_epi8 (x, x);
176 :
177 0 : return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
178 : }
179 :
180 : static force_inline int
181 : is_zero (__m128i x)
182 : {
183 0 : return _mm_movemask_epi8 (
184 0 : _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
185 : }
186 :
187 : static force_inline int
188 : is_transparent (__m128i x)
189 : {
190 0 : return (_mm_movemask_epi8 (
191 0 : _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
192 : }
193 :
194 : static force_inline __m128i
195 : expand_pixel_32_1x128 (uint32_t data)
196 : {
197 0 : return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
198 : }
199 :
200 : static force_inline __m128i
201 : expand_alpha_1x128 (__m128i data)
202 : {
203 0 : return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204 : _MM_SHUFFLE (3, 3, 3, 3)),
205 : _MM_SHUFFLE (3, 3, 3, 3));
206 : }
207 :
208 : static force_inline void
209 : expand_alpha_2x128 (__m128i data_lo,
210 : __m128i data_hi,
211 : __m128i* alpha_lo,
212 : __m128i* alpha_hi)
213 : {
214 : __m128i lo, hi;
215 :
216 0 : lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217 0 : hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
218 :
219 0 : *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220 0 : *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
221 : }
222 :
223 : static force_inline void
224 : expand_alpha_rev_2x128 (__m128i data_lo,
225 : __m128i data_hi,
226 : __m128i* alpha_lo,
227 : __m128i* alpha_hi)
228 : {
229 : __m128i lo, hi;
230 :
231 0 : lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232 0 : hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233 0 : *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234 0 : *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
235 : }
236 :
237 : static force_inline void
238 : pix_multiply_2x128 (__m128i* data_lo,
239 : __m128i* data_hi,
240 : __m128i* alpha_lo,
241 : __m128i* alpha_hi,
242 : __m128i* ret_lo,
243 : __m128i* ret_hi)
244 : {
245 : __m128i lo, hi;
246 :
247 0 : lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248 0 : hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249 0 : lo = _mm_adds_epu16 (lo, mask_0080);
250 0 : hi = _mm_adds_epu16 (hi, mask_0080);
251 0 : *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252 0 : *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
253 : }
254 :
255 : static force_inline void
256 : pix_add_multiply_2x128 (__m128i* src_lo,
257 : __m128i* src_hi,
258 : __m128i* alpha_dst_lo,
259 : __m128i* alpha_dst_hi,
260 : __m128i* dst_lo,
261 : __m128i* dst_hi,
262 : __m128i* alpha_src_lo,
263 : __m128i* alpha_src_hi,
264 : __m128i* ret_lo,
265 : __m128i* ret_hi)
266 : {
267 : __m128i t1_lo, t1_hi;
268 : __m128i t2_lo, t2_hi;
269 :
270 : pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271 : pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
272 :
273 0 : *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274 0 : *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
275 : }
276 :
277 : static force_inline void
278 : negate_2x128 (__m128i data_lo,
279 : __m128i data_hi,
280 : __m128i* neg_lo,
281 : __m128i* neg_hi)
282 : {
283 0 : *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284 0 : *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
285 : }
286 :
287 : static force_inline void
288 : invert_colors_2x128 (__m128i data_lo,
289 : __m128i data_hi,
290 : __m128i* inv_lo,
291 : __m128i* inv_hi)
292 : {
293 : __m128i lo, hi;
294 :
295 0 : lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296 0 : hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297 0 : *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298 0 : *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
299 : }
300 :
301 : static force_inline void
302 : over_2x128 (__m128i* src_lo,
303 : __m128i* src_hi,
304 : __m128i* alpha_lo,
305 : __m128i* alpha_hi,
306 : __m128i* dst_lo,
307 : __m128i* dst_hi)
308 : {
309 : __m128i t1, t2;
310 :
311 0 : negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
312 :
313 : pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
314 :
315 0 : *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316 0 : *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
317 : }
318 :
319 : static force_inline void
320 : over_rev_non_pre_2x128 (__m128i src_lo,
321 : __m128i src_hi,
322 : __m128i* dst_lo,
323 : __m128i* dst_hi)
324 : {
325 : __m128i lo, hi;
326 : __m128i alpha_lo, alpha_hi;
327 :
328 0 : expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
329 :
330 0 : lo = _mm_or_si128 (alpha_lo, mask_alpha);
331 0 : hi = _mm_or_si128 (alpha_hi, mask_alpha);
332 :
333 0 : invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
334 :
335 : pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
336 :
337 : over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
338 : }
339 :
340 : static force_inline void
341 : in_over_2x128 (__m128i* src_lo,
342 : __m128i* src_hi,
343 : __m128i* alpha_lo,
344 : __m128i* alpha_hi,
345 : __m128i* mask_lo,
346 : __m128i* mask_hi,
347 : __m128i* dst_lo,
348 : __m128i* dst_hi)
349 : {
350 : __m128i s_lo, s_hi;
351 : __m128i a_lo, a_hi;
352 :
353 : pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354 : pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
355 :
356 : over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
357 : }
358 :
359 : /* load 4 pixels from a 16-byte boundary aligned address */
360 : static force_inline __m128i
361 : load_128_aligned (__m128i* src)
362 : {
363 0 : return _mm_load_si128 (src);
364 : }
365 :
366 : /* load 4 pixels from a unaligned address */
367 : static force_inline __m128i
368 : load_128_unaligned (const __m128i* src)
369 : {
370 5632 : return _mm_loadu_si128 (src);
371 : }
372 :
373 : /* save 4 pixels using Write Combining memory on a 16-byte
374 : * boundary aligned address
375 : */
376 : static force_inline void
377 : save_128_write_combining (__m128i* dst,
378 : __m128i data)
379 : {
380 : _mm_stream_si128 (dst, data);
381 : }
382 :
383 : /* save 4 pixels on a 16-byte boundary aligned address */
384 : static force_inline void
385 : save_128_aligned (__m128i* dst,
386 : __m128i data)
387 : {
388 : _mm_store_si128 (dst, data);
389 : }
390 :
391 : /* save 4 pixels on a unaligned address */
392 : static force_inline void
393 : save_128_unaligned (__m128i* dst,
394 : __m128i data)
395 : {
396 : _mm_storeu_si128 (dst, data);
397 : }
398 :
399 : static force_inline __m128i
400 : load_32_1x128 (uint32_t data)
401 : {
402 0 : return _mm_cvtsi32_si128 (data);
403 : }
404 :
405 : static force_inline __m128i
406 : expand_alpha_rev_1x128 (__m128i data)
407 : {
408 0 : return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
409 : }
410 :
411 : static force_inline __m128i
412 : expand_pixel_8_1x128 (uint8_t data)
413 : {
414 0 : return _mm_shufflelo_epi16 (
415 : unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
416 : }
417 :
418 : static force_inline __m128i
419 : pix_multiply_1x128 (__m128i data,
420 : __m128i alpha)
421 : {
422 0 : return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
423 : mask_0080),
424 : mask_0101);
425 : }
426 :
427 : static force_inline __m128i
428 : pix_add_multiply_1x128 (__m128i* src,
429 : __m128i* alpha_dst,
430 : __m128i* dst,
431 : __m128i* alpha_src)
432 : {
433 0 : __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
434 0 : __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
435 :
436 0 : return _mm_adds_epu8 (t1, t2);
437 : }
438 :
439 : static force_inline __m128i
440 : negate_1x128 (__m128i data)
441 : {
442 0 : return _mm_xor_si128 (data, mask_00ff);
443 : }
444 :
445 : static force_inline __m128i
446 : invert_colors_1x128 (__m128i data)
447 : {
448 0 : return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
449 : }
450 :
451 : static force_inline __m128i
452 : over_1x128 (__m128i src, __m128i alpha, __m128i dst)
453 : {
454 0 : return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
455 : }
456 :
457 : static force_inline __m128i
458 : in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
459 : {
460 0 : return over_1x128 (pix_multiply_1x128 (*src, *mask),
461 : pix_multiply_1x128 (*alpha, *mask),
462 : *dst);
463 : }
464 :
465 : static force_inline __m128i
466 : over_rev_non_pre_1x128 (__m128i src, __m128i dst)
467 : {
468 0 : __m128i alpha = expand_alpha_1x128 (src);
469 :
470 0 : return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
471 : _mm_or_si128 (alpha, mask_alpha)),
472 : alpha,
473 : dst);
474 : }
475 :
476 : static force_inline uint32_t
477 : pack_1x128_32 (__m128i data)
478 : {
479 0 : return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
480 : }
481 :
482 : static force_inline __m128i
483 : expand565_16_1x128 (uint16_t pixel)
484 : {
485 0 : __m128i m = _mm_cvtsi32_si128 (pixel);
486 :
487 0 : m = unpack_565_to_8888 (m);
488 :
489 0 : return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
490 : }
491 :
492 : static force_inline uint32_t
493 : core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
494 : {
495 : uint8_t a;
496 : __m128i xmms;
497 :
498 0 : a = src >> 24;
499 :
500 0 : if (a == 0xff)
501 : {
502 0 : return src;
503 : }
504 0 : else if (src)
505 : {
506 0 : xmms = unpack_32_1x128 (src);
507 0 : return pack_1x128_32 (
508 : over_1x128 (xmms, expand_alpha_1x128 (xmms),
509 : unpack_32_1x128 (dst)));
510 : }
511 :
512 0 : return dst;
513 : }
514 :
515 : static force_inline uint32_t
516 : combine1 (const uint32_t *ps, const uint32_t *pm)
517 : {
518 0 : uint32_t s = *ps;
519 :
520 0 : if (pm)
521 : {
522 : __m128i ms, mm;
523 :
524 0 : mm = unpack_32_1x128 (*pm);
525 0 : mm = expand_alpha_1x128 (mm);
526 :
527 0 : ms = unpack_32_1x128 (s);
528 0 : ms = pix_multiply_1x128 (ms, mm);
529 :
530 0 : s = pack_1x128_32 (ms);
531 : }
532 :
533 0 : return s;
534 : }
535 :
536 : static force_inline __m128i
537 : combine4 (const __m128i *ps, const __m128i *pm)
538 : {
539 : __m128i xmm_src_lo, xmm_src_hi;
540 : __m128i xmm_msk_lo, xmm_msk_hi;
541 : __m128i s;
542 :
543 0 : if (pm)
544 : {
545 0 : xmm_msk_lo = load_128_unaligned (pm);
546 :
547 0 : if (is_transparent (xmm_msk_lo))
548 0 : return _mm_setzero_si128 ();
549 : }
550 :
551 0 : s = load_128_unaligned (ps);
552 :
553 0 : if (pm)
554 : {
555 : unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
556 0 : unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
557 :
558 0 : expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
559 :
560 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
561 : &xmm_msk_lo, &xmm_msk_hi,
562 : &xmm_src_lo, &xmm_src_hi);
563 :
564 0 : s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
565 : }
566 :
567 0 : return s;
568 : }
569 :
570 : static force_inline void
571 : core_combine_over_u_sse2_mask (uint32_t * pd,
572 : const uint32_t* ps,
573 : const uint32_t* pm,
574 : int w)
575 : {
576 : uint32_t s, d;
577 :
578 : /* Align dst on a 16-byte boundary */
579 0 : while (w && ((uintptr_t)pd & 15))
580 : {
581 0 : d = *pd;
582 0 : s = combine1 (ps, pm);
583 :
584 0 : if (s)
585 0 : *pd = core_combine_over_u_pixel_sse2 (s, d);
586 0 : pd++;
587 0 : ps++;
588 0 : pm++;
589 0 : w--;
590 : }
591 :
592 0 : while (w >= 4)
593 : {
594 0 : __m128i mask = load_128_unaligned ((__m128i *)pm);
595 :
596 0 : if (!is_zero (mask))
597 : {
598 : __m128i src;
599 : __m128i src_hi, src_lo;
600 : __m128i mask_hi, mask_lo;
601 : __m128i alpha_hi, alpha_lo;
602 :
603 0 : src = load_128_unaligned ((__m128i *)ps);
604 :
605 0 : if (is_opaque (_mm_and_si128 (src, mask)))
606 : {
607 : save_128_aligned ((__m128i *)pd, src);
608 : }
609 : else
610 : {
611 0 : __m128i dst = load_128_aligned ((__m128i *)pd);
612 : __m128i dst_hi, dst_lo;
613 :
614 : unpack_128_2x128 (mask, &mask_lo, &mask_hi);
615 : unpack_128_2x128 (src, &src_lo, &src_hi);
616 :
617 0 : expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
618 : pix_multiply_2x128 (&src_lo, &src_hi,
619 : &mask_lo, &mask_hi,
620 : &src_lo, &src_hi);
621 :
622 : unpack_128_2x128 (dst, &dst_lo, &dst_hi);
623 :
624 0 : expand_alpha_2x128 (src_lo, src_hi,
625 : &alpha_lo, &alpha_hi);
626 :
627 : over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
628 : &dst_lo, &dst_hi);
629 :
630 0 : save_128_aligned (
631 : (__m128i *)pd,
632 : pack_2x128_128 (dst_lo, dst_hi));
633 : }
634 : }
635 :
636 0 : pm += 4;
637 0 : ps += 4;
638 0 : pd += 4;
639 0 : w -= 4;
640 : }
641 0 : while (w)
642 : {
643 0 : d = *pd;
644 0 : s = combine1 (ps, pm);
645 :
646 0 : if (s)
647 0 : *pd = core_combine_over_u_pixel_sse2 (s, d);
648 0 : pd++;
649 0 : ps++;
650 0 : pm++;
651 :
652 0 : w--;
653 : }
654 : }
655 :
656 : static force_inline void
657 : core_combine_over_u_sse2_no_mask (uint32_t * pd,
658 : const uint32_t* ps,
659 : int w)
660 : {
661 : uint32_t s, d;
662 :
663 : /* Align dst on a 16-byte boundary */
664 0 : while (w && ((uintptr_t)pd & 15))
665 : {
666 0 : d = *pd;
667 0 : s = *ps;
668 :
669 0 : if (s)
670 0 : *pd = core_combine_over_u_pixel_sse2 (s, d);
671 0 : pd++;
672 0 : ps++;
673 0 : w--;
674 : }
675 :
676 0 : while (w >= 4)
677 : {
678 : __m128i src;
679 : __m128i src_hi, src_lo, dst_hi, dst_lo;
680 : __m128i alpha_hi, alpha_lo;
681 :
682 0 : src = load_128_unaligned ((__m128i *)ps);
683 :
684 0 : if (!is_zero (src))
685 : {
686 0 : if (is_opaque (src))
687 : {
688 : save_128_aligned ((__m128i *)pd, src);
689 : }
690 : else
691 : {
692 0 : __m128i dst = load_128_aligned ((__m128i *)pd);
693 :
694 : unpack_128_2x128 (src, &src_lo, &src_hi);
695 : unpack_128_2x128 (dst, &dst_lo, &dst_hi);
696 :
697 0 : expand_alpha_2x128 (src_lo, src_hi,
698 : &alpha_lo, &alpha_hi);
699 : over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
700 : &dst_lo, &dst_hi);
701 :
702 0 : save_128_aligned (
703 : (__m128i *)pd,
704 : pack_2x128_128 (dst_lo, dst_hi));
705 : }
706 : }
707 :
708 0 : ps += 4;
709 0 : pd += 4;
710 0 : w -= 4;
711 : }
712 0 : while (w)
713 : {
714 0 : d = *pd;
715 0 : s = *ps;
716 :
717 0 : if (s)
718 0 : *pd = core_combine_over_u_pixel_sse2 (s, d);
719 0 : pd++;
720 0 : ps++;
721 :
722 0 : w--;
723 : }
724 : }
725 :
726 : static force_inline void
727 0 : sse2_combine_over_u (pixman_implementation_t *imp,
728 : pixman_op_t op,
729 : uint32_t * pd,
730 : const uint32_t * ps,
731 : const uint32_t * pm,
732 : int w)
733 : {
734 0 : if (pm)
735 : core_combine_over_u_sse2_mask (pd, ps, pm, w);
736 : else
737 : core_combine_over_u_sse2_no_mask (pd, ps, w);
738 0 : }
739 :
740 : static void
741 0 : sse2_combine_over_reverse_u (pixman_implementation_t *imp,
742 : pixman_op_t op,
743 : uint32_t * pd,
744 : const uint32_t * ps,
745 : const uint32_t * pm,
746 : int w)
747 : {
748 : uint32_t s, d;
749 :
750 : __m128i xmm_dst_lo, xmm_dst_hi;
751 : __m128i xmm_src_lo, xmm_src_hi;
752 : __m128i xmm_alpha_lo, xmm_alpha_hi;
753 :
754 : /* Align dst on a 16-byte boundary */
755 0 : while (w &&
756 0 : ((uintptr_t)pd & 15))
757 : {
758 0 : d = *pd;
759 0 : s = combine1 (ps, pm);
760 :
761 0 : *pd++ = core_combine_over_u_pixel_sse2 (d, s);
762 0 : w--;
763 0 : ps++;
764 0 : if (pm)
765 0 : pm++;
766 : }
767 :
768 0 : while (w >= 4)
769 : {
770 : /* I'm loading unaligned because I'm not sure
771 : * about the address alignment.
772 : */
773 0 : xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
774 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
775 :
776 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
777 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
778 :
779 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
780 : &xmm_alpha_lo, &xmm_alpha_hi);
781 :
782 : over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
783 : &xmm_alpha_lo, &xmm_alpha_hi,
784 : &xmm_src_lo, &xmm_src_hi);
785 :
786 : /* rebuid the 4 pixel data and save*/
787 0 : save_128_aligned ((__m128i*)pd,
788 : pack_2x128_128 (xmm_src_lo, xmm_src_hi));
789 :
790 0 : w -= 4;
791 0 : ps += 4;
792 0 : pd += 4;
793 :
794 0 : if (pm)
795 0 : pm += 4;
796 : }
797 :
798 0 : while (w)
799 : {
800 0 : d = *pd;
801 0 : s = combine1 (ps, pm);
802 :
803 0 : *pd++ = core_combine_over_u_pixel_sse2 (d, s);
804 0 : ps++;
805 0 : w--;
806 0 : if (pm)
807 0 : pm++;
808 : }
809 0 : }
810 :
811 : static force_inline uint32_t
812 : core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
813 : {
814 0 : uint32_t maska = src >> 24;
815 :
816 0 : if (maska == 0)
817 : {
818 0 : return 0;
819 : }
820 0 : else if (maska != 0xff)
821 : {
822 0 : return pack_1x128_32 (
823 : pix_multiply_1x128 (unpack_32_1x128 (dst),
824 : expand_alpha_1x128 (unpack_32_1x128 (src))));
825 : }
826 :
827 0 : return dst;
828 : }
829 :
830 : static void
831 0 : sse2_combine_in_u (pixman_implementation_t *imp,
832 : pixman_op_t op,
833 : uint32_t * pd,
834 : const uint32_t * ps,
835 : const uint32_t * pm,
836 : int w)
837 : {
838 : uint32_t s, d;
839 :
840 : __m128i xmm_src_lo, xmm_src_hi;
841 : __m128i xmm_dst_lo, xmm_dst_hi;
842 :
843 0 : while (w && ((uintptr_t)pd & 15))
844 : {
845 0 : s = combine1 (ps, pm);
846 0 : d = *pd;
847 :
848 0 : *pd++ = core_combine_in_u_pixel_sse2 (d, s);
849 0 : w--;
850 0 : ps++;
851 0 : if (pm)
852 0 : pm++;
853 : }
854 :
855 0 : while (w >= 4)
856 : {
857 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
858 0 : xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
859 :
860 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
861 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
862 :
863 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
864 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
865 : &xmm_dst_lo, &xmm_dst_hi,
866 : &xmm_dst_lo, &xmm_dst_hi);
867 :
868 0 : save_128_aligned ((__m128i*)pd,
869 : pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
870 :
871 0 : ps += 4;
872 0 : pd += 4;
873 0 : w -= 4;
874 0 : if (pm)
875 0 : pm += 4;
876 : }
877 :
878 0 : while (w)
879 : {
880 0 : s = combine1 (ps, pm);
881 0 : d = *pd;
882 :
883 0 : *pd++ = core_combine_in_u_pixel_sse2 (d, s);
884 0 : w--;
885 0 : ps++;
886 0 : if (pm)
887 0 : pm++;
888 : }
889 0 : }
890 :
891 : static void
892 0 : sse2_combine_in_reverse_u (pixman_implementation_t *imp,
893 : pixman_op_t op,
894 : uint32_t * pd,
895 : const uint32_t * ps,
896 : const uint32_t * pm,
897 : int w)
898 : {
899 : uint32_t s, d;
900 :
901 : __m128i xmm_src_lo, xmm_src_hi;
902 : __m128i xmm_dst_lo, xmm_dst_hi;
903 :
904 0 : while (w && ((uintptr_t)pd & 15))
905 : {
906 0 : s = combine1 (ps, pm);
907 0 : d = *pd;
908 :
909 0 : *pd++ = core_combine_in_u_pixel_sse2 (s, d);
910 0 : ps++;
911 0 : w--;
912 0 : if (pm)
913 0 : pm++;
914 : }
915 :
916 0 : while (w >= 4)
917 : {
918 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
919 0 : xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
920 :
921 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
922 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
923 :
924 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
925 : pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
926 : &xmm_src_lo, &xmm_src_hi,
927 : &xmm_dst_lo, &xmm_dst_hi);
928 :
929 0 : save_128_aligned (
930 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
931 :
932 0 : ps += 4;
933 0 : pd += 4;
934 0 : w -= 4;
935 0 : if (pm)
936 0 : pm += 4;
937 : }
938 :
939 0 : while (w)
940 : {
941 0 : s = combine1 (ps, pm);
942 0 : d = *pd;
943 :
944 0 : *pd++ = core_combine_in_u_pixel_sse2 (s, d);
945 0 : w--;
946 0 : ps++;
947 0 : if (pm)
948 0 : pm++;
949 : }
950 0 : }
951 :
952 : static void
953 0 : sse2_combine_out_reverse_u (pixman_implementation_t *imp,
954 : pixman_op_t op,
955 : uint32_t * pd,
956 : const uint32_t * ps,
957 : const uint32_t * pm,
958 : int w)
959 : {
960 0 : while (w && ((uintptr_t)pd & 15))
961 : {
962 0 : uint32_t s = combine1 (ps, pm);
963 0 : uint32_t d = *pd;
964 :
965 0 : *pd++ = pack_1x128_32 (
966 : pix_multiply_1x128 (
967 : unpack_32_1x128 (d), negate_1x128 (
968 : expand_alpha_1x128 (unpack_32_1x128 (s)))));
969 :
970 0 : if (pm)
971 0 : pm++;
972 0 : ps++;
973 0 : w--;
974 : }
975 :
976 0 : while (w >= 4)
977 : {
978 : __m128i xmm_src_lo, xmm_src_hi;
979 : __m128i xmm_dst_lo, xmm_dst_hi;
980 :
981 0 : xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
982 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
983 :
984 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
985 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
986 :
987 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
988 0 : negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
989 :
990 : pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
991 : &xmm_src_lo, &xmm_src_hi,
992 : &xmm_dst_lo, &xmm_dst_hi);
993 :
994 0 : save_128_aligned (
995 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
996 :
997 0 : ps += 4;
998 0 : pd += 4;
999 0 : if (pm)
1000 0 : pm += 4;
1001 :
1002 0 : w -= 4;
1003 : }
1004 :
1005 0 : while (w)
1006 : {
1007 0 : uint32_t s = combine1 (ps, pm);
1008 0 : uint32_t d = *pd;
1009 :
1010 0 : *pd++ = pack_1x128_32 (
1011 : pix_multiply_1x128 (
1012 : unpack_32_1x128 (d), negate_1x128 (
1013 : expand_alpha_1x128 (unpack_32_1x128 (s)))));
1014 0 : ps++;
1015 0 : if (pm)
1016 0 : pm++;
1017 0 : w--;
1018 : }
1019 0 : }
1020 :
1021 : static void
1022 0 : sse2_combine_out_u (pixman_implementation_t *imp,
1023 : pixman_op_t op,
1024 : uint32_t * pd,
1025 : const uint32_t * ps,
1026 : const uint32_t * pm,
1027 : int w)
1028 : {
1029 0 : while (w && ((uintptr_t)pd & 15))
1030 : {
1031 0 : uint32_t s = combine1 (ps, pm);
1032 0 : uint32_t d = *pd;
1033 :
1034 0 : *pd++ = pack_1x128_32 (
1035 : pix_multiply_1x128 (
1036 : unpack_32_1x128 (s), negate_1x128 (
1037 : expand_alpha_1x128 (unpack_32_1x128 (d)))));
1038 0 : w--;
1039 0 : ps++;
1040 0 : if (pm)
1041 0 : pm++;
1042 : }
1043 :
1044 0 : while (w >= 4)
1045 : {
1046 : __m128i xmm_src_lo, xmm_src_hi;
1047 : __m128i xmm_dst_lo, xmm_dst_hi;
1048 :
1049 0 : xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1050 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1051 :
1052 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1053 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1054 :
1055 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1056 0 : negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1057 :
1058 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1059 : &xmm_dst_lo, &xmm_dst_hi,
1060 : &xmm_dst_lo, &xmm_dst_hi);
1061 :
1062 0 : save_128_aligned (
1063 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1064 :
1065 0 : ps += 4;
1066 0 : pd += 4;
1067 0 : w -= 4;
1068 0 : if (pm)
1069 0 : pm += 4;
1070 : }
1071 :
1072 0 : while (w)
1073 : {
1074 0 : uint32_t s = combine1 (ps, pm);
1075 0 : uint32_t d = *pd;
1076 :
1077 0 : *pd++ = pack_1x128_32 (
1078 : pix_multiply_1x128 (
1079 : unpack_32_1x128 (s), negate_1x128 (
1080 : expand_alpha_1x128 (unpack_32_1x128 (d)))));
1081 0 : w--;
1082 0 : ps++;
1083 0 : if (pm)
1084 0 : pm++;
1085 : }
1086 0 : }
1087 :
1088 : static force_inline uint32_t
1089 : core_combine_atop_u_pixel_sse2 (uint32_t src,
1090 : uint32_t dst)
1091 : {
1092 0 : __m128i s = unpack_32_1x128 (src);
1093 0 : __m128i d = unpack_32_1x128 (dst);
1094 :
1095 0 : __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1096 0 : __m128i da = expand_alpha_1x128 (d);
1097 :
1098 0 : return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1099 : }
1100 :
1101 : static void
1102 0 : sse2_combine_atop_u (pixman_implementation_t *imp,
1103 : pixman_op_t op,
1104 : uint32_t * pd,
1105 : const uint32_t * ps,
1106 : const uint32_t * pm,
1107 : int w)
1108 : {
1109 : uint32_t s, d;
1110 :
1111 : __m128i xmm_src_lo, xmm_src_hi;
1112 : __m128i xmm_dst_lo, xmm_dst_hi;
1113 : __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1114 : __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1115 :
1116 0 : while (w && ((uintptr_t)pd & 15))
1117 : {
1118 0 : s = combine1 (ps, pm);
1119 0 : d = *pd;
1120 :
1121 0 : *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1122 0 : w--;
1123 0 : ps++;
1124 0 : if (pm)
1125 0 : pm++;
1126 : }
1127 :
1128 0 : while (w >= 4)
1129 : {
1130 0 : xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1131 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1132 :
1133 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1134 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1135 :
1136 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1137 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1138 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1139 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1140 :
1141 0 : negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1142 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1143 :
1144 : pix_add_multiply_2x128 (
1145 : &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1146 : &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1147 : &xmm_dst_lo, &xmm_dst_hi);
1148 :
1149 0 : save_128_aligned (
1150 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1151 :
1152 0 : ps += 4;
1153 0 : pd += 4;
1154 0 : w -= 4;
1155 0 : if (pm)
1156 0 : pm += 4;
1157 : }
1158 :
1159 0 : while (w)
1160 : {
1161 0 : s = combine1 (ps, pm);
1162 0 : d = *pd;
1163 :
1164 0 : *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1165 0 : w--;
1166 0 : ps++;
1167 0 : if (pm)
1168 0 : pm++;
1169 : }
1170 0 : }
1171 :
1172 : static force_inline uint32_t
1173 : core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1174 : uint32_t dst)
1175 : {
1176 0 : __m128i s = unpack_32_1x128 (src);
1177 0 : __m128i d = unpack_32_1x128 (dst);
1178 :
1179 0 : __m128i sa = expand_alpha_1x128 (s);
1180 0 : __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1181 :
1182 0 : return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1183 : }
1184 :
1185 : static void
1186 0 : sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1187 : pixman_op_t op,
1188 : uint32_t * pd,
1189 : const uint32_t * ps,
1190 : const uint32_t * pm,
1191 : int w)
1192 : {
1193 : uint32_t s, d;
1194 :
1195 : __m128i xmm_src_lo, xmm_src_hi;
1196 : __m128i xmm_dst_lo, xmm_dst_hi;
1197 : __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1198 : __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1199 :
1200 0 : while (w && ((uintptr_t)pd & 15))
1201 : {
1202 0 : s = combine1 (ps, pm);
1203 0 : d = *pd;
1204 :
1205 0 : *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1206 0 : ps++;
1207 0 : w--;
1208 0 : if (pm)
1209 0 : pm++;
1210 : }
1211 :
1212 0 : while (w >= 4)
1213 : {
1214 0 : xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1215 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1216 :
1217 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1218 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1219 :
1220 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1221 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1222 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1223 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1224 :
1225 0 : negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1226 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1227 :
1228 : pix_add_multiply_2x128 (
1229 : &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1230 : &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1231 : &xmm_dst_lo, &xmm_dst_hi);
1232 :
1233 0 : save_128_aligned (
1234 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1235 :
1236 0 : ps += 4;
1237 0 : pd += 4;
1238 0 : w -= 4;
1239 0 : if (pm)
1240 0 : pm += 4;
1241 : }
1242 :
1243 0 : while (w)
1244 : {
1245 0 : s = combine1 (ps, pm);
1246 0 : d = *pd;
1247 :
1248 0 : *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1249 0 : ps++;
1250 0 : w--;
1251 0 : if (pm)
1252 0 : pm++;
1253 : }
1254 0 : }
1255 :
1256 : static force_inline uint32_t
1257 : core_combine_xor_u_pixel_sse2 (uint32_t src,
1258 : uint32_t dst)
1259 : {
1260 0 : __m128i s = unpack_32_1x128 (src);
1261 0 : __m128i d = unpack_32_1x128 (dst);
1262 :
1263 0 : __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1264 0 : __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1265 :
1266 0 : return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1267 : }
1268 :
1269 : static void
1270 0 : sse2_combine_xor_u (pixman_implementation_t *imp,
1271 : pixman_op_t op,
1272 : uint32_t * dst,
1273 : const uint32_t * src,
1274 : const uint32_t * mask,
1275 : int width)
1276 : {
1277 0 : int w = width;
1278 : uint32_t s, d;
1279 0 : uint32_t* pd = dst;
1280 0 : const uint32_t* ps = src;
1281 0 : const uint32_t* pm = mask;
1282 :
1283 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1284 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1285 : __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1286 : __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1287 :
1288 0 : while (w && ((uintptr_t)pd & 15))
1289 : {
1290 0 : s = combine1 (ps, pm);
1291 0 : d = *pd;
1292 :
1293 0 : *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1294 0 : w--;
1295 0 : ps++;
1296 0 : if (pm)
1297 0 : pm++;
1298 : }
1299 :
1300 0 : while (w >= 4)
1301 : {
1302 0 : xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1303 0 : xmm_dst = load_128_aligned ((__m128i*) pd);
1304 :
1305 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1306 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1307 :
1308 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1309 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1310 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1311 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1312 :
1313 0 : negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1314 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1315 0 : negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1316 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1317 :
1318 : pix_add_multiply_2x128 (
1319 : &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1320 : &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1321 : &xmm_dst_lo, &xmm_dst_hi);
1322 :
1323 0 : save_128_aligned (
1324 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1325 :
1326 0 : ps += 4;
1327 0 : pd += 4;
1328 0 : w -= 4;
1329 0 : if (pm)
1330 0 : pm += 4;
1331 : }
1332 :
1333 0 : while (w)
1334 : {
1335 0 : s = combine1 (ps, pm);
1336 0 : d = *pd;
1337 :
1338 0 : *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1339 0 : w--;
1340 0 : ps++;
1341 0 : if (pm)
1342 0 : pm++;
1343 : }
1344 0 : }
1345 :
1346 : static force_inline void
1347 0 : sse2_combine_add_u (pixman_implementation_t *imp,
1348 : pixman_op_t op,
1349 : uint32_t * dst,
1350 : const uint32_t * src,
1351 : const uint32_t * mask,
1352 : int width)
1353 : {
1354 0 : int w = width;
1355 : uint32_t s, d;
1356 0 : uint32_t* pd = dst;
1357 0 : const uint32_t* ps = src;
1358 0 : const uint32_t* pm = mask;
1359 :
1360 0 : while (w && (uintptr_t)pd & 15)
1361 : {
1362 0 : s = combine1 (ps, pm);
1363 0 : d = *pd;
1364 :
1365 0 : ps++;
1366 0 : if (pm)
1367 0 : pm++;
1368 0 : *pd++ = _mm_cvtsi128_si32 (
1369 : _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1370 0 : w--;
1371 : }
1372 :
1373 0 : while (w >= 4)
1374 : {
1375 : __m128i s;
1376 :
1377 0 : s = combine4 ((__m128i*)ps, (__m128i*)pm);
1378 :
1379 0 : save_128_aligned (
1380 : (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1381 :
1382 0 : pd += 4;
1383 0 : ps += 4;
1384 0 : if (pm)
1385 0 : pm += 4;
1386 0 : w -= 4;
1387 : }
1388 :
1389 0 : while (w--)
1390 : {
1391 0 : s = combine1 (ps, pm);
1392 0 : d = *pd;
1393 :
1394 0 : ps++;
1395 0 : *pd++ = _mm_cvtsi128_si32 (
1396 : _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1397 0 : if (pm)
1398 0 : pm++;
1399 : }
1400 0 : }
1401 :
1402 : static force_inline uint32_t
1403 : core_combine_saturate_u_pixel_sse2 (uint32_t src,
1404 : uint32_t dst)
1405 : {
1406 0 : __m128i ms = unpack_32_1x128 (src);
1407 0 : __m128i md = unpack_32_1x128 (dst);
1408 0 : uint32_t sa = src >> 24;
1409 0 : uint32_t da = ~dst >> 24;
1410 :
1411 0 : if (sa > da)
1412 : {
1413 0 : ms = pix_multiply_1x128 (
1414 0 : ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1415 : }
1416 :
1417 0 : return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1418 : }
1419 :
1420 : static void
1421 0 : sse2_combine_saturate_u (pixman_implementation_t *imp,
1422 : pixman_op_t op,
1423 : uint32_t * pd,
1424 : const uint32_t * ps,
1425 : const uint32_t * pm,
1426 : int w)
1427 : {
1428 : uint32_t s, d;
1429 :
1430 : uint32_t pack_cmp;
1431 : __m128i xmm_src, xmm_dst;
1432 :
1433 0 : while (w && (uintptr_t)pd & 15)
1434 : {
1435 0 : s = combine1 (ps, pm);
1436 0 : d = *pd;
1437 :
1438 0 : *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1439 0 : w--;
1440 0 : ps++;
1441 0 : if (pm)
1442 0 : pm++;
1443 : }
1444 :
1445 0 : while (w >= 4)
1446 : {
1447 0 : xmm_dst = load_128_aligned ((__m128i*)pd);
1448 0 : xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1449 :
1450 0 : pack_cmp = _mm_movemask_epi8 (
1451 : _mm_cmpgt_epi32 (
1452 : _mm_srli_epi32 (xmm_src, 24),
1453 : _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1454 :
1455 : /* if some alpha src is grater than respective ~alpha dst */
1456 0 : if (pack_cmp)
1457 : {
1458 0 : s = combine1 (ps++, pm);
1459 0 : d = *pd;
1460 0 : *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1461 0 : if (pm)
1462 0 : pm++;
1463 :
1464 0 : s = combine1 (ps++, pm);
1465 0 : d = *pd;
1466 0 : *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1467 0 : if (pm)
1468 0 : pm++;
1469 :
1470 0 : s = combine1 (ps++, pm);
1471 0 : d = *pd;
1472 0 : *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1473 0 : if (pm)
1474 0 : pm++;
1475 :
1476 0 : s = combine1 (ps++, pm);
1477 0 : d = *pd;
1478 0 : *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1479 0 : if (pm)
1480 0 : pm++;
1481 : }
1482 : else
1483 : {
1484 0 : save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1485 :
1486 0 : pd += 4;
1487 0 : ps += 4;
1488 0 : if (pm)
1489 0 : pm += 4;
1490 : }
1491 :
1492 0 : w -= 4;
1493 : }
1494 :
1495 0 : while (w--)
1496 : {
1497 0 : s = combine1 (ps, pm);
1498 0 : d = *pd;
1499 :
1500 0 : *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1501 0 : ps++;
1502 0 : if (pm)
1503 0 : pm++;
1504 : }
1505 0 : }
1506 :
1507 : static void
1508 0 : sse2_combine_src_ca (pixman_implementation_t *imp,
1509 : pixman_op_t op,
1510 : uint32_t * pd,
1511 : const uint32_t * ps,
1512 : const uint32_t * pm,
1513 : int w)
1514 : {
1515 : uint32_t s, m;
1516 :
1517 : __m128i xmm_src_lo, xmm_src_hi;
1518 : __m128i xmm_mask_lo, xmm_mask_hi;
1519 : __m128i xmm_dst_lo, xmm_dst_hi;
1520 :
1521 0 : while (w && (uintptr_t)pd & 15)
1522 : {
1523 0 : s = *ps++;
1524 0 : m = *pm++;
1525 0 : *pd++ = pack_1x128_32 (
1526 : pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1527 0 : w--;
1528 : }
1529 :
1530 0 : while (w >= 4)
1531 : {
1532 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1533 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1534 :
1535 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1536 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1537 :
1538 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1539 : &xmm_mask_lo, &xmm_mask_hi,
1540 : &xmm_dst_lo, &xmm_dst_hi);
1541 :
1542 0 : save_128_aligned (
1543 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1544 :
1545 0 : ps += 4;
1546 0 : pd += 4;
1547 0 : pm += 4;
1548 0 : w -= 4;
1549 : }
1550 :
1551 0 : while (w)
1552 : {
1553 0 : s = *ps++;
1554 0 : m = *pm++;
1555 0 : *pd++ = pack_1x128_32 (
1556 : pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1557 0 : w--;
1558 : }
1559 0 : }
1560 :
1561 : static force_inline uint32_t
1562 : core_combine_over_ca_pixel_sse2 (uint32_t src,
1563 : uint32_t mask,
1564 : uint32_t dst)
1565 : {
1566 0 : __m128i s = unpack_32_1x128 (src);
1567 0 : __m128i expAlpha = expand_alpha_1x128 (s);
1568 0 : __m128i unpk_mask = unpack_32_1x128 (mask);
1569 0 : __m128i unpk_dst = unpack_32_1x128 (dst);
1570 :
1571 0 : return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1572 : }
1573 :
1574 : static void
1575 0 : sse2_combine_over_ca (pixman_implementation_t *imp,
1576 : pixman_op_t op,
1577 : uint32_t * pd,
1578 : const uint32_t * ps,
1579 : const uint32_t * pm,
1580 : int w)
1581 : {
1582 : uint32_t s, m, d;
1583 :
1584 : __m128i xmm_alpha_lo, xmm_alpha_hi;
1585 : __m128i xmm_src_lo, xmm_src_hi;
1586 : __m128i xmm_dst_lo, xmm_dst_hi;
1587 : __m128i xmm_mask_lo, xmm_mask_hi;
1588 :
1589 0 : while (w && (uintptr_t)pd & 15)
1590 : {
1591 0 : s = *ps++;
1592 0 : m = *pm++;
1593 0 : d = *pd;
1594 :
1595 0 : *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1596 0 : w--;
1597 : }
1598 :
1599 0 : while (w >= 4)
1600 : {
1601 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1602 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1603 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1604 :
1605 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1606 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1607 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1608 :
1609 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1610 : &xmm_alpha_lo, &xmm_alpha_hi);
1611 :
1612 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1613 : &xmm_alpha_lo, &xmm_alpha_hi,
1614 : &xmm_mask_lo, &xmm_mask_hi,
1615 : &xmm_dst_lo, &xmm_dst_hi);
1616 :
1617 0 : save_128_aligned (
1618 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1619 :
1620 0 : ps += 4;
1621 0 : pd += 4;
1622 0 : pm += 4;
1623 0 : w -= 4;
1624 : }
1625 :
1626 0 : while (w)
1627 : {
1628 0 : s = *ps++;
1629 0 : m = *pm++;
1630 0 : d = *pd;
1631 :
1632 0 : *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1633 0 : w--;
1634 : }
1635 0 : }
1636 :
1637 : static force_inline uint32_t
1638 : core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1639 : uint32_t mask,
1640 : uint32_t dst)
1641 : {
1642 0 : __m128i d = unpack_32_1x128 (dst);
1643 :
1644 0 : return pack_1x128_32 (
1645 : over_1x128 (d, expand_alpha_1x128 (d),
1646 : pix_multiply_1x128 (unpack_32_1x128 (src),
1647 : unpack_32_1x128 (mask))));
1648 : }
1649 :
1650 : static void
1651 0 : sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1652 : pixman_op_t op,
1653 : uint32_t * pd,
1654 : const uint32_t * ps,
1655 : const uint32_t * pm,
1656 : int w)
1657 : {
1658 : uint32_t s, m, d;
1659 :
1660 : __m128i xmm_alpha_lo, xmm_alpha_hi;
1661 : __m128i xmm_src_lo, xmm_src_hi;
1662 : __m128i xmm_dst_lo, xmm_dst_hi;
1663 : __m128i xmm_mask_lo, xmm_mask_hi;
1664 :
1665 0 : while (w && (uintptr_t)pd & 15)
1666 : {
1667 0 : s = *ps++;
1668 0 : m = *pm++;
1669 0 : d = *pd;
1670 :
1671 0 : *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1672 0 : w--;
1673 : }
1674 :
1675 0 : while (w >= 4)
1676 : {
1677 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1678 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1679 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1680 :
1681 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1682 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1683 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1684 :
1685 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1686 : &xmm_alpha_lo, &xmm_alpha_hi);
1687 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1688 : &xmm_mask_lo, &xmm_mask_hi,
1689 : &xmm_mask_lo, &xmm_mask_hi);
1690 :
1691 : over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1692 : &xmm_alpha_lo, &xmm_alpha_hi,
1693 : &xmm_mask_lo, &xmm_mask_hi);
1694 :
1695 0 : save_128_aligned (
1696 : (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1697 :
1698 0 : ps += 4;
1699 0 : pd += 4;
1700 0 : pm += 4;
1701 0 : w -= 4;
1702 : }
1703 :
1704 0 : while (w)
1705 : {
1706 0 : s = *ps++;
1707 0 : m = *pm++;
1708 0 : d = *pd;
1709 :
1710 0 : *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1711 0 : w--;
1712 : }
1713 0 : }
1714 :
1715 : static void
1716 0 : sse2_combine_in_ca (pixman_implementation_t *imp,
1717 : pixman_op_t op,
1718 : uint32_t * pd,
1719 : const uint32_t * ps,
1720 : const uint32_t * pm,
1721 : int w)
1722 : {
1723 : uint32_t s, m, d;
1724 :
1725 : __m128i xmm_alpha_lo, xmm_alpha_hi;
1726 : __m128i xmm_src_lo, xmm_src_hi;
1727 : __m128i xmm_dst_lo, xmm_dst_hi;
1728 : __m128i xmm_mask_lo, xmm_mask_hi;
1729 :
1730 0 : while (w && (uintptr_t)pd & 15)
1731 : {
1732 0 : s = *ps++;
1733 0 : m = *pm++;
1734 0 : d = *pd;
1735 :
1736 0 : *pd++ = pack_1x128_32 (
1737 : pix_multiply_1x128 (
1738 : pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1739 : expand_alpha_1x128 (unpack_32_1x128 (d))));
1740 :
1741 0 : w--;
1742 : }
1743 :
1744 0 : while (w >= 4)
1745 : {
1746 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1747 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1748 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1749 :
1750 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1751 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1752 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1753 :
1754 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1755 : &xmm_alpha_lo, &xmm_alpha_hi);
1756 :
1757 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1758 : &xmm_mask_lo, &xmm_mask_hi,
1759 : &xmm_dst_lo, &xmm_dst_hi);
1760 :
1761 : pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1762 : &xmm_alpha_lo, &xmm_alpha_hi,
1763 : &xmm_dst_lo, &xmm_dst_hi);
1764 :
1765 0 : save_128_aligned (
1766 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1767 :
1768 0 : ps += 4;
1769 0 : pd += 4;
1770 0 : pm += 4;
1771 0 : w -= 4;
1772 : }
1773 :
1774 0 : while (w)
1775 : {
1776 0 : s = *ps++;
1777 0 : m = *pm++;
1778 0 : d = *pd;
1779 :
1780 0 : *pd++ = pack_1x128_32 (
1781 : pix_multiply_1x128 (
1782 : pix_multiply_1x128 (
1783 : unpack_32_1x128 (s), unpack_32_1x128 (m)),
1784 : expand_alpha_1x128 (unpack_32_1x128 (d))));
1785 :
1786 0 : w--;
1787 : }
1788 0 : }
1789 :
1790 : static void
1791 0 : sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1792 : pixman_op_t op,
1793 : uint32_t * pd,
1794 : const uint32_t * ps,
1795 : const uint32_t * pm,
1796 : int w)
1797 : {
1798 : uint32_t s, m, d;
1799 :
1800 : __m128i xmm_alpha_lo, xmm_alpha_hi;
1801 : __m128i xmm_src_lo, xmm_src_hi;
1802 : __m128i xmm_dst_lo, xmm_dst_hi;
1803 : __m128i xmm_mask_lo, xmm_mask_hi;
1804 :
1805 0 : while (w && (uintptr_t)pd & 15)
1806 : {
1807 0 : s = *ps++;
1808 0 : m = *pm++;
1809 0 : d = *pd;
1810 :
1811 0 : *pd++ = pack_1x128_32 (
1812 : pix_multiply_1x128 (
1813 : unpack_32_1x128 (d),
1814 : pix_multiply_1x128 (unpack_32_1x128 (m),
1815 : expand_alpha_1x128 (unpack_32_1x128 (s)))));
1816 0 : w--;
1817 : }
1818 :
1819 0 : while (w >= 4)
1820 : {
1821 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1822 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1823 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1824 :
1825 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1826 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1827 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1828 :
1829 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1830 : &xmm_alpha_lo, &xmm_alpha_hi);
1831 : pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1832 : &xmm_alpha_lo, &xmm_alpha_hi,
1833 : &xmm_alpha_lo, &xmm_alpha_hi);
1834 :
1835 : pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1836 : &xmm_alpha_lo, &xmm_alpha_hi,
1837 : &xmm_dst_lo, &xmm_dst_hi);
1838 :
1839 0 : save_128_aligned (
1840 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1841 :
1842 0 : ps += 4;
1843 0 : pd += 4;
1844 0 : pm += 4;
1845 0 : w -= 4;
1846 : }
1847 :
1848 0 : while (w)
1849 : {
1850 0 : s = *ps++;
1851 0 : m = *pm++;
1852 0 : d = *pd;
1853 :
1854 0 : *pd++ = pack_1x128_32 (
1855 : pix_multiply_1x128 (
1856 : unpack_32_1x128 (d),
1857 : pix_multiply_1x128 (unpack_32_1x128 (m),
1858 : expand_alpha_1x128 (unpack_32_1x128 (s)))));
1859 0 : w--;
1860 : }
1861 0 : }
1862 :
1863 : static void
1864 0 : sse2_combine_out_ca (pixman_implementation_t *imp,
1865 : pixman_op_t op,
1866 : uint32_t * pd,
1867 : const uint32_t * ps,
1868 : const uint32_t * pm,
1869 : int w)
1870 : {
1871 : uint32_t s, m, d;
1872 :
1873 : __m128i xmm_alpha_lo, xmm_alpha_hi;
1874 : __m128i xmm_src_lo, xmm_src_hi;
1875 : __m128i xmm_dst_lo, xmm_dst_hi;
1876 : __m128i xmm_mask_lo, xmm_mask_hi;
1877 :
1878 0 : while (w && (uintptr_t)pd & 15)
1879 : {
1880 0 : s = *ps++;
1881 0 : m = *pm++;
1882 0 : d = *pd;
1883 :
1884 0 : *pd++ = pack_1x128_32 (
1885 : pix_multiply_1x128 (
1886 : pix_multiply_1x128 (
1887 : unpack_32_1x128 (s), unpack_32_1x128 (m)),
1888 : negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1889 0 : w--;
1890 : }
1891 :
1892 0 : while (w >= 4)
1893 : {
1894 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1895 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1896 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1897 :
1898 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1899 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1900 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1901 :
1902 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1903 : &xmm_alpha_lo, &xmm_alpha_hi);
1904 0 : negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1905 : &xmm_alpha_lo, &xmm_alpha_hi);
1906 :
1907 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1908 : &xmm_mask_lo, &xmm_mask_hi,
1909 : &xmm_dst_lo, &xmm_dst_hi);
1910 : pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1911 : &xmm_alpha_lo, &xmm_alpha_hi,
1912 : &xmm_dst_lo, &xmm_dst_hi);
1913 :
1914 0 : save_128_aligned (
1915 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1916 :
1917 0 : ps += 4;
1918 0 : pd += 4;
1919 0 : pm += 4;
1920 0 : w -= 4;
1921 : }
1922 :
1923 0 : while (w)
1924 : {
1925 0 : s = *ps++;
1926 0 : m = *pm++;
1927 0 : d = *pd;
1928 :
1929 0 : *pd++ = pack_1x128_32 (
1930 : pix_multiply_1x128 (
1931 : pix_multiply_1x128 (
1932 : unpack_32_1x128 (s), unpack_32_1x128 (m)),
1933 : negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1934 :
1935 0 : w--;
1936 : }
1937 0 : }
1938 :
1939 : static void
1940 0 : sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1941 : pixman_op_t op,
1942 : uint32_t * pd,
1943 : const uint32_t * ps,
1944 : const uint32_t * pm,
1945 : int w)
1946 : {
1947 : uint32_t s, m, d;
1948 :
1949 : __m128i xmm_alpha_lo, xmm_alpha_hi;
1950 : __m128i xmm_src_lo, xmm_src_hi;
1951 : __m128i xmm_dst_lo, xmm_dst_hi;
1952 : __m128i xmm_mask_lo, xmm_mask_hi;
1953 :
1954 0 : while (w && (uintptr_t)pd & 15)
1955 : {
1956 0 : s = *ps++;
1957 0 : m = *pm++;
1958 0 : d = *pd;
1959 :
1960 0 : *pd++ = pack_1x128_32 (
1961 : pix_multiply_1x128 (
1962 : unpack_32_1x128 (d),
1963 : negate_1x128 (pix_multiply_1x128 (
1964 : unpack_32_1x128 (m),
1965 : expand_alpha_1x128 (unpack_32_1x128 (s))))));
1966 0 : w--;
1967 : }
1968 :
1969 0 : while (w >= 4)
1970 : {
1971 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1972 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1973 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1974 :
1975 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1976 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1977 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1978 :
1979 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1980 : &xmm_alpha_lo, &xmm_alpha_hi);
1981 :
1982 : pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1983 : &xmm_alpha_lo, &xmm_alpha_hi,
1984 : &xmm_mask_lo, &xmm_mask_hi);
1985 :
1986 0 : negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1987 : &xmm_mask_lo, &xmm_mask_hi);
1988 :
1989 : pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1990 : &xmm_mask_lo, &xmm_mask_hi,
1991 : &xmm_dst_lo, &xmm_dst_hi);
1992 :
1993 0 : save_128_aligned (
1994 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1995 :
1996 0 : ps += 4;
1997 0 : pd += 4;
1998 0 : pm += 4;
1999 0 : w -= 4;
2000 : }
2001 :
2002 0 : while (w)
2003 : {
2004 0 : s = *ps++;
2005 0 : m = *pm++;
2006 0 : d = *pd;
2007 :
2008 0 : *pd++ = pack_1x128_32 (
2009 : pix_multiply_1x128 (
2010 : unpack_32_1x128 (d),
2011 : negate_1x128 (pix_multiply_1x128 (
2012 : unpack_32_1x128 (m),
2013 : expand_alpha_1x128 (unpack_32_1x128 (s))))));
2014 0 : w--;
2015 : }
2016 0 : }
2017 :
2018 : static force_inline uint32_t
2019 : core_combine_atop_ca_pixel_sse2 (uint32_t src,
2020 : uint32_t mask,
2021 : uint32_t dst)
2022 : {
2023 0 : __m128i m = unpack_32_1x128 (mask);
2024 0 : __m128i s = unpack_32_1x128 (src);
2025 0 : __m128i d = unpack_32_1x128 (dst);
2026 0 : __m128i sa = expand_alpha_1x128 (s);
2027 0 : __m128i da = expand_alpha_1x128 (d);
2028 :
2029 0 : s = pix_multiply_1x128 (s, m);
2030 0 : m = negate_1x128 (pix_multiply_1x128 (m, sa));
2031 :
2032 0 : return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2033 : }
2034 :
2035 : static void
2036 0 : sse2_combine_atop_ca (pixman_implementation_t *imp,
2037 : pixman_op_t op,
2038 : uint32_t * pd,
2039 : const uint32_t * ps,
2040 : const uint32_t * pm,
2041 : int w)
2042 : {
2043 : uint32_t s, m, d;
2044 :
2045 : __m128i xmm_src_lo, xmm_src_hi;
2046 : __m128i xmm_dst_lo, xmm_dst_hi;
2047 : __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2048 : __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2049 : __m128i xmm_mask_lo, xmm_mask_hi;
2050 :
2051 0 : while (w && (uintptr_t)pd & 15)
2052 : {
2053 0 : s = *ps++;
2054 0 : m = *pm++;
2055 0 : d = *pd;
2056 :
2057 0 : *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2058 0 : w--;
2059 : }
2060 :
2061 0 : while (w >= 4)
2062 : {
2063 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2064 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2065 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2066 :
2067 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2068 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2069 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2070 :
2071 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2072 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2073 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2074 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2075 :
2076 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2077 : &xmm_mask_lo, &xmm_mask_hi,
2078 : &xmm_src_lo, &xmm_src_hi);
2079 : pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2080 : &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2081 : &xmm_mask_lo, &xmm_mask_hi);
2082 :
2083 0 : negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2084 :
2085 : pix_add_multiply_2x128 (
2086 : &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2087 : &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2088 : &xmm_dst_lo, &xmm_dst_hi);
2089 :
2090 0 : save_128_aligned (
2091 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2092 :
2093 0 : ps += 4;
2094 0 : pd += 4;
2095 0 : pm += 4;
2096 0 : w -= 4;
2097 : }
2098 :
2099 0 : while (w)
2100 : {
2101 0 : s = *ps++;
2102 0 : m = *pm++;
2103 0 : d = *pd;
2104 :
2105 0 : *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2106 0 : w--;
2107 : }
2108 0 : }
2109 :
2110 : static force_inline uint32_t
2111 : core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2112 : uint32_t mask,
2113 : uint32_t dst)
2114 : {
2115 0 : __m128i m = unpack_32_1x128 (mask);
2116 0 : __m128i s = unpack_32_1x128 (src);
2117 0 : __m128i d = unpack_32_1x128 (dst);
2118 :
2119 0 : __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2120 0 : __m128i sa = expand_alpha_1x128 (s);
2121 :
2122 0 : s = pix_multiply_1x128 (s, m);
2123 0 : m = pix_multiply_1x128 (m, sa);
2124 :
2125 0 : return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2126 : }
2127 :
2128 : static void
2129 0 : sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2130 : pixman_op_t op,
2131 : uint32_t * pd,
2132 : const uint32_t * ps,
2133 : const uint32_t * pm,
2134 : int w)
2135 : {
2136 : uint32_t s, m, d;
2137 :
2138 : __m128i xmm_src_lo, xmm_src_hi;
2139 : __m128i xmm_dst_lo, xmm_dst_hi;
2140 : __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2141 : __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2142 : __m128i xmm_mask_lo, xmm_mask_hi;
2143 :
2144 0 : while (w && (uintptr_t)pd & 15)
2145 : {
2146 0 : s = *ps++;
2147 0 : m = *pm++;
2148 0 : d = *pd;
2149 :
2150 0 : *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2151 0 : w--;
2152 : }
2153 :
2154 0 : while (w >= 4)
2155 : {
2156 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2157 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2158 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2159 :
2160 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2161 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2162 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2163 :
2164 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2165 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2166 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2167 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2168 :
2169 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2170 : &xmm_mask_lo, &xmm_mask_hi,
2171 : &xmm_src_lo, &xmm_src_hi);
2172 : pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2173 : &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2174 : &xmm_mask_lo, &xmm_mask_hi);
2175 :
2176 0 : negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2177 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2178 :
2179 : pix_add_multiply_2x128 (
2180 : &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2181 : &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2182 : &xmm_dst_lo, &xmm_dst_hi);
2183 :
2184 0 : save_128_aligned (
2185 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2186 :
2187 0 : ps += 4;
2188 0 : pd += 4;
2189 0 : pm += 4;
2190 0 : w -= 4;
2191 : }
2192 :
2193 0 : while (w)
2194 : {
2195 0 : s = *ps++;
2196 0 : m = *pm++;
2197 0 : d = *pd;
2198 :
2199 0 : *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2200 0 : w--;
2201 : }
2202 0 : }
2203 :
2204 : static force_inline uint32_t
2205 : core_combine_xor_ca_pixel_sse2 (uint32_t src,
2206 : uint32_t mask,
2207 : uint32_t dst)
2208 : {
2209 0 : __m128i a = unpack_32_1x128 (mask);
2210 0 : __m128i s = unpack_32_1x128 (src);
2211 0 : __m128i d = unpack_32_1x128 (dst);
2212 :
2213 0 : __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2214 : a, expand_alpha_1x128 (s)));
2215 0 : __m128i dest = pix_multiply_1x128 (s, a);
2216 0 : __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2217 :
2218 0 : return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2219 : &alpha_dst,
2220 : &dest,
2221 : &alpha_src));
2222 : }
2223 :
2224 : static void
2225 0 : sse2_combine_xor_ca (pixman_implementation_t *imp,
2226 : pixman_op_t op,
2227 : uint32_t * pd,
2228 : const uint32_t * ps,
2229 : const uint32_t * pm,
2230 : int w)
2231 : {
2232 : uint32_t s, m, d;
2233 :
2234 : __m128i xmm_src_lo, xmm_src_hi;
2235 : __m128i xmm_dst_lo, xmm_dst_hi;
2236 : __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2237 : __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2238 : __m128i xmm_mask_lo, xmm_mask_hi;
2239 :
2240 0 : while (w && (uintptr_t)pd & 15)
2241 : {
2242 0 : s = *ps++;
2243 0 : m = *pm++;
2244 0 : d = *pd;
2245 :
2246 0 : *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2247 0 : w--;
2248 : }
2249 :
2250 0 : while (w >= 4)
2251 : {
2252 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2253 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2254 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2255 :
2256 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2257 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2258 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2259 :
2260 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2261 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2262 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2263 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2264 :
2265 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2266 : &xmm_mask_lo, &xmm_mask_hi,
2267 : &xmm_src_lo, &xmm_src_hi);
2268 : pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2269 : &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2270 : &xmm_mask_lo, &xmm_mask_hi);
2271 :
2272 0 : negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2273 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2274 0 : negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2275 : &xmm_mask_lo, &xmm_mask_hi);
2276 :
2277 : pix_add_multiply_2x128 (
2278 : &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2279 : &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2280 : &xmm_dst_lo, &xmm_dst_hi);
2281 :
2282 0 : save_128_aligned (
2283 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2284 :
2285 0 : ps += 4;
2286 0 : pd += 4;
2287 0 : pm += 4;
2288 0 : w -= 4;
2289 : }
2290 :
2291 0 : while (w)
2292 : {
2293 0 : s = *ps++;
2294 0 : m = *pm++;
2295 0 : d = *pd;
2296 :
2297 0 : *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2298 0 : w--;
2299 : }
2300 0 : }
2301 :
2302 : static void
2303 0 : sse2_combine_add_ca (pixman_implementation_t *imp,
2304 : pixman_op_t op,
2305 : uint32_t * pd,
2306 : const uint32_t * ps,
2307 : const uint32_t * pm,
2308 : int w)
2309 : {
2310 : uint32_t s, m, d;
2311 :
2312 : __m128i xmm_src_lo, xmm_src_hi;
2313 : __m128i xmm_dst_lo, xmm_dst_hi;
2314 : __m128i xmm_mask_lo, xmm_mask_hi;
2315 :
2316 0 : while (w && (uintptr_t)pd & 15)
2317 : {
2318 0 : s = *ps++;
2319 0 : m = *pm++;
2320 0 : d = *pd;
2321 :
2322 0 : *pd++ = pack_1x128_32 (
2323 : _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2324 : unpack_32_1x128 (m)),
2325 : unpack_32_1x128 (d)));
2326 0 : w--;
2327 : }
2328 :
2329 0 : while (w >= 4)
2330 : {
2331 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2332 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2333 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2334 :
2335 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2336 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2337 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2338 :
2339 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2340 : &xmm_mask_lo, &xmm_mask_hi,
2341 : &xmm_src_lo, &xmm_src_hi);
2342 :
2343 0 : save_128_aligned (
2344 : (__m128i*)pd, pack_2x128_128 (
2345 : _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2346 : _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2347 :
2348 0 : ps += 4;
2349 0 : pd += 4;
2350 0 : pm += 4;
2351 0 : w -= 4;
2352 : }
2353 :
2354 0 : while (w)
2355 : {
2356 0 : s = *ps++;
2357 0 : m = *pm++;
2358 0 : d = *pd;
2359 :
2360 0 : *pd++ = pack_1x128_32 (
2361 : _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2362 : unpack_32_1x128 (m)),
2363 : unpack_32_1x128 (d)));
2364 0 : w--;
2365 : }
2366 0 : }
2367 :
2368 : static force_inline __m128i
2369 : create_mask_16_128 (uint16_t mask)
2370 : {
2371 8 : return _mm_set1_epi16 (mask);
2372 : }
2373 :
2374 : /* Work around a code generation bug in Sun Studio 12. */
2375 : #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2376 : # define create_mask_2x32_128(mask0, mask1) \
2377 : (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2378 : #else
2379 : static force_inline __m128i
2380 : create_mask_2x32_128 (uint32_t mask0,
2381 : uint32_t mask1)
2382 : {
2383 26 : return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2384 : }
2385 : #endif
2386 :
2387 : static void
2388 0 : sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2389 : pixman_composite_info_t *info)
2390 : {
2391 0 : PIXMAN_COMPOSITE_ARGS (info);
2392 : uint32_t src;
2393 : uint32_t *dst_line, *dst, d;
2394 : int32_t w;
2395 : int dst_stride;
2396 : __m128i xmm_src, xmm_alpha;
2397 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2398 :
2399 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2400 :
2401 0 : if (src == 0)
2402 0 : return;
2403 :
2404 0 : PIXMAN_IMAGE_GET_LINE (
2405 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2406 :
2407 0 : xmm_src = expand_pixel_32_1x128 (src);
2408 0 : xmm_alpha = expand_alpha_1x128 (xmm_src);
2409 :
2410 0 : while (height--)
2411 : {
2412 0 : dst = dst_line;
2413 :
2414 0 : dst_line += dst_stride;
2415 0 : w = width;
2416 :
2417 0 : while (w && (uintptr_t)dst & 15)
2418 : {
2419 0 : d = *dst;
2420 0 : *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2421 : xmm_alpha,
2422 : unpack_32_1x128 (d)));
2423 0 : w--;
2424 : }
2425 :
2426 0 : while (w >= 4)
2427 : {
2428 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
2429 :
2430 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2431 :
2432 : over_2x128 (&xmm_src, &xmm_src,
2433 : &xmm_alpha, &xmm_alpha,
2434 : &xmm_dst_lo, &xmm_dst_hi);
2435 :
2436 : /* rebuid the 4 pixel data and save*/
2437 0 : save_128_aligned (
2438 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2439 :
2440 0 : w -= 4;
2441 0 : dst += 4;
2442 : }
2443 :
2444 0 : while (w)
2445 : {
2446 0 : d = *dst;
2447 0 : *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2448 : xmm_alpha,
2449 : unpack_32_1x128 (d)));
2450 0 : w--;
2451 : }
2452 :
2453 : }
2454 : }
2455 :
2456 : static void
2457 0 : sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2458 : pixman_composite_info_t *info)
2459 : {
2460 0 : PIXMAN_COMPOSITE_ARGS (info);
2461 : uint32_t src;
2462 : uint16_t *dst_line, *dst, d;
2463 : int32_t w;
2464 : int dst_stride;
2465 : __m128i xmm_src, xmm_alpha;
2466 : __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2467 :
2468 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2469 :
2470 0 : if (src == 0)
2471 0 : return;
2472 :
2473 0 : PIXMAN_IMAGE_GET_LINE (
2474 : dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2475 :
2476 0 : xmm_src = expand_pixel_32_1x128 (src);
2477 0 : xmm_alpha = expand_alpha_1x128 (xmm_src);
2478 :
2479 0 : while (height--)
2480 : {
2481 0 : dst = dst_line;
2482 :
2483 0 : dst_line += dst_stride;
2484 0 : w = width;
2485 :
2486 0 : while (w && (uintptr_t)dst & 15)
2487 : {
2488 0 : d = *dst;
2489 :
2490 0 : *dst++ = pack_565_32_16 (
2491 : pack_1x128_32 (over_1x128 (xmm_src,
2492 : xmm_alpha,
2493 : expand565_16_1x128 (d))));
2494 0 : w--;
2495 : }
2496 :
2497 0 : while (w >= 8)
2498 : {
2499 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
2500 :
2501 : unpack_565_128_4x128 (xmm_dst,
2502 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2503 :
2504 : over_2x128 (&xmm_src, &xmm_src,
2505 : &xmm_alpha, &xmm_alpha,
2506 : &xmm_dst0, &xmm_dst1);
2507 : over_2x128 (&xmm_src, &xmm_src,
2508 : &xmm_alpha, &xmm_alpha,
2509 : &xmm_dst2, &xmm_dst3);
2510 :
2511 0 : xmm_dst = pack_565_4x128_128 (
2512 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2513 :
2514 : save_128_aligned ((__m128i*)dst, xmm_dst);
2515 :
2516 0 : dst += 8;
2517 0 : w -= 8;
2518 : }
2519 :
2520 0 : while (w--)
2521 : {
2522 0 : d = *dst;
2523 0 : *dst++ = pack_565_32_16 (
2524 : pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2525 : expand565_16_1x128 (d))));
2526 : }
2527 : }
2528 :
2529 : }
2530 :
2531 : static void
2532 0 : sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2533 : pixman_composite_info_t *info)
2534 : {
2535 0 : PIXMAN_COMPOSITE_ARGS (info);
2536 : uint32_t src;
2537 : uint32_t *dst_line, d;
2538 : uint32_t *mask_line, m;
2539 : uint32_t pack_cmp;
2540 : int dst_stride, mask_stride;
2541 :
2542 : __m128i xmm_src;
2543 : __m128i xmm_dst;
2544 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2545 :
2546 : __m128i mmx_src, mmx_mask, mmx_dest;
2547 :
2548 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2549 :
2550 0 : if (src == 0)
2551 0 : return;
2552 :
2553 0 : PIXMAN_IMAGE_GET_LINE (
2554 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2555 0 : PIXMAN_IMAGE_GET_LINE (
2556 : mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2557 :
2558 0 : xmm_src = _mm_unpacklo_epi8 (
2559 : create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2560 0 : mmx_src = xmm_src;
2561 :
2562 0 : while (height--)
2563 : {
2564 0 : int w = width;
2565 0 : const uint32_t *pm = (uint32_t *)mask_line;
2566 0 : uint32_t *pd = (uint32_t *)dst_line;
2567 :
2568 0 : dst_line += dst_stride;
2569 0 : mask_line += mask_stride;
2570 :
2571 0 : while (w && (uintptr_t)pd & 15)
2572 : {
2573 0 : m = *pm++;
2574 :
2575 0 : if (m)
2576 : {
2577 0 : d = *pd;
2578 :
2579 0 : mmx_mask = unpack_32_1x128 (m);
2580 0 : mmx_dest = unpack_32_1x128 (d);
2581 :
2582 0 : *pd = pack_1x128_32 (
2583 : _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2584 : mmx_dest));
2585 : }
2586 :
2587 0 : pd++;
2588 0 : w--;
2589 : }
2590 :
2591 0 : while (w >= 4)
2592 : {
2593 0 : xmm_mask = load_128_unaligned ((__m128i*)pm);
2594 :
2595 0 : pack_cmp =
2596 0 : _mm_movemask_epi8 (
2597 : _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2598 :
2599 : /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2600 0 : if (pack_cmp != 0xffff)
2601 : {
2602 0 : xmm_dst = load_128_aligned ((__m128i*)pd);
2603 :
2604 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2605 :
2606 : pix_multiply_2x128 (&xmm_src, &xmm_src,
2607 : &xmm_mask_lo, &xmm_mask_hi,
2608 : &xmm_mask_lo, &xmm_mask_hi);
2609 0 : xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2610 :
2611 0 : save_128_aligned (
2612 : (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2613 : }
2614 :
2615 0 : pd += 4;
2616 0 : pm += 4;
2617 0 : w -= 4;
2618 : }
2619 :
2620 0 : while (w)
2621 : {
2622 0 : m = *pm++;
2623 :
2624 0 : if (m)
2625 : {
2626 0 : d = *pd;
2627 :
2628 0 : mmx_mask = unpack_32_1x128 (m);
2629 0 : mmx_dest = unpack_32_1x128 (d);
2630 :
2631 0 : *pd = pack_1x128_32 (
2632 : _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2633 : mmx_dest));
2634 : }
2635 :
2636 0 : pd++;
2637 0 : w--;
2638 : }
2639 : }
2640 :
2641 : }
2642 :
2643 : static void
2644 0 : sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2645 : pixman_composite_info_t *info)
2646 : {
2647 0 : PIXMAN_COMPOSITE_ARGS (info);
2648 : uint32_t src;
2649 : uint32_t *dst_line, d;
2650 : uint32_t *mask_line, m;
2651 : uint32_t pack_cmp;
2652 : int dst_stride, mask_stride;
2653 :
2654 : __m128i xmm_src, xmm_alpha;
2655 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2656 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2657 :
2658 : __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2659 :
2660 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2661 :
2662 0 : if (src == 0)
2663 0 : return;
2664 :
2665 0 : PIXMAN_IMAGE_GET_LINE (
2666 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2667 0 : PIXMAN_IMAGE_GET_LINE (
2668 : mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2669 :
2670 0 : xmm_src = _mm_unpacklo_epi8 (
2671 : create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2672 0 : xmm_alpha = expand_alpha_1x128 (xmm_src);
2673 0 : mmx_src = xmm_src;
2674 0 : mmx_alpha = xmm_alpha;
2675 :
2676 0 : while (height--)
2677 : {
2678 0 : int w = width;
2679 0 : const uint32_t *pm = (uint32_t *)mask_line;
2680 0 : uint32_t *pd = (uint32_t *)dst_line;
2681 :
2682 0 : dst_line += dst_stride;
2683 0 : mask_line += mask_stride;
2684 :
2685 0 : while (w && (uintptr_t)pd & 15)
2686 : {
2687 0 : m = *pm++;
2688 :
2689 0 : if (m)
2690 : {
2691 0 : d = *pd;
2692 0 : mmx_mask = unpack_32_1x128 (m);
2693 0 : mmx_dest = unpack_32_1x128 (d);
2694 :
2695 0 : *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2696 : &mmx_alpha,
2697 : &mmx_mask,
2698 : &mmx_dest));
2699 : }
2700 :
2701 0 : pd++;
2702 0 : w--;
2703 : }
2704 :
2705 0 : while (w >= 4)
2706 : {
2707 0 : xmm_mask = load_128_unaligned ((__m128i*)pm);
2708 :
2709 0 : pack_cmp =
2710 0 : _mm_movemask_epi8 (
2711 : _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2712 :
2713 : /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2714 0 : if (pack_cmp != 0xffff)
2715 : {
2716 0 : xmm_dst = load_128_aligned ((__m128i*)pd);
2717 :
2718 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2719 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2720 :
2721 : in_over_2x128 (&xmm_src, &xmm_src,
2722 : &xmm_alpha, &xmm_alpha,
2723 : &xmm_mask_lo, &xmm_mask_hi,
2724 : &xmm_dst_lo, &xmm_dst_hi);
2725 :
2726 0 : save_128_aligned (
2727 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2728 : }
2729 :
2730 0 : pd += 4;
2731 0 : pm += 4;
2732 0 : w -= 4;
2733 : }
2734 :
2735 0 : while (w)
2736 : {
2737 0 : m = *pm++;
2738 :
2739 0 : if (m)
2740 : {
2741 0 : d = *pd;
2742 0 : mmx_mask = unpack_32_1x128 (m);
2743 0 : mmx_dest = unpack_32_1x128 (d);
2744 :
2745 0 : *pd = pack_1x128_32 (
2746 : in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2747 : }
2748 :
2749 0 : pd++;
2750 0 : w--;
2751 : }
2752 : }
2753 :
2754 : }
2755 :
2756 : static void
2757 0 : sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2758 : pixman_composite_info_t *info)
2759 : {
2760 0 : PIXMAN_COMPOSITE_ARGS (info);
2761 : uint32_t *dst_line, *dst;
2762 : uint32_t *src_line, *src;
2763 : uint32_t mask;
2764 : int32_t w;
2765 : int dst_stride, src_stride;
2766 :
2767 : __m128i xmm_mask;
2768 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2769 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2770 : __m128i xmm_alpha_lo, xmm_alpha_hi;
2771 :
2772 0 : PIXMAN_IMAGE_GET_LINE (
2773 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2774 0 : PIXMAN_IMAGE_GET_LINE (
2775 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2776 :
2777 0 : mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2778 :
2779 0 : xmm_mask = create_mask_16_128 (mask >> 24);
2780 :
2781 0 : while (height--)
2782 : {
2783 0 : dst = dst_line;
2784 0 : dst_line += dst_stride;
2785 0 : src = src_line;
2786 0 : src_line += src_stride;
2787 0 : w = width;
2788 :
2789 0 : while (w && (uintptr_t)dst & 15)
2790 : {
2791 0 : uint32_t s = *src++;
2792 :
2793 0 : if (s)
2794 : {
2795 0 : uint32_t d = *dst;
2796 :
2797 0 : __m128i ms = unpack_32_1x128 (s);
2798 0 : __m128i alpha = expand_alpha_1x128 (ms);
2799 0 : __m128i dest = xmm_mask;
2800 0 : __m128i alpha_dst = unpack_32_1x128 (d);
2801 :
2802 0 : *dst = pack_1x128_32 (
2803 : in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2804 : }
2805 0 : dst++;
2806 0 : w--;
2807 : }
2808 :
2809 0 : while (w >= 4)
2810 : {
2811 0 : xmm_src = load_128_unaligned ((__m128i*)src);
2812 :
2813 0 : if (!is_zero (xmm_src))
2814 : {
2815 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
2816 :
2817 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2818 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2819 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2820 : &xmm_alpha_lo, &xmm_alpha_hi);
2821 :
2822 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2823 : &xmm_alpha_lo, &xmm_alpha_hi,
2824 : &xmm_mask, &xmm_mask,
2825 : &xmm_dst_lo, &xmm_dst_hi);
2826 :
2827 0 : save_128_aligned (
2828 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2829 : }
2830 :
2831 0 : dst += 4;
2832 0 : src += 4;
2833 0 : w -= 4;
2834 : }
2835 :
2836 0 : while (w)
2837 : {
2838 0 : uint32_t s = *src++;
2839 :
2840 0 : if (s)
2841 : {
2842 0 : uint32_t d = *dst;
2843 :
2844 0 : __m128i ms = unpack_32_1x128 (s);
2845 0 : __m128i alpha = expand_alpha_1x128 (ms);
2846 0 : __m128i mask = xmm_mask;
2847 0 : __m128i dest = unpack_32_1x128 (d);
2848 :
2849 0 : *dst = pack_1x128_32 (
2850 : in_over_1x128 (&ms, &alpha, &mask, &dest));
2851 : }
2852 :
2853 0 : dst++;
2854 0 : w--;
2855 : }
2856 : }
2857 :
2858 0 : }
2859 :
2860 : static void
2861 0 : sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
2862 : pixman_composite_info_t *info)
2863 : {
2864 0 : PIXMAN_COMPOSITE_ARGS (info);
2865 : uint16_t *dst_line, *dst;
2866 : uint32_t *src_line, *src, s;
2867 : int dst_stride, src_stride;
2868 : int32_t w;
2869 :
2870 0 : PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2871 0 : PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2872 :
2873 0 : while (height--)
2874 : {
2875 0 : dst = dst_line;
2876 0 : dst_line += dst_stride;
2877 0 : src = src_line;
2878 0 : src_line += src_stride;
2879 0 : w = width;
2880 :
2881 0 : while (w && (uintptr_t)dst & 15)
2882 : {
2883 0 : s = *src++;
2884 0 : *dst = convert_8888_to_0565 (s);
2885 0 : dst++;
2886 0 : w--;
2887 : }
2888 :
2889 0 : while (w >= 8)
2890 : {
2891 0 : __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
2892 0 : __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
2893 :
2894 0 : save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
2895 :
2896 0 : w -= 8;
2897 0 : src += 8;
2898 0 : dst += 8;
2899 : }
2900 :
2901 0 : while (w)
2902 : {
2903 0 : s = *src++;
2904 0 : *dst = convert_8888_to_0565 (s);
2905 0 : dst++;
2906 0 : w--;
2907 : }
2908 : }
2909 0 : }
2910 :
2911 : static void
2912 0 : sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2913 : pixman_composite_info_t *info)
2914 : {
2915 0 : PIXMAN_COMPOSITE_ARGS (info);
2916 : uint32_t *dst_line, *dst;
2917 : uint32_t *src_line, *src;
2918 : int32_t w;
2919 : int dst_stride, src_stride;
2920 :
2921 :
2922 0 : PIXMAN_IMAGE_GET_LINE (
2923 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2924 0 : PIXMAN_IMAGE_GET_LINE (
2925 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2926 :
2927 0 : while (height--)
2928 : {
2929 0 : dst = dst_line;
2930 0 : dst_line += dst_stride;
2931 0 : src = src_line;
2932 0 : src_line += src_stride;
2933 0 : w = width;
2934 :
2935 0 : while (w && (uintptr_t)dst & 15)
2936 : {
2937 0 : *dst++ = *src++ | 0xff000000;
2938 0 : w--;
2939 : }
2940 :
2941 0 : while (w >= 16)
2942 : {
2943 : __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2944 :
2945 0 : xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2946 0 : xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2947 0 : xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2948 0 : xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2949 :
2950 0 : save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2951 0 : save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2952 0 : save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2953 0 : save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2954 :
2955 0 : dst += 16;
2956 0 : src += 16;
2957 0 : w -= 16;
2958 : }
2959 :
2960 0 : while (w)
2961 : {
2962 0 : *dst++ = *src++ | 0xff000000;
2963 0 : w--;
2964 : }
2965 : }
2966 :
2967 0 : }
2968 :
2969 : static void
2970 0 : sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2971 : pixman_composite_info_t *info)
2972 : {
2973 0 : PIXMAN_COMPOSITE_ARGS (info);
2974 : uint32_t *dst_line, *dst;
2975 : uint32_t *src_line, *src;
2976 : uint32_t mask;
2977 : int dst_stride, src_stride;
2978 : int32_t w;
2979 :
2980 : __m128i xmm_mask, xmm_alpha;
2981 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2982 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2983 :
2984 0 : PIXMAN_IMAGE_GET_LINE (
2985 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2986 0 : PIXMAN_IMAGE_GET_LINE (
2987 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2988 :
2989 0 : mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2990 :
2991 0 : xmm_mask = create_mask_16_128 (mask >> 24);
2992 0 : xmm_alpha = mask_00ff;
2993 :
2994 0 : while (height--)
2995 : {
2996 0 : dst = dst_line;
2997 0 : dst_line += dst_stride;
2998 0 : src = src_line;
2999 0 : src_line += src_stride;
3000 0 : w = width;
3001 :
3002 0 : while (w && (uintptr_t)dst & 15)
3003 : {
3004 0 : uint32_t s = (*src++) | 0xff000000;
3005 0 : uint32_t d = *dst;
3006 :
3007 0 : __m128i src = unpack_32_1x128 (s);
3008 0 : __m128i alpha = xmm_alpha;
3009 0 : __m128i mask = xmm_mask;
3010 0 : __m128i dest = unpack_32_1x128 (d);
3011 :
3012 0 : *dst++ = pack_1x128_32 (
3013 : in_over_1x128 (&src, &alpha, &mask, &dest));
3014 :
3015 0 : w--;
3016 : }
3017 :
3018 0 : while (w >= 4)
3019 : {
3020 0 : xmm_src = _mm_or_si128 (
3021 : load_128_unaligned ((__m128i*)src), mask_ff000000);
3022 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
3023 :
3024 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3025 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3026 :
3027 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3028 : &xmm_alpha, &xmm_alpha,
3029 : &xmm_mask, &xmm_mask,
3030 : &xmm_dst_lo, &xmm_dst_hi);
3031 :
3032 0 : save_128_aligned (
3033 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3034 :
3035 0 : dst += 4;
3036 0 : src += 4;
3037 0 : w -= 4;
3038 :
3039 : }
3040 :
3041 0 : while (w)
3042 : {
3043 0 : uint32_t s = (*src++) | 0xff000000;
3044 0 : uint32_t d = *dst;
3045 :
3046 0 : __m128i src = unpack_32_1x128 (s);
3047 0 : __m128i alpha = xmm_alpha;
3048 0 : __m128i mask = xmm_mask;
3049 0 : __m128i dest = unpack_32_1x128 (d);
3050 :
3051 0 : *dst++ = pack_1x128_32 (
3052 : in_over_1x128 (&src, &alpha, &mask, &dest));
3053 :
3054 0 : w--;
3055 : }
3056 : }
3057 :
3058 0 : }
3059 :
3060 : static void
3061 0 : sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3062 : pixman_composite_info_t *info)
3063 : {
3064 0 : PIXMAN_COMPOSITE_ARGS (info);
3065 : int dst_stride, src_stride;
3066 : uint32_t *dst_line, *dst;
3067 : uint32_t *src_line, *src;
3068 :
3069 0 : PIXMAN_IMAGE_GET_LINE (
3070 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3071 0 : PIXMAN_IMAGE_GET_LINE (
3072 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3073 :
3074 0 : dst = dst_line;
3075 0 : src = src_line;
3076 :
3077 0 : while (height--)
3078 : {
3079 : sse2_combine_over_u (imp, op, dst, src, NULL, width);
3080 :
3081 0 : dst += dst_stride;
3082 0 : src += src_stride;
3083 : }
3084 0 : }
3085 :
3086 : static force_inline uint16_t
3087 : composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3088 : {
3089 : __m128i ms;
3090 :
3091 0 : ms = unpack_32_1x128 (src);
3092 0 : return pack_565_32_16 (
3093 : pack_1x128_32 (
3094 : over_1x128 (
3095 : ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3096 : }
3097 :
3098 : static void
3099 0 : sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3100 : pixman_composite_info_t *info)
3101 : {
3102 0 : PIXMAN_COMPOSITE_ARGS (info);
3103 : uint16_t *dst_line, *dst, d;
3104 : uint32_t *src_line, *src, s;
3105 : int dst_stride, src_stride;
3106 : int32_t w;
3107 :
3108 : __m128i xmm_alpha_lo, xmm_alpha_hi;
3109 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3110 : __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3111 :
3112 0 : PIXMAN_IMAGE_GET_LINE (
3113 : dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3114 0 : PIXMAN_IMAGE_GET_LINE (
3115 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3116 :
3117 0 : while (height--)
3118 : {
3119 0 : dst = dst_line;
3120 0 : src = src_line;
3121 :
3122 0 : dst_line += dst_stride;
3123 0 : src_line += src_stride;
3124 0 : w = width;
3125 :
3126 : /* Align dst on a 16-byte boundary */
3127 0 : while (w &&
3128 0 : ((uintptr_t)dst & 15))
3129 : {
3130 0 : s = *src++;
3131 0 : d = *dst;
3132 :
3133 0 : *dst++ = composite_over_8888_0565pixel (s, d);
3134 0 : w--;
3135 : }
3136 :
3137 : /* It's a 8 pixel loop */
3138 0 : while (w >= 8)
3139 : {
3140 : /* I'm loading unaligned because I'm not sure
3141 : * about the address alignment.
3142 : */
3143 0 : xmm_src = load_128_unaligned ((__m128i*) src);
3144 0 : xmm_dst = load_128_aligned ((__m128i*) dst);
3145 :
3146 : /* Unpacking */
3147 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3148 : unpack_565_128_4x128 (xmm_dst,
3149 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3150 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3151 : &xmm_alpha_lo, &xmm_alpha_hi);
3152 :
3153 : /* I'm loading next 4 pixels from memory
3154 : * before to optimze the memory read.
3155 : */
3156 0 : xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3157 :
3158 : over_2x128 (&xmm_src_lo, &xmm_src_hi,
3159 : &xmm_alpha_lo, &xmm_alpha_hi,
3160 : &xmm_dst0, &xmm_dst1);
3161 :
3162 : /* Unpacking */
3163 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3164 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3165 : &xmm_alpha_lo, &xmm_alpha_hi);
3166 :
3167 : over_2x128 (&xmm_src_lo, &xmm_src_hi,
3168 : &xmm_alpha_lo, &xmm_alpha_hi,
3169 : &xmm_dst2, &xmm_dst3);
3170 :
3171 0 : save_128_aligned (
3172 : (__m128i*)dst, pack_565_4x128_128 (
3173 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3174 :
3175 0 : w -= 8;
3176 0 : dst += 8;
3177 0 : src += 8;
3178 : }
3179 :
3180 0 : while (w--)
3181 : {
3182 0 : s = *src++;
3183 0 : d = *dst;
3184 :
3185 0 : *dst++ = composite_over_8888_0565pixel (s, d);
3186 : }
3187 : }
3188 :
3189 0 : }
3190 :
3191 : static void
3192 0 : sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3193 : pixman_composite_info_t *info)
3194 : {
3195 0 : PIXMAN_COMPOSITE_ARGS (info);
3196 : uint32_t src, srca;
3197 : uint32_t *dst_line, *dst;
3198 : uint8_t *mask_line, *mask;
3199 : int dst_stride, mask_stride;
3200 : int32_t w;
3201 : uint32_t m, d;
3202 :
3203 : __m128i xmm_src, xmm_alpha, xmm_def;
3204 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3205 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3206 :
3207 : __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3208 :
3209 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3210 :
3211 0 : srca = src >> 24;
3212 0 : if (src == 0)
3213 0 : return;
3214 :
3215 0 : PIXMAN_IMAGE_GET_LINE (
3216 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3217 0 : PIXMAN_IMAGE_GET_LINE (
3218 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3219 :
3220 0 : xmm_def = create_mask_2x32_128 (src, src);
3221 0 : xmm_src = expand_pixel_32_1x128 (src);
3222 0 : xmm_alpha = expand_alpha_1x128 (xmm_src);
3223 0 : mmx_src = xmm_src;
3224 0 : mmx_alpha = xmm_alpha;
3225 :
3226 0 : while (height--)
3227 : {
3228 0 : dst = dst_line;
3229 0 : dst_line += dst_stride;
3230 0 : mask = mask_line;
3231 0 : mask_line += mask_stride;
3232 0 : w = width;
3233 :
3234 0 : while (w && (uintptr_t)dst & 15)
3235 : {
3236 0 : uint8_t m = *mask++;
3237 :
3238 0 : if (m)
3239 : {
3240 0 : d = *dst;
3241 0 : mmx_mask = expand_pixel_8_1x128 (m);
3242 0 : mmx_dest = unpack_32_1x128 (d);
3243 :
3244 0 : *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3245 : &mmx_alpha,
3246 : &mmx_mask,
3247 : &mmx_dest));
3248 : }
3249 :
3250 0 : w--;
3251 0 : dst++;
3252 : }
3253 :
3254 0 : while (w >= 4)
3255 : {
3256 0 : m = *((uint32_t*)mask);
3257 :
3258 0 : if (srca == 0xff && m == 0xffffffff)
3259 : {
3260 0 : save_128_aligned ((__m128i*)dst, xmm_def);
3261 : }
3262 0 : else if (m)
3263 : {
3264 0 : xmm_dst = load_128_aligned ((__m128i*) dst);
3265 0 : xmm_mask = unpack_32_1x128 (m);
3266 0 : xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3267 :
3268 : /* Unpacking */
3269 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3270 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3271 :
3272 0 : expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3273 : &xmm_mask_lo, &xmm_mask_hi);
3274 :
3275 : in_over_2x128 (&xmm_src, &xmm_src,
3276 : &xmm_alpha, &xmm_alpha,
3277 : &xmm_mask_lo, &xmm_mask_hi,
3278 : &xmm_dst_lo, &xmm_dst_hi);
3279 :
3280 0 : save_128_aligned (
3281 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3282 : }
3283 :
3284 0 : w -= 4;
3285 0 : dst += 4;
3286 0 : mask += 4;
3287 : }
3288 :
3289 0 : while (w)
3290 : {
3291 0 : uint8_t m = *mask++;
3292 :
3293 0 : if (m)
3294 : {
3295 0 : d = *dst;
3296 0 : mmx_mask = expand_pixel_8_1x128 (m);
3297 0 : mmx_dest = unpack_32_1x128 (d);
3298 :
3299 0 : *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3300 : &mmx_alpha,
3301 : &mmx_mask,
3302 : &mmx_dest));
3303 : }
3304 :
3305 0 : w--;
3306 0 : dst++;
3307 : }
3308 : }
3309 :
3310 : }
3311 :
3312 : #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3313 : __attribute__((__force_align_arg_pointer__))
3314 : #endif
3315 : static pixman_bool_t
3316 0 : sse2_fill (pixman_implementation_t *imp,
3317 : uint32_t * bits,
3318 : int stride,
3319 : int bpp,
3320 : int x,
3321 : int y,
3322 : int width,
3323 : int height,
3324 : uint32_t filler)
3325 : {
3326 : uint32_t byte_width;
3327 : uint8_t *byte_line;
3328 :
3329 : __m128i xmm_def;
3330 :
3331 0 : if (bpp == 8)
3332 : {
3333 : uint8_t b;
3334 : uint16_t w;
3335 :
3336 0 : stride = stride * (int) sizeof (uint32_t) / 1;
3337 0 : byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3338 0 : byte_width = width;
3339 0 : stride *= 1;
3340 :
3341 0 : b = filler & 0xff;
3342 0 : w = (b << 8) | b;
3343 0 : filler = (w << 16) | w;
3344 : }
3345 0 : else if (bpp == 16)
3346 : {
3347 0 : stride = stride * (int) sizeof (uint32_t) / 2;
3348 0 : byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3349 0 : byte_width = 2 * width;
3350 0 : stride *= 2;
3351 :
3352 0 : filler = (filler & 0xffff) * 0x00010001;
3353 : }
3354 0 : else if (bpp == 32)
3355 : {
3356 0 : stride = stride * (int) sizeof (uint32_t) / 4;
3357 0 : byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3358 0 : byte_width = 4 * width;
3359 0 : stride *= 4;
3360 : }
3361 : else
3362 : {
3363 0 : return FALSE;
3364 : }
3365 :
3366 0 : xmm_def = create_mask_2x32_128 (filler, filler);
3367 :
3368 0 : while (height--)
3369 : {
3370 : int w;
3371 0 : uint8_t *d = byte_line;
3372 0 : byte_line += stride;
3373 0 : w = byte_width;
3374 :
3375 0 : if (w >= 1 && ((uintptr_t)d & 1))
3376 : {
3377 0 : *(uint8_t *)d = filler;
3378 0 : w -= 1;
3379 0 : d += 1;
3380 : }
3381 :
3382 0 : while (w >= 2 && ((uintptr_t)d & 3))
3383 : {
3384 0 : *(uint16_t *)d = filler;
3385 0 : w -= 2;
3386 0 : d += 2;
3387 : }
3388 :
3389 0 : while (w >= 4 && ((uintptr_t)d & 15))
3390 : {
3391 0 : *(uint32_t *)d = filler;
3392 :
3393 0 : w -= 4;
3394 0 : d += 4;
3395 : }
3396 :
3397 0 : while (w >= 128)
3398 : {
3399 : save_128_aligned ((__m128i*)(d), xmm_def);
3400 0 : save_128_aligned ((__m128i*)(d + 16), xmm_def);
3401 0 : save_128_aligned ((__m128i*)(d + 32), xmm_def);
3402 0 : save_128_aligned ((__m128i*)(d + 48), xmm_def);
3403 0 : save_128_aligned ((__m128i*)(d + 64), xmm_def);
3404 0 : save_128_aligned ((__m128i*)(d + 80), xmm_def);
3405 0 : save_128_aligned ((__m128i*)(d + 96), xmm_def);
3406 0 : save_128_aligned ((__m128i*)(d + 112), xmm_def);
3407 :
3408 0 : d += 128;
3409 0 : w -= 128;
3410 : }
3411 :
3412 0 : if (w >= 64)
3413 : {
3414 : save_128_aligned ((__m128i*)(d), xmm_def);
3415 0 : save_128_aligned ((__m128i*)(d + 16), xmm_def);
3416 0 : save_128_aligned ((__m128i*)(d + 32), xmm_def);
3417 0 : save_128_aligned ((__m128i*)(d + 48), xmm_def);
3418 :
3419 0 : d += 64;
3420 0 : w -= 64;
3421 : }
3422 :
3423 0 : if (w >= 32)
3424 : {
3425 : save_128_aligned ((__m128i*)(d), xmm_def);
3426 0 : save_128_aligned ((__m128i*)(d + 16), xmm_def);
3427 :
3428 0 : d += 32;
3429 0 : w -= 32;
3430 : }
3431 :
3432 0 : if (w >= 16)
3433 : {
3434 : save_128_aligned ((__m128i*)(d), xmm_def);
3435 :
3436 0 : d += 16;
3437 0 : w -= 16;
3438 : }
3439 :
3440 0 : while (w >= 4)
3441 : {
3442 0 : *(uint32_t *)d = filler;
3443 :
3444 0 : w -= 4;
3445 0 : d += 4;
3446 : }
3447 :
3448 0 : if (w >= 2)
3449 : {
3450 0 : *(uint16_t *)d = filler;
3451 0 : w -= 2;
3452 0 : d += 2;
3453 : }
3454 :
3455 0 : if (w >= 1)
3456 : {
3457 0 : *(uint8_t *)d = filler;
3458 0 : w -= 1;
3459 0 : d += 1;
3460 : }
3461 : }
3462 :
3463 0 : return TRUE;
3464 : }
3465 :
3466 : static void
3467 0 : sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3468 : pixman_composite_info_t *info)
3469 : {
3470 0 : PIXMAN_COMPOSITE_ARGS (info);
3471 : uint32_t src, srca;
3472 : uint32_t *dst_line, *dst;
3473 : uint8_t *mask_line, *mask;
3474 : int dst_stride, mask_stride;
3475 : int32_t w;
3476 : uint32_t m;
3477 :
3478 : __m128i xmm_src, xmm_def;
3479 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3480 :
3481 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3482 :
3483 0 : srca = src >> 24;
3484 0 : if (src == 0)
3485 : {
3486 0 : sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
3487 0 : PIXMAN_FORMAT_BPP (dest_image->bits.format),
3488 : dest_x, dest_y, width, height, 0);
3489 0 : return;
3490 : }
3491 :
3492 0 : PIXMAN_IMAGE_GET_LINE (
3493 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3494 0 : PIXMAN_IMAGE_GET_LINE (
3495 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3496 :
3497 0 : xmm_def = create_mask_2x32_128 (src, src);
3498 0 : xmm_src = expand_pixel_32_1x128 (src);
3499 :
3500 0 : while (height--)
3501 : {
3502 0 : dst = dst_line;
3503 0 : dst_line += dst_stride;
3504 0 : mask = mask_line;
3505 0 : mask_line += mask_stride;
3506 0 : w = width;
3507 :
3508 0 : while (w && (uintptr_t)dst & 15)
3509 : {
3510 0 : uint8_t m = *mask++;
3511 :
3512 0 : if (m)
3513 : {
3514 0 : *dst = pack_1x128_32 (
3515 : pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3516 : }
3517 : else
3518 : {
3519 0 : *dst = 0;
3520 : }
3521 :
3522 0 : w--;
3523 0 : dst++;
3524 : }
3525 :
3526 0 : while (w >= 4)
3527 : {
3528 0 : m = *((uint32_t*)mask);
3529 :
3530 0 : if (srca == 0xff && m == 0xffffffff)
3531 : {
3532 0 : save_128_aligned ((__m128i*)dst, xmm_def);
3533 : }
3534 0 : else if (m)
3535 : {
3536 0 : xmm_mask = unpack_32_1x128 (m);
3537 0 : xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3538 :
3539 : /* Unpacking */
3540 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3541 :
3542 0 : expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3543 : &xmm_mask_lo, &xmm_mask_hi);
3544 :
3545 : pix_multiply_2x128 (&xmm_src, &xmm_src,
3546 : &xmm_mask_lo, &xmm_mask_hi,
3547 : &xmm_mask_lo, &xmm_mask_hi);
3548 :
3549 0 : save_128_aligned (
3550 : (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3551 : }
3552 : else
3553 : {
3554 0 : save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3555 : }
3556 :
3557 0 : w -= 4;
3558 0 : dst += 4;
3559 0 : mask += 4;
3560 : }
3561 :
3562 0 : while (w)
3563 : {
3564 0 : uint8_t m = *mask++;
3565 :
3566 0 : if (m)
3567 : {
3568 0 : *dst = pack_1x128_32 (
3569 : pix_multiply_1x128 (
3570 : xmm_src, expand_pixel_8_1x128 (m)));
3571 : }
3572 : else
3573 : {
3574 0 : *dst = 0;
3575 : }
3576 :
3577 0 : w--;
3578 0 : dst++;
3579 : }
3580 : }
3581 :
3582 : }
3583 :
3584 : static void
3585 0 : sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3586 : pixman_composite_info_t *info)
3587 : {
3588 0 : PIXMAN_COMPOSITE_ARGS (info);
3589 : uint32_t src;
3590 : uint16_t *dst_line, *dst, d;
3591 : uint8_t *mask_line, *mask;
3592 : int dst_stride, mask_stride;
3593 : int32_t w;
3594 : uint32_t m;
3595 : __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3596 :
3597 : __m128i xmm_src, xmm_alpha;
3598 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3599 : __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3600 :
3601 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3602 :
3603 0 : if (src == 0)
3604 0 : return;
3605 :
3606 0 : PIXMAN_IMAGE_GET_LINE (
3607 : dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3608 0 : PIXMAN_IMAGE_GET_LINE (
3609 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3610 :
3611 0 : xmm_src = expand_pixel_32_1x128 (src);
3612 0 : xmm_alpha = expand_alpha_1x128 (xmm_src);
3613 0 : mmx_src = xmm_src;
3614 0 : mmx_alpha = xmm_alpha;
3615 :
3616 0 : while (height--)
3617 : {
3618 0 : dst = dst_line;
3619 0 : dst_line += dst_stride;
3620 0 : mask = mask_line;
3621 0 : mask_line += mask_stride;
3622 0 : w = width;
3623 :
3624 0 : while (w && (uintptr_t)dst & 15)
3625 : {
3626 0 : m = *mask++;
3627 :
3628 0 : if (m)
3629 : {
3630 0 : d = *dst;
3631 0 : mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3632 0 : mmx_dest = expand565_16_1x128 (d);
3633 :
3634 0 : *dst = pack_565_32_16 (
3635 : pack_1x128_32 (
3636 : in_over_1x128 (
3637 : &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3638 : }
3639 :
3640 0 : w--;
3641 0 : dst++;
3642 : }
3643 :
3644 0 : while (w >= 8)
3645 : {
3646 0 : xmm_dst = load_128_aligned ((__m128i*) dst);
3647 : unpack_565_128_4x128 (xmm_dst,
3648 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3649 :
3650 0 : m = *((uint32_t*)mask);
3651 0 : mask += 4;
3652 :
3653 0 : if (m)
3654 : {
3655 0 : xmm_mask = unpack_32_1x128 (m);
3656 0 : xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3657 :
3658 : /* Unpacking */
3659 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3660 :
3661 0 : expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3662 : &xmm_mask_lo, &xmm_mask_hi);
3663 :
3664 : in_over_2x128 (&xmm_src, &xmm_src,
3665 : &xmm_alpha, &xmm_alpha,
3666 : &xmm_mask_lo, &xmm_mask_hi,
3667 : &xmm_dst0, &xmm_dst1);
3668 : }
3669 :
3670 0 : m = *((uint32_t*)mask);
3671 0 : mask += 4;
3672 :
3673 0 : if (m)
3674 : {
3675 0 : xmm_mask = unpack_32_1x128 (m);
3676 0 : xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3677 :
3678 : /* Unpacking */
3679 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3680 :
3681 0 : expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3682 : &xmm_mask_lo, &xmm_mask_hi);
3683 : in_over_2x128 (&xmm_src, &xmm_src,
3684 : &xmm_alpha, &xmm_alpha,
3685 : &xmm_mask_lo, &xmm_mask_hi,
3686 : &xmm_dst2, &xmm_dst3);
3687 : }
3688 :
3689 0 : save_128_aligned (
3690 : (__m128i*)dst, pack_565_4x128_128 (
3691 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3692 :
3693 0 : w -= 8;
3694 0 : dst += 8;
3695 : }
3696 :
3697 0 : while (w)
3698 : {
3699 0 : m = *mask++;
3700 :
3701 0 : if (m)
3702 : {
3703 0 : d = *dst;
3704 0 : mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3705 0 : mmx_dest = expand565_16_1x128 (d);
3706 :
3707 0 : *dst = pack_565_32_16 (
3708 : pack_1x128_32 (
3709 : in_over_1x128 (
3710 : &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3711 : }
3712 :
3713 0 : w--;
3714 0 : dst++;
3715 : }
3716 : }
3717 :
3718 : }
3719 :
3720 : static void
3721 0 : sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3722 : pixman_composite_info_t *info)
3723 : {
3724 0 : PIXMAN_COMPOSITE_ARGS (info);
3725 : uint16_t *dst_line, *dst, d;
3726 : uint32_t *src_line, *src, s;
3727 : int dst_stride, src_stride;
3728 : int32_t w;
3729 : uint32_t opaque, zero;
3730 :
3731 : __m128i ms;
3732 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3733 : __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3734 :
3735 0 : PIXMAN_IMAGE_GET_LINE (
3736 : dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3737 0 : PIXMAN_IMAGE_GET_LINE (
3738 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3739 :
3740 0 : while (height--)
3741 : {
3742 0 : dst = dst_line;
3743 0 : dst_line += dst_stride;
3744 0 : src = src_line;
3745 0 : src_line += src_stride;
3746 0 : w = width;
3747 :
3748 0 : while (w && (uintptr_t)dst & 15)
3749 : {
3750 0 : s = *src++;
3751 0 : d = *dst;
3752 :
3753 0 : ms = unpack_32_1x128 (s);
3754 :
3755 0 : *dst++ = pack_565_32_16 (
3756 : pack_1x128_32 (
3757 : over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3758 0 : w--;
3759 : }
3760 :
3761 0 : while (w >= 8)
3762 : {
3763 : /* First round */
3764 0 : xmm_src = load_128_unaligned ((__m128i*)src);
3765 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
3766 :
3767 0 : opaque = is_opaque (xmm_src);
3768 0 : zero = is_zero (xmm_src);
3769 :
3770 : unpack_565_128_4x128 (xmm_dst,
3771 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3772 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3773 :
3774 : /* preload next round*/
3775 0 : xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3776 :
3777 0 : if (opaque)
3778 : {
3779 0 : invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3780 : &xmm_dst0, &xmm_dst1);
3781 : }
3782 0 : else if (!zero)
3783 : {
3784 0 : over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3785 : &xmm_dst0, &xmm_dst1);
3786 : }
3787 :
3788 : /* Second round */
3789 0 : opaque = is_opaque (xmm_src);
3790 0 : zero = is_zero (xmm_src);
3791 :
3792 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3793 :
3794 0 : if (opaque)
3795 : {
3796 0 : invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3797 : &xmm_dst2, &xmm_dst3);
3798 : }
3799 0 : else if (!zero)
3800 : {
3801 0 : over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3802 : &xmm_dst2, &xmm_dst3);
3803 : }
3804 :
3805 0 : save_128_aligned (
3806 : (__m128i*)dst, pack_565_4x128_128 (
3807 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3808 :
3809 0 : w -= 8;
3810 0 : src += 8;
3811 0 : dst += 8;
3812 : }
3813 :
3814 0 : while (w)
3815 : {
3816 0 : s = *src++;
3817 0 : d = *dst;
3818 :
3819 0 : ms = unpack_32_1x128 (s);
3820 :
3821 0 : *dst++ = pack_565_32_16 (
3822 : pack_1x128_32 (
3823 : over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3824 0 : w--;
3825 : }
3826 : }
3827 :
3828 0 : }
3829 :
3830 : static void
3831 0 : sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3832 : pixman_composite_info_t *info)
3833 : {
3834 0 : PIXMAN_COMPOSITE_ARGS (info);
3835 : uint32_t *dst_line, *dst, d;
3836 : uint32_t *src_line, *src, s;
3837 : int dst_stride, src_stride;
3838 : int32_t w;
3839 : uint32_t opaque, zero;
3840 :
3841 : __m128i xmm_src_lo, xmm_src_hi;
3842 : __m128i xmm_dst_lo, xmm_dst_hi;
3843 :
3844 0 : PIXMAN_IMAGE_GET_LINE (
3845 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3846 0 : PIXMAN_IMAGE_GET_LINE (
3847 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3848 :
3849 0 : while (height--)
3850 : {
3851 0 : dst = dst_line;
3852 0 : dst_line += dst_stride;
3853 0 : src = src_line;
3854 0 : src_line += src_stride;
3855 0 : w = width;
3856 :
3857 0 : while (w && (uintptr_t)dst & 15)
3858 : {
3859 0 : s = *src++;
3860 0 : d = *dst;
3861 :
3862 0 : *dst++ = pack_1x128_32 (
3863 : over_rev_non_pre_1x128 (
3864 : unpack_32_1x128 (s), unpack_32_1x128 (d)));
3865 :
3866 0 : w--;
3867 : }
3868 :
3869 0 : while (w >= 4)
3870 : {
3871 0 : xmm_src_hi = load_128_unaligned ((__m128i*)src);
3872 :
3873 0 : opaque = is_opaque (xmm_src_hi);
3874 0 : zero = is_zero (xmm_src_hi);
3875 :
3876 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3877 :
3878 0 : if (opaque)
3879 : {
3880 0 : invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3881 : &xmm_dst_lo, &xmm_dst_hi);
3882 :
3883 0 : save_128_aligned (
3884 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3885 : }
3886 0 : else if (!zero)
3887 : {
3888 0 : xmm_dst_hi = load_128_aligned ((__m128i*)dst);
3889 :
3890 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3891 :
3892 0 : over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3893 : &xmm_dst_lo, &xmm_dst_hi);
3894 :
3895 0 : save_128_aligned (
3896 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3897 : }
3898 :
3899 0 : w -= 4;
3900 0 : dst += 4;
3901 0 : src += 4;
3902 : }
3903 :
3904 0 : while (w)
3905 : {
3906 0 : s = *src++;
3907 0 : d = *dst;
3908 :
3909 0 : *dst++ = pack_1x128_32 (
3910 : over_rev_non_pre_1x128 (
3911 : unpack_32_1x128 (s), unpack_32_1x128 (d)));
3912 :
3913 0 : w--;
3914 : }
3915 : }
3916 :
3917 0 : }
3918 :
3919 : static void
3920 0 : sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3921 : pixman_composite_info_t *info)
3922 : {
3923 0 : PIXMAN_COMPOSITE_ARGS (info);
3924 : uint32_t src;
3925 : uint16_t *dst_line, *dst, d;
3926 : uint32_t *mask_line, *mask, m;
3927 : int dst_stride, mask_stride;
3928 : int w;
3929 : uint32_t pack_cmp;
3930 :
3931 : __m128i xmm_src, xmm_alpha;
3932 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3933 : __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3934 :
3935 : __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3936 :
3937 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3938 :
3939 0 : if (src == 0)
3940 0 : return;
3941 :
3942 0 : PIXMAN_IMAGE_GET_LINE (
3943 : dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3944 0 : PIXMAN_IMAGE_GET_LINE (
3945 : mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3946 :
3947 0 : xmm_src = expand_pixel_32_1x128 (src);
3948 0 : xmm_alpha = expand_alpha_1x128 (xmm_src);
3949 0 : mmx_src = xmm_src;
3950 0 : mmx_alpha = xmm_alpha;
3951 :
3952 0 : while (height--)
3953 : {
3954 0 : w = width;
3955 0 : mask = mask_line;
3956 0 : dst = dst_line;
3957 0 : mask_line += mask_stride;
3958 0 : dst_line += dst_stride;
3959 :
3960 0 : while (w && ((uintptr_t)dst & 15))
3961 : {
3962 0 : m = *(uint32_t *) mask;
3963 :
3964 0 : if (m)
3965 : {
3966 0 : d = *dst;
3967 0 : mmx_mask = unpack_32_1x128 (m);
3968 0 : mmx_dest = expand565_16_1x128 (d);
3969 :
3970 0 : *dst = pack_565_32_16 (
3971 : pack_1x128_32 (
3972 : in_over_1x128 (
3973 : &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3974 : }
3975 :
3976 0 : w--;
3977 0 : dst++;
3978 0 : mask++;
3979 : }
3980 :
3981 0 : while (w >= 8)
3982 : {
3983 : /* First round */
3984 0 : xmm_mask = load_128_unaligned ((__m128i*)mask);
3985 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
3986 :
3987 0 : pack_cmp = _mm_movemask_epi8 (
3988 : _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3989 :
3990 : unpack_565_128_4x128 (xmm_dst,
3991 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3992 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3993 :
3994 : /* preload next round */
3995 0 : xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
3996 :
3997 : /* preload next round */
3998 0 : if (pack_cmp != 0xffff)
3999 : {
4000 : in_over_2x128 (&xmm_src, &xmm_src,
4001 : &xmm_alpha, &xmm_alpha,
4002 : &xmm_mask_lo, &xmm_mask_hi,
4003 : &xmm_dst0, &xmm_dst1);
4004 : }
4005 :
4006 : /* Second round */
4007 0 : pack_cmp = _mm_movemask_epi8 (
4008 : _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4009 :
4010 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4011 :
4012 0 : if (pack_cmp != 0xffff)
4013 : {
4014 : in_over_2x128 (&xmm_src, &xmm_src,
4015 : &xmm_alpha, &xmm_alpha,
4016 : &xmm_mask_lo, &xmm_mask_hi,
4017 : &xmm_dst2, &xmm_dst3);
4018 : }
4019 :
4020 0 : save_128_aligned (
4021 : (__m128i*)dst, pack_565_4x128_128 (
4022 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4023 :
4024 0 : w -= 8;
4025 0 : dst += 8;
4026 0 : mask += 8;
4027 : }
4028 :
4029 0 : while (w)
4030 : {
4031 0 : m = *(uint32_t *) mask;
4032 :
4033 0 : if (m)
4034 : {
4035 0 : d = *dst;
4036 0 : mmx_mask = unpack_32_1x128 (m);
4037 0 : mmx_dest = expand565_16_1x128 (d);
4038 :
4039 0 : *dst = pack_565_32_16 (
4040 : pack_1x128_32 (
4041 : in_over_1x128 (
4042 : &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4043 : }
4044 :
4045 0 : w--;
4046 0 : dst++;
4047 0 : mask++;
4048 : }
4049 : }
4050 :
4051 : }
4052 :
4053 : static void
4054 0 : sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4055 : pixman_composite_info_t *info)
4056 : {
4057 0 : PIXMAN_COMPOSITE_ARGS (info);
4058 : uint8_t *dst_line, *dst;
4059 : uint8_t *mask_line, *mask;
4060 : int dst_stride, mask_stride;
4061 : uint32_t d, m;
4062 : uint32_t src;
4063 : int32_t w;
4064 :
4065 : __m128i xmm_alpha;
4066 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4067 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4068 :
4069 0 : PIXMAN_IMAGE_GET_LINE (
4070 : dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4071 0 : PIXMAN_IMAGE_GET_LINE (
4072 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4073 :
4074 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4075 :
4076 0 : xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4077 :
4078 0 : while (height--)
4079 : {
4080 0 : dst = dst_line;
4081 0 : dst_line += dst_stride;
4082 0 : mask = mask_line;
4083 0 : mask_line += mask_stride;
4084 0 : w = width;
4085 :
4086 0 : while (w && ((uintptr_t)dst & 15))
4087 : {
4088 0 : m = (uint32_t) *mask++;
4089 0 : d = (uint32_t) *dst;
4090 :
4091 0 : *dst++ = (uint8_t) pack_1x128_32 (
4092 : pix_multiply_1x128 (
4093 : pix_multiply_1x128 (xmm_alpha,
4094 : unpack_32_1x128 (m)),
4095 : unpack_32_1x128 (d)));
4096 0 : w--;
4097 : }
4098 :
4099 0 : while (w >= 16)
4100 : {
4101 0 : xmm_mask = load_128_unaligned ((__m128i*)mask);
4102 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
4103 :
4104 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4105 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4106 :
4107 : pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4108 : &xmm_mask_lo, &xmm_mask_hi,
4109 : &xmm_mask_lo, &xmm_mask_hi);
4110 :
4111 : pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4112 : &xmm_dst_lo, &xmm_dst_hi,
4113 : &xmm_dst_lo, &xmm_dst_hi);
4114 :
4115 0 : save_128_aligned (
4116 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4117 :
4118 0 : mask += 16;
4119 0 : dst += 16;
4120 0 : w -= 16;
4121 : }
4122 :
4123 0 : while (w)
4124 : {
4125 0 : m = (uint32_t) *mask++;
4126 0 : d = (uint32_t) *dst;
4127 :
4128 0 : *dst++ = (uint8_t) pack_1x128_32 (
4129 : pix_multiply_1x128 (
4130 : pix_multiply_1x128 (
4131 : xmm_alpha, unpack_32_1x128 (m)),
4132 : unpack_32_1x128 (d)));
4133 0 : w--;
4134 : }
4135 : }
4136 :
4137 0 : }
4138 :
4139 : static void
4140 0 : sse2_composite_in_n_8 (pixman_implementation_t *imp,
4141 : pixman_composite_info_t *info)
4142 : {
4143 0 : PIXMAN_COMPOSITE_ARGS (info);
4144 : uint8_t *dst_line, *dst;
4145 : int dst_stride;
4146 : uint32_t d;
4147 : uint32_t src;
4148 : int32_t w;
4149 :
4150 : __m128i xmm_alpha;
4151 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4152 :
4153 0 : PIXMAN_IMAGE_GET_LINE (
4154 : dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4155 :
4156 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4157 :
4158 0 : xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4159 :
4160 0 : src = src >> 24;
4161 :
4162 0 : if (src == 0xff)
4163 0 : return;
4164 :
4165 0 : if (src == 0x00)
4166 : {
4167 0 : pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4168 : 8, dest_x, dest_y, width, height, src);
4169 :
4170 0 : return;
4171 : }
4172 :
4173 0 : while (height--)
4174 : {
4175 0 : dst = dst_line;
4176 0 : dst_line += dst_stride;
4177 0 : w = width;
4178 :
4179 0 : while (w && ((uintptr_t)dst & 15))
4180 : {
4181 0 : d = (uint32_t) *dst;
4182 :
4183 0 : *dst++ = (uint8_t) pack_1x128_32 (
4184 : pix_multiply_1x128 (
4185 : xmm_alpha,
4186 : unpack_32_1x128 (d)));
4187 0 : w--;
4188 : }
4189 :
4190 0 : while (w >= 16)
4191 : {
4192 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
4193 :
4194 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4195 :
4196 : pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4197 : &xmm_dst_lo, &xmm_dst_hi,
4198 : &xmm_dst_lo, &xmm_dst_hi);
4199 :
4200 0 : save_128_aligned (
4201 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4202 :
4203 0 : dst += 16;
4204 0 : w -= 16;
4205 : }
4206 :
4207 0 : while (w)
4208 : {
4209 0 : d = (uint32_t) *dst;
4210 :
4211 0 : *dst++ = (uint8_t) pack_1x128_32 (
4212 : pix_multiply_1x128 (
4213 : xmm_alpha,
4214 : unpack_32_1x128 (d)));
4215 0 : w--;
4216 : }
4217 : }
4218 :
4219 : }
4220 :
4221 : static void
4222 0 : sse2_composite_in_8_8 (pixman_implementation_t *imp,
4223 : pixman_composite_info_t *info)
4224 : {
4225 0 : PIXMAN_COMPOSITE_ARGS (info);
4226 : uint8_t *dst_line, *dst;
4227 : uint8_t *src_line, *src;
4228 : int src_stride, dst_stride;
4229 : int32_t w;
4230 : uint32_t s, d;
4231 :
4232 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4233 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4234 :
4235 0 : PIXMAN_IMAGE_GET_LINE (
4236 : dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4237 0 : PIXMAN_IMAGE_GET_LINE (
4238 : src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4239 :
4240 0 : while (height--)
4241 : {
4242 0 : dst = dst_line;
4243 0 : dst_line += dst_stride;
4244 0 : src = src_line;
4245 0 : src_line += src_stride;
4246 0 : w = width;
4247 :
4248 0 : while (w && ((uintptr_t)dst & 15))
4249 : {
4250 0 : s = (uint32_t) *src++;
4251 0 : d = (uint32_t) *dst;
4252 :
4253 0 : *dst++ = (uint8_t) pack_1x128_32 (
4254 : pix_multiply_1x128 (
4255 : unpack_32_1x128 (s), unpack_32_1x128 (d)));
4256 0 : w--;
4257 : }
4258 :
4259 0 : while (w >= 16)
4260 : {
4261 0 : xmm_src = load_128_unaligned ((__m128i*)src);
4262 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
4263 :
4264 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4265 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4266 :
4267 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4268 : &xmm_dst_lo, &xmm_dst_hi,
4269 : &xmm_dst_lo, &xmm_dst_hi);
4270 :
4271 0 : save_128_aligned (
4272 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4273 :
4274 0 : src += 16;
4275 0 : dst += 16;
4276 0 : w -= 16;
4277 : }
4278 :
4279 0 : while (w)
4280 : {
4281 0 : s = (uint32_t) *src++;
4282 0 : d = (uint32_t) *dst;
4283 :
4284 0 : *dst++ = (uint8_t) pack_1x128_32 (
4285 : pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4286 0 : w--;
4287 : }
4288 : }
4289 :
4290 0 : }
4291 :
4292 : static void
4293 0 : sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4294 : pixman_composite_info_t *info)
4295 : {
4296 0 : PIXMAN_COMPOSITE_ARGS (info);
4297 : uint8_t *dst_line, *dst;
4298 : uint8_t *mask_line, *mask;
4299 : int dst_stride, mask_stride;
4300 : int32_t w;
4301 : uint32_t src;
4302 : uint32_t m, d;
4303 :
4304 : __m128i xmm_alpha;
4305 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4306 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4307 :
4308 0 : PIXMAN_IMAGE_GET_LINE (
4309 : dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4310 0 : PIXMAN_IMAGE_GET_LINE (
4311 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4312 :
4313 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4314 :
4315 0 : xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4316 :
4317 0 : while (height--)
4318 : {
4319 0 : dst = dst_line;
4320 0 : dst_line += dst_stride;
4321 0 : mask = mask_line;
4322 0 : mask_line += mask_stride;
4323 0 : w = width;
4324 :
4325 0 : while (w && ((uintptr_t)dst & 15))
4326 : {
4327 0 : m = (uint32_t) *mask++;
4328 0 : d = (uint32_t) *dst;
4329 :
4330 0 : *dst++ = (uint8_t) pack_1x128_32 (
4331 : _mm_adds_epu16 (
4332 : pix_multiply_1x128 (
4333 : xmm_alpha, unpack_32_1x128 (m)),
4334 : unpack_32_1x128 (d)));
4335 0 : w--;
4336 : }
4337 :
4338 0 : while (w >= 16)
4339 : {
4340 0 : xmm_mask = load_128_unaligned ((__m128i*)mask);
4341 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
4342 :
4343 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4344 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4345 :
4346 : pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4347 : &xmm_mask_lo, &xmm_mask_hi,
4348 : &xmm_mask_lo, &xmm_mask_hi);
4349 :
4350 0 : xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4351 0 : xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4352 :
4353 0 : save_128_aligned (
4354 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4355 :
4356 0 : mask += 16;
4357 0 : dst += 16;
4358 0 : w -= 16;
4359 : }
4360 :
4361 0 : while (w)
4362 : {
4363 0 : m = (uint32_t) *mask++;
4364 0 : d = (uint32_t) *dst;
4365 :
4366 0 : *dst++ = (uint8_t) pack_1x128_32 (
4367 : _mm_adds_epu16 (
4368 : pix_multiply_1x128 (
4369 : xmm_alpha, unpack_32_1x128 (m)),
4370 : unpack_32_1x128 (d)));
4371 :
4372 0 : w--;
4373 : }
4374 : }
4375 :
4376 0 : }
4377 :
4378 : static void
4379 0 : sse2_composite_add_n_8 (pixman_implementation_t *imp,
4380 : pixman_composite_info_t *info)
4381 : {
4382 0 : PIXMAN_COMPOSITE_ARGS (info);
4383 : uint8_t *dst_line, *dst;
4384 : int dst_stride;
4385 : int32_t w;
4386 : uint32_t src;
4387 :
4388 : __m128i xmm_src;
4389 :
4390 0 : PIXMAN_IMAGE_GET_LINE (
4391 : dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4392 :
4393 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4394 :
4395 0 : src >>= 24;
4396 :
4397 0 : if (src == 0x00)
4398 0 : return;
4399 :
4400 0 : if (src == 0xff)
4401 : {
4402 0 : pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4403 : 8, dest_x, dest_y, width, height, 0xff);
4404 :
4405 0 : return;
4406 : }
4407 :
4408 0 : src = (src << 24) | (src << 16) | (src << 8) | src;
4409 0 : xmm_src = _mm_set_epi32 (src, src, src, src);
4410 :
4411 0 : while (height--)
4412 : {
4413 0 : dst = dst_line;
4414 0 : dst_line += dst_stride;
4415 0 : w = width;
4416 :
4417 0 : while (w && ((uintptr_t)dst & 15))
4418 : {
4419 0 : *dst = (uint8_t)_mm_cvtsi128_si32 (
4420 : _mm_adds_epu8 (
4421 : xmm_src,
4422 0 : _mm_cvtsi32_si128 (*dst)));
4423 :
4424 0 : w--;
4425 0 : dst++;
4426 : }
4427 :
4428 0 : while (w >= 16)
4429 : {
4430 0 : save_128_aligned (
4431 : (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4432 :
4433 0 : dst += 16;
4434 0 : w -= 16;
4435 : }
4436 :
4437 0 : while (w)
4438 : {
4439 0 : *dst = (uint8_t)_mm_cvtsi128_si32 (
4440 : _mm_adds_epu8 (
4441 : xmm_src,
4442 0 : _mm_cvtsi32_si128 (*dst)));
4443 :
4444 0 : w--;
4445 0 : dst++;
4446 : }
4447 : }
4448 :
4449 : }
4450 :
4451 : static void
4452 0 : sse2_composite_add_8_8 (pixman_implementation_t *imp,
4453 : pixman_composite_info_t *info)
4454 : {
4455 0 : PIXMAN_COMPOSITE_ARGS (info);
4456 : uint8_t *dst_line, *dst;
4457 : uint8_t *src_line, *src;
4458 : int dst_stride, src_stride;
4459 : int32_t w;
4460 : uint16_t t;
4461 :
4462 0 : PIXMAN_IMAGE_GET_LINE (
4463 : src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4464 0 : PIXMAN_IMAGE_GET_LINE (
4465 : dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4466 :
4467 0 : while (height--)
4468 : {
4469 0 : dst = dst_line;
4470 0 : src = src_line;
4471 :
4472 0 : dst_line += dst_stride;
4473 0 : src_line += src_stride;
4474 0 : w = width;
4475 :
4476 : /* Small head */
4477 0 : while (w && (uintptr_t)dst & 3)
4478 : {
4479 0 : t = (*dst) + (*src++);
4480 0 : *dst++ = t | (0 - (t >> 8));
4481 0 : w--;
4482 : }
4483 :
4484 0 : sse2_combine_add_u (imp, op,
4485 : (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4486 :
4487 : /* Small tail */
4488 0 : dst += w & 0xfffc;
4489 0 : src += w & 0xfffc;
4490 :
4491 0 : w &= 3;
4492 :
4493 0 : while (w)
4494 : {
4495 0 : t = (*dst) + (*src++);
4496 0 : *dst++ = t | (0 - (t >> 8));
4497 0 : w--;
4498 : }
4499 : }
4500 :
4501 0 : }
4502 :
4503 : static void
4504 0 : sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4505 : pixman_composite_info_t *info)
4506 : {
4507 0 : PIXMAN_COMPOSITE_ARGS (info);
4508 : uint32_t *dst_line, *dst;
4509 : uint32_t *src_line, *src;
4510 : int dst_stride, src_stride;
4511 :
4512 0 : PIXMAN_IMAGE_GET_LINE (
4513 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4514 0 : PIXMAN_IMAGE_GET_LINE (
4515 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4516 :
4517 0 : while (height--)
4518 : {
4519 0 : dst = dst_line;
4520 0 : dst_line += dst_stride;
4521 0 : src = src_line;
4522 0 : src_line += src_stride;
4523 :
4524 : sse2_combine_add_u (imp, op, dst, src, NULL, width);
4525 : }
4526 0 : }
4527 :
4528 : static void
4529 0 : sse2_composite_add_n_8888 (pixman_implementation_t *imp,
4530 : pixman_composite_info_t *info)
4531 : {
4532 0 : PIXMAN_COMPOSITE_ARGS (info);
4533 : uint32_t *dst_line, *dst, src;
4534 : int dst_stride;
4535 :
4536 : __m128i xmm_src;
4537 :
4538 0 : PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4539 :
4540 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4541 0 : if (src == 0)
4542 0 : return;
4543 :
4544 0 : if (src == ~0)
4545 : {
4546 0 : pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
4547 : dest_x, dest_y, width, height, ~0);
4548 :
4549 0 : return;
4550 : }
4551 :
4552 0 : xmm_src = _mm_set_epi32 (src, src, src, src);
4553 0 : while (height--)
4554 : {
4555 0 : int w = width;
4556 : uint32_t d;
4557 :
4558 0 : dst = dst_line;
4559 0 : dst_line += dst_stride;
4560 :
4561 0 : while (w && (unsigned long)dst & 15)
4562 : {
4563 0 : d = *dst;
4564 0 : *dst++ =
4565 0 : _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
4566 0 : w--;
4567 : }
4568 :
4569 0 : while (w >= 4)
4570 : {
4571 0 : save_128_aligned
4572 : ((__m128i*)dst,
4573 : _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4574 :
4575 0 : dst += 4;
4576 0 : w -= 4;
4577 : }
4578 :
4579 0 : while (w--)
4580 : {
4581 0 : d = *dst;
4582 0 : *dst++ =
4583 0 : _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
4584 : _mm_cvtsi32_si128 (d)));
4585 : }
4586 : }
4587 : }
4588 :
4589 : static void
4590 0 : sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
4591 : pixman_composite_info_t *info)
4592 : {
4593 0 : PIXMAN_COMPOSITE_ARGS (info);
4594 : uint32_t *dst_line, *dst;
4595 : uint8_t *mask_line, *mask;
4596 : int dst_stride, mask_stride;
4597 : int32_t w;
4598 : uint32_t src;
4599 :
4600 : __m128i xmm_src;
4601 :
4602 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4603 0 : if (src == 0)
4604 0 : return;
4605 0 : xmm_src = expand_pixel_32_1x128 (src);
4606 :
4607 0 : PIXMAN_IMAGE_GET_LINE (
4608 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4609 0 : PIXMAN_IMAGE_GET_LINE (
4610 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4611 :
4612 0 : while (height--)
4613 : {
4614 0 : dst = dst_line;
4615 0 : dst_line += dst_stride;
4616 0 : mask = mask_line;
4617 0 : mask_line += mask_stride;
4618 0 : w = width;
4619 :
4620 0 : while (w && ((unsigned long)dst & 15))
4621 : {
4622 0 : uint8_t m = *mask++;
4623 0 : if (m)
4624 : {
4625 0 : *dst = pack_1x128_32
4626 : (_mm_adds_epu16
4627 : (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4628 : unpack_32_1x128 (*dst)));
4629 : }
4630 0 : dst++;
4631 0 : w--;
4632 : }
4633 :
4634 0 : while (w >= 4)
4635 : {
4636 0 : uint32_t m = *(uint32_t*)mask;
4637 0 : if (m)
4638 : {
4639 : __m128i xmm_mask_lo, xmm_mask_hi;
4640 : __m128i xmm_dst_lo, xmm_dst_hi;
4641 :
4642 0 : __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
4643 0 : __m128i xmm_mask =
4644 0 : _mm_unpacklo_epi8 (unpack_32_1x128(m),
4645 : _mm_setzero_si128 ());
4646 :
4647 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4648 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4649 :
4650 0 : expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4651 : &xmm_mask_lo, &xmm_mask_hi);
4652 :
4653 : pix_multiply_2x128 (&xmm_src, &xmm_src,
4654 : &xmm_mask_lo, &xmm_mask_hi,
4655 : &xmm_mask_lo, &xmm_mask_hi);
4656 :
4657 0 : xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4658 0 : xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4659 :
4660 0 : save_128_aligned (
4661 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4662 : }
4663 :
4664 0 : w -= 4;
4665 0 : dst += 4;
4666 0 : mask += 4;
4667 : }
4668 :
4669 0 : while (w)
4670 : {
4671 0 : uint8_t m = *mask++;
4672 0 : if (m)
4673 : {
4674 0 : *dst = pack_1x128_32
4675 : (_mm_adds_epu16
4676 : (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4677 : unpack_32_1x128 (*dst)));
4678 : }
4679 0 : dst++;
4680 0 : w--;
4681 : }
4682 : }
4683 : }
4684 :
4685 : static pixman_bool_t
4686 22 : sse2_blt (pixman_implementation_t *imp,
4687 : uint32_t * src_bits,
4688 : uint32_t * dst_bits,
4689 : int src_stride,
4690 : int dst_stride,
4691 : int src_bpp,
4692 : int dst_bpp,
4693 : int src_x,
4694 : int src_y,
4695 : int dest_x,
4696 : int dest_y,
4697 : int width,
4698 : int height)
4699 : {
4700 : uint8_t * src_bytes;
4701 : uint8_t * dst_bytes;
4702 : int byte_width;
4703 :
4704 22 : if (src_bpp != dst_bpp)
4705 0 : return FALSE;
4706 :
4707 22 : if (src_bpp == 16)
4708 : {
4709 0 : src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4710 0 : dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4711 0 : src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4712 0 : dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4713 0 : byte_width = 2 * width;
4714 0 : src_stride *= 2;
4715 0 : dst_stride *= 2;
4716 : }
4717 22 : else if (src_bpp == 32)
4718 : {
4719 22 : src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4720 22 : dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4721 22 : src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4722 22 : dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4723 22 : byte_width = 4 * width;
4724 22 : src_stride *= 4;
4725 22 : dst_stride *= 4;
4726 : }
4727 : else
4728 : {
4729 0 : return FALSE;
4730 : }
4731 :
4732 748 : while (height--)
4733 : {
4734 : int w;
4735 704 : uint8_t *s = src_bytes;
4736 704 : uint8_t *d = dst_bytes;
4737 704 : src_bytes += src_stride;
4738 704 : dst_bytes += dst_stride;
4739 704 : w = byte_width;
4740 :
4741 1408 : while (w >= 2 && ((uintptr_t)d & 3))
4742 : {
4743 0 : *(uint16_t *)d = *(uint16_t *)s;
4744 0 : w -= 2;
4745 0 : s += 2;
4746 0 : d += 2;
4747 : }
4748 :
4749 1408 : while (w >= 4 && ((uintptr_t)d & 15))
4750 : {
4751 0 : *(uint32_t *)d = *(uint32_t *)s;
4752 :
4753 0 : w -= 4;
4754 0 : s += 4;
4755 0 : d += 4;
4756 : }
4757 :
4758 2816 : while (w >= 64)
4759 : {
4760 : __m128i xmm0, xmm1, xmm2, xmm3;
4761 :
4762 1408 : xmm0 = load_128_unaligned ((__m128i*)(s));
4763 2816 : xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4764 2816 : xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4765 2816 : xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4766 :
4767 : save_128_aligned ((__m128i*)(d), xmm0);
4768 1408 : save_128_aligned ((__m128i*)(d + 16), xmm1);
4769 1408 : save_128_aligned ((__m128i*)(d + 32), xmm2);
4770 1408 : save_128_aligned ((__m128i*)(d + 48), xmm3);
4771 :
4772 1408 : s += 64;
4773 1408 : d += 64;
4774 1408 : w -= 64;
4775 : }
4776 :
4777 1408 : while (w >= 16)
4778 : {
4779 0 : save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4780 :
4781 0 : w -= 16;
4782 0 : d += 16;
4783 0 : s += 16;
4784 : }
4785 :
4786 1408 : while (w >= 4)
4787 : {
4788 0 : *(uint32_t *)d = *(uint32_t *)s;
4789 :
4790 0 : w -= 4;
4791 0 : s += 4;
4792 0 : d += 4;
4793 : }
4794 :
4795 704 : if (w >= 2)
4796 : {
4797 0 : *(uint16_t *)d = *(uint16_t *)s;
4798 0 : w -= 2;
4799 0 : s += 2;
4800 0 : d += 2;
4801 : }
4802 : }
4803 :
4804 22 : return TRUE;
4805 : }
4806 :
4807 : static void
4808 22 : sse2_composite_copy_area (pixman_implementation_t *imp,
4809 : pixman_composite_info_t *info)
4810 : {
4811 22 : PIXMAN_COMPOSITE_ARGS (info);
4812 44 : sse2_blt (imp, src_image->bits.bits,
4813 : dest_image->bits.bits,
4814 : src_image->bits.rowstride,
4815 : dest_image->bits.rowstride,
4816 22 : PIXMAN_FORMAT_BPP (src_image->bits.format),
4817 22 : PIXMAN_FORMAT_BPP (dest_image->bits.format),
4818 : src_x, src_y, dest_x, dest_y, width, height);
4819 22 : }
4820 :
4821 : static void
4822 0 : sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4823 : pixman_composite_info_t *info)
4824 : {
4825 0 : PIXMAN_COMPOSITE_ARGS (info);
4826 : uint32_t *src, *src_line, s;
4827 : uint32_t *dst, *dst_line, d;
4828 : uint8_t *mask, *mask_line;
4829 : uint32_t m;
4830 : int src_stride, mask_stride, dst_stride;
4831 : int32_t w;
4832 : __m128i ms;
4833 :
4834 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4835 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4836 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4837 :
4838 0 : PIXMAN_IMAGE_GET_LINE (
4839 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4840 0 : PIXMAN_IMAGE_GET_LINE (
4841 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4842 0 : PIXMAN_IMAGE_GET_LINE (
4843 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4844 :
4845 0 : while (height--)
4846 : {
4847 0 : src = src_line;
4848 0 : src_line += src_stride;
4849 0 : dst = dst_line;
4850 0 : dst_line += dst_stride;
4851 0 : mask = mask_line;
4852 0 : mask_line += mask_stride;
4853 :
4854 0 : w = width;
4855 :
4856 0 : while (w && (uintptr_t)dst & 15)
4857 : {
4858 0 : s = 0xff000000 | *src++;
4859 0 : m = (uint32_t) *mask++;
4860 0 : d = *dst;
4861 0 : ms = unpack_32_1x128 (s);
4862 :
4863 0 : if (m != 0xff)
4864 : {
4865 0 : __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4866 0 : __m128i md = unpack_32_1x128 (d);
4867 :
4868 0 : ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4869 : }
4870 :
4871 0 : *dst++ = pack_1x128_32 (ms);
4872 0 : w--;
4873 : }
4874 :
4875 0 : while (w >= 4)
4876 : {
4877 0 : m = *(uint32_t*) mask;
4878 0 : xmm_src = _mm_or_si128 (
4879 : load_128_unaligned ((__m128i*)src), mask_ff000000);
4880 :
4881 0 : if (m == 0xffffffff)
4882 : {
4883 : save_128_aligned ((__m128i*)dst, xmm_src);
4884 : }
4885 : else
4886 : {
4887 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
4888 :
4889 0 : xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4890 :
4891 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4892 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4893 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4894 :
4895 0 : expand_alpha_rev_2x128 (
4896 : xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4897 :
4898 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4899 : &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4900 : &xmm_dst_lo, &xmm_dst_hi);
4901 :
4902 0 : save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4903 : }
4904 :
4905 0 : src += 4;
4906 0 : dst += 4;
4907 0 : mask += 4;
4908 0 : w -= 4;
4909 : }
4910 :
4911 0 : while (w)
4912 : {
4913 0 : m = (uint32_t) *mask++;
4914 :
4915 0 : if (m)
4916 : {
4917 0 : s = 0xff000000 | *src;
4918 :
4919 0 : if (m == 0xff)
4920 : {
4921 0 : *dst = s;
4922 : }
4923 : else
4924 : {
4925 : __m128i ma, md, ms;
4926 :
4927 0 : d = *dst;
4928 :
4929 0 : ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4930 0 : md = unpack_32_1x128 (d);
4931 0 : ms = unpack_32_1x128 (s);
4932 :
4933 0 : *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4934 : }
4935 :
4936 : }
4937 :
4938 0 : src++;
4939 0 : dst++;
4940 0 : w--;
4941 : }
4942 : }
4943 :
4944 0 : }
4945 :
4946 : static void
4947 0 : sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4948 : pixman_composite_info_t *info)
4949 : {
4950 0 : PIXMAN_COMPOSITE_ARGS (info);
4951 : uint32_t *src, *src_line, s;
4952 : uint32_t *dst, *dst_line, d;
4953 : uint8_t *mask, *mask_line;
4954 : uint32_t m;
4955 : int src_stride, mask_stride, dst_stride;
4956 : int32_t w;
4957 :
4958 : __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4959 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4960 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4961 :
4962 0 : PIXMAN_IMAGE_GET_LINE (
4963 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4964 0 : PIXMAN_IMAGE_GET_LINE (
4965 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4966 0 : PIXMAN_IMAGE_GET_LINE (
4967 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4968 :
4969 0 : while (height--)
4970 : {
4971 0 : src = src_line;
4972 0 : src_line += src_stride;
4973 0 : dst = dst_line;
4974 0 : dst_line += dst_stride;
4975 0 : mask = mask_line;
4976 0 : mask_line += mask_stride;
4977 :
4978 0 : w = width;
4979 :
4980 0 : while (w && (uintptr_t)dst & 15)
4981 : {
4982 : uint32_t sa;
4983 :
4984 0 : s = *src++;
4985 0 : m = (uint32_t) *mask++;
4986 0 : d = *dst;
4987 :
4988 0 : sa = s >> 24;
4989 :
4990 0 : if (m)
4991 : {
4992 0 : if (sa == 0xff && m == 0xff)
4993 : {
4994 0 : *dst = s;
4995 : }
4996 : else
4997 : {
4998 : __m128i ms, md, ma, msa;
4999 :
5000 0 : ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5001 0 : ms = unpack_32_1x128 (s);
5002 0 : md = unpack_32_1x128 (d);
5003 :
5004 0 : msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5005 :
5006 0 : *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5007 : }
5008 : }
5009 :
5010 0 : dst++;
5011 0 : w--;
5012 : }
5013 :
5014 0 : while (w >= 4)
5015 : {
5016 0 : m = *(uint32_t *) mask;
5017 :
5018 0 : if (m)
5019 : {
5020 0 : xmm_src = load_128_unaligned ((__m128i*)src);
5021 :
5022 0 : if (m == 0xffffffff && is_opaque (xmm_src))
5023 : {
5024 0 : save_128_aligned ((__m128i *)dst, xmm_src);
5025 : }
5026 : else
5027 : {
5028 0 : xmm_dst = load_128_aligned ((__m128i *)dst);
5029 :
5030 0 : xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5031 :
5032 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5033 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5034 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5035 :
5036 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5037 0 : expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5038 :
5039 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5040 : &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5041 :
5042 0 : save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5043 : }
5044 : }
5045 :
5046 0 : src += 4;
5047 0 : dst += 4;
5048 0 : mask += 4;
5049 0 : w -= 4;
5050 : }
5051 :
5052 0 : while (w)
5053 : {
5054 : uint32_t sa;
5055 :
5056 0 : s = *src++;
5057 0 : m = (uint32_t) *mask++;
5058 0 : d = *dst;
5059 :
5060 0 : sa = s >> 24;
5061 :
5062 0 : if (m)
5063 : {
5064 0 : if (sa == 0xff && m == 0xff)
5065 : {
5066 0 : *dst = s;
5067 : }
5068 : else
5069 : {
5070 : __m128i ms, md, ma, msa;
5071 :
5072 0 : ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5073 0 : ms = unpack_32_1x128 (s);
5074 0 : md = unpack_32_1x128 (d);
5075 :
5076 0 : msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5077 :
5078 0 : *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5079 : }
5080 : }
5081 :
5082 0 : dst++;
5083 0 : w--;
5084 : }
5085 : }
5086 :
5087 0 : }
5088 :
5089 : static void
5090 0 : sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5091 : pixman_composite_info_t *info)
5092 : {
5093 0 : PIXMAN_COMPOSITE_ARGS (info);
5094 : uint32_t src;
5095 : uint32_t *dst_line, *dst;
5096 : __m128i xmm_src;
5097 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5098 : __m128i xmm_dsta_hi, xmm_dsta_lo;
5099 : int dst_stride;
5100 : int32_t w;
5101 :
5102 0 : src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5103 :
5104 0 : if (src == 0)
5105 0 : return;
5106 :
5107 0 : PIXMAN_IMAGE_GET_LINE (
5108 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5109 :
5110 0 : xmm_src = expand_pixel_32_1x128 (src);
5111 :
5112 0 : while (height--)
5113 : {
5114 0 : dst = dst_line;
5115 :
5116 0 : dst_line += dst_stride;
5117 0 : w = width;
5118 :
5119 0 : while (w && (uintptr_t)dst & 15)
5120 : {
5121 : __m128i vd;
5122 :
5123 0 : vd = unpack_32_1x128 (*dst);
5124 :
5125 0 : *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5126 : xmm_src));
5127 0 : w--;
5128 0 : dst++;
5129 : }
5130 :
5131 0 : while (w >= 4)
5132 : {
5133 : __m128i tmp_lo, tmp_hi;
5134 :
5135 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
5136 :
5137 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5138 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5139 :
5140 0 : tmp_lo = xmm_src;
5141 0 : tmp_hi = xmm_src;
5142 :
5143 : over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5144 : &xmm_dsta_lo, &xmm_dsta_hi,
5145 : &tmp_lo, &tmp_hi);
5146 :
5147 0 : save_128_aligned (
5148 : (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5149 :
5150 0 : w -= 4;
5151 0 : dst += 4;
5152 : }
5153 :
5154 0 : while (w)
5155 : {
5156 : __m128i vd;
5157 :
5158 0 : vd = unpack_32_1x128 (*dst);
5159 :
5160 0 : *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5161 : xmm_src));
5162 0 : w--;
5163 0 : dst++;
5164 : }
5165 :
5166 : }
5167 :
5168 : }
5169 :
5170 : static void
5171 0 : sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5172 : pixman_composite_info_t *info)
5173 : {
5174 0 : PIXMAN_COMPOSITE_ARGS (info);
5175 : uint32_t *src, *src_line, s;
5176 : uint32_t *dst, *dst_line, d;
5177 : uint32_t *mask, *mask_line;
5178 : uint32_t m;
5179 : int src_stride, mask_stride, dst_stride;
5180 : int32_t w;
5181 :
5182 : __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5183 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5184 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5185 :
5186 0 : PIXMAN_IMAGE_GET_LINE (
5187 : dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5188 0 : PIXMAN_IMAGE_GET_LINE (
5189 : mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5190 0 : PIXMAN_IMAGE_GET_LINE (
5191 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5192 :
5193 0 : while (height--)
5194 : {
5195 0 : src = src_line;
5196 0 : src_line += src_stride;
5197 0 : dst = dst_line;
5198 0 : dst_line += dst_stride;
5199 0 : mask = mask_line;
5200 0 : mask_line += mask_stride;
5201 :
5202 0 : w = width;
5203 :
5204 0 : while (w && (uintptr_t)dst & 15)
5205 : {
5206 : uint32_t sa;
5207 :
5208 0 : s = *src++;
5209 0 : m = (*mask++) >> 24;
5210 0 : d = *dst;
5211 :
5212 0 : sa = s >> 24;
5213 :
5214 0 : if (m)
5215 : {
5216 0 : if (sa == 0xff && m == 0xff)
5217 : {
5218 0 : *dst = s;
5219 : }
5220 : else
5221 : {
5222 : __m128i ms, md, ma, msa;
5223 :
5224 0 : ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5225 0 : ms = unpack_32_1x128 (s);
5226 0 : md = unpack_32_1x128 (d);
5227 :
5228 0 : msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5229 :
5230 0 : *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5231 : }
5232 : }
5233 :
5234 0 : dst++;
5235 0 : w--;
5236 : }
5237 :
5238 0 : while (w >= 4)
5239 : {
5240 0 : xmm_mask = load_128_unaligned ((__m128i*)mask);
5241 :
5242 0 : if (!is_transparent (xmm_mask))
5243 : {
5244 0 : xmm_src = load_128_unaligned ((__m128i*)src);
5245 :
5246 0 : if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5247 : {
5248 0 : save_128_aligned ((__m128i *)dst, xmm_src);
5249 : }
5250 : else
5251 : {
5252 0 : xmm_dst = load_128_aligned ((__m128i *)dst);
5253 :
5254 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5255 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5256 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5257 :
5258 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5259 0 : expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5260 :
5261 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5262 : &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5263 :
5264 0 : save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5265 : }
5266 : }
5267 :
5268 0 : src += 4;
5269 0 : dst += 4;
5270 0 : mask += 4;
5271 0 : w -= 4;
5272 : }
5273 :
5274 0 : while (w)
5275 : {
5276 : uint32_t sa;
5277 :
5278 0 : s = *src++;
5279 0 : m = (*mask++) >> 24;
5280 0 : d = *dst;
5281 :
5282 0 : sa = s >> 24;
5283 :
5284 0 : if (m)
5285 : {
5286 0 : if (sa == 0xff && m == 0xff)
5287 : {
5288 0 : *dst = s;
5289 : }
5290 : else
5291 : {
5292 : __m128i ms, md, ma, msa;
5293 :
5294 0 : ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5295 0 : ms = unpack_32_1x128 (s);
5296 0 : md = unpack_32_1x128 (d);
5297 :
5298 0 : msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5299 :
5300 0 : *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5301 : }
5302 : }
5303 :
5304 0 : dst++;
5305 0 : w--;
5306 : }
5307 : }
5308 :
5309 0 : }
5310 :
5311 : /* A variant of 'sse2_combine_over_u' with minor tweaks */
5312 : static force_inline void
5313 : scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5314 : const uint32_t* ps,
5315 : int32_t w,
5316 : pixman_fixed_t vx,
5317 : pixman_fixed_t unit_x,
5318 : pixman_fixed_t src_width_fixed,
5319 : pixman_bool_t fully_transparent_src)
5320 : {
5321 : uint32_t s, d;
5322 0 : const uint32_t* pm = NULL;
5323 :
5324 : __m128i xmm_dst_lo, xmm_dst_hi;
5325 : __m128i xmm_src_lo, xmm_src_hi;
5326 : __m128i xmm_alpha_lo, xmm_alpha_hi;
5327 :
5328 0 : if (fully_transparent_src)
5329 0 : return;
5330 :
5331 : /* Align dst on a 16-byte boundary */
5332 0 : while (w && ((uintptr_t)pd & 15))
5333 : {
5334 0 : d = *pd;
5335 0 : s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5336 0 : vx += unit_x;
5337 0 : while (vx >= 0)
5338 0 : vx -= src_width_fixed;
5339 :
5340 0 : *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5341 0 : if (pm)
5342 0 : pm++;
5343 0 : w--;
5344 : }
5345 :
5346 0 : while (w >= 4)
5347 : {
5348 : __m128i tmp;
5349 : uint32_t tmp1, tmp2, tmp3, tmp4;
5350 :
5351 0 : tmp1 = *(ps + pixman_fixed_to_int (vx));
5352 0 : vx += unit_x;
5353 0 : while (vx >= 0)
5354 0 : vx -= src_width_fixed;
5355 0 : tmp2 = *(ps + pixman_fixed_to_int (vx));
5356 0 : vx += unit_x;
5357 0 : while (vx >= 0)
5358 0 : vx -= src_width_fixed;
5359 0 : tmp3 = *(ps + pixman_fixed_to_int (vx));
5360 0 : vx += unit_x;
5361 0 : while (vx >= 0)
5362 0 : vx -= src_width_fixed;
5363 0 : tmp4 = *(ps + pixman_fixed_to_int (vx));
5364 0 : vx += unit_x;
5365 0 : while (vx >= 0)
5366 0 : vx -= src_width_fixed;
5367 :
5368 0 : tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5369 :
5370 0 : xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5371 :
5372 0 : if (is_opaque (xmm_src_hi))
5373 : {
5374 0 : save_128_aligned ((__m128i*)pd, xmm_src_hi);
5375 : }
5376 0 : else if (!is_zero (xmm_src_hi))
5377 : {
5378 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5379 :
5380 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5381 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5382 :
5383 0 : expand_alpha_2x128 (
5384 : xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5385 :
5386 : over_2x128 (&xmm_src_lo, &xmm_src_hi,
5387 : &xmm_alpha_lo, &xmm_alpha_hi,
5388 : &xmm_dst_lo, &xmm_dst_hi);
5389 :
5390 : /* rebuid the 4 pixel data and save*/
5391 0 : save_128_aligned ((__m128i*)pd,
5392 : pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5393 : }
5394 :
5395 0 : w -= 4;
5396 0 : pd += 4;
5397 0 : if (pm)
5398 0 : pm += 4;
5399 : }
5400 :
5401 0 : while (w)
5402 : {
5403 0 : d = *pd;
5404 0 : s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5405 0 : vx += unit_x;
5406 0 : while (vx >= 0)
5407 0 : vx -= src_width_fixed;
5408 :
5409 0 : *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5410 0 : if (pm)
5411 0 : pm++;
5412 :
5413 0 : w--;
5414 : }
5415 : }
5416 :
5417 0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5418 : scaled_nearest_scanline_sse2_8888_8888_OVER,
5419 : uint32_t, uint32_t, COVER)
5420 0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5421 : scaled_nearest_scanline_sse2_8888_8888_OVER,
5422 : uint32_t, uint32_t, NONE)
5423 0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5424 : scaled_nearest_scanline_sse2_8888_8888_OVER,
5425 : uint32_t, uint32_t, PAD)
5426 0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
5427 : scaled_nearest_scanline_sse2_8888_8888_OVER,
5428 : uint32_t, uint32_t, NORMAL)
5429 :
5430 : static force_inline void
5431 : scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5432 : uint32_t * dst,
5433 : const uint32_t * src,
5434 : int32_t w,
5435 : pixman_fixed_t vx,
5436 : pixman_fixed_t unit_x,
5437 : pixman_fixed_t src_width_fixed,
5438 : pixman_bool_t zero_src)
5439 : {
5440 : __m128i xmm_mask;
5441 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5442 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5443 : __m128i xmm_alpha_lo, xmm_alpha_hi;
5444 :
5445 0 : if (zero_src || (*mask >> 24) == 0)
5446 0 : return;
5447 :
5448 0 : xmm_mask = create_mask_16_128 (*mask >> 24);
5449 :
5450 0 : while (w && (uintptr_t)dst & 15)
5451 : {
5452 0 : uint32_t s = *(src + pixman_fixed_to_int (vx));
5453 0 : vx += unit_x;
5454 0 : while (vx >= 0)
5455 0 : vx -= src_width_fixed;
5456 :
5457 0 : if (s)
5458 : {
5459 0 : uint32_t d = *dst;
5460 :
5461 0 : __m128i ms = unpack_32_1x128 (s);
5462 0 : __m128i alpha = expand_alpha_1x128 (ms);
5463 0 : __m128i dest = xmm_mask;
5464 0 : __m128i alpha_dst = unpack_32_1x128 (d);
5465 :
5466 0 : *dst = pack_1x128_32 (
5467 : in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5468 : }
5469 0 : dst++;
5470 0 : w--;
5471 : }
5472 :
5473 0 : while (w >= 4)
5474 : {
5475 : uint32_t tmp1, tmp2, tmp3, tmp4;
5476 :
5477 0 : tmp1 = *(src + pixman_fixed_to_int (vx));
5478 0 : vx += unit_x;
5479 0 : while (vx >= 0)
5480 0 : vx -= src_width_fixed;
5481 0 : tmp2 = *(src + pixman_fixed_to_int (vx));
5482 0 : vx += unit_x;
5483 0 : while (vx >= 0)
5484 0 : vx -= src_width_fixed;
5485 0 : tmp3 = *(src + pixman_fixed_to_int (vx));
5486 0 : vx += unit_x;
5487 0 : while (vx >= 0)
5488 0 : vx -= src_width_fixed;
5489 0 : tmp4 = *(src + pixman_fixed_to_int (vx));
5490 0 : vx += unit_x;
5491 0 : while (vx >= 0)
5492 0 : vx -= src_width_fixed;
5493 :
5494 0 : xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5495 :
5496 0 : if (!is_zero (xmm_src))
5497 : {
5498 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
5499 :
5500 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5501 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5502 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5503 : &xmm_alpha_lo, &xmm_alpha_hi);
5504 :
5505 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5506 : &xmm_alpha_lo, &xmm_alpha_hi,
5507 : &xmm_mask, &xmm_mask,
5508 : &xmm_dst_lo, &xmm_dst_hi);
5509 :
5510 0 : save_128_aligned (
5511 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5512 : }
5513 :
5514 0 : dst += 4;
5515 0 : w -= 4;
5516 : }
5517 :
5518 0 : while (w)
5519 : {
5520 0 : uint32_t s = *(src + pixman_fixed_to_int (vx));
5521 0 : vx += unit_x;
5522 0 : while (vx >= 0)
5523 0 : vx -= src_width_fixed;
5524 :
5525 0 : if (s)
5526 : {
5527 0 : uint32_t d = *dst;
5528 :
5529 0 : __m128i ms = unpack_32_1x128 (s);
5530 0 : __m128i alpha = expand_alpha_1x128 (ms);
5531 0 : __m128i mask = xmm_mask;
5532 0 : __m128i dest = unpack_32_1x128 (d);
5533 :
5534 0 : *dst = pack_1x128_32 (
5535 : in_over_1x128 (&ms, &alpha, &mask, &dest));
5536 : }
5537 :
5538 0 : dst++;
5539 0 : w--;
5540 : }
5541 :
5542 : }
5543 :
5544 0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5545 : scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5546 : uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5547 0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5548 : scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5549 : uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5550 0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5551 : scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5552 : uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5553 0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
5554 : scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5555 : uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
5556 :
5557 : #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
5558 :
5559 : #define BILINEAR_DECLARE_VARIABLES \
5560 : const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
5561 : const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
5562 : const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
5563 : const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \
5564 : const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\
5565 : const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
5566 : const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \
5567 : unit_x, unit_x, unit_x, unit_x); \
5568 : const __m128i xmm_zero = _mm_setzero_si128 (); \
5569 : __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
5570 :
5571 : #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
5572 : do { \
5573 : __m128i xmm_wh, xmm_lo, xmm_hi, a; \
5574 : /* fetch 2x2 pixel block into sse2 registers */ \
5575 : __m128i tltr = _mm_loadl_epi64 ( \
5576 : (__m128i *)&src_top[pixman_fixed_to_int (vx)]); \
5577 : __m128i blbr = _mm_loadl_epi64 ( \
5578 : (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]); \
5579 : vx += unit_x; \
5580 : /* vertical interpolation */ \
5581 : a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), \
5582 : xmm_wt), \
5583 : _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \
5584 : xmm_wb)); \
5585 : if (BILINEAR_INTERPOLATION_BITS < 8) \
5586 : { \
5587 : /* calculate horizontal weights */ \
5588 : xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7, \
5589 : _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \
5590 : xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
5591 : /* horizontal interpolation */ \
5592 : a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
5593 : a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \
5594 : } \
5595 : else \
5596 : { \
5597 : /* calculate horizontal weights */ \
5598 : xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8, \
5599 : _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \
5600 : xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
5601 : /* horizontal interpolation */ \
5602 : xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
5603 : xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
5604 : a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
5605 : _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
5606 : } \
5607 : /* shift and pack the result */ \
5608 : a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \
5609 : a = _mm_packs_epi32 (a, a); \
5610 : a = _mm_packus_epi16 (a, a); \
5611 : pix = _mm_cvtsi128_si32 (a); \
5612 : } while (0)
5613 :
5614 : #define BILINEAR_SKIP_ONE_PIXEL() \
5615 : do { \
5616 : vx += unit_x; \
5617 : xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
5618 : } while(0)
5619 :
5620 : static force_inline void
5621 : scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
5622 : const uint32_t * mask,
5623 : const uint32_t * src_top,
5624 : const uint32_t * src_bottom,
5625 : int32_t w,
5626 : int wt,
5627 : int wb,
5628 : pixman_fixed_t vx,
5629 : pixman_fixed_t unit_x,
5630 : pixman_fixed_t max_vx,
5631 : pixman_bool_t zero_src)
5632 : {
5633 0 : BILINEAR_DECLARE_VARIABLES;
5634 : uint32_t pix1, pix2, pix3, pix4;
5635 :
5636 0 : while ((w -= 4) >= 0)
5637 : {
5638 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5639 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5640 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5641 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5642 0 : *dst++ = pix1;
5643 0 : *dst++ = pix2;
5644 0 : *dst++ = pix3;
5645 0 : *dst++ = pix4;
5646 : }
5647 :
5648 0 : if (w & 2)
5649 : {
5650 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5651 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5652 0 : *dst++ = pix1;
5653 0 : *dst++ = pix2;
5654 : }
5655 :
5656 0 : if (w & 1)
5657 : {
5658 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5659 0 : *dst = pix1;
5660 : }
5661 :
5662 : }
5663 :
5664 : /* Add extra NULL argument to the existing bilinear fast paths to indicate
5665 : * that we don't need two-pass processing */
5666 :
5667 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5668 : scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
5669 : uint32_t, uint32_t, uint32_t,
5670 : COVER, FLAG_NONE)
5671 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5672 : scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
5673 : uint32_t, uint32_t, uint32_t,
5674 : PAD, FLAG_NONE)
5675 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5676 : scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
5677 : uint32_t, uint32_t, uint32_t,
5678 : NONE, FLAG_NONE)
5679 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5680 : scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
5681 : uint32_t, uint32_t, uint32_t,
5682 : NORMAL, FLAG_NONE)
5683 :
5684 : static force_inline void
5685 : scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t * dst,
5686 : const uint32_t * mask,
5687 : const uint32_t * src_top,
5688 : const uint32_t * src_bottom,
5689 : int32_t w,
5690 : int wt,
5691 : int wb,
5692 : pixman_fixed_t vx_,
5693 : pixman_fixed_t unit_x_,
5694 : pixman_fixed_t max_vx,
5695 : pixman_bool_t zero_src)
5696 : {
5697 0 : intptr_t vx = vx_;
5698 0 : intptr_t unit_x = unit_x_;
5699 0 : BILINEAR_DECLARE_VARIABLES;
5700 : uint32_t pix1, pix2, pix3, pix4;
5701 :
5702 0 : while (w && ((uintptr_t)dst & 15))
5703 : {
5704 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5705 0 : *dst++ = pix1 | 0xFF000000;
5706 0 : w--;
5707 : }
5708 :
5709 0 : while ((w -= 4) >= 0) {
5710 : __m128i xmm_src;
5711 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5712 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5713 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5714 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5715 :
5716 0 : xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
5717 0 : _mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
5718 0 : dst += 4;
5719 : }
5720 :
5721 0 : if (w & 2)
5722 : {
5723 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5724 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5725 0 : *dst++ = pix1 | 0xFF000000;
5726 0 : *dst++ = pix2 | 0xFF000000;
5727 : }
5728 :
5729 0 : if (w & 1)
5730 : {
5731 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5732 0 : *dst = pix1 | 0xFF000000;
5733 : }
5734 : }
5735 :
5736 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
5737 : scaled_bilinear_scanline_sse2_x888_8888_SRC, NULL,
5738 : uint32_t, uint32_t, uint32_t,
5739 : COVER, FLAG_NONE)
5740 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
5741 : scaled_bilinear_scanline_sse2_x888_8888_SRC, NULL,
5742 : uint32_t, uint32_t, uint32_t,
5743 : PAD, FLAG_NONE)
5744 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
5745 : scaled_bilinear_scanline_sse2_x888_8888_SRC, NULL,
5746 : uint32_t, uint32_t, uint32_t,
5747 : NORMAL, FLAG_NONE)
5748 : #if 0
5749 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5750 : scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
5751 : uint32_t, uint32_t, uint32_t,
5752 : PAD, FLAG_NONE)
5753 : #endif
5754 : static force_inline void
5755 : scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
5756 : const uint32_t * mask,
5757 : const uint32_t * src_top,
5758 : const uint32_t * src_bottom,
5759 : int32_t w,
5760 : int wt,
5761 : int wb,
5762 : pixman_fixed_t vx,
5763 : pixman_fixed_t unit_x,
5764 : pixman_fixed_t max_vx,
5765 : pixman_bool_t zero_src)
5766 : {
5767 0 : BILINEAR_DECLARE_VARIABLES;
5768 : uint32_t pix1, pix2, pix3, pix4;
5769 :
5770 0 : while (w && ((uintptr_t)dst & 15))
5771 : {
5772 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5773 :
5774 0 : if (pix1)
5775 : {
5776 0 : pix2 = *dst;
5777 0 : *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5778 : }
5779 :
5780 0 : w--;
5781 0 : dst++;
5782 : }
5783 :
5784 0 : while (w >= 4)
5785 : {
5786 : __m128i xmm_src;
5787 : __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5788 : __m128i xmm_alpha_hi, xmm_alpha_lo;
5789 :
5790 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5791 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5792 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5793 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5794 :
5795 0 : xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
5796 :
5797 0 : if (!is_zero (xmm_src))
5798 : {
5799 0 : if (is_opaque (xmm_src))
5800 : {
5801 : save_128_aligned ((__m128i *)dst, xmm_src);
5802 : }
5803 : else
5804 : {
5805 0 : __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5806 :
5807 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5808 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5809 :
5810 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5811 : over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5812 : &xmm_dst_lo, &xmm_dst_hi);
5813 :
5814 0 : save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5815 : }
5816 : }
5817 :
5818 0 : w -= 4;
5819 0 : dst += 4;
5820 : }
5821 :
5822 0 : while (w)
5823 : {
5824 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5825 :
5826 0 : if (pix1)
5827 : {
5828 0 : pix2 = *dst;
5829 0 : *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5830 : }
5831 :
5832 0 : w--;
5833 0 : dst++;
5834 : }
5835 : }
5836 :
5837 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5838 : scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
5839 : uint32_t, uint32_t, uint32_t,
5840 : COVER, FLAG_NONE)
5841 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5842 : scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
5843 : uint32_t, uint32_t, uint32_t,
5844 : PAD, FLAG_NONE)
5845 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5846 : scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
5847 : uint32_t, uint32_t, uint32_t,
5848 : NONE, FLAG_NONE)
5849 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5850 : scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
5851 : uint32_t, uint32_t, uint32_t,
5852 : NORMAL, FLAG_NONE)
5853 :
5854 :
5855 : /* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented
5856 : as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */
5857 :
5858 0 : void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width)
5859 : {
5860 : /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */
5861 0 : while (--width >= 0)
5862 : {
5863 0 : *dst = composite_over_8888_0565pixel (*src, *dst);
5864 0 : src++;
5865 0 : dst++;
5866 : }
5867 0 : }
5868 :
5869 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER,
5870 : scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
5871 : uint32_t, uint32_t, uint16_t,
5872 : COVER, FLAG_NONE)
5873 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER,
5874 : scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
5875 : uint32_t, uint32_t, uint16_t,
5876 : PAD, FLAG_NONE)
5877 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER,
5878 : scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
5879 : uint32_t, uint32_t, uint16_t,
5880 : NONE, FLAG_NONE)
5881 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER,
5882 : scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
5883 : uint32_t, uint32_t, uint16_t,
5884 : NORMAL, FLAG_NONE)
5885 :
5886 : /*****************************/
5887 :
5888 : static force_inline void
5889 : scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
5890 : const uint8_t * mask,
5891 : const uint32_t * src_top,
5892 : const uint32_t * src_bottom,
5893 : int32_t w,
5894 : int wt,
5895 : int wb,
5896 : pixman_fixed_t vx,
5897 : pixman_fixed_t unit_x,
5898 : pixman_fixed_t max_vx,
5899 : pixman_bool_t zero_src)
5900 : {
5901 0 : BILINEAR_DECLARE_VARIABLES;
5902 : uint32_t pix1, pix2, pix3, pix4;
5903 : uint32_t m;
5904 :
5905 0 : while (w && ((uintptr_t)dst & 15))
5906 : {
5907 : uint32_t sa;
5908 :
5909 0 : m = (uint32_t) *mask++;
5910 :
5911 0 : if (m)
5912 : {
5913 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5914 0 : sa = pix1 >> 24;
5915 :
5916 0 : if (sa == 0xff && m == 0xff)
5917 : {
5918 0 : *dst = pix1;
5919 : }
5920 : else
5921 : {
5922 : __m128i ms, md, ma, msa;
5923 :
5924 0 : pix2 = *dst;
5925 0 : ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5926 0 : ms = unpack_32_1x128 (pix1);
5927 0 : md = unpack_32_1x128 (pix2);
5928 :
5929 0 : msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5930 :
5931 0 : *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5932 : }
5933 : }
5934 : else
5935 : {
5936 0 : BILINEAR_SKIP_ONE_PIXEL ();
5937 : }
5938 :
5939 0 : w--;
5940 0 : dst++;
5941 : }
5942 :
5943 0 : while (w >= 4)
5944 : {
5945 : __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5946 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5947 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5948 :
5949 0 : m = *(uint32_t*)mask;
5950 :
5951 0 : if (m)
5952 : {
5953 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5954 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5955 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5956 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5957 :
5958 0 : xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
5959 :
5960 0 : if (m == 0xffffffff && is_opaque (xmm_src))
5961 : {
5962 : save_128_aligned ((__m128i *)dst, xmm_src);
5963 : }
5964 : else
5965 : {
5966 0 : xmm_dst = load_128_aligned ((__m128i *)dst);
5967 :
5968 0 : xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5969 :
5970 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5971 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5972 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5973 :
5974 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5975 0 : expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5976 :
5977 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5978 : &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5979 :
5980 0 : save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5981 : }
5982 : }
5983 : else
5984 : {
5985 0 : BILINEAR_SKIP_ONE_PIXEL ();
5986 0 : BILINEAR_SKIP_ONE_PIXEL ();
5987 0 : BILINEAR_SKIP_ONE_PIXEL ();
5988 0 : BILINEAR_SKIP_ONE_PIXEL ();
5989 : }
5990 :
5991 0 : w -= 4;
5992 0 : dst += 4;
5993 0 : mask += 4;
5994 : }
5995 :
5996 0 : while (w)
5997 : {
5998 : uint32_t sa;
5999 :
6000 0 : m = (uint32_t) *mask++;
6001 :
6002 0 : if (m)
6003 : {
6004 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6005 0 : sa = pix1 >> 24;
6006 :
6007 0 : if (sa == 0xff && m == 0xff)
6008 : {
6009 0 : *dst = pix1;
6010 : }
6011 : else
6012 : {
6013 : __m128i ms, md, ma, msa;
6014 :
6015 0 : pix2 = *dst;
6016 0 : ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
6017 0 : ms = unpack_32_1x128 (pix1);
6018 0 : md = unpack_32_1x128 (pix2);
6019 :
6020 0 : msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
6021 :
6022 0 : *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
6023 : }
6024 : }
6025 : else
6026 : {
6027 0 : BILINEAR_SKIP_ONE_PIXEL ();
6028 : }
6029 :
6030 0 : w--;
6031 0 : dst++;
6032 : }
6033 : }
6034 :
6035 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
6036 : scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
6037 : uint32_t, uint8_t, uint32_t,
6038 : COVER, FLAG_HAVE_NON_SOLID_MASK)
6039 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
6040 : scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
6041 : uint32_t, uint8_t, uint32_t,
6042 : PAD, FLAG_HAVE_NON_SOLID_MASK)
6043 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
6044 : scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
6045 : uint32_t, uint8_t, uint32_t,
6046 : NONE, FLAG_HAVE_NON_SOLID_MASK)
6047 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
6048 : scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
6049 : uint32_t, uint8_t, uint32_t,
6050 : NORMAL, FLAG_HAVE_NON_SOLID_MASK)
6051 :
6052 : static force_inline void
6053 : scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst,
6054 : const uint32_t * mask,
6055 : const uint32_t * src_top,
6056 : const uint32_t * src_bottom,
6057 : int32_t w,
6058 : int wt,
6059 : int wb,
6060 : pixman_fixed_t vx,
6061 : pixman_fixed_t unit_x,
6062 : pixman_fixed_t max_vx,
6063 : pixman_bool_t zero_src)
6064 : {
6065 0 : BILINEAR_DECLARE_VARIABLES;
6066 : uint32_t pix1, pix2, pix3, pix4;
6067 : __m128i xmm_mask;
6068 :
6069 0 : if (zero_src || (*mask >> 24) == 0)
6070 0 : return;
6071 :
6072 0 : xmm_mask = create_mask_16_128 (*mask >> 24);
6073 :
6074 0 : while (w && ((uintptr_t)dst & 15))
6075 : {
6076 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6077 0 : if (pix1)
6078 : {
6079 0 : uint32_t d = *dst;
6080 :
6081 0 : __m128i ms = unpack_32_1x128 (pix1);
6082 0 : __m128i alpha = expand_alpha_1x128 (ms);
6083 0 : __m128i dest = xmm_mask;
6084 0 : __m128i alpha_dst = unpack_32_1x128 (d);
6085 :
6086 0 : *dst = pack_1x128_32
6087 : (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6088 : }
6089 :
6090 0 : dst++;
6091 0 : w--;
6092 : }
6093 :
6094 0 : while (w >= 4)
6095 : {
6096 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6097 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
6098 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
6099 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
6100 :
6101 0 : if (pix1 | pix2 | pix3 | pix4)
6102 : {
6103 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
6104 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6105 : __m128i xmm_alpha_lo, xmm_alpha_hi;
6106 :
6107 0 : xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
6108 :
6109 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
6110 :
6111 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6112 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6113 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6114 : &xmm_alpha_lo, &xmm_alpha_hi);
6115 :
6116 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6117 : &xmm_alpha_lo, &xmm_alpha_hi,
6118 : &xmm_mask, &xmm_mask,
6119 : &xmm_dst_lo, &xmm_dst_hi);
6120 :
6121 0 : save_128_aligned
6122 : ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6123 : }
6124 :
6125 0 : dst += 4;
6126 0 : w -= 4;
6127 : }
6128 :
6129 0 : while (w)
6130 : {
6131 0 : BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6132 0 : if (pix1)
6133 : {
6134 0 : uint32_t d = *dst;
6135 :
6136 0 : __m128i ms = unpack_32_1x128 (pix1);
6137 0 : __m128i alpha = expand_alpha_1x128 (ms);
6138 0 : __m128i dest = xmm_mask;
6139 0 : __m128i alpha_dst = unpack_32_1x128 (d);
6140 :
6141 0 : *dst = pack_1x128_32
6142 : (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6143 : }
6144 :
6145 0 : dst++;
6146 0 : w--;
6147 : }
6148 : }
6149 :
6150 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6151 : scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
6152 : uint32_t, uint32_t, uint32_t,
6153 : COVER, FLAG_HAVE_SOLID_MASK)
6154 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6155 : scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
6156 : uint32_t, uint32_t, uint32_t,
6157 : PAD, FLAG_HAVE_SOLID_MASK)
6158 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6159 : scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
6160 : uint32_t, uint32_t, uint32_t,
6161 : NONE, FLAG_HAVE_SOLID_MASK)
6162 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
6163 : scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
6164 : uint32_t, uint32_t, uint32_t,
6165 : NORMAL, FLAG_HAVE_SOLID_MASK)
6166 :
6167 : static const pixman_fast_path_t sse2_fast_paths[] =
6168 : {
6169 : /* PIXMAN_OP_OVER */
6170 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6171 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6172 : PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6173 : PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6174 : PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6175 : PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
6176 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6177 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6178 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6179 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6180 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6181 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6182 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6183 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6184 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6185 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6186 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6187 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6188 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6189 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6190 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6191 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6192 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6193 : PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6194 : PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6195 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6196 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6197 : PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6198 : PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6199 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6200 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6201 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6202 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6203 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6204 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6205 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6206 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6207 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6208 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6209 : PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6210 : PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6211 : PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6212 : PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6213 : PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6214 : PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6215 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6216 : PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6217 :
6218 : /* PIXMAN_OP_OVER_REVERSE */
6219 : PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6220 : PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6221 :
6222 : /* PIXMAN_OP_ADD */
6223 : PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6224 : PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6225 : PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6226 : PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6227 : PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6228 : PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6229 : PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
6230 : PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
6231 : PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
6232 : PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
6233 : PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
6234 : PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
6235 : PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
6236 : PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
6237 :
6238 : /* PIXMAN_OP_SRC */
6239 : PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6240 : PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6241 : PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6242 : PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6243 : PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6244 : PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6245 : PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6246 : PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6247 : PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6248 : PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6249 : PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6250 : PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6251 : PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6252 : PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6253 : PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6254 : PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6255 : PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6256 : PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6257 :
6258 : /* PIXMAN_OP_IN */
6259 : PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6260 : PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6261 : PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6262 :
6263 : SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6264 : SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6265 : SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6266 : SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6267 : SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6268 : SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6269 : SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6270 : SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6271 : SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6272 : SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6273 : SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6274 : SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6275 : SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6276 : SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6277 : SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6278 : SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6279 :
6280 : SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6281 : SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6282 : SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6283 : SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6284 : SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6285 : SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6286 : SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6287 : SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6288 :
6289 : SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6290 : SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6291 : SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
6292 : SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6293 : SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6294 : SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
6295 :
6296 : SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6297 : SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6298 : SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6299 : SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6300 : SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6301 : SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6302 :
6303 : SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6304 : SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6305 : SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6306 : SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6307 :
6308 : SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6309 : SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6310 : SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6311 : SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6312 :
6313 : SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
6314 : SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
6315 : SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
6316 : SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
6317 :
6318 : /* and here the needed entries are added to the fast path table */
6319 :
6320 : SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565),
6321 : SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565),
6322 :
6323 : { PIXMAN_OP_NONE },
6324 : };
6325 :
6326 : static uint32_t *
6327 0 : sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6328 : {
6329 0 : int w = iter->width;
6330 0 : __m128i ff000000 = mask_ff000000;
6331 0 : uint32_t *dst = iter->buffer;
6332 0 : uint32_t *src = (uint32_t *)iter->bits;
6333 :
6334 0 : iter->bits += iter->stride;
6335 :
6336 0 : while (w && ((uintptr_t)dst) & 0x0f)
6337 : {
6338 0 : *dst++ = (*src++) | 0xff000000;
6339 0 : w--;
6340 : }
6341 :
6342 0 : while (w >= 4)
6343 : {
6344 0 : save_128_aligned (
6345 : (__m128i *)dst, _mm_or_si128 (
6346 : load_128_unaligned ((__m128i *)src), ff000000));
6347 :
6348 0 : dst += 4;
6349 0 : src += 4;
6350 0 : w -= 4;
6351 : }
6352 :
6353 0 : while (w)
6354 : {
6355 0 : *dst++ = (*src++) | 0xff000000;
6356 0 : w--;
6357 : }
6358 :
6359 0 : return iter->buffer;
6360 : }
6361 :
6362 : static uint32_t *
6363 0 : sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6364 : {
6365 0 : int w = iter->width;
6366 0 : uint32_t *dst = iter->buffer;
6367 0 : uint16_t *src = (uint16_t *)iter->bits;
6368 0 : __m128i ff000000 = mask_ff000000;
6369 :
6370 0 : iter->bits += iter->stride;
6371 :
6372 0 : while (w && ((uintptr_t)dst) & 0x0f)
6373 : {
6374 0 : uint16_t s = *src++;
6375 :
6376 0 : *dst++ = convert_0565_to_8888 (s);
6377 0 : w--;
6378 : }
6379 :
6380 0 : while (w >= 8)
6381 : {
6382 : __m128i lo, hi, s;
6383 :
6384 0 : s = _mm_loadu_si128 ((__m128i *)src);
6385 :
6386 0 : lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6387 0 : hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6388 :
6389 0 : save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6390 0 : save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6391 :
6392 0 : dst += 8;
6393 0 : src += 8;
6394 0 : w -= 8;
6395 : }
6396 :
6397 0 : while (w)
6398 : {
6399 0 : uint16_t s = *src++;
6400 :
6401 0 : *dst++ = convert_0565_to_8888 (s);
6402 0 : w--;
6403 : }
6404 :
6405 0 : return iter->buffer;
6406 : }
6407 :
6408 : static uint32_t *
6409 0 : sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6410 : {
6411 0 : int w = iter->width;
6412 0 : uint32_t *dst = iter->buffer;
6413 0 : uint8_t *src = iter->bits;
6414 : __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6415 :
6416 0 : iter->bits += iter->stride;
6417 :
6418 0 : while (w && (((uintptr_t)dst) & 15))
6419 : {
6420 0 : *dst++ = *(src++) << 24;
6421 0 : w--;
6422 : }
6423 :
6424 0 : while (w >= 16)
6425 : {
6426 0 : xmm0 = _mm_loadu_si128((__m128i *)src);
6427 :
6428 0 : xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
6429 0 : xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
6430 0 : xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6431 0 : xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6432 0 : xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6433 0 : xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6434 :
6435 : _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
6436 0 : _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
6437 0 : _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
6438 0 : _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6439 :
6440 0 : dst += 16;
6441 0 : src += 16;
6442 0 : w -= 16;
6443 : }
6444 :
6445 0 : while (w)
6446 : {
6447 0 : *dst++ = *(src++) << 24;
6448 0 : w--;
6449 : }
6450 :
6451 0 : return iter->buffer;
6452 : }
6453 :
6454 : typedef struct
6455 : {
6456 : pixman_format_code_t format;
6457 : pixman_iter_get_scanline_t get_scanline;
6458 : } fetcher_info_t;
6459 :
6460 : static const fetcher_info_t fetchers[] =
6461 : {
6462 : { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
6463 : { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 },
6464 : { PIXMAN_a8, sse2_fetch_a8 },
6465 : { PIXMAN_null }
6466 : };
6467 :
6468 : static pixman_bool_t
6469 0 : sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
6470 : {
6471 0 : pixman_image_t *image = iter->image;
6472 :
6473 : #define FLAGS \
6474 : (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
6475 : FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
6476 :
6477 0 : if ((iter->iter_flags & ITER_NARROW) &&
6478 0 : (iter->image_flags & FLAGS) == FLAGS)
6479 : {
6480 : const fetcher_info_t *f;
6481 :
6482 0 : for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
6483 : {
6484 0 : if (image->common.extended_format_code == f->format)
6485 : {
6486 0 : uint8_t *b = (uint8_t *)image->bits.bits;
6487 0 : int s = image->bits.rowstride * 4;
6488 :
6489 0 : iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
6490 0 : iter->stride = s;
6491 :
6492 0 : iter->get_scanline = f->get_scanline;
6493 0 : return TRUE;
6494 : }
6495 : }
6496 : }
6497 :
6498 0 : return FALSE;
6499 : }
6500 :
6501 : #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6502 : __attribute__((__force_align_arg_pointer__))
6503 : #endif
6504 : pixman_implementation_t *
6505 1 : _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6506 : {
6507 1 : pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6508 :
6509 : /* SSE2 constants */
6510 1 : mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6511 1 : mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6512 1 : mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6513 1 : mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6514 1 : mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6515 1 : mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6516 1 : mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6517 1 : mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6518 1 : mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
6519 1 : mask_0080 = create_mask_16_128 (0x0080);
6520 1 : mask_00ff = create_mask_16_128 (0x00ff);
6521 1 : mask_0101 = create_mask_16_128 (0x0101);
6522 1 : mask_ffff = create_mask_16_128 (0xffff);
6523 1 : mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6524 1 : mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6525 1 : mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
6526 1 : mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
6527 :
6528 : /* Set up function pointers */
6529 1 : imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6530 1 : imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6531 1 : imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6532 1 : imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6533 1 : imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6534 1 : imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6535 1 : imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6536 1 : imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6537 1 : imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6538 1 : imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6539 :
6540 1 : imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6541 :
6542 1 : imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6543 1 : imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6544 1 : imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6545 1 : imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6546 1 : imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6547 1 : imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6548 1 : imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6549 1 : imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6550 1 : imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6551 1 : imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6552 1 : imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6553 :
6554 1 : imp->blt = sse2_blt;
6555 1 : imp->fill = sse2_fill;
6556 :
6557 1 : imp->src_iter_init = sse2_src_iter_init;
6558 :
6559 1 : return imp;
6560 : }
|