LCOV - output.info - gfx/cairo/libpixman/src/pixman-sse2.c

LCOV - code coverage report

Current view:	top level - gfx/cairo/libpixman/src - pixman-sse2.c (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	88	2490	3.5 %
Date:	2017-07-14 16:53:18	Functions:	3	91	3.3 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*
       2             :  * Copyright © 2008 Rodrigo Kumpera
       3             :  * Copyright © 2008 André Tupinambá
       4             :  *
       5             :  * Permission to use, copy, modify, distribute, and sell this software and its
       6             :  * documentation for any purpose is hereby granted without fee, provided that
       7             :  * the above copyright notice appear in all copies and that both that
       8             :  * copyright notice and this permission notice appear in supporting
       9             :  * documentation, and that the name of Red Hat not be used in advertising or
      10             :  * publicity pertaining to distribution of the software without specific,
      11             :  * written prior permission.  Red Hat makes no representations about the
      12             :  * suitability of this software for any purpose.  It is provided "as is"
      13             :  * without express or implied warranty.
      14             :  *
      15             :  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
      16             :  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
      17             :  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
      18             :  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      19             :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
      20             :  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
      21             :  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
      22             :  * SOFTWARE.
      23             :  *
      24             :  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
      25             :  *          André Tupinambá (andrelrt@gmail.com)
      26             :  *
      27             :  * Based on work by Owen Taylor and Søren Sandmann
      28             :  */
      29             : #ifdef HAVE_CONFIG_H
      30             : #include <config.h>
      31             : #endif
      32             : 
      33             : #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
      34             : #include <emmintrin.h> /* for SSE2 intrinsics */
      35             : #include "pixman-private.h"
      36             : #include "pixman-combine32.h"
      37             : #include "pixman-inlines.h"
      38             : 
      39             : static __m128i mask_0080;
      40             : static __m128i mask_00ff;
      41             : static __m128i mask_0101;
      42             : static __m128i mask_ffff;
      43             : static __m128i mask_ff000000;
      44             : static __m128i mask_alpha;
      45             : 
      46             : static __m128i mask_565_r;
      47             : static __m128i mask_565_g1, mask_565_g2;
      48             : static __m128i mask_565_b;
      49             : static __m128i mask_red;
      50             : static __m128i mask_green;
      51             : static __m128i mask_blue;
      52             : 
      53             : static __m128i mask_565_fix_rb;
      54             : static __m128i mask_565_fix_g;
      55             : 
      56             : static __m128i mask_565_rb;
      57             : static __m128i mask_565_pack_multiplier;
      58             : 
      59             : static force_inline __m128i
      60             : unpack_32_1x128 (uint32_t data)
      61             : {
      62           0 :     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
      63             : }
      64             : 
      65             : static force_inline void
      66             : unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
      67             : {
      68           0 :     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
      69           0 :     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
      70             : }
      71             : 
      72             : static force_inline __m128i
      73             : unpack_565_to_8888 (__m128i lo)
      74             : {
      75             :     __m128i r, g, b, rb, t;
      76             : 
      77           0 :     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
      78           0 :     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
      79           0 :     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
      80             : 
      81           0 :     rb = _mm_or_si128 (r, b);
      82           0 :     t  = _mm_and_si128 (rb, mask_565_fix_rb);
      83           0 :     t  = _mm_srli_epi32 (t, 5);
      84           0 :     rb = _mm_or_si128 (rb, t);
      85             : 
      86           0 :     t  = _mm_and_si128 (g, mask_565_fix_g);
      87           0 :     t  = _mm_srli_epi32 (t, 6);
      88           0 :     g  = _mm_or_si128 (g, t);
      89             : 
      90           0 :     return _mm_or_si128 (rb, g);
      91             : }
      92             : 
      93             : static force_inline void
      94             : unpack_565_128_4x128 (__m128i  data,
      95             :                       __m128i* data0,
      96             :                       __m128i* data1,
      97             :                       __m128i* data2,
      98             :                       __m128i* data3)
      99             : {
     100             :     __m128i lo, hi;
     101             : 
     102           0 :     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
     103           0 :     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
     104             : 
     105           0 :     lo = unpack_565_to_8888 (lo);
     106           0 :     hi = unpack_565_to_8888 (hi);
     107             : 
     108             :     unpack_128_2x128 (lo, data0, data1);
     109             :     unpack_128_2x128 (hi, data2, data3);
     110             : }
     111             : 
     112             : static force_inline uint16_t
     113             : pack_565_32_16 (uint32_t pixel)
     114             : {
     115           0 :     return (uint16_t) (((pixel >> 8) & 0xf800) |
     116           0 :                        ((pixel >> 5) & 0x07e0) |
     117           0 :                        ((pixel >> 3) & 0x001f));
     118             : }
     119             : 
     120             : static force_inline __m128i
     121             : pack_2x128_128 (__m128i lo, __m128i hi)
     122             : {
     123           0 :     return _mm_packus_epi16 (lo, hi);
     124             : }
     125             : 
     126             : static force_inline __m128i
     127             : pack_565_2packedx128_128 (__m128i lo, __m128i hi)
     128             : {
     129           0 :     __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
     130           0 :     __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
     131             : 
     132           0 :     __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
     133           0 :     __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
     134             : 
     135           0 :     __m128i g0 = _mm_and_si128 (lo, mask_green);
     136           0 :     __m128i g1 = _mm_and_si128 (hi, mask_green);
     137             : 
     138           0 :     t0 = _mm_or_si128 (t0, g0);
     139           0 :     t1 = _mm_or_si128 (t1, g1);
     140             : 
     141             :     /* Simulates _mm_packus_epi32 */
     142           0 :     t0 = _mm_slli_epi32 (t0, 16 - 5);
     143           0 :     t1 = _mm_slli_epi32 (t1, 16 - 5);
     144           0 :     t0 = _mm_srai_epi32 (t0, 16);
     145           0 :     t1 = _mm_srai_epi32 (t1, 16);
     146           0 :     return _mm_packs_epi32 (t0, t1);
     147             : }
     148             : 
     149             : static force_inline __m128i
     150             : pack_565_2x128_128 (__m128i lo, __m128i hi)
     151             : {
     152             :     __m128i data;
     153             :     __m128i r, g1, g2, b;
     154             : 
     155           0 :     data = pack_2x128_128 (lo, hi);
     156             : 
     157           0 :     r  = _mm_and_si128 (data, mask_565_r);
     158           0 :     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
     159           0 :     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
     160           0 :     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
     161             : 
     162           0 :     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
     163             : }
     164             : 
     165             : static force_inline __m128i
     166             : pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
     167             : {
     168           0 :     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
     169             :                              pack_565_2x128_128 (*xmm2, *xmm3));
     170             : }
     171             : 
     172             : static force_inline int
     173             : is_opaque (__m128i x)
     174             : {
     175           0 :     __m128i ffs = _mm_cmpeq_epi8 (x, x);
     176             : 
     177           0 :     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
     178             : }
     179             : 
     180             : static force_inline int
     181             : is_zero (__m128i x)
     182             : {
     183           0 :     return _mm_movemask_epi8 (
     184           0 :         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
     185             : }
     186             : 
     187             : static force_inline int
     188             : is_transparent (__m128i x)
     189             : {
     190           0 :     return (_mm_movemask_epi8 (
     191           0 :                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
     192             : }
     193             : 
     194             : static force_inline __m128i
     195             : expand_pixel_32_1x128 (uint32_t data)
     196             : {
     197           0 :     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
     198             : }
     199             : 
     200             : static force_inline __m128i
     201             : expand_alpha_1x128 (__m128i data)
     202             : {
     203           0 :     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
     204             :                                                      _MM_SHUFFLE (3, 3, 3, 3)),
     205             :                                 _MM_SHUFFLE (3, 3, 3, 3));
     206             : }
     207             : 
     208             : static force_inline void
     209             : expand_alpha_2x128 (__m128i  data_lo,
     210             :                     __m128i  data_hi,
     211             :                     __m128i* alpha_lo,
     212             :                     __m128i* alpha_hi)
     213             : {
     214             :     __m128i lo, hi;
     215             : 
     216           0 :     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
     217           0 :     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
     218             : 
     219           0 :     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
     220           0 :     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
     221             : }
     222             : 
     223             : static force_inline void
     224             : expand_alpha_rev_2x128 (__m128i  data_lo,
     225             :                         __m128i  data_hi,
     226             :                         __m128i* alpha_lo,
     227             :                         __m128i* alpha_hi)
     228             : {
     229             :     __m128i lo, hi;
     230             : 
     231           0 :     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
     232           0 :     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
     233           0 :     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
     234           0 :     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
     235             : }
     236             : 
     237             : static force_inline void
     238             : pix_multiply_2x128 (__m128i* data_lo,
     239             :                     __m128i* data_hi,
     240             :                     __m128i* alpha_lo,
     241             :                     __m128i* alpha_hi,
     242             :                     __m128i* ret_lo,
     243             :                     __m128i* ret_hi)
     244             : {
     245             :     __m128i lo, hi;
     246             : 
     247           0 :     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
     248           0 :     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
     249           0 :     lo = _mm_adds_epu16 (lo, mask_0080);
     250           0 :     hi = _mm_adds_epu16 (hi, mask_0080);
     251           0 :     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
     252           0 :     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
     253             : }
     254             : 
     255             : static force_inline void
     256             : pix_add_multiply_2x128 (__m128i* src_lo,
     257             :                         __m128i* src_hi,
     258             :                         __m128i* alpha_dst_lo,
     259             :                         __m128i* alpha_dst_hi,
     260             :                         __m128i* dst_lo,
     261             :                         __m128i* dst_hi,
     262             :                         __m128i* alpha_src_lo,
     263             :                         __m128i* alpha_src_hi,
     264             :                         __m128i* ret_lo,
     265             :                         __m128i* ret_hi)
     266             : {
     267             :     __m128i t1_lo, t1_hi;
     268             :     __m128i t2_lo, t2_hi;
     269             : 
     270             :     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
     271             :     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
     272             : 
     273           0 :     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
     274           0 :     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
     275             : }
     276             : 
     277             : static force_inline void
     278             : negate_2x128 (__m128i  data_lo,
     279             :               __m128i  data_hi,
     280             :               __m128i* neg_lo,
     281             :               __m128i* neg_hi)
     282             : {
     283           0 :     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
     284           0 :     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
     285             : }
     286             : 
     287             : static force_inline void
     288             : invert_colors_2x128 (__m128i  data_lo,
     289             :                      __m128i  data_hi,
     290             :                      __m128i* inv_lo,
     291             :                      __m128i* inv_hi)
     292             : {
     293             :     __m128i lo, hi;
     294             : 
     295           0 :     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
     296           0 :     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
     297           0 :     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
     298           0 :     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
     299             : }
     300             : 
     301             : static force_inline void
     302             : over_2x128 (__m128i* src_lo,
     303             :             __m128i* src_hi,
     304             :             __m128i* alpha_lo,
     305             :             __m128i* alpha_hi,
     306             :             __m128i* dst_lo,
     307             :             __m128i* dst_hi)
     308             : {
     309             :     __m128i t1, t2;
     310             : 
     311           0 :     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
     312             : 
     313             :     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
     314             : 
     315           0 :     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
     316           0 :     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
     317             : }
     318             : 
     319             : static force_inline void
     320             : over_rev_non_pre_2x128 (__m128i  src_lo,
     321             :                         __m128i  src_hi,
     322             :                         __m128i* dst_lo,
     323             :                         __m128i* dst_hi)
     324             : {
     325             :     __m128i lo, hi;
     326             :     __m128i alpha_lo, alpha_hi;
     327             : 
     328           0 :     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
     329             : 
     330           0 :     lo = _mm_or_si128 (alpha_lo, mask_alpha);
     331           0 :     hi = _mm_or_si128 (alpha_hi, mask_alpha);
     332             : 
     333           0 :     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
     334             : 
     335             :     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
     336             : 
     337             :     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
     338             : }
     339             : 
     340             : static force_inline void
     341             : in_over_2x128 (__m128i* src_lo,
     342             :                __m128i* src_hi,
     343             :                __m128i* alpha_lo,
     344             :                __m128i* alpha_hi,
     345             :                __m128i* mask_lo,
     346             :                __m128i* mask_hi,
     347             :                __m128i* dst_lo,
     348             :                __m128i* dst_hi)
     349             : {
     350             :     __m128i s_lo, s_hi;
     351             :     __m128i a_lo, a_hi;
     352             : 
     353             :     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
     354             :     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
     355             : 
     356             :     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
     357             : }
     358             : 
     359             : /* load 4 pixels from a 16-byte boundary aligned address */
     360             : static force_inline __m128i
     361             : load_128_aligned (__m128i* src)
     362             : {
     363           0 :     return _mm_load_si128 (src);
     364             : }
     365             : 
     366             : /* load 4 pixels from a unaligned address */
     367             : static force_inline __m128i
     368             : load_128_unaligned (const __m128i* src)
     369             : {
     370        5632 :     return _mm_loadu_si128 (src);
     371             : }
     372             : 
     373             : /* save 4 pixels using Write Combining memory on a 16-byte
     374             :  * boundary aligned address
     375             :  */
     376             : static force_inline void
     377             : save_128_write_combining (__m128i* dst,
     378             :                           __m128i  data)
     379             : {
     380             :     _mm_stream_si128 (dst, data);
     381             : }
     382             : 
     383             : /* save 4 pixels on a 16-byte boundary aligned address */
     384             : static force_inline void
     385             : save_128_aligned (__m128i* dst,
     386             :                   __m128i  data)
     387             : {
     388             :     _mm_store_si128 (dst, data);
     389             : }
     390             : 
     391             : /* save 4 pixels on a unaligned address */
     392             : static force_inline void
     393             : save_128_unaligned (__m128i* dst,
     394             :                     __m128i  data)
     395             : {
     396             :     _mm_storeu_si128 (dst, data);
     397             : }
     398             : 
     399             : static force_inline __m128i
     400             : load_32_1x128 (uint32_t data)
     401             : {
     402           0 :     return _mm_cvtsi32_si128 (data);
     403             : }
     404             : 
     405             : static force_inline __m128i
     406             : expand_alpha_rev_1x128 (__m128i data)
     407             : {
     408           0 :     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
     409             : }
     410             : 
     411             : static force_inline __m128i
     412             : expand_pixel_8_1x128 (uint8_t data)
     413             : {
     414           0 :     return _mm_shufflelo_epi16 (
     415             :         unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
     416             : }
     417             : 
     418             : static force_inline __m128i
     419             : pix_multiply_1x128 (__m128i data,
     420             :                     __m128i alpha)
     421             : {
     422           0 :     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
     423             :                                             mask_0080),
     424             :                             mask_0101);
     425             : }
     426             : 
     427             : static force_inline __m128i
     428             : pix_add_multiply_1x128 (__m128i* src,
     429             :                         __m128i* alpha_dst,
     430             :                         __m128i* dst,
     431             :                         __m128i* alpha_src)
     432             : {
     433           0 :     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
     434           0 :     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
     435             : 
     436           0 :     return _mm_adds_epu8 (t1, t2);
     437             : }
     438             : 
     439             : static force_inline __m128i
     440             : negate_1x128 (__m128i data)
     441             : {
     442           0 :     return _mm_xor_si128 (data, mask_00ff);
     443             : }
     444             : 
     445             : static force_inline __m128i
     446             : invert_colors_1x128 (__m128i data)
     447             : {
     448           0 :     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
     449             : }
     450             : 
     451             : static force_inline __m128i
     452             : over_1x128 (__m128i src, __m128i alpha, __m128i dst)
     453             : {
     454           0 :     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
     455             : }
     456             : 
     457             : static force_inline __m128i
     458             : in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
     459             : {
     460           0 :     return over_1x128 (pix_multiply_1x128 (*src, *mask),
     461             :                        pix_multiply_1x128 (*alpha, *mask),
     462             :                        *dst);
     463             : }
     464             : 
     465             : static force_inline __m128i
     466             : over_rev_non_pre_1x128 (__m128i src, __m128i dst)
     467             : {
     468           0 :     __m128i alpha = expand_alpha_1x128 (src);
     469             : 
     470           0 :     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
     471             :                                            _mm_or_si128 (alpha, mask_alpha)),
     472             :                        alpha,
     473             :                        dst);
     474             : }
     475             : 
     476             : static force_inline uint32_t
     477             : pack_1x128_32 (__m128i data)
     478             : {
     479           0 :     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
     480             : }
     481             : 
     482             : static force_inline __m128i
     483             : expand565_16_1x128 (uint16_t pixel)
     484             : {
     485           0 :     __m128i m = _mm_cvtsi32_si128 (pixel);
     486             : 
     487           0 :     m = unpack_565_to_8888 (m);
     488             : 
     489           0 :     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
     490             : }
     491             : 
     492             : static force_inline uint32_t
     493             : core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
     494             : {
     495             :     uint8_t a;
     496             :     __m128i xmms;
     497             : 
     498           0 :     a = src >> 24;
     499             : 
     500           0 :     if (a == 0xff)
     501             :     {
     502           0 :         return src;
     503             :     }
     504           0 :     else if (src)
     505             :     {
     506           0 :         xmms = unpack_32_1x128 (src);
     507           0 :         return pack_1x128_32 (
     508             :             over_1x128 (xmms, expand_alpha_1x128 (xmms),
     509             :                         unpack_32_1x128 (dst)));
     510             :     }
     511             : 
     512           0 :     return dst;
     513             : }
     514             : 
     515             : static force_inline uint32_t
     516             : combine1 (const uint32_t *ps, const uint32_t *pm)
     517             : {
     518           0 :     uint32_t s = *ps;
     519             : 
     520           0 :     if (pm)
     521             :     {
     522             :         __m128i ms, mm;
     523             : 
     524           0 :         mm = unpack_32_1x128 (*pm);
     525           0 :         mm = expand_alpha_1x128 (mm);
     526             : 
     527           0 :         ms = unpack_32_1x128 (s);
     528           0 :         ms = pix_multiply_1x128 (ms, mm);
     529             : 
     530           0 :         s = pack_1x128_32 (ms);
     531             :     }
     532             : 
     533           0 :     return s;
     534             : }
     535             : 
     536             : static force_inline __m128i
     537             : combine4 (const __m128i *ps, const __m128i *pm)
     538             : {
     539             :     __m128i xmm_src_lo, xmm_src_hi;
     540             :     __m128i xmm_msk_lo, xmm_msk_hi;
     541             :     __m128i s;
     542             : 
     543           0 :     if (pm)
     544             :     {
     545           0 :         xmm_msk_lo = load_128_unaligned (pm);
     546             : 
     547           0 :         if (is_transparent (xmm_msk_lo))
     548           0 :             return _mm_setzero_si128 ();
     549             :     }
     550             : 
     551           0 :     s = load_128_unaligned (ps);
     552             : 
     553           0 :     if (pm)
     554             :     {
     555             :         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
     556           0 :         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
     557             : 
     558           0 :         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
     559             : 
     560             :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
     561             :                             &xmm_msk_lo, &xmm_msk_hi,
     562             :                             &xmm_src_lo, &xmm_src_hi);
     563             : 
     564           0 :         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
     565             :     }
     566             : 
     567           0 :     return s;
     568             : }
     569             : 
     570             : static force_inline void
     571             : core_combine_over_u_sse2_mask (uint32_t *         pd,
     572             :                                const uint32_t*    ps,
     573             :                                const uint32_t*    pm,
     574             :                                int                w)
     575             : {
     576             :     uint32_t s, d;
     577             : 
     578             :     /* Align dst on a 16-byte boundary */
     579           0 :     while (w && ((uintptr_t)pd & 15))
     580             :     {
     581           0 :         d = *pd;
     582           0 :         s = combine1 (ps, pm);
     583             : 
     584           0 :         if (s)
     585           0 :             *pd = core_combine_over_u_pixel_sse2 (s, d);
     586           0 :         pd++;
     587           0 :         ps++;
     588           0 :         pm++;
     589           0 :         w--;
     590             :     }
     591             : 
     592           0 :     while (w >= 4)
     593             :     {
     594           0 :         __m128i mask = load_128_unaligned ((__m128i *)pm);
     595             : 
     596           0 :         if (!is_zero (mask))
     597             :         {
     598             :             __m128i src;
     599             :             __m128i src_hi, src_lo;
     600             :             __m128i mask_hi, mask_lo;
     601             :             __m128i alpha_hi, alpha_lo;
     602             : 
     603           0 :             src = load_128_unaligned ((__m128i *)ps);
     604             : 
     605           0 :             if (is_opaque (_mm_and_si128 (src, mask)))
     606             :             {
     607             :                 save_128_aligned ((__m128i *)pd, src);
     608             :             }
     609             :             else
     610             :             {
     611           0 :                 __m128i dst = load_128_aligned ((__m128i *)pd);
     612             :                 __m128i dst_hi, dst_lo;
     613             : 
     614             :                 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
     615             :                 unpack_128_2x128 (src, &src_lo, &src_hi);
     616             : 
     617           0 :                 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
     618             :                 pix_multiply_2x128 (&src_lo, &src_hi,
     619             :                                     &mask_lo, &mask_hi,
     620             :                                     &src_lo, &src_hi);
     621             : 
     622             :                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
     623             : 
     624           0 :                 expand_alpha_2x128 (src_lo, src_hi,
     625             :                                     &alpha_lo, &alpha_hi);
     626             : 
     627             :                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
     628             :                             &dst_lo, &dst_hi);
     629             : 
     630           0 :                 save_128_aligned (
     631             :                     (__m128i *)pd,
     632             :                     pack_2x128_128 (dst_lo, dst_hi));
     633             :             }
     634             :         }
     635             : 
     636           0 :         pm += 4;
     637           0 :         ps += 4;
     638           0 :         pd += 4;
     639           0 :         w -= 4;
     640             :     }
     641           0 :     while (w)
     642             :     {
     643           0 :         d = *pd;
     644           0 :         s = combine1 (ps, pm);
     645             : 
     646           0 :         if (s)
     647           0 :             *pd = core_combine_over_u_pixel_sse2 (s, d);
     648           0 :         pd++;
     649           0 :         ps++;
     650           0 :         pm++;
     651             : 
     652           0 :         w--;
     653             :     }
     654             : }
     655             : 
     656             : static force_inline void
     657             : core_combine_over_u_sse2_no_mask (uint32_t *      pd,
     658             :                                   const uint32_t*    ps,
     659             :                                   int                w)
     660             : {
     661             :     uint32_t s, d;
     662             : 
     663             :     /* Align dst on a 16-byte boundary */
     664           0 :     while (w && ((uintptr_t)pd & 15))
     665             :     {
     666           0 :         d = *pd;
     667           0 :         s = *ps;
     668             : 
     669           0 :         if (s)
     670           0 :             *pd = core_combine_over_u_pixel_sse2 (s, d);
     671           0 :         pd++;
     672           0 :         ps++;
     673           0 :         w--;
     674             :     }
     675             : 
     676           0 :     while (w >= 4)
     677             :     {
     678             :         __m128i src;
     679             :         __m128i src_hi, src_lo, dst_hi, dst_lo;
     680             :         __m128i alpha_hi, alpha_lo;
     681             : 
     682           0 :         src = load_128_unaligned ((__m128i *)ps);
     683             : 
     684           0 :         if (!is_zero (src))
     685             :         {
     686           0 :             if (is_opaque (src))
     687             :             {
     688             :                 save_128_aligned ((__m128i *)pd, src);
     689             :             }
     690             :             else
     691             :             {
     692           0 :                 __m128i dst = load_128_aligned ((__m128i *)pd);
     693             : 
     694             :                 unpack_128_2x128 (src, &src_lo, &src_hi);
     695             :                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
     696             : 
     697           0 :                 expand_alpha_2x128 (src_lo, src_hi,
     698             :                                     &alpha_lo, &alpha_hi);
     699             :                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
     700             :                             &dst_lo, &dst_hi);
     701             : 
     702           0 :                 save_128_aligned (
     703             :                     (__m128i *)pd,
     704             :                     pack_2x128_128 (dst_lo, dst_hi));
     705             :             }
     706             :         }
     707             : 
     708           0 :         ps += 4;
     709           0 :         pd += 4;
     710           0 :         w -= 4;
     711             :     }
     712           0 :     while (w)
     713             :     {
     714           0 :         d = *pd;
     715           0 :         s = *ps;
     716             : 
     717           0 :         if (s)
     718           0 :             *pd = core_combine_over_u_pixel_sse2 (s, d);
     719           0 :         pd++;
     720           0 :         ps++;
     721             : 
     722           0 :         w--;
     723             :     }
     724             : }
     725             : 
     726             : static force_inline void
     727           0 : sse2_combine_over_u (pixman_implementation_t *imp,
     728             :                      pixman_op_t              op,
     729             :                      uint32_t *               pd,
     730             :                      const uint32_t *         ps,
     731             :                      const uint32_t *         pm,
     732             :                      int                      w)
     733             : {
     734           0 :     if (pm)
     735             :         core_combine_over_u_sse2_mask (pd, ps, pm, w);
     736             :     else
     737             :         core_combine_over_u_sse2_no_mask (pd, ps, w);
     738           0 : }
     739             : 
     740             : static void
     741           0 : sse2_combine_over_reverse_u (pixman_implementation_t *imp,
     742             :                              pixman_op_t              op,
     743             :                              uint32_t *               pd,
     744             :                              const uint32_t *         ps,
     745             :                              const uint32_t *         pm,
     746             :                              int                      w)
     747             : {
     748             :     uint32_t s, d;
     749             : 
     750             :     __m128i xmm_dst_lo, xmm_dst_hi;
     751             :     __m128i xmm_src_lo, xmm_src_hi;
     752             :     __m128i xmm_alpha_lo, xmm_alpha_hi;
     753             : 
     754             :     /* Align dst on a 16-byte boundary */
     755           0 :     while (w &&
     756           0 :            ((uintptr_t)pd & 15))
     757             :     {
     758           0 :         d = *pd;
     759           0 :         s = combine1 (ps, pm);
     760             : 
     761           0 :         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
     762           0 :         w--;
     763           0 :         ps++;
     764           0 :         if (pm)
     765           0 :             pm++;
     766             :     }
     767             : 
     768           0 :     while (w >= 4)
     769             :     {
     770             :         /* I'm loading unaligned because I'm not sure
     771             :          * about the address alignment.
     772             :          */
     773           0 :         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
     774           0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
     775             : 
     776           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     777           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
     778             : 
     779           0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
     780             :                             &xmm_alpha_lo, &xmm_alpha_hi);
     781             : 
     782             :         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
     783             :                     &xmm_alpha_lo, &xmm_alpha_hi,
     784             :                     &xmm_src_lo, &xmm_src_hi);
     785             : 
     786             :         /* rebuid the 4 pixel data and save*/
     787           0 :         save_128_aligned ((__m128i*)pd,
     788             :                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
     789             : 
     790           0 :         w -= 4;
     791           0 :         ps += 4;
     792           0 :         pd += 4;
     793             : 
     794           0 :         if (pm)
     795           0 :             pm += 4;
     796             :     }
     797             : 
     798           0 :     while (w)
     799             :     {
     800           0 :         d = *pd;
     801           0 :         s = combine1 (ps, pm);
     802             : 
     803           0 :         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
     804           0 :         ps++;
     805           0 :         w--;
     806           0 :         if (pm)
     807           0 :             pm++;
     808             :     }
     809           0 : }
     810             : 
     811             : static force_inline uint32_t
     812             : core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
     813             : {
     814           0 :     uint32_t maska = src >> 24;
     815             : 
     816           0 :     if (maska == 0)
     817             :     {
     818           0 :         return 0;
     819             :     }
     820           0 :     else if (maska != 0xff)
     821             :     {
     822           0 :         return pack_1x128_32 (
     823             :             pix_multiply_1x128 (unpack_32_1x128 (dst),
     824             :                                 expand_alpha_1x128 (unpack_32_1x128 (src))));
     825             :     }
     826             : 
     827           0 :     return dst;
     828             : }
     829             : 
     830             : static void
     831           0 : sse2_combine_in_u (pixman_implementation_t *imp,
     832             :                    pixman_op_t              op,
     833             :                    uint32_t *               pd,
     834             :                    const uint32_t *         ps,
     835             :                    const uint32_t *         pm,
     836             :                    int                      w)
     837             : {
     838             :     uint32_t s, d;
     839             : 
     840             :     __m128i xmm_src_lo, xmm_src_hi;
     841             :     __m128i xmm_dst_lo, xmm_dst_hi;
     842             : 
     843           0 :     while (w && ((uintptr_t)pd & 15))
     844             :     {
     845           0 :         s = combine1 (ps, pm);
     846           0 :         d = *pd;
     847             : 
     848           0 :         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
     849           0 :         w--;
     850           0 :         ps++;
     851           0 :         if (pm)
     852           0 :             pm++;
     853             :     }
     854             : 
     855           0 :     while (w >= 4)
     856             :     {
     857           0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
     858           0 :         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
     859             : 
     860           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
     861           0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
     862             : 
     863           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     864             :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
     865             :                             &xmm_dst_lo, &xmm_dst_hi,
     866             :                             &xmm_dst_lo, &xmm_dst_hi);
     867             : 
     868           0 :         save_128_aligned ((__m128i*)pd,
     869             :                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
     870             : 
     871           0 :         ps += 4;
     872           0 :         pd += 4;
     873           0 :         w -= 4;
     874           0 :         if (pm)
     875           0 :             pm += 4;
     876             :     }
     877             : 
     878           0 :     while (w)
     879             :     {
     880           0 :         s = combine1 (ps, pm);
     881           0 :         d = *pd;
     882             : 
     883           0 :         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
     884           0 :         w--;
     885           0 :         ps++;
     886           0 :         if (pm)
     887           0 :             pm++;
     888             :     }
     889           0 : }
     890             : 
     891             : static void
     892           0 : sse2_combine_in_reverse_u (pixman_implementation_t *imp,
     893             :                            pixman_op_t              op,
     894             :                            uint32_t *               pd,
     895             :                            const uint32_t *         ps,
     896             :                            const uint32_t *         pm,
     897             :                            int                      w)
     898             : {
     899             :     uint32_t s, d;
     900             : 
     901             :     __m128i xmm_src_lo, xmm_src_hi;
     902             :     __m128i xmm_dst_lo, xmm_dst_hi;
     903             : 
     904           0 :     while (w && ((uintptr_t)pd & 15))
     905             :     {
     906           0 :         s = combine1 (ps, pm);
     907           0 :         d = *pd;
     908             : 
     909           0 :         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
     910           0 :         ps++;
     911           0 :         w--;
     912           0 :         if (pm)
     913           0 :             pm++;
     914             :     }
     915             : 
     916           0 :     while (w >= 4)
     917             :     {
     918           0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
     919           0 :         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
     920             : 
     921           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     922           0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     923             : 
     924           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
     925             :         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
     926             :                             &xmm_src_lo, &xmm_src_hi,
     927             :                             &xmm_dst_lo, &xmm_dst_hi);
     928             : 
     929           0 :         save_128_aligned (
     930             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
     931             : 
     932           0 :         ps += 4;
     933           0 :         pd += 4;
     934           0 :         w -= 4;
     935           0 :         if (pm)
     936           0 :             pm += 4;
     937             :     }
     938             : 
     939           0 :     while (w)
     940             :     {
     941           0 :         s = combine1 (ps, pm);
     942           0 :         d = *pd;
     943             : 
     944           0 :         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
     945           0 :         w--;
     946           0 :         ps++;
     947           0 :         if (pm)
     948           0 :             pm++;
     949             :     }
     950           0 : }
     951             : 
     952             : static void
     953           0 : sse2_combine_out_reverse_u (pixman_implementation_t *imp,
     954             :                             pixman_op_t              op,
     955             :                             uint32_t *               pd,
     956             :                             const uint32_t *         ps,
     957             :                             const uint32_t *         pm,
     958             :                             int                      w)
     959             : {
     960           0 :     while (w && ((uintptr_t)pd & 15))
     961             :     {
     962           0 :         uint32_t s = combine1 (ps, pm);
     963           0 :         uint32_t d = *pd;
     964             : 
     965           0 :         *pd++ = pack_1x128_32 (
     966             :             pix_multiply_1x128 (
     967             :                 unpack_32_1x128 (d), negate_1x128 (
     968             :                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
     969             : 
     970           0 :         if (pm)
     971           0 :             pm++;
     972           0 :         ps++;
     973           0 :         w--;
     974             :     }
     975             : 
     976           0 :     while (w >= 4)
     977             :     {
     978             :         __m128i xmm_src_lo, xmm_src_hi;
     979             :         __m128i xmm_dst_lo, xmm_dst_hi;
     980             : 
     981           0 :         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
     982           0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
     983             : 
     984           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     985           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
     986             : 
     987           0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     988           0 :         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     989             : 
     990             :         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
     991             :                             &xmm_src_lo, &xmm_src_hi,
     992             :                             &xmm_dst_lo, &xmm_dst_hi);
     993             : 
     994           0 :         save_128_aligned (
     995             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
     996             : 
     997           0 :         ps += 4;
     998           0 :         pd += 4;
     999           0 :         if (pm)
    1000           0 :             pm += 4;
    1001             : 
    1002           0 :         w -= 4;
    1003             :     }
    1004             : 
    1005           0 :     while (w)
    1006             :     {
    1007           0 :         uint32_t s = combine1 (ps, pm);
    1008           0 :         uint32_t d = *pd;
    1009             : 
    1010           0 :         *pd++ = pack_1x128_32 (
    1011             :             pix_multiply_1x128 (
    1012             :                 unpack_32_1x128 (d), negate_1x128 (
    1013             :                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
    1014           0 :         ps++;
    1015           0 :         if (pm)
    1016           0 :             pm++;
    1017           0 :         w--;
    1018             :     }
    1019           0 : }
    1020             : 
    1021             : static void
    1022           0 : sse2_combine_out_u (pixman_implementation_t *imp,
    1023             :                     pixman_op_t              op,
    1024             :                     uint32_t *               pd,
    1025             :                     const uint32_t *         ps,
    1026             :                     const uint32_t *         pm,
    1027             :                     int                      w)
    1028             : {
    1029           0 :     while (w && ((uintptr_t)pd & 15))
    1030             :     {
    1031           0 :         uint32_t s = combine1 (ps, pm);
    1032           0 :         uint32_t d = *pd;
    1033             : 
    1034           0 :         *pd++ = pack_1x128_32 (
    1035             :             pix_multiply_1x128 (
    1036             :                 unpack_32_1x128 (s), negate_1x128 (
    1037             :                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
    1038           0 :         w--;
    1039           0 :         ps++;
    1040           0 :         if (pm)
    1041           0 :             pm++;
    1042             :     }
    1043             : 
    1044           0 :     while (w >= 4)
    1045             :     {
    1046             :         __m128i xmm_src_lo, xmm_src_hi;
    1047             :         __m128i xmm_dst_lo, xmm_dst_hi;
    1048             : 
    1049           0 :         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
    1050           0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    1051             : 
    1052           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1053           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1054             : 
    1055           0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1056           0 :         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1057             : 
    1058             :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    1059             :                             &xmm_dst_lo, &xmm_dst_hi,
    1060             :                             &xmm_dst_lo, &xmm_dst_hi);
    1061             : 
    1062           0 :         save_128_aligned (
    1063             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1064             : 
    1065           0 :         ps += 4;
    1066           0 :         pd += 4;
    1067           0 :         w -= 4;
    1068           0 :         if (pm)
    1069           0 :             pm += 4;
    1070             :     }
    1071             : 
    1072           0 :     while (w)
    1073             :     {
    1074           0 :         uint32_t s = combine1 (ps, pm);
    1075           0 :         uint32_t d = *pd;
    1076             : 
    1077           0 :         *pd++ = pack_1x128_32 (
    1078             :             pix_multiply_1x128 (
    1079             :                 unpack_32_1x128 (s), negate_1x128 (
    1080             :                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
    1081           0 :         w--;
    1082           0 :         ps++;
    1083           0 :         if (pm)
    1084           0 :             pm++;
    1085             :     }
    1086           0 : }
    1087             : 
    1088             : static force_inline uint32_t
    1089             : core_combine_atop_u_pixel_sse2 (uint32_t src,
    1090             :                                 uint32_t dst)
    1091             : {
    1092           0 :     __m128i s = unpack_32_1x128 (src);
    1093           0 :     __m128i d = unpack_32_1x128 (dst);
    1094             : 
    1095           0 :     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
    1096           0 :     __m128i da = expand_alpha_1x128 (d);
    1097             : 
    1098           0 :     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
    1099             : }
    1100             : 
    1101             : static void
    1102           0 : sse2_combine_atop_u (pixman_implementation_t *imp,
    1103             :                      pixman_op_t              op,
    1104             :                      uint32_t *               pd,
    1105             :                      const uint32_t *         ps,
    1106             :                      const uint32_t *         pm,
    1107             :                      int                      w)
    1108             : {
    1109             :     uint32_t s, d;
    1110             : 
    1111             :     __m128i xmm_src_lo, xmm_src_hi;
    1112             :     __m128i xmm_dst_lo, xmm_dst_hi;
    1113             :     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
    1114             :     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
    1115             : 
    1116           0 :     while (w && ((uintptr_t)pd & 15))
    1117             :     {
    1118           0 :         s = combine1 (ps, pm);
    1119           0 :         d = *pd;
    1120             : 
    1121           0 :         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
    1122           0 :         w--;
    1123           0 :         ps++;
    1124           0 :         if (pm)
    1125           0 :             pm++;
    1126             :     }
    1127             : 
    1128           0 :     while (w >= 4)
    1129             :     {
    1130           0 :         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
    1131           0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    1132             : 
    1133           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1134           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1135             : 
    1136           0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    1137             :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    1138           0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    1139             :                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    1140             : 
    1141           0 :         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
    1142             :                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    1143             : 
    1144             :         pix_add_multiply_2x128 (
    1145             :             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
    1146             :             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
    1147             :             &xmm_dst_lo, &xmm_dst_hi);
    1148             : 
    1149           0 :         save_128_aligned (
    1150             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1151             : 
    1152           0 :         ps += 4;
    1153           0 :         pd += 4;
    1154           0 :         w -= 4;
    1155           0 :         if (pm)
    1156           0 :             pm += 4;
    1157             :     }
    1158             : 
    1159           0 :     while (w)
    1160             :     {
    1161           0 :         s = combine1 (ps, pm);
    1162           0 :         d = *pd;
    1163             : 
    1164           0 :         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
    1165           0 :         w--;
    1166           0 :         ps++;
    1167           0 :         if (pm)
    1168           0 :             pm++;
    1169             :     }
    1170           0 : }
    1171             : 
    1172             : static force_inline uint32_t
    1173             : core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
    1174             :                                         uint32_t dst)
    1175             : {
    1176           0 :     __m128i s = unpack_32_1x128 (src);
    1177           0 :     __m128i d = unpack_32_1x128 (dst);
    1178             : 
    1179           0 :     __m128i sa = expand_alpha_1x128 (s);
    1180           0 :     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
    1181             : 
    1182           0 :     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
    1183             : }
    1184             : 
    1185             : static void
    1186           0 : sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
    1187             :                              pixman_op_t              op,
    1188             :                              uint32_t *               pd,
    1189             :                              const uint32_t *         ps,
    1190             :                              const uint32_t *         pm,
    1191             :                              int                      w)
    1192             : {
    1193             :     uint32_t s, d;
    1194             : 
    1195             :     __m128i xmm_src_lo, xmm_src_hi;
    1196             :     __m128i xmm_dst_lo, xmm_dst_hi;
    1197             :     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
    1198             :     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
    1199             : 
    1200           0 :     while (w && ((uintptr_t)pd & 15))
    1201             :     {
    1202           0 :         s = combine1 (ps, pm);
    1203           0 :         d = *pd;
    1204             : 
    1205           0 :         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
    1206           0 :         ps++;
    1207           0 :         w--;
    1208           0 :         if (pm)
    1209           0 :             pm++;
    1210             :     }
    1211             : 
    1212           0 :     while (w >= 4)
    1213             :     {
    1214           0 :         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
    1215           0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    1216             : 
    1217           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1218           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1219             : 
    1220           0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    1221             :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    1222           0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    1223             :                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    1224             : 
    1225           0 :         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
    1226             :                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    1227             : 
    1228             :         pix_add_multiply_2x128 (
    1229             :             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
    1230             :             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
    1231             :             &xmm_dst_lo, &xmm_dst_hi);
    1232             : 
    1233           0 :         save_128_aligned (
    1234             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1235             : 
    1236           0 :         ps += 4;
    1237           0 :         pd += 4;
    1238           0 :         w -= 4;
    1239           0 :         if (pm)
    1240           0 :             pm += 4;
    1241             :     }
    1242             : 
    1243           0 :     while (w)
    1244             :     {
    1245           0 :         s = combine1 (ps, pm);
    1246           0 :         d = *pd;
    1247             : 
    1248           0 :         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
    1249           0 :         ps++;
    1250           0 :         w--;
    1251           0 :         if (pm)
    1252           0 :             pm++;
    1253             :     }
    1254           0 : }
    1255             : 
    1256             : static force_inline uint32_t
    1257             : core_combine_xor_u_pixel_sse2 (uint32_t src,
    1258             :                                uint32_t dst)
    1259             : {
    1260           0 :     __m128i s = unpack_32_1x128 (src);
    1261           0 :     __m128i d = unpack_32_1x128 (dst);
    1262             : 
    1263           0 :     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
    1264           0 :     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
    1265             : 
    1266           0 :     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
    1267             : }
    1268             : 
    1269             : static void
    1270           0 : sse2_combine_xor_u (pixman_implementation_t *imp,
    1271             :                     pixman_op_t              op,
    1272             :                     uint32_t *               dst,
    1273             :                     const uint32_t *         src,
    1274             :                     const uint32_t *         mask,
    1275             :                     int                      width)
    1276             : {
    1277           0 :     int w = width;
    1278             :     uint32_t s, d;
    1279           0 :     uint32_t* pd = dst;
    1280           0 :     const uint32_t* ps = src;
    1281           0 :     const uint32_t* pm = mask;
    1282             : 
    1283             :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    1284             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    1285             :     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
    1286             :     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
    1287             : 
    1288           0 :     while (w && ((uintptr_t)pd & 15))
    1289             :     {
    1290           0 :         s = combine1 (ps, pm);
    1291           0 :         d = *pd;
    1292             : 
    1293           0 :         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
    1294           0 :         w--;
    1295           0 :         ps++;
    1296           0 :         if (pm)
    1297           0 :             pm++;
    1298             :     }
    1299             : 
    1300           0 :     while (w >= 4)
    1301             :     {
    1302           0 :         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
    1303           0 :         xmm_dst = load_128_aligned ((__m128i*) pd);
    1304             : 
    1305             :         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    1306             :         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    1307             : 
    1308           0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    1309             :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    1310           0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    1311             :                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    1312             : 
    1313           0 :         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
    1314             :                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    1315           0 :         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
    1316             :                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    1317             : 
    1318             :         pix_add_multiply_2x128 (
    1319             :             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
    1320             :             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
    1321             :             &xmm_dst_lo, &xmm_dst_hi);
    1322             : 
    1323           0 :         save_128_aligned (
    1324             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1325             : 
    1326           0 :         ps += 4;
    1327           0 :         pd += 4;
    1328           0 :         w -= 4;
    1329           0 :         if (pm)
    1330           0 :             pm += 4;
    1331             :     }
    1332             : 
    1333           0 :     while (w)
    1334             :     {
    1335           0 :         s = combine1 (ps, pm);
    1336           0 :         d = *pd;
    1337             : 
    1338           0 :         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
    1339           0 :         w--;
    1340           0 :         ps++;
    1341           0 :         if (pm)
    1342           0 :             pm++;
    1343             :     }
    1344           0 : }
    1345             : 
    1346             : static force_inline void
    1347           0 : sse2_combine_add_u (pixman_implementation_t *imp,
    1348             :                     pixman_op_t              op,
    1349             :                     uint32_t *               dst,
    1350             :                     const uint32_t *         src,
    1351             :                     const uint32_t *         mask,
    1352             :                     int                      width)
    1353             : {
    1354           0 :     int w = width;
    1355             :     uint32_t s, d;
    1356           0 :     uint32_t* pd = dst;
    1357           0 :     const uint32_t* ps = src;
    1358           0 :     const uint32_t* pm = mask;
    1359             : 
    1360           0 :     while (w && (uintptr_t)pd & 15)
    1361             :     {
    1362           0 :         s = combine1 (ps, pm);
    1363           0 :         d = *pd;
    1364             : 
    1365           0 :         ps++;
    1366           0 :         if (pm)
    1367           0 :             pm++;
    1368           0 :         *pd++ = _mm_cvtsi128_si32 (
    1369             :             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
    1370           0 :         w--;
    1371             :     }
    1372             : 
    1373           0 :     while (w >= 4)
    1374             :     {
    1375             :         __m128i s;
    1376             : 
    1377           0 :         s = combine4 ((__m128i*)ps, (__m128i*)pm);
    1378             : 
    1379           0 :         save_128_aligned (
    1380             :             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
    1381             : 
    1382           0 :         pd += 4;
    1383           0 :         ps += 4;
    1384           0 :         if (pm)
    1385           0 :             pm += 4;
    1386           0 :         w -= 4;
    1387             :     }
    1388             : 
    1389           0 :     while (w--)
    1390             :     {
    1391           0 :         s = combine1 (ps, pm);
    1392           0 :         d = *pd;
    1393             : 
    1394           0 :         ps++;
    1395           0 :         *pd++ = _mm_cvtsi128_si32 (
    1396             :             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
    1397           0 :         if (pm)
    1398           0 :             pm++;
    1399             :     }
    1400           0 : }
    1401             : 
    1402             : static force_inline uint32_t
    1403             : core_combine_saturate_u_pixel_sse2 (uint32_t src,
    1404             :                                     uint32_t dst)
    1405             : {
    1406           0 :     __m128i ms = unpack_32_1x128 (src);
    1407           0 :     __m128i md = unpack_32_1x128 (dst);
    1408           0 :     uint32_t sa = src >> 24;
    1409           0 :     uint32_t da = ~dst >> 24;
    1410             : 
    1411           0 :     if (sa > da)
    1412             :     {
    1413           0 :         ms = pix_multiply_1x128 (
    1414           0 :             ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
    1415             :     }
    1416             : 
    1417           0 :     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
    1418             : }
    1419             : 
    1420             : static void
    1421           0 : sse2_combine_saturate_u (pixman_implementation_t *imp,
    1422             :                          pixman_op_t              op,
    1423             :                          uint32_t *               pd,
    1424             :                          const uint32_t *         ps,
    1425             :                          const uint32_t *         pm,
    1426             :                          int                      w)
    1427             : {
    1428             :     uint32_t s, d;
    1429             : 
    1430             :     uint32_t pack_cmp;
    1431             :     __m128i xmm_src, xmm_dst;
    1432             : 
    1433           0 :     while (w && (uintptr_t)pd & 15)
    1434             :     {
    1435           0 :         s = combine1 (ps, pm);
    1436           0 :         d = *pd;
    1437             : 
    1438           0 :         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
    1439           0 :         w--;
    1440           0 :         ps++;
    1441           0 :         if (pm)
    1442           0 :             pm++;
    1443             :     }
    1444             : 
    1445           0 :     while (w >= 4)
    1446             :     {
    1447           0 :         xmm_dst = load_128_aligned  ((__m128i*)pd);
    1448           0 :         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
    1449             : 
    1450           0 :         pack_cmp = _mm_movemask_epi8 (
    1451             :             _mm_cmpgt_epi32 (
    1452             :                 _mm_srli_epi32 (xmm_src, 24),
    1453             :                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
    1454             : 
    1455             :         /* if some alpha src is grater than respective ~alpha dst */
    1456           0 :         if (pack_cmp)
    1457             :         {
    1458           0 :             s = combine1 (ps++, pm);
    1459           0 :             d = *pd;
    1460           0 :             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
    1461           0 :             if (pm)
    1462           0 :                 pm++;
    1463             : 
    1464           0 :             s = combine1 (ps++, pm);
    1465           0 :             d = *pd;
    1466           0 :             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
    1467           0 :             if (pm)
    1468           0 :                 pm++;
    1469             : 
    1470           0 :             s = combine1 (ps++, pm);
    1471           0 :             d = *pd;
    1472           0 :             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
    1473           0 :             if (pm)
    1474           0 :                 pm++;
    1475             : 
    1476           0 :             s = combine1 (ps++, pm);
    1477           0 :             d = *pd;
    1478           0 :             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
    1479           0 :             if (pm)
    1480           0 :                 pm++;
    1481             :         }
    1482             :         else
    1483             :         {
    1484           0 :             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
    1485             : 
    1486           0 :             pd += 4;
    1487           0 :             ps += 4;
    1488           0 :             if (pm)
    1489           0 :                 pm += 4;
    1490             :         }
    1491             : 
    1492           0 :         w -= 4;
    1493             :     }
    1494             : 
    1495           0 :     while (w--)
    1496             :     {
    1497           0 :         s = combine1 (ps, pm);
    1498           0 :         d = *pd;
    1499             : 
    1500           0 :         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
    1501           0 :         ps++;
    1502           0 :         if (pm)
    1503           0 :             pm++;
    1504             :     }
    1505           0 : }
    1506             : 
    1507             : static void
    1508           0 : sse2_combine_src_ca (pixman_implementation_t *imp,
    1509             :                      pixman_op_t              op,
    1510             :                      uint32_t *               pd,
    1511             :                      const uint32_t *         ps,
    1512             :                      const uint32_t *         pm,
    1513             :                      int                      w)
    1514             : {
    1515             :     uint32_t s, m;
    1516             : 
    1517             :     __m128i xmm_src_lo, xmm_src_hi;
    1518             :     __m128i xmm_mask_lo, xmm_mask_hi;
    1519             :     __m128i xmm_dst_lo, xmm_dst_hi;
    1520             : 
    1521           0 :     while (w && (uintptr_t)pd & 15)
    1522             :     {
    1523           0 :         s = *ps++;
    1524           0 :         m = *pm++;
    1525           0 :         *pd++ = pack_1x128_32 (
    1526             :             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
    1527           0 :         w--;
    1528             :     }
    1529             : 
    1530           0 :     while (w >= 4)
    1531             :     {
    1532           0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1533           0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1534             : 
    1535           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1536           0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1537             : 
    1538             :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    1539             :                             &xmm_mask_lo, &xmm_mask_hi,
    1540             :                             &xmm_dst_lo, &xmm_dst_hi);
    1541             : 
    1542           0 :         save_128_aligned (
    1543             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1544             : 
    1545           0 :         ps += 4;
    1546           0 :         pd += 4;
    1547           0 :         pm += 4;
    1548           0 :         w -= 4;
    1549             :     }
    1550             : 
    1551           0 :     while (w)
    1552             :     {
    1553           0 :         s = *ps++;
    1554           0 :         m = *pm++;
    1555           0 :         *pd++ = pack_1x128_32 (
    1556             :             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
    1557           0 :         w--;
    1558             :     }
    1559           0 : }
    1560             : 
    1561             : static force_inline uint32_t
    1562             : core_combine_over_ca_pixel_sse2 (uint32_t src,
    1563             :                                  uint32_t mask,
    1564             :                                  uint32_t dst)
    1565             : {
    1566           0 :     __m128i s = unpack_32_1x128 (src);
    1567           0 :     __m128i expAlpha = expand_alpha_1x128 (s);
    1568           0 :     __m128i unpk_mask = unpack_32_1x128 (mask);
    1569           0 :     __m128i unpk_dst  = unpack_32_1x128 (dst);
    1570             : 
    1571           0 :     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
    1572             : }
    1573             : 
    1574             : static void
    1575           0 : sse2_combine_over_ca (pixman_implementation_t *imp,
    1576             :                       pixman_op_t              op,
    1577             :                       uint32_t *               pd,
    1578             :                       const uint32_t *         ps,
    1579             :                       const uint32_t *         pm,
    1580             :                       int                      w)
    1581             : {
    1582             :     uint32_t s, m, d;
    1583             : 
    1584             :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    1585             :     __m128i xmm_src_lo, xmm_src_hi;
    1586             :     __m128i xmm_dst_lo, xmm_dst_hi;
    1587             :     __m128i xmm_mask_lo, xmm_mask_hi;
    1588             : 
    1589           0 :     while (w && (uintptr_t)pd & 15)
    1590             :     {
    1591           0 :         s = *ps++;
    1592           0 :         m = *pm++;
    1593           0 :         d = *pd;
    1594             : 
    1595           0 :         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
    1596           0 :         w--;
    1597             :     }
    1598             : 
    1599           0 :     while (w >= 4)
    1600             :     {
    1601           0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    1602           0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1603           0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1604             : 
    1605           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1606           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1607           0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1608             : 
    1609           0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    1610             :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1611             : 
    1612             :         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
    1613             :                        &xmm_alpha_lo, &xmm_alpha_hi,
    1614             :                        &xmm_mask_lo, &xmm_mask_hi,
    1615             :                        &xmm_dst_lo, &xmm_dst_hi);
    1616             : 
    1617           0 :         save_128_aligned (
    1618             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1619             : 
    1620           0 :         ps += 4;
    1621           0 :         pd += 4;
    1622           0 :         pm += 4;
    1623           0 :         w -= 4;
    1624             :     }
    1625             : 
    1626           0 :     while (w)
    1627             :     {
    1628           0 :         s = *ps++;
    1629           0 :         m = *pm++;
    1630           0 :         d = *pd;
    1631             : 
    1632           0 :         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
    1633           0 :         w--;
    1634             :     }
    1635           0 : }
    1636             : 
    1637             : static force_inline uint32_t
    1638             : core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
    1639             :                                          uint32_t mask,
    1640             :                                          uint32_t dst)
    1641             : {
    1642           0 :     __m128i d = unpack_32_1x128 (dst);
    1643             : 
    1644           0 :     return pack_1x128_32 (
    1645             :         over_1x128 (d, expand_alpha_1x128 (d),
    1646             :                     pix_multiply_1x128 (unpack_32_1x128 (src),
    1647             :                                         unpack_32_1x128 (mask))));
    1648             : }
    1649             : 
    1650             : static void
    1651           0 : sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
    1652             :                               pixman_op_t              op,
    1653             :                               uint32_t *               pd,
    1654             :                               const uint32_t *         ps,
    1655             :                               const uint32_t *         pm,
    1656             :                               int                      w)
    1657             : {
    1658             :     uint32_t s, m, d;
    1659             : 
    1660             :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    1661             :     __m128i xmm_src_lo, xmm_src_hi;
    1662             :     __m128i xmm_dst_lo, xmm_dst_hi;
    1663             :     __m128i xmm_mask_lo, xmm_mask_hi;
    1664             : 
    1665           0 :     while (w && (uintptr_t)pd & 15)
    1666             :     {
    1667           0 :         s = *ps++;
    1668           0 :         m = *pm++;
    1669           0 :         d = *pd;
    1670             : 
    1671           0 :         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
    1672           0 :         w--;
    1673             :     }
    1674             : 
    1675           0 :     while (w >= 4)
    1676             :     {
    1677           0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    1678           0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1679           0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1680             : 
    1681           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1682           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1683           0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1684             : 
    1685           0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    1686             :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1687             :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    1688             :                             &xmm_mask_lo, &xmm_mask_hi,
    1689             :                             &xmm_mask_lo, &xmm_mask_hi);
    1690             : 
    1691             :         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    1692             :                     &xmm_alpha_lo, &xmm_alpha_hi,
    1693             :                     &xmm_mask_lo, &xmm_mask_hi);
    1694             : 
    1695           0 :         save_128_aligned (
    1696             :             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
    1697             : 
    1698           0 :         ps += 4;
    1699           0 :         pd += 4;
    1700           0 :         pm += 4;
    1701           0 :         w -= 4;
    1702             :     }
    1703             : 
    1704           0 :     while (w)
    1705             :     {
    1706           0 :         s = *ps++;
    1707           0 :         m = *pm++;
    1708           0 :         d = *pd;
    1709             : 
    1710           0 :         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
    1711           0 :         w--;
    1712             :     }
    1713           0 : }
    1714             : 
    1715             : static void
    1716           0 : sse2_combine_in_ca (pixman_implementation_t *imp,
    1717             :                     pixman_op_t              op,
    1718             :                     uint32_t *               pd,
    1719             :                     const uint32_t *         ps,
    1720             :                     const uint32_t *         pm,
    1721             :                     int                      w)
    1722             : {
    1723             :     uint32_t s, m, d;
    1724             : 
    1725             :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    1726             :     __m128i xmm_src_lo, xmm_src_hi;
    1727             :     __m128i xmm_dst_lo, xmm_dst_hi;
    1728             :     __m128i xmm_mask_lo, xmm_mask_hi;
    1729             : 
    1730           0 :     while (w && (uintptr_t)pd & 15)
    1731             :     {
    1732           0 :         s = *ps++;
    1733           0 :         m = *pm++;
    1734           0 :         d = *pd;
    1735             : 
    1736           0 :         *pd++ = pack_1x128_32 (
    1737             :             pix_multiply_1x128 (
    1738             :                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
    1739             :                 expand_alpha_1x128 (unpack_32_1x128 (d))));
    1740             : 
    1741           0 :         w--;
    1742             :     }
    1743             : 
    1744           0 :     while (w >= 4)
    1745             :     {
    1746           0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    1747           0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1748           0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1749             : 
    1750           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1751           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1752           0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1753             : 
    1754           0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    1755             :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1756             : 
    1757             :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    1758             :                             &xmm_mask_lo, &xmm_mask_hi,
    1759             :                             &xmm_dst_lo, &xmm_dst_hi);
    1760             : 
    1761             :         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    1762             :                             &xmm_alpha_lo, &xmm_alpha_hi,
    1763             :                             &xmm_dst_lo, &xmm_dst_hi);
    1764             : 
    1765           0 :         save_128_aligned (
    1766             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1767             : 
    1768           0 :         ps += 4;
    1769           0 :         pd += 4;
    1770           0 :         pm += 4;
    1771           0 :         w -= 4;
    1772             :     }
    1773             : 
    1774           0 :     while (w)
    1775             :     {
    1776           0 :         s = *ps++;
    1777           0 :         m = *pm++;
    1778           0 :         d = *pd;
    1779             : 
    1780           0 :         *pd++ = pack_1x128_32 (
    1781             :             pix_multiply_1x128 (
    1782             :                 pix_multiply_1x128 (
    1783             :                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
    1784             :                 expand_alpha_1x128 (unpack_32_1x128 (d))));
    1785             : 
    1786           0 :         w--;
    1787             :     }
    1788           0 : }
    1789             : 
    1790             : static void
    1791           0 : sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
    1792             :                             pixman_op_t              op,
    1793             :                             uint32_t *               pd,
    1794             :                             const uint32_t *         ps,
    1795             :                             const uint32_t *         pm,
    1796             :                             int                      w)
    1797             : {
    1798             :     uint32_t s, m, d;
    1799             : 
    1800             :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    1801             :     __m128i xmm_src_lo, xmm_src_hi;
    1802             :     __m128i xmm_dst_lo, xmm_dst_hi;
    1803             :     __m128i xmm_mask_lo, xmm_mask_hi;
    1804             : 
    1805           0 :     while (w && (uintptr_t)pd & 15)
    1806             :     {
    1807           0 :         s = *ps++;
    1808           0 :         m = *pm++;
    1809           0 :         d = *pd;
    1810             : 
    1811           0 :         *pd++ = pack_1x128_32 (
    1812             :             pix_multiply_1x128 (
    1813             :                 unpack_32_1x128 (d),
    1814             :                 pix_multiply_1x128 (unpack_32_1x128 (m),
    1815             :                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
    1816           0 :         w--;
    1817             :     }
    1818             : 
    1819           0 :     while (w >= 4)
    1820             :     {
    1821           0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    1822           0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1823           0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1824             : 
    1825           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1826           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1827           0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1828             : 
    1829           0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    1830             :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1831             :         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
    1832             :                             &xmm_alpha_lo, &xmm_alpha_hi,
    1833             :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1834             : 
    1835             :         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    1836             :                             &xmm_alpha_lo, &xmm_alpha_hi,
    1837             :                             &xmm_dst_lo, &xmm_dst_hi);
    1838             : 
    1839           0 :         save_128_aligned (
    1840             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1841             : 
    1842           0 :         ps += 4;
    1843           0 :         pd += 4;
    1844           0 :         pm += 4;
    1845           0 :         w -= 4;
    1846             :     }
    1847             : 
    1848           0 :     while (w)
    1849             :     {
    1850           0 :         s = *ps++;
    1851           0 :         m = *pm++;
    1852           0 :         d = *pd;
    1853             : 
    1854           0 :         *pd++ = pack_1x128_32 (
    1855             :             pix_multiply_1x128 (
    1856             :                 unpack_32_1x128 (d),
    1857             :                 pix_multiply_1x128 (unpack_32_1x128 (m),
    1858             :                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
    1859           0 :         w--;
    1860             :     }
    1861           0 : }
    1862             : 
    1863             : static void
    1864           0 : sse2_combine_out_ca (pixman_implementation_t *imp,
    1865             :                      pixman_op_t              op,
    1866             :                      uint32_t *               pd,
    1867             :                      const uint32_t *         ps,
    1868             :                      const uint32_t *         pm,
    1869             :                      int                      w)
    1870             : {
    1871             :     uint32_t s, m, d;
    1872             : 
    1873             :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    1874             :     __m128i xmm_src_lo, xmm_src_hi;
    1875             :     __m128i xmm_dst_lo, xmm_dst_hi;
    1876             :     __m128i xmm_mask_lo, xmm_mask_hi;
    1877             : 
    1878           0 :     while (w && (uintptr_t)pd & 15)
    1879             :     {
    1880           0 :         s = *ps++;
    1881           0 :         m = *pm++;
    1882           0 :         d = *pd;
    1883             : 
    1884           0 :         *pd++ = pack_1x128_32 (
    1885             :             pix_multiply_1x128 (
    1886             :                 pix_multiply_1x128 (
    1887             :                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
    1888             :                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
    1889           0 :         w--;
    1890             :     }
    1891             : 
    1892           0 :     while (w >= 4)
    1893             :     {
    1894           0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    1895           0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1896           0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1897             : 
    1898           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1899           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1900           0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1901             : 
    1902           0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    1903             :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1904           0 :         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
    1905             :                       &xmm_alpha_lo, &xmm_alpha_hi);
    1906             : 
    1907             :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    1908             :                             &xmm_mask_lo, &xmm_mask_hi,
    1909             :                             &xmm_dst_lo, &xmm_dst_hi);
    1910             :         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    1911             :                             &xmm_alpha_lo, &xmm_alpha_hi,
    1912             :                             &xmm_dst_lo, &xmm_dst_hi);
    1913             : 
    1914           0 :         save_128_aligned (
    1915             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1916             : 
    1917           0 :         ps += 4;
    1918           0 :         pd += 4;
    1919           0 :         pm += 4;
    1920           0 :         w -= 4;
    1921             :     }
    1922             : 
    1923           0 :     while (w)
    1924             :     {
    1925           0 :         s = *ps++;
    1926           0 :         m = *pm++;
    1927           0 :         d = *pd;
    1928             : 
    1929           0 :         *pd++ = pack_1x128_32 (
    1930             :             pix_multiply_1x128 (
    1931             :                 pix_multiply_1x128 (
    1932             :                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
    1933             :                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
    1934             : 
    1935           0 :         w--;
    1936             :     }
    1937           0 : }
    1938             : 
    1939             : static void
    1940           0 : sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
    1941             :                              pixman_op_t              op,
    1942             :                              uint32_t *               pd,
    1943             :                              const uint32_t *         ps,
    1944             :                              const uint32_t *         pm,
    1945             :                              int                      w)
    1946             : {
    1947             :     uint32_t s, m, d;
    1948             : 
    1949             :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    1950             :     __m128i xmm_src_lo, xmm_src_hi;
    1951             :     __m128i xmm_dst_lo, xmm_dst_hi;
    1952             :     __m128i xmm_mask_lo, xmm_mask_hi;
    1953             : 
    1954           0 :     while (w && (uintptr_t)pd & 15)
    1955             :     {
    1956           0 :         s = *ps++;
    1957           0 :         m = *pm++;
    1958           0 :         d = *pd;
    1959             : 
    1960           0 :         *pd++ = pack_1x128_32 (
    1961             :             pix_multiply_1x128 (
    1962             :                 unpack_32_1x128 (d),
    1963             :                 negate_1x128 (pix_multiply_1x128 (
    1964             :                                  unpack_32_1x128 (m),
    1965             :                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
    1966           0 :         w--;
    1967             :     }
    1968             : 
    1969           0 :     while (w >= 4)
    1970             :     {
    1971           0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    1972           0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1973           0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1974             : 
    1975           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1976           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1977           0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1978             : 
    1979           0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    1980             :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1981             : 
    1982             :         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
    1983             :                             &xmm_alpha_lo, &xmm_alpha_hi,
    1984             :                             &xmm_mask_lo, &xmm_mask_hi);
    1985             : 
    1986           0 :         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
    1987             :                       &xmm_mask_lo, &xmm_mask_hi);
    1988             : 
    1989             :         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    1990             :                             &xmm_mask_lo, &xmm_mask_hi,
    1991             :                             &xmm_dst_lo, &xmm_dst_hi);
    1992             : 
    1993           0 :         save_128_aligned (
    1994             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1995             : 
    1996           0 :         ps += 4;
    1997           0 :         pd += 4;
    1998           0 :         pm += 4;
    1999           0 :         w -= 4;
    2000             :     }
    2001             : 
    2002           0 :     while (w)
    2003             :     {
    2004           0 :         s = *ps++;
    2005           0 :         m = *pm++;
    2006           0 :         d = *pd;
    2007             : 
    2008           0 :         *pd++ = pack_1x128_32 (
    2009             :             pix_multiply_1x128 (
    2010             :                 unpack_32_1x128 (d),
    2011             :                 negate_1x128 (pix_multiply_1x128 (
    2012             :                                  unpack_32_1x128 (m),
    2013             :                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
    2014           0 :         w--;
    2015             :     }
    2016           0 : }
    2017             : 
    2018             : static force_inline uint32_t
    2019             : core_combine_atop_ca_pixel_sse2 (uint32_t src,
    2020             :                                  uint32_t mask,
    2021             :                                  uint32_t dst)
    2022             : {
    2023           0 :     __m128i m = unpack_32_1x128 (mask);
    2024           0 :     __m128i s = unpack_32_1x128 (src);
    2025           0 :     __m128i d = unpack_32_1x128 (dst);
    2026           0 :     __m128i sa = expand_alpha_1x128 (s);
    2027           0 :     __m128i da = expand_alpha_1x128 (d);
    2028             : 
    2029           0 :     s = pix_multiply_1x128 (s, m);
    2030           0 :     m = negate_1x128 (pix_multiply_1x128 (m, sa));
    2031             : 
    2032           0 :     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
    2033             : }
    2034             : 
    2035             : static void
    2036           0 : sse2_combine_atop_ca (pixman_implementation_t *imp,
    2037             :                       pixman_op_t              op,
    2038             :                       uint32_t *               pd,
    2039             :                       const uint32_t *         ps,
    2040             :                       const uint32_t *         pm,
    2041             :                       int                      w)
    2042             : {
    2043             :     uint32_t s, m, d;
    2044             : 
    2045             :     __m128i xmm_src_lo, xmm_src_hi;
    2046             :     __m128i xmm_dst_lo, xmm_dst_hi;
    2047             :     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
    2048             :     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
    2049             :     __m128i xmm_mask_lo, xmm_mask_hi;
    2050             : 
    2051           0 :     while (w && (uintptr_t)pd & 15)
    2052             :     {
    2053           0 :         s = *ps++;
    2054           0 :         m = *pm++;
    2055           0 :         d = *pd;
    2056             : 
    2057           0 :         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
    2058           0 :         w--;
    2059             :     }
    2060             : 
    2061           0 :     while (w >= 4)
    2062             :     {
    2063           0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    2064           0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    2065           0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    2066             : 
    2067           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    2068           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    2069           0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    2070             : 
    2071           0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    2072             :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    2073           0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    2074             :                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    2075             : 
    2076             :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    2077             :                             &xmm_mask_lo, &xmm_mask_hi,
    2078             :                             &xmm_src_lo, &xmm_src_hi);
    2079             :         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
    2080             :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
    2081             :                             &xmm_mask_lo, &xmm_mask_hi);
    2082             : 
    2083           0 :         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    2084             : 
    2085             :         pix_add_multiply_2x128 (
    2086             :             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
    2087             :             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
    2088             :             &xmm_dst_lo, &xmm_dst_hi);
    2089             : 
    2090           0 :         save_128_aligned (
    2091             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    2092             : 
    2093           0 :         ps += 4;
    2094           0 :         pd += 4;
    2095           0 :         pm += 4;
    2096           0 :         w -= 4;
    2097             :     }
    2098             : 
    2099           0 :     while (w)
    2100             :     {
    2101           0 :         s = *ps++;
    2102           0 :         m = *pm++;
    2103           0 :         d = *pd;
    2104             : 
    2105           0 :         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
    2106           0 :         w--;
    2107             :     }
    2108           0 : }
    2109             : 
    2110             : static force_inline uint32_t
    2111             : core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
    2112             :                                          uint32_t mask,
    2113             :                                          uint32_t dst)
    2114             : {
    2115           0 :     __m128i m = unpack_32_1x128 (mask);
    2116           0 :     __m128i s = unpack_32_1x128 (src);
    2117           0 :     __m128i d = unpack_32_1x128 (dst);
    2118             : 
    2119           0 :     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
    2120           0 :     __m128i sa = expand_alpha_1x128 (s);
    2121             : 
    2122           0 :     s = pix_multiply_1x128 (s, m);
    2123           0 :     m = pix_multiply_1x128 (m, sa);
    2124             : 
    2125           0 :     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
    2126             : }
    2127             : 
    2128             : static void
    2129           0 : sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
    2130             :                               pixman_op_t              op,
    2131             :                               uint32_t *               pd,
    2132             :                               const uint32_t *         ps,
    2133             :                               const uint32_t *         pm,
    2134             :                               int                      w)
    2135             : {
    2136             :     uint32_t s, m, d;
    2137             : 
    2138             :     __m128i xmm_src_lo, xmm_src_hi;
    2139             :     __m128i xmm_dst_lo, xmm_dst_hi;
    2140             :     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
    2141             :     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
    2142             :     __m128i xmm_mask_lo, xmm_mask_hi;
    2143             : 
    2144           0 :     while (w && (uintptr_t)pd & 15)
    2145             :     {
    2146           0 :         s = *ps++;
    2147           0 :         m = *pm++;
    2148           0 :         d = *pd;
    2149             : 
    2150           0 :         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
    2151           0 :         w--;
    2152             :     }
    2153             : 
    2154           0 :     while (w >= 4)
    2155             :     {
    2156           0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    2157           0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    2158           0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    2159             : 
    2160           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    2161           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    2162           0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    2163             : 
    2164           0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    2165             :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    2166           0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    2167             :                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    2168             : 
    2169             :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    2170             :                             &xmm_mask_lo, &xmm_mask_hi,
    2171             :                             &xmm_src_lo, &xmm_src_hi);
    2172             :         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
    2173             :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
    2174             :                             &xmm_mask_lo, &xmm_mask_hi);
    2175             : 
    2176           0 :         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
    2177             :                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    2178             : 
    2179             :         pix_add_multiply_2x128 (
    2180             :             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
    2181             :             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
    2182             :             &xmm_dst_lo, &xmm_dst_hi);
    2183             : 
    2184           0 :         save_128_aligned (
    2185             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    2186             : 
    2187           0 :         ps += 4;
    2188           0 :         pd += 4;
    2189           0 :         pm += 4;
    2190           0 :         w -= 4;
    2191             :     }
    2192             : 
    2193           0 :     while (w)
    2194             :     {
    2195           0 :         s = *ps++;
    2196           0 :         m = *pm++;
    2197           0 :         d = *pd;
    2198             : 
    2199           0 :         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
    2200           0 :         w--;
    2201             :     }
    2202           0 : }
    2203             : 
    2204             : static force_inline uint32_t
    2205             : core_combine_xor_ca_pixel_sse2 (uint32_t src,
    2206             :                                 uint32_t mask,
    2207             :                                 uint32_t dst)
    2208             : {
    2209           0 :     __m128i a = unpack_32_1x128 (mask);
    2210           0 :     __m128i s = unpack_32_1x128 (src);
    2211           0 :     __m128i d = unpack_32_1x128 (dst);
    2212             : 
    2213           0 :     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
    2214             :                                        a, expand_alpha_1x128 (s)));
    2215           0 :     __m128i dest      = pix_multiply_1x128 (s, a);
    2216           0 :     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
    2217             : 
    2218           0 :     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
    2219             :                                                 &alpha_dst,
    2220             :                                                 &dest,
    2221             :                                                 &alpha_src));
    2222             : }
    2223             : 
    2224             : static void
    2225           0 : sse2_combine_xor_ca (pixman_implementation_t *imp,
    2226             :                      pixman_op_t              op,
    2227             :                      uint32_t *               pd,
    2228             :                      const uint32_t *         ps,
    2229             :                      const uint32_t *         pm,
    2230             :                      int                      w)
    2231             : {
    2232             :     uint32_t s, m, d;
    2233             : 
    2234             :     __m128i xmm_src_lo, xmm_src_hi;
    2235             :     __m128i xmm_dst_lo, xmm_dst_hi;
    2236             :     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
    2237             :     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
    2238             :     __m128i xmm_mask_lo, xmm_mask_hi;
    2239             : 
    2240           0 :     while (w && (uintptr_t)pd & 15)
    2241             :     {
    2242           0 :         s = *ps++;
    2243           0 :         m = *pm++;
    2244           0 :         d = *pd;
    2245             : 
    2246           0 :         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
    2247           0 :         w--;
    2248             :     }
    2249             : 
    2250           0 :     while (w >= 4)
    2251             :     {
    2252           0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    2253           0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    2254           0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    2255             : 
    2256           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    2257           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    2258           0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    2259             : 
    2260           0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    2261             :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    2262           0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    2263             :                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    2264             : 
    2265             :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    2266             :                             &xmm_mask_lo, &xmm_mask_hi,
    2267             :                             &xmm_src_lo, &xmm_src_hi);
    2268             :         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
    2269             :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
    2270             :                             &xmm_mask_lo, &xmm_mask_hi);
    2271             : 
    2272           0 :         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
    2273             :                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    2274           0 :         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
    2275             :                       &xmm_mask_lo, &xmm_mask_hi);
    2276             : 
    2277             :         pix_add_multiply_2x128 (
    2278             :             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
    2279             :             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
    2280             :             &xmm_dst_lo, &xmm_dst_hi);
    2281             : 
    2282           0 :         save_128_aligned (
    2283             :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    2284             : 
    2285           0 :         ps += 4;
    2286           0 :         pd += 4;
    2287           0 :         pm += 4;
    2288           0 :         w -= 4;
    2289             :     }
    2290             : 
    2291           0 :     while (w)
    2292             :     {
    2293           0 :         s = *ps++;
    2294           0 :         m = *pm++;
    2295           0 :         d = *pd;
    2296             : 
    2297           0 :         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
    2298           0 :         w--;
    2299             :     }
    2300           0 : }
    2301             : 
    2302             : static void
    2303           0 : sse2_combine_add_ca (pixman_implementation_t *imp,
    2304             :                      pixman_op_t              op,
    2305             :                      uint32_t *               pd,
    2306             :                      const uint32_t *         ps,
    2307             :                      const uint32_t *         pm,
    2308             :                      int                      w)
    2309             : {
    2310             :     uint32_t s, m, d;
    2311             : 
    2312             :     __m128i xmm_src_lo, xmm_src_hi;
    2313             :     __m128i xmm_dst_lo, xmm_dst_hi;
    2314             :     __m128i xmm_mask_lo, xmm_mask_hi;
    2315             : 
    2316           0 :     while (w && (uintptr_t)pd & 15)
    2317             :     {
    2318           0 :         s = *ps++;
    2319           0 :         m = *pm++;
    2320           0 :         d = *pd;
    2321             : 
    2322           0 :         *pd++ = pack_1x128_32 (
    2323             :             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
    2324             :                                                unpack_32_1x128 (m)),
    2325             :                            unpack_32_1x128 (d)));
    2326           0 :         w--;
    2327             :     }
    2328             : 
    2329           0 :     while (w >= 4)
    2330             :     {
    2331           0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    2332           0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    2333           0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    2334             : 
    2335           0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    2336           0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    2337           0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    2338             : 
    2339             :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    2340             :                             &xmm_mask_lo, &xmm_mask_hi,
    2341             :                             &xmm_src_lo, &xmm_src_hi);
    2342             : 
    2343           0 :         save_128_aligned (
    2344             :             (__m128i*)pd, pack_2x128_128 (
    2345             :                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
    2346             :                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
    2347             : 
    2348           0 :         ps += 4;
    2349           0 :         pd += 4;
    2350           0 :         pm += 4;
    2351           0 :         w -= 4;
    2352             :     }
    2353             : 
    2354           0 :     while (w)
    2355             :     {
    2356           0 :         s = *ps++;
    2357           0 :         m = *pm++;
    2358           0 :         d = *pd;
    2359             : 
    2360           0 :         *pd++ = pack_1x128_32 (
    2361             :             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
    2362             :                                                unpack_32_1x128 (m)),
    2363             :                            unpack_32_1x128 (d)));
    2364           0 :         w--;
    2365             :     }
    2366           0 : }
    2367             : 
    2368             : static force_inline __m128i
    2369             : create_mask_16_128 (uint16_t mask)
    2370             : {
    2371           8 :     return _mm_set1_epi16 (mask);
    2372             : }
    2373             : 
    2374             : /* Work around a code generation bug in Sun Studio 12. */
    2375             : #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
    2376             : # define create_mask_2x32_128(mask0, mask1)                             \
    2377             :     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
    2378             : #else
    2379             : static force_inline __m128i
    2380             : create_mask_2x32_128 (uint32_t mask0,
    2381             :                       uint32_t mask1)
    2382             : {
    2383          26 :     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
    2384             : }
    2385             : #endif
    2386             : 
    2387             : static void
    2388           0 : sse2_composite_over_n_8888 (pixman_implementation_t *imp,
    2389             :                             pixman_composite_info_t *info)
    2390             : {
    2391           0 :     PIXMAN_COMPOSITE_ARGS (info);
    2392             :     uint32_t src;
    2393             :     uint32_t    *dst_line, *dst, d;
    2394             :     int32_t w;
    2395             :     int dst_stride;
    2396             :     __m128i xmm_src, xmm_alpha;
    2397             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    2398             : 
    2399           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    2400             : 
    2401           0 :     if (src == 0)
    2402           0 :         return;
    2403             : 
    2404           0 :     PIXMAN_IMAGE_GET_LINE (
    2405             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2406             : 
    2407           0 :     xmm_src = expand_pixel_32_1x128 (src);
    2408           0 :     xmm_alpha = expand_alpha_1x128 (xmm_src);
    2409             : 
    2410           0 :     while (height--)
    2411             :     {
    2412           0 :         dst = dst_line;
    2413             : 
    2414           0 :         dst_line += dst_stride;
    2415           0 :         w = width;
    2416             : 
    2417           0 :         while (w && (uintptr_t)dst & 15)
    2418             :         {
    2419           0 :             d = *dst;
    2420           0 :             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
    2421             :                                                 xmm_alpha,
    2422             :                                                 unpack_32_1x128 (d)));
    2423           0 :             w--;
    2424             :         }
    2425             : 
    2426           0 :         while (w >= 4)
    2427             :         {
    2428           0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    2429             : 
    2430             :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    2431             : 
    2432             :             over_2x128 (&xmm_src, &xmm_src,
    2433             :                         &xmm_alpha, &xmm_alpha,
    2434             :                         &xmm_dst_lo, &xmm_dst_hi);
    2435             : 
    2436             :             /* rebuid the 4 pixel data and save*/
    2437           0 :             save_128_aligned (
    2438             :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    2439             : 
    2440           0 :             w -= 4;
    2441           0 :             dst += 4;
    2442             :         }
    2443             : 
    2444           0 :         while (w)
    2445             :         {
    2446           0 :             d = *dst;
    2447           0 :             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
    2448             :                                                 xmm_alpha,
    2449             :                                                 unpack_32_1x128 (d)));
    2450           0 :             w--;
    2451             :         }
    2452             : 
    2453             :     }
    2454             : }
    2455             : 
    2456             : static void
    2457           0 : sse2_composite_over_n_0565 (pixman_implementation_t *imp,
    2458             :                             pixman_composite_info_t *info)
    2459             : {
    2460           0 :     PIXMAN_COMPOSITE_ARGS (info);
    2461             :     uint32_t src;
    2462             :     uint16_t    *dst_line, *dst, d;
    2463             :     int32_t w;
    2464             :     int dst_stride;
    2465             :     __m128i xmm_src, xmm_alpha;
    2466             :     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
    2467             : 
    2468           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    2469             : 
    2470           0 :     if (src == 0)
    2471           0 :         return;
    2472             : 
    2473           0 :     PIXMAN_IMAGE_GET_LINE (
    2474             :         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    2475             : 
    2476           0 :     xmm_src = expand_pixel_32_1x128 (src);
    2477           0 :     xmm_alpha = expand_alpha_1x128 (xmm_src);
    2478             : 
    2479           0 :     while (height--)
    2480             :     {
    2481           0 :         dst = dst_line;
    2482             : 
    2483           0 :         dst_line += dst_stride;
    2484           0 :         w = width;
    2485             : 
    2486           0 :         while (w && (uintptr_t)dst & 15)
    2487             :         {
    2488           0 :             d = *dst;
    2489             : 
    2490           0 :             *dst++ = pack_565_32_16 (
    2491             :                 pack_1x128_32 (over_1x128 (xmm_src,
    2492             :                                            xmm_alpha,
    2493             :                                            expand565_16_1x128 (d))));
    2494           0 :             w--;
    2495             :         }
    2496             : 
    2497           0 :         while (w >= 8)
    2498             :         {
    2499           0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    2500             : 
    2501             :             unpack_565_128_4x128 (xmm_dst,
    2502             :                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
    2503             : 
    2504             :             over_2x128 (&xmm_src, &xmm_src,
    2505             :                         &xmm_alpha, &xmm_alpha,
    2506             :                         &xmm_dst0, &xmm_dst1);
    2507             :             over_2x128 (&xmm_src, &xmm_src,
    2508             :                         &xmm_alpha, &xmm_alpha,
    2509             :                         &xmm_dst2, &xmm_dst3);
    2510             : 
    2511           0 :             xmm_dst = pack_565_4x128_128 (
    2512             :                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
    2513             : 
    2514             :             save_128_aligned ((__m128i*)dst, xmm_dst);
    2515             : 
    2516           0 :             dst += 8;
    2517           0 :             w -= 8;
    2518             :         }
    2519             : 
    2520           0 :         while (w--)
    2521             :         {
    2522           0 :             d = *dst;
    2523           0 :             *dst++ = pack_565_32_16 (
    2524             :                 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
    2525             :                                            expand565_16_1x128 (d))));
    2526             :         }
    2527             :     }
    2528             : 
    2529             : }
    2530             : 
    2531             : static void
    2532           0 : sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
    2533             :                                    pixman_composite_info_t *info)
    2534             : {
    2535           0 :     PIXMAN_COMPOSITE_ARGS (info);
    2536             :     uint32_t src;
    2537             :     uint32_t    *dst_line, d;
    2538             :     uint32_t    *mask_line, m;
    2539             :     uint32_t pack_cmp;
    2540             :     int dst_stride, mask_stride;
    2541             : 
    2542             :     __m128i xmm_src;
    2543             :     __m128i xmm_dst;
    2544             :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    2545             : 
    2546             :     __m128i mmx_src, mmx_mask, mmx_dest;
    2547             : 
    2548           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    2549             : 
    2550           0 :     if (src == 0)
    2551           0 :         return;
    2552             : 
    2553           0 :     PIXMAN_IMAGE_GET_LINE (
    2554             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2555           0 :     PIXMAN_IMAGE_GET_LINE (
    2556             :         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
    2557             : 
    2558           0 :     xmm_src = _mm_unpacklo_epi8 (
    2559             :         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
    2560           0 :     mmx_src   = xmm_src;
    2561             : 
    2562           0 :     while (height--)
    2563             :     {
    2564           0 :         int w = width;
    2565           0 :         const uint32_t *pm = (uint32_t *)mask_line;
    2566           0 :         uint32_t *pd = (uint32_t *)dst_line;
    2567             : 
    2568           0 :         dst_line += dst_stride;
    2569           0 :         mask_line += mask_stride;
    2570             : 
    2571           0 :         while (w && (uintptr_t)pd & 15)
    2572             :         {
    2573           0 :             m = *pm++;
    2574             : 
    2575           0 :             if (m)
    2576             :             {
    2577           0 :                 d = *pd;
    2578             : 
    2579           0 :                 mmx_mask = unpack_32_1x128 (m);
    2580           0 :                 mmx_dest = unpack_32_1x128 (d);
    2581             : 
    2582           0 :                 *pd = pack_1x128_32 (
    2583             :                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
    2584             :                                    mmx_dest));
    2585             :             }
    2586             : 
    2587           0 :             pd++;
    2588           0 :             w--;
    2589             :         }
    2590             : 
    2591           0 :         while (w >= 4)
    2592             :         {
    2593           0 :             xmm_mask = load_128_unaligned ((__m128i*)pm);
    2594             : 
    2595           0 :             pack_cmp =
    2596           0 :                 _mm_movemask_epi8 (
    2597             :                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
    2598             : 
    2599             :             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
    2600           0 :             if (pack_cmp != 0xffff)
    2601             :             {
    2602           0 :                 xmm_dst = load_128_aligned ((__m128i*)pd);
    2603             : 
    2604             :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    2605             : 
    2606             :                 pix_multiply_2x128 (&xmm_src, &xmm_src,
    2607             :                                     &xmm_mask_lo, &xmm_mask_hi,
    2608             :                                     &xmm_mask_lo, &xmm_mask_hi);
    2609           0 :                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
    2610             : 
    2611           0 :                 save_128_aligned (
    2612             :                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
    2613             :             }
    2614             : 
    2615           0 :             pd += 4;
    2616           0 :             pm += 4;
    2617           0 :             w -= 4;
    2618             :         }
    2619             : 
    2620           0 :         while (w)
    2621             :         {
    2622           0 :             m = *pm++;
    2623             : 
    2624           0 :             if (m)
    2625             :             {
    2626           0 :                 d = *pd;
    2627             : 
    2628           0 :                 mmx_mask = unpack_32_1x128 (m);
    2629           0 :                 mmx_dest = unpack_32_1x128 (d);
    2630             : 
    2631           0 :                 *pd = pack_1x128_32 (
    2632             :                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
    2633             :                                    mmx_dest));
    2634             :             }
    2635             : 
    2636           0 :             pd++;
    2637           0 :             w--;
    2638             :         }
    2639             :     }
    2640             : 
    2641             : }
    2642             : 
    2643             : static void
    2644           0 : sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
    2645             :                                     pixman_composite_info_t *info)
    2646             : {
    2647           0 :     PIXMAN_COMPOSITE_ARGS (info);
    2648             :     uint32_t src;
    2649             :     uint32_t    *dst_line, d;
    2650             :     uint32_t    *mask_line, m;
    2651             :     uint32_t pack_cmp;
    2652             :     int dst_stride, mask_stride;
    2653             : 
    2654             :     __m128i xmm_src, xmm_alpha;
    2655             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    2656             :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    2657             : 
    2658             :     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
    2659             : 
    2660           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    2661             : 
    2662           0 :     if (src == 0)
    2663           0 :         return;
    2664             : 
    2665           0 :     PIXMAN_IMAGE_GET_LINE (
    2666             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2667           0 :     PIXMAN_IMAGE_GET_LINE (
    2668             :         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
    2669             : 
    2670           0 :     xmm_src = _mm_unpacklo_epi8 (
    2671             :         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
    2672           0 :     xmm_alpha = expand_alpha_1x128 (xmm_src);
    2673           0 :     mmx_src   = xmm_src;
    2674           0 :     mmx_alpha = xmm_alpha;
    2675             : 
    2676           0 :     while (height--)
    2677             :     {
    2678           0 :         int w = width;
    2679           0 :         const uint32_t *pm = (uint32_t *)mask_line;
    2680           0 :         uint32_t *pd = (uint32_t *)dst_line;
    2681             : 
    2682           0 :         dst_line += dst_stride;
    2683           0 :         mask_line += mask_stride;
    2684             : 
    2685           0 :         while (w && (uintptr_t)pd & 15)
    2686             :         {
    2687           0 :             m = *pm++;
    2688             : 
    2689           0 :             if (m)
    2690             :             {
    2691           0 :                 d = *pd;
    2692           0 :                 mmx_mask = unpack_32_1x128 (m);
    2693           0 :                 mmx_dest = unpack_32_1x128 (d);
    2694             : 
    2695           0 :                 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
    2696             :                                                   &mmx_alpha,
    2697             :                                                   &mmx_mask,
    2698             :                                                   &mmx_dest));
    2699             :             }
    2700             : 
    2701           0 :             pd++;
    2702           0 :             w--;
    2703             :         }
    2704             : 
    2705           0 :         while (w >= 4)
    2706             :         {
    2707           0 :             xmm_mask = load_128_unaligned ((__m128i*)pm);
    2708             : 
    2709           0 :             pack_cmp =
    2710           0 :                 _mm_movemask_epi8 (
    2711             :                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
    2712             : 
    2713             :             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
    2714           0 :             if (pack_cmp != 0xffff)
    2715             :             {
    2716           0 :                 xmm_dst = load_128_aligned ((__m128i*)pd);
    2717             : 
    2718             :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    2719             :                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    2720             : 
    2721             :                 in_over_2x128 (&xmm_src, &xmm_src,
    2722             :                                &xmm_alpha, &xmm_alpha,
    2723             :                                &xmm_mask_lo, &xmm_mask_hi,
    2724             :                                &xmm_dst_lo, &xmm_dst_hi);
    2725             : 
    2726           0 :                 save_128_aligned (
    2727             :                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    2728             :             }
    2729             : 
    2730           0 :             pd += 4;
    2731           0 :             pm += 4;
    2732           0 :             w -= 4;
    2733             :         }
    2734             : 
    2735           0 :         while (w)
    2736             :         {
    2737           0 :             m = *pm++;
    2738             : 
    2739           0 :             if (m)
    2740             :             {
    2741           0 :                 d = *pd;
    2742           0 :                 mmx_mask = unpack_32_1x128 (m);
    2743           0 :                 mmx_dest = unpack_32_1x128 (d);
    2744             : 
    2745           0 :                 *pd = pack_1x128_32 (
    2746             :                     in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
    2747             :             }
    2748             : 
    2749           0 :             pd++;
    2750           0 :             w--;
    2751             :         }
    2752             :     }
    2753             : 
    2754             : }
    2755             : 
    2756             : static void
    2757           0 : sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
    2758             :                                  pixman_composite_info_t *info)
    2759             : {
    2760           0 :     PIXMAN_COMPOSITE_ARGS (info);
    2761             :     uint32_t    *dst_line, *dst;
    2762             :     uint32_t    *src_line, *src;
    2763             :     uint32_t mask;
    2764             :     int32_t w;
    2765             :     int dst_stride, src_stride;
    2766             : 
    2767             :     __m128i xmm_mask;
    2768             :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    2769             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    2770             :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    2771             : 
    2772           0 :     PIXMAN_IMAGE_GET_LINE (
    2773             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2774           0 :     PIXMAN_IMAGE_GET_LINE (
    2775             :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    2776             : 
    2777           0 :     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
    2778             : 
    2779           0 :     xmm_mask = create_mask_16_128 (mask >> 24);
    2780             : 
    2781           0 :     while (height--)
    2782             :     {
    2783           0 :         dst = dst_line;
    2784           0 :         dst_line += dst_stride;
    2785           0 :         src = src_line;
    2786           0 :         src_line += src_stride;
    2787           0 :         w = width;
    2788             : 
    2789           0 :         while (w && (uintptr_t)dst & 15)
    2790             :         {
    2791           0 :             uint32_t s = *src++;
    2792             : 
    2793           0 :             if (s)
    2794             :             {
    2795           0 :                 uint32_t d = *dst;
    2796             :                 
    2797           0 :                 __m128i ms = unpack_32_1x128 (s);
    2798           0 :                 __m128i alpha    = expand_alpha_1x128 (ms);
    2799           0 :                 __m128i dest     = xmm_mask;
    2800           0 :                 __m128i alpha_dst = unpack_32_1x128 (d);
    2801             :                 
    2802           0 :                 *dst = pack_1x128_32 (
    2803             :                     in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
    2804             :             }
    2805           0 :             dst++;
    2806           0 :             w--;
    2807             :         }
    2808             : 
    2809           0 :         while (w >= 4)
    2810             :         {
    2811           0 :             xmm_src = load_128_unaligned ((__m128i*)src);
    2812             : 
    2813           0 :             if (!is_zero (xmm_src))
    2814             :             {
    2815           0 :                 xmm_dst = load_128_aligned ((__m128i*)dst);
    2816             :                 
    2817             :                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    2818             :                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    2819           0 :                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    2820             :                                     &xmm_alpha_lo, &xmm_alpha_hi);
    2821             :                 
    2822             :                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
    2823             :                                &xmm_alpha_lo, &xmm_alpha_hi,
    2824             :                                &xmm_mask, &xmm_mask,
    2825             :                                &xmm_dst_lo, &xmm_dst_hi);
    2826             :                 
    2827           0 :                 save_128_aligned (
    2828             :                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    2829             :             }
    2830             :                 
    2831           0 :             dst += 4;
    2832           0 :             src += 4;
    2833           0 :             w -= 4;
    2834             :         }
    2835             : 
    2836           0 :         while (w)
    2837             :         {
    2838           0 :             uint32_t s = *src++;
    2839             : 
    2840           0 :             if (s)
    2841             :             {
    2842           0 :                 uint32_t d = *dst;
    2843             :                 
    2844           0 :                 __m128i ms = unpack_32_1x128 (s);
    2845           0 :                 __m128i alpha = expand_alpha_1x128 (ms);
    2846           0 :                 __m128i mask  = xmm_mask;
    2847           0 :                 __m128i dest  = unpack_32_1x128 (d);
    2848             :                 
    2849           0 :                 *dst = pack_1x128_32 (
    2850             :                     in_over_1x128 (&ms, &alpha, &mask, &dest));
    2851             :             }
    2852             : 
    2853           0 :             dst++;
    2854           0 :             w--;
    2855             :         }
    2856             :     }
    2857             : 
    2858           0 : }
    2859             : 
    2860             : static void
    2861           0 : sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
    2862             :                               pixman_composite_info_t *info)
    2863             : {
    2864           0 :     PIXMAN_COMPOSITE_ARGS (info);
    2865             :     uint16_t    *dst_line, *dst;
    2866             :     uint32_t    *src_line, *src, s;
    2867             :     int dst_stride, src_stride;
    2868             :     int32_t w;
    2869             : 
    2870           0 :     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    2871           0 :     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    2872             : 
    2873           0 :     while (height--)
    2874             :     {
    2875           0 :         dst = dst_line;
    2876           0 :         dst_line += dst_stride;
    2877           0 :         src = src_line;
    2878           0 :         src_line += src_stride;
    2879           0 :         w = width;
    2880             : 
    2881           0 :         while (w && (uintptr_t)dst & 15)
    2882             :         {
    2883           0 :             s = *src++;
    2884           0 :             *dst = convert_8888_to_0565 (s);
    2885           0 :             dst++;
    2886           0 :             w--;
    2887             :         }
    2888             : 
    2889           0 :         while (w >= 8)
    2890             :         {
    2891           0 :             __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
    2892           0 :             __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
    2893             : 
    2894           0 :             save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
    2895             : 
    2896           0 :             w -= 8;
    2897           0 :             src += 8;
    2898           0 :             dst += 8;
    2899             :         }
    2900             : 
    2901           0 :         while (w)
    2902             :         {
    2903           0 :             s = *src++;
    2904           0 :             *dst = convert_8888_to_0565 (s);
    2905           0 :             dst++;
    2906           0 :             w--;
    2907             :         }
    2908             :     }
    2909           0 : }
    2910             : 
    2911             : static void
    2912           0 : sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
    2913             :                               pixman_composite_info_t *info)
    2914             : {
    2915           0 :     PIXMAN_COMPOSITE_ARGS (info);
    2916             :     uint32_t    *dst_line, *dst;
    2917             :     uint32_t    *src_line, *src;
    2918             :     int32_t w;
    2919             :     int dst_stride, src_stride;
    2920             : 
    2921             : 
    2922           0 :     PIXMAN_IMAGE_GET_LINE (
    2923             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2924           0 :     PIXMAN_IMAGE_GET_LINE (
    2925             :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    2926             : 
    2927           0 :     while (height--)
    2928             :     {
    2929           0 :         dst = dst_line;
    2930           0 :         dst_line += dst_stride;
    2931           0 :         src = src_line;
    2932           0 :         src_line += src_stride;
    2933           0 :         w = width;
    2934             : 
    2935           0 :         while (w && (uintptr_t)dst & 15)
    2936             :         {
    2937           0 :             *dst++ = *src++ | 0xff000000;
    2938           0 :             w--;
    2939             :         }
    2940             : 
    2941           0 :         while (w >= 16)
    2942             :         {
    2943             :             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
    2944             :             
    2945           0 :             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
    2946           0 :             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
    2947           0 :             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
    2948           0 :             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
    2949             :             
    2950           0 :             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
    2951           0 :             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
    2952           0 :             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
    2953           0 :             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
    2954             :             
    2955           0 :             dst += 16;
    2956           0 :             src += 16;
    2957           0 :             w -= 16;
    2958             :         }
    2959             : 
    2960           0 :         while (w)
    2961             :         {
    2962           0 :             *dst++ = *src++ | 0xff000000;
    2963           0 :             w--;
    2964             :         }
    2965             :     }
    2966             : 
    2967           0 : }
    2968             : 
    2969             : static void
    2970           0 : sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
    2971             :                                  pixman_composite_info_t *info)
    2972             : {
    2973           0 :     PIXMAN_COMPOSITE_ARGS (info);
    2974             :     uint32_t    *dst_line, *dst;
    2975             :     uint32_t    *src_line, *src;
    2976             :     uint32_t mask;
    2977             :     int dst_stride, src_stride;
    2978             :     int32_t w;
    2979             : 
    2980             :     __m128i xmm_mask, xmm_alpha;
    2981             :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    2982             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    2983             : 
    2984           0 :     PIXMAN_IMAGE_GET_LINE (
    2985             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2986           0 :     PIXMAN_IMAGE_GET_LINE (
    2987             :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    2988             : 
    2989           0 :     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
    2990             : 
    2991           0 :     xmm_mask = create_mask_16_128 (mask >> 24);
    2992           0 :     xmm_alpha = mask_00ff;
    2993             : 
    2994           0 :     while (height--)
    2995             :     {
    2996           0 :         dst = dst_line;
    2997           0 :         dst_line += dst_stride;
    2998           0 :         src = src_line;
    2999           0 :         src_line += src_stride;
    3000           0 :         w = width;
    3001             : 
    3002           0 :         while (w && (uintptr_t)dst & 15)
    3003             :         {
    3004           0 :             uint32_t s = (*src++) | 0xff000000;
    3005           0 :             uint32_t d = *dst;
    3006             : 
    3007           0 :             __m128i src   = unpack_32_1x128 (s);
    3008           0 :             __m128i alpha = xmm_alpha;
    3009           0 :             __m128i mask  = xmm_mask;
    3010           0 :             __m128i dest  = unpack_32_1x128 (d);
    3011             : 
    3012           0 :             *dst++ = pack_1x128_32 (
    3013             :                 in_over_1x128 (&src, &alpha, &mask, &dest));
    3014             : 
    3015           0 :             w--;
    3016             :         }
    3017             : 
    3018           0 :         while (w >= 4)
    3019             :         {
    3020           0 :             xmm_src = _mm_or_si128 (
    3021             :                 load_128_unaligned ((__m128i*)src), mask_ff000000);
    3022           0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    3023             : 
    3024             :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    3025             :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    3026             : 
    3027             :             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
    3028             :                            &xmm_alpha, &xmm_alpha,
    3029             :                            &xmm_mask, &xmm_mask,
    3030             :                            &xmm_dst_lo, &xmm_dst_hi);
    3031             : 
    3032           0 :             save_128_aligned (
    3033             :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    3034             : 
    3035           0 :             dst += 4;
    3036           0 :             src += 4;
    3037           0 :             w -= 4;
    3038             : 
    3039             :         }
    3040             : 
    3041           0 :         while (w)
    3042             :         {
    3043           0 :             uint32_t s = (*src++) | 0xff000000;
    3044           0 :             uint32_t d = *dst;
    3045             : 
    3046           0 :             __m128i src  = unpack_32_1x128 (s);
    3047           0 :             __m128i alpha = xmm_alpha;
    3048           0 :             __m128i mask  = xmm_mask;
    3049           0 :             __m128i dest  = unpack_32_1x128 (d);
    3050             : 
    3051           0 :             *dst++ = pack_1x128_32 (
    3052             :                 in_over_1x128 (&src, &alpha, &mask, &dest));
    3053             : 
    3054           0 :             w--;
    3055             :         }
    3056             :     }
    3057             : 
    3058           0 : }
    3059             : 
    3060             : static void
    3061           0 : sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
    3062             :                                pixman_composite_info_t *info)
    3063             : {
    3064           0 :     PIXMAN_COMPOSITE_ARGS (info);
    3065             :     int dst_stride, src_stride;
    3066             :     uint32_t    *dst_line, *dst;
    3067             :     uint32_t    *src_line, *src;
    3068             : 
    3069           0 :     PIXMAN_IMAGE_GET_LINE (
    3070             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    3071           0 :     PIXMAN_IMAGE_GET_LINE (
    3072             :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    3073             : 
    3074           0 :     dst = dst_line;
    3075           0 :     src = src_line;
    3076             : 
    3077           0 :     while (height--)
    3078             :     {
    3079             :         sse2_combine_over_u (imp, op, dst, src, NULL, width);
    3080             : 
    3081           0 :         dst += dst_stride;
    3082           0 :         src += src_stride;
    3083             :     }
    3084           0 : }
    3085             : 
    3086             : static force_inline uint16_t
    3087             : composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
    3088             : {
    3089             :     __m128i ms;
    3090             : 
    3091           0 :     ms = unpack_32_1x128 (src);
    3092           0 :     return pack_565_32_16 (
    3093             :         pack_1x128_32 (
    3094             :             over_1x128 (
    3095             :                 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
    3096             : }
    3097             : 
    3098             : static void
    3099           0 : sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
    3100             :                                pixman_composite_info_t *info)
    3101             : {
    3102           0 :     PIXMAN_COMPOSITE_ARGS (info);
    3103             :     uint16_t    *dst_line, *dst, d;
    3104             :     uint32_t    *src_line, *src, s;
    3105             :     int dst_stride, src_stride;
    3106             :     int32_t w;
    3107             : 
    3108             :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    3109             :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    3110             :     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
    3111             : 
    3112           0 :     PIXMAN_IMAGE_GET_LINE (
    3113             :         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    3114           0 :     PIXMAN_IMAGE_GET_LINE (
    3115             :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    3116             : 
    3117           0 :     while (height--)
    3118             :     {
    3119           0 :         dst = dst_line;
    3120           0 :         src = src_line;
    3121             : 
    3122           0 :         dst_line += dst_stride;
    3123           0 :         src_line += src_stride;
    3124           0 :         w = width;
    3125             : 
    3126             :         /* Align dst on a 16-byte boundary */
    3127           0 :         while (w &&
    3128           0 :                ((uintptr_t)dst & 15))
    3129             :         {
    3130           0 :             s = *src++;
    3131           0 :             d = *dst;
    3132             : 
    3133           0 :             *dst++ = composite_over_8888_0565pixel (s, d);
    3134           0 :             w--;
    3135             :         }
    3136             : 
    3137             :         /* It's a 8 pixel loop */
    3138           0 :         while (w >= 8)
    3139             :         {
    3140             :             /* I'm loading unaligned because I'm not sure
    3141             :              * about the address alignment.
    3142             :              */
    3143           0 :             xmm_src = load_128_unaligned ((__m128i*) src);
    3144           0 :             xmm_dst = load_128_aligned ((__m128i*) dst);
    3145             : 
    3146             :             /* Unpacking */
    3147             :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    3148             :             unpack_565_128_4x128 (xmm_dst,
    3149             :                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
    3150           0 :             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    3151             :                                 &xmm_alpha_lo, &xmm_alpha_hi);
    3152             : 
    3153             :             /* I'm loading next 4 pixels from memory
    3154             :              * before to optimze the memory read.
    3155             :              */
    3156           0 :             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
    3157             : 
    3158             :             over_2x128 (&xmm_src_lo, &xmm_src_hi,
    3159             :                         &xmm_alpha_lo, &xmm_alpha_hi,
    3160             :                         &xmm_dst0, &xmm_dst1);
    3161             : 
    3162             :             /* Unpacking */
    3163             :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    3164           0 :             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    3165             :                                 &xmm_alpha_lo, &xmm_alpha_hi);
    3166             : 
    3167             :             over_2x128 (&xmm_src_lo, &xmm_src_hi,
    3168             :                         &xmm_alpha_lo, &xmm_alpha_hi,
    3169             :                         &xmm_dst2, &xmm_dst3);
    3170             : 
    3171           0 :             save_128_aligned (
    3172             :                 (__m128i*)dst, pack_565_4x128_128 (
    3173             :                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
    3174             : 
    3175           0 :             w -= 8;
    3176           0 :             dst += 8;
    3177           0 :             src += 8;
    3178             :         }
    3179             : 
    3180           0 :         while (w--)
    3181             :         {
    3182           0 :             s = *src++;
    3183           0 :             d = *dst;
    3184             : 
    3185           0 :             *dst++ = composite_over_8888_0565pixel (s, d);
    3186             :         }
    3187             :     }
    3188             : 
    3189           0 : }
    3190             : 
    3191             : static void
    3192           0 : sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
    3193             :                               pixman_composite_info_t *info)
    3194             : {
    3195           0 :     PIXMAN_COMPOSITE_ARGS (info);
    3196             :     uint32_t src, srca;
    3197             :     uint32_t *dst_line, *dst;
    3198             :     uint8_t *mask_line, *mask;
    3199             :     int dst_stride, mask_stride;
    3200             :     int32_t w;
    3201             :     uint32_t m, d;
    3202             : 
    3203             :     __m128i xmm_src, xmm_alpha, xmm_def;
    3204             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    3205             :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    3206             : 
    3207             :     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
    3208             : 
    3209           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    3210             : 
    3211           0 :     srca = src >> 24;
    3212           0 :     if (src == 0)
    3213           0 :         return;
    3214             : 
    3215           0 :     PIXMAN_IMAGE_GET_LINE (
    3216             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    3217           0 :     PIXMAN_IMAGE_GET_LINE (
    3218             :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    3219             : 
    3220           0 :     xmm_def = create_mask_2x32_128 (src, src);
    3221           0 :     xmm_src = expand_pixel_32_1x128 (src);
    3222           0 :     xmm_alpha = expand_alpha_1x128 (xmm_src);
    3223           0 :     mmx_src   = xmm_src;
    3224           0 :     mmx_alpha = xmm_alpha;
    3225             : 
    3226           0 :     while (height--)
    3227             :     {
    3228           0 :         dst = dst_line;
    3229           0 :         dst_line += dst_stride;
    3230           0 :         mask = mask_line;
    3231           0 :         mask_line += mask_stride;
    3232           0 :         w = width;
    3233             : 
    3234           0 :         while (w && (uintptr_t)dst & 15)
    3235             :         {
    3236           0 :             uint8_t m = *mask++;
    3237             : 
    3238           0 :             if (m)
    3239             :             {
    3240           0 :                 d = *dst;
    3241           0 :                 mmx_mask = expand_pixel_8_1x128 (m);
    3242           0 :                 mmx_dest = unpack_32_1x128 (d);
    3243             : 
    3244           0 :                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
    3245             :                                                    &mmx_alpha,
    3246             :                                                    &mmx_mask,
    3247             :                                                    &mmx_dest));
    3248             :             }
    3249             : 
    3250           0 :             w--;
    3251           0 :             dst++;
    3252             :         }
    3253             : 
    3254           0 :         while (w >= 4)
    3255             :         {
    3256           0 :             m = *((uint32_t*)mask);
    3257             : 
    3258           0 :             if (srca == 0xff && m == 0xffffffff)
    3259             :             {
    3260           0 :                 save_128_aligned ((__m128i*)dst, xmm_def);
    3261             :             }
    3262           0 :             else if (m)
    3263             :             {
    3264           0 :                 xmm_dst = load_128_aligned ((__m128i*) dst);
    3265           0 :                 xmm_mask = unpack_32_1x128 (m);
    3266           0 :                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
    3267             : 
    3268             :                 /* Unpacking */
    3269             :                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    3270             :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    3271             : 
    3272           0 :                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
    3273             :                                         &xmm_mask_lo, &xmm_mask_hi);
    3274             : 
    3275             :                 in_over_2x128 (&xmm_src, &xmm_src,
    3276             :                                &xmm_alpha, &xmm_alpha,
    3277             :                                &xmm_mask_lo, &xmm_mask_hi,
    3278             :                                &xmm_dst_lo, &xmm_dst_hi);
    3279             : 
    3280           0 :                 save_128_aligned (
    3281             :                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    3282             :             }
    3283             : 
    3284           0 :             w -= 4;
    3285           0 :             dst += 4;
    3286           0 :             mask += 4;
    3287             :         }
    3288             : 
    3289           0 :         while (w)
    3290             :         {
    3291           0 :             uint8_t m = *mask++;
    3292             : 
    3293           0 :             if (m)
    3294             :             {
    3295           0 :                 d = *dst;
    3296           0 :                 mmx_mask = expand_pixel_8_1x128 (m);
    3297           0 :                 mmx_dest = unpack_32_1x128 (d);
    3298             : 
    3299           0 :                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
    3300             :                                                    &mmx_alpha,
    3301             :                                                    &mmx_mask,
    3302             :                                                    &mmx_dest));
    3303             :             }
    3304             : 
    3305           0 :             w--;
    3306           0 :             dst++;
    3307             :         }
    3308             :     }
    3309             : 
    3310             : }
    3311             : 
    3312             : #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
    3313             : __attribute__((__force_align_arg_pointer__))
    3314             : #endif
    3315             : static pixman_bool_t
    3316           0 : sse2_fill (pixman_implementation_t *imp,
    3317             :            uint32_t *               bits,
    3318             :            int                      stride,
    3319             :            int                      bpp,
    3320             :            int                      x,
    3321             :            int                      y,
    3322             :            int                      width,
    3323             :            int                      height,
    3324             :            uint32_t                 filler)
    3325             : {
    3326             :     uint32_t byte_width;
    3327             :     uint8_t *byte_line;
    3328             : 
    3329             :     __m128i xmm_def;
    3330             : 
    3331           0 :     if (bpp == 8)
    3332             :     {
    3333             :         uint8_t b;
    3334             :         uint16_t w;
    3335             : 
    3336           0 :         stride = stride * (int) sizeof (uint32_t) / 1;
    3337           0 :         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
    3338           0 :         byte_width = width;
    3339           0 :         stride *= 1;
    3340             : 
    3341           0 :         b = filler & 0xff;
    3342           0 :         w = (b << 8) | b;
    3343           0 :         filler = (w << 16) | w;
    3344             :     }
    3345           0 :     else if (bpp == 16)
    3346             :     {
    3347           0 :         stride = stride * (int) sizeof (uint32_t) / 2;
    3348           0 :         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
    3349           0 :         byte_width = 2 * width;
    3350           0 :         stride *= 2;
    3351             : 
    3352           0 :         filler = (filler & 0xffff) * 0x00010001;
    3353             :     }
    3354           0 :     else if (bpp == 32)
    3355             :     {
    3356           0 :         stride = stride * (int) sizeof (uint32_t) / 4;
    3357           0 :         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
    3358           0 :         byte_width = 4 * width;
    3359           0 :         stride *= 4;
    3360             :     }
    3361             :     else
    3362             :     {
    3363           0 :         return FALSE;
    3364             :     }
    3365             : 
    3366           0 :     xmm_def = create_mask_2x32_128 (filler, filler);
    3367             : 
    3368           0 :     while (height--)
    3369             :     {
    3370             :         int w;
    3371           0 :         uint8_t *d = byte_line;
    3372           0 :         byte_line += stride;
    3373           0 :         w = byte_width;
    3374             : 
    3375           0 :         if (w >= 1 && ((uintptr_t)d & 1))
    3376             :         {
    3377           0 :             *(uint8_t *)d = filler;
    3378           0 :             w -= 1;
    3379           0 :             d += 1;
    3380             :         }
    3381             : 
    3382           0 :         while (w >= 2 && ((uintptr_t)d & 3))
    3383             :         {
    3384           0 :             *(uint16_t *)d = filler;
    3385           0 :             w -= 2;
    3386           0 :             d += 2;
    3387             :         }
    3388             : 
    3389           0 :         while (w >= 4 && ((uintptr_t)d & 15))
    3390             :         {
    3391           0 :             *(uint32_t *)d = filler;
    3392             : 
    3393           0 :             w -= 4;
    3394           0 :             d += 4;
    3395             :         }
    3396             : 
    3397           0 :         while (w >= 128)
    3398             :         {
    3399             :             save_128_aligned ((__m128i*)(d),     xmm_def);
    3400           0 :             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
    3401           0 :             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
    3402           0 :             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
    3403           0 :             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
    3404           0 :             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
    3405           0 :             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
    3406           0 :             save_128_aligned ((__m128i*)(d + 112), xmm_def);
    3407             : 
    3408           0 :             d += 128;
    3409           0 :             w -= 128;
    3410             :         }
    3411             : 
    3412           0 :         if (w >= 64)
    3413             :         {
    3414             :             save_128_aligned ((__m128i*)(d),     xmm_def);
    3415           0 :             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
    3416           0 :             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
    3417           0 :             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
    3418             : 
    3419           0 :             d += 64;
    3420           0 :             w -= 64;
    3421             :         }
    3422             : 
    3423           0 :         if (w >= 32)
    3424             :         {
    3425             :             save_128_aligned ((__m128i*)(d),     xmm_def);
    3426           0 :             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
    3427             : 
    3428           0 :             d += 32;
    3429           0 :             w -= 32;
    3430             :         }
    3431             : 
    3432           0 :         if (w >= 16)
    3433             :         {
    3434             :             save_128_aligned ((__m128i*)(d),     xmm_def);
    3435             : 
    3436           0 :             d += 16;
    3437           0 :             w -= 16;
    3438             :         }
    3439             : 
    3440           0 :         while (w >= 4)
    3441             :         {
    3442           0 :             *(uint32_t *)d = filler;
    3443             : 
    3444           0 :             w -= 4;
    3445           0 :             d += 4;
    3446             :         }
    3447             : 
    3448           0 :         if (w >= 2)
    3449             :         {
    3450           0 :             *(uint16_t *)d = filler;
    3451           0 :             w -= 2;
    3452           0 :             d += 2;
    3453             :         }
    3454             : 
    3455           0 :         if (w >= 1)
    3456             :         {
    3457           0 :             *(uint8_t *)d = filler;
    3458           0 :             w -= 1;
    3459           0 :             d += 1;
    3460             :         }
    3461             :     }
    3462             : 
    3463           0 :     return TRUE;
    3464             : }
    3465             : 
    3466             : static void
    3467           0 : sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
    3468             :                              pixman_composite_info_t *info)
    3469             : {
    3470           0 :     PIXMAN_COMPOSITE_ARGS (info);
    3471             :     uint32_t src, srca;
    3472             :     uint32_t    *dst_line, *dst;
    3473             :     uint8_t     *mask_line, *mask;
    3474             :     int dst_stride, mask_stride;
    3475             :     int32_t w;
    3476             :     uint32_t m;
    3477             : 
    3478             :     __m128i xmm_src, xmm_def;
    3479             :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    3480             : 
    3481           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    3482             : 
    3483           0 :     srca = src >> 24;
    3484           0 :     if (src == 0)
    3485             :     {
    3486           0 :         sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
    3487           0 :                    PIXMAN_FORMAT_BPP (dest_image->bits.format),
    3488             :                    dest_x, dest_y, width, height, 0);
    3489           0 :         return;
    3490             :     }
    3491             : 
    3492           0 :     PIXMAN_IMAGE_GET_LINE (
    3493             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    3494           0 :     PIXMAN_IMAGE_GET_LINE (
    3495             :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    3496             : 
    3497           0 :     xmm_def = create_mask_2x32_128 (src, src);
    3498           0 :     xmm_src = expand_pixel_32_1x128 (src);
    3499             : 
    3500           0 :     while (height--)
    3501             :     {
    3502           0 :         dst = dst_line;
    3503           0 :         dst_line += dst_stride;
    3504           0 :         mask = mask_line;
    3505           0 :         mask_line += mask_stride;
    3506           0 :         w = width;
    3507             : 
    3508           0 :         while (w && (uintptr_t)dst & 15)
    3509             :         {
    3510           0 :             uint8_t m = *mask++;
    3511             : 
    3512           0 :             if (m)
    3513             :             {
    3514           0 :                 *dst = pack_1x128_32 (
    3515             :                     pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
    3516             :             }
    3517             :             else
    3518             :             {
    3519           0 :                 *dst = 0;
    3520             :             }
    3521             : 
    3522           0 :             w--;
    3523           0 :             dst++;
    3524             :         }
    3525             : 
    3526           0 :         while (w >= 4)
    3527             :         {
    3528           0 :             m = *((uint32_t*)mask);
    3529             : 
    3530           0 :             if (srca == 0xff && m == 0xffffffff)
    3531             :             {
    3532           0 :                 save_128_aligned ((__m128i*)dst, xmm_def);
    3533             :             }
    3534           0 :             else if (m)
    3535             :             {
    3536           0 :                 xmm_mask = unpack_32_1x128 (m);
    3537           0 :                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
    3538             : 
    3539             :                 /* Unpacking */
    3540             :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    3541             : 
    3542           0 :                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
    3543             :                                         &xmm_mask_lo, &xmm_mask_hi);
    3544             : 
    3545             :                 pix_multiply_2x128 (&xmm_src, &xmm_src,
    3546             :                                     &xmm_mask_lo, &xmm_mask_hi,
    3547             :                                     &xmm_mask_lo, &xmm_mask_hi);
    3548             : 
    3549           0 :                 save_128_aligned (
    3550             :                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
    3551             :             }
    3552             :             else
    3553             :             {
    3554           0 :                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
    3555             :             }
    3556             : 
    3557           0 :             w -= 4;
    3558           0 :             dst += 4;
    3559           0 :             mask += 4;
    3560             :         }
    3561             : 
    3562           0 :         while (w)
    3563             :         {
    3564           0 :             uint8_t m = *mask++;
    3565             : 
    3566           0 :             if (m)
    3567             :             {
    3568           0 :                 *dst = pack_1x128_32 (
    3569             :                     pix_multiply_1x128 (
    3570             :                         xmm_src, expand_pixel_8_1x128 (m)));
    3571             :             }
    3572             :             else
    3573             :             {
    3574           0 :                 *dst = 0;
    3575             :             }
    3576             : 
    3577           0 :             w--;
    3578           0 :             dst++;
    3579             :         }
    3580             :     }
    3581             : 
    3582             : }
    3583             : 
    3584             : static void
    3585           0 : sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
    3586             :                               pixman_composite_info_t *info)
    3587             : {
    3588           0 :     PIXMAN_COMPOSITE_ARGS (info);
    3589             :     uint32_t src;
    3590             :     uint16_t    *dst_line, *dst, d;
    3591             :     uint8_t     *mask_line, *mask;
    3592             :     int dst_stride, mask_stride;
    3593             :     int32_t w;
    3594             :     uint32_t m;
    3595             :     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
    3596             : 
    3597             :     __m128i xmm_src, xmm_alpha;
    3598             :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    3599             :     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
    3600             : 
    3601           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    3602             : 
    3603           0 :     if (src == 0)
    3604           0 :         return;
    3605             : 
    3606           0 :     PIXMAN_IMAGE_GET_LINE (
    3607             :         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    3608           0 :     PIXMAN_IMAGE_GET_LINE (
    3609             :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    3610             : 
    3611           0 :     xmm_src = expand_pixel_32_1x128 (src);
    3612           0 :     xmm_alpha = expand_alpha_1x128 (xmm_src);
    3613           0 :     mmx_src = xmm_src;
    3614           0 :     mmx_alpha = xmm_alpha;
    3615             : 
    3616           0 :     while (height--)
    3617             :     {
    3618           0 :         dst = dst_line;
    3619           0 :         dst_line += dst_stride;
    3620           0 :         mask = mask_line;
    3621           0 :         mask_line += mask_stride;
    3622           0 :         w = width;
    3623             : 
    3624           0 :         while (w && (uintptr_t)dst & 15)
    3625             :         {
    3626           0 :             m = *mask++;
    3627             : 
    3628           0 :             if (m)
    3629             :             {
    3630           0 :                 d = *dst;
    3631           0 :                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
    3632           0 :                 mmx_dest = expand565_16_1x128 (d);
    3633             : 
    3634           0 :                 *dst = pack_565_32_16 (
    3635             :                     pack_1x128_32 (
    3636             :                         in_over_1x128 (
    3637             :                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
    3638             :             }
    3639             : 
    3640           0 :             w--;
    3641           0 :             dst++;
    3642             :         }
    3643             : 
    3644           0 :         while (w >= 8)
    3645             :         {
    3646           0 :             xmm_dst = load_128_aligned ((__m128i*) dst);
    3647             :             unpack_565_128_4x128 (xmm_dst,
    3648             :                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
    3649             : 
    3650           0 :             m = *((uint32_t*)mask);
    3651           0 :             mask += 4;
    3652             : 
    3653           0 :             if (m)
    3654             :             {
    3655           0 :                 xmm_mask = unpack_32_1x128 (m);
    3656           0 :                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
    3657             : 
    3658             :                 /* Unpacking */
    3659             :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    3660             : 
    3661           0 :                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
    3662             :                                         &xmm_mask_lo, &xmm_mask_hi);
    3663             : 
    3664             :                 in_over_2x128 (&xmm_src, &xmm_src,
    3665             :                                &xmm_alpha, &xmm_alpha,
    3666             :                                &xmm_mask_lo, &xmm_mask_hi,
    3667             :                                &xmm_dst0, &xmm_dst1);
    3668             :             }
    3669             : 
    3670           0 :             m = *((uint32_t*)mask);
    3671           0 :             mask += 4;
    3672             : 
    3673           0 :             if (m)
    3674             :             {
    3675           0 :                 xmm_mask = unpack_32_1x128 (m);
    3676           0 :                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
    3677             : 
    3678             :                 /* Unpacking */
    3679             :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    3680             : 
    3681           0 :                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
    3682             :                                         &xmm_mask_lo, &xmm_mask_hi);
    3683             :                 in_over_2x128 (&xmm_src, &xmm_src,
    3684             :                                &xmm_alpha, &xmm_alpha,
    3685             :                                &xmm_mask_lo, &xmm_mask_hi,
    3686             :                                &xmm_dst2, &xmm_dst3);
    3687             :             }
    3688             : 
    3689           0 :             save_128_aligned (
    3690             :                 (__m128i*)dst, pack_565_4x128_128 (
    3691             :                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
    3692             : 
    3693           0 :             w -= 8;
    3694           0 :             dst += 8;
    3695             :         }
    3696             : 
    3697           0 :         while (w)
    3698             :         {
    3699           0 :             m = *mask++;
    3700             : 
    3701           0 :             if (m)
    3702             :             {
    3703           0 :                 d = *dst;
    3704           0 :                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
    3705           0 :                 mmx_dest = expand565_16_1x128 (d);
    3706             : 
    3707           0 :                 *dst = pack_565_32_16 (
    3708             :                     pack_1x128_32 (
    3709             :                         in_over_1x128 (
    3710             :                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
    3711             :             }
    3712             : 
    3713           0 :             w--;
    3714           0 :             dst++;
    3715             :         }
    3716             :     }
    3717             : 
    3718             : }
    3719             : 
    3720             : static void
    3721           0 : sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
    3722             :                                  pixman_composite_info_t *info)
    3723             : {
    3724           0 :     PIXMAN_COMPOSITE_ARGS (info);
    3725             :     uint16_t    *dst_line, *dst, d;
    3726             :     uint32_t    *src_line, *src, s;
    3727             :     int dst_stride, src_stride;
    3728             :     int32_t w;
    3729             :     uint32_t opaque, zero;
    3730             : 
    3731             :     __m128i ms;
    3732             :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    3733             :     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
    3734             : 
    3735           0 :     PIXMAN_IMAGE_GET_LINE (
    3736             :         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    3737           0 :     PIXMAN_IMAGE_GET_LINE (
    3738             :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    3739             : 
    3740           0 :     while (height--)
    3741             :     {
    3742           0 :         dst = dst_line;
    3743           0 :         dst_line += dst_stride;
    3744           0 :         src = src_line;
    3745           0 :         src_line += src_stride;
    3746           0 :         w = width;
    3747             : 
    3748           0 :         while (w && (uintptr_t)dst & 15)
    3749             :         {
    3750           0 :             s = *src++;
    3751           0 :             d = *dst;
    3752             : 
    3753           0 :             ms = unpack_32_1x128 (s);
    3754             : 
    3755           0 :             *dst++ = pack_565_32_16 (
    3756             :                 pack_1x128_32 (
    3757             :                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
    3758           0 :             w--;
    3759             :         }
    3760             : 
    3761           0 :         while (w >= 8)
    3762             :         {
    3763             :             /* First round */
    3764           0 :             xmm_src = load_128_unaligned ((__m128i*)src);
    3765           0 :             xmm_dst = load_128_aligned  ((__m128i*)dst);
    3766             : 
    3767           0 :             opaque = is_opaque (xmm_src);
    3768           0 :             zero = is_zero (xmm_src);
    3769             : 
    3770             :             unpack_565_128_4x128 (xmm_dst,
    3771             :                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
    3772             :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    3773             : 
    3774             :             /* preload next round*/
    3775           0 :             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
    3776             : 
    3777           0 :             if (opaque)
    3778             :             {
    3779           0 :                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
    3780             :                                      &xmm_dst0, &xmm_dst1);
    3781             :             }
    3782           0 :             else if (!zero)
    3783             :             {
    3784           0 :                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
    3785             :                                         &xmm_dst0, &xmm_dst1);
    3786             :             }
    3787             : 
    3788             :             /* Second round */
    3789           0 :             opaque = is_opaque (xmm_src);
    3790           0 :             zero = is_zero (xmm_src);
    3791             : 
    3792             :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    3793             : 
    3794           0 :             if (opaque)
    3795             :             {
    3796           0 :                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
    3797             :                                      &xmm_dst2, &xmm_dst3);
    3798             :             }
    3799           0 :             else if (!zero)
    3800             :             {
    3801           0 :                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
    3802             :                                         &xmm_dst2, &xmm_dst3);
    3803             :             }
    3804             : 
    3805           0 :             save_128_aligned (
    3806             :                 (__m128i*)dst, pack_565_4x128_128 (
    3807             :                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
    3808             : 
    3809           0 :             w -= 8;
    3810           0 :             src += 8;
    3811           0 :             dst += 8;
    3812             :         }
    3813             : 
    3814           0 :         while (w)
    3815             :         {
    3816           0 :             s = *src++;
    3817           0 :             d = *dst;
    3818             : 
    3819           0 :             ms = unpack_32_1x128 (s);
    3820             : 
    3821           0 :             *dst++ = pack_565_32_16 (
    3822             :                 pack_1x128_32 (
    3823             :                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
    3824           0 :             w--;
    3825             :         }
    3826             :     }
    3827             : 
    3828           0 : }
    3829             : 
    3830             : static void
    3831           0 : sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
    3832             :                                  pixman_composite_info_t *info)
    3833             : {
    3834           0 :     PIXMAN_COMPOSITE_ARGS (info);
    3835             :     uint32_t    *dst_line, *dst, d;
    3836             :     uint32_t    *src_line, *src, s;
    3837             :     int dst_stride, src_stride;
    3838             :     int32_t w;
    3839             :     uint32_t opaque, zero;
    3840             : 
    3841             :     __m128i xmm_src_lo, xmm_src_hi;
    3842             :     __m128i xmm_dst_lo, xmm_dst_hi;
    3843             : 
    3844           0 :     PIXMAN_IMAGE_GET_LINE (
    3845             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    3846           0 :     PIXMAN_IMAGE_GET_LINE (
    3847             :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    3848             : 
    3849           0 :     while (height--)
    3850             :     {
    3851           0 :         dst = dst_line;
    3852           0 :         dst_line += dst_stride;
    3853           0 :         src = src_line;
    3854           0 :         src_line += src_stride;
    3855           0 :         w = width;
    3856             : 
    3857           0 :         while (w && (uintptr_t)dst & 15)
    3858             :         {
    3859           0 :             s = *src++;
    3860           0 :             d = *dst;
    3861             : 
    3862           0 :             *dst++ = pack_1x128_32 (
    3863             :                 over_rev_non_pre_1x128 (
    3864             :                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
    3865             : 
    3866           0 :             w--;
    3867             :         }
    3868             : 
    3869           0 :         while (w >= 4)
    3870             :         {
    3871           0 :             xmm_src_hi = load_128_unaligned ((__m128i*)src);
    3872             : 
    3873           0 :             opaque = is_opaque (xmm_src_hi);
    3874           0 :             zero = is_zero (xmm_src_hi);
    3875             : 
    3876           0 :             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    3877             : 
    3878           0 :             if (opaque)
    3879             :             {
    3880           0 :                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
    3881             :                                      &xmm_dst_lo, &xmm_dst_hi);
    3882             : 
    3883           0 :                 save_128_aligned (
    3884             :                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    3885             :             }
    3886           0 :             else if (!zero)
    3887             :             {
    3888           0 :                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
    3889             : 
    3890           0 :                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    3891             : 
    3892           0 :                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
    3893             :                                         &xmm_dst_lo, &xmm_dst_hi);
    3894             : 
    3895           0 :                 save_128_aligned (
    3896             :                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    3897             :             }
    3898             : 
    3899           0 :             w -= 4;
    3900           0 :             dst += 4;
    3901           0 :             src += 4;
    3902             :         }
    3903             : 
    3904           0 :         while (w)
    3905             :         {
    3906           0 :             s = *src++;
    3907           0 :             d = *dst;
    3908             : 
    3909           0 :             *dst++ = pack_1x128_32 (
    3910             :                 over_rev_non_pre_1x128 (
    3911             :                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
    3912             : 
    3913           0 :             w--;
    3914             :         }
    3915             :     }
    3916             : 
    3917           0 : }
    3918             : 
    3919             : static void
    3920           0 : sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
    3921             :                                     pixman_composite_info_t *info)
    3922             : {
    3923           0 :     PIXMAN_COMPOSITE_ARGS (info);
    3924             :     uint32_t src;
    3925             :     uint16_t    *dst_line, *dst, d;
    3926             :     uint32_t    *mask_line, *mask, m;
    3927             :     int dst_stride, mask_stride;
    3928             :     int w;
    3929             :     uint32_t pack_cmp;
    3930             : 
    3931             :     __m128i xmm_src, xmm_alpha;
    3932             :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    3933             :     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
    3934             : 
    3935             :     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
    3936             : 
    3937           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    3938             : 
    3939           0 :     if (src == 0)
    3940           0 :         return;
    3941             : 
    3942           0 :     PIXMAN_IMAGE_GET_LINE (
    3943             :         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    3944           0 :     PIXMAN_IMAGE_GET_LINE (
    3945             :         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
    3946             : 
    3947           0 :     xmm_src = expand_pixel_32_1x128 (src);
    3948           0 :     xmm_alpha = expand_alpha_1x128 (xmm_src);
    3949           0 :     mmx_src = xmm_src;
    3950           0 :     mmx_alpha = xmm_alpha;
    3951             : 
    3952           0 :     while (height--)
    3953             :     {
    3954           0 :         w = width;
    3955           0 :         mask = mask_line;
    3956           0 :         dst = dst_line;
    3957           0 :         mask_line += mask_stride;
    3958           0 :         dst_line += dst_stride;
    3959             : 
    3960           0 :         while (w && ((uintptr_t)dst & 15))
    3961             :         {
    3962           0 :             m = *(uint32_t *) mask;
    3963             : 
    3964           0 :             if (m)
    3965             :             {
    3966           0 :                 d = *dst;
    3967           0 :                 mmx_mask = unpack_32_1x128 (m);
    3968           0 :                 mmx_dest = expand565_16_1x128 (d);
    3969             : 
    3970           0 :                 *dst = pack_565_32_16 (
    3971             :                     pack_1x128_32 (
    3972             :                         in_over_1x128 (
    3973             :                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
    3974             :             }
    3975             : 
    3976           0 :             w--;
    3977           0 :             dst++;
    3978           0 :             mask++;
    3979             :         }
    3980             : 
    3981           0 :         while (w >= 8)
    3982             :         {
    3983             :             /* First round */
    3984           0 :             xmm_mask = load_128_unaligned ((__m128i*)mask);
    3985           0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    3986             : 
    3987           0 :             pack_cmp = _mm_movemask_epi8 (
    3988             :                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
    3989             : 
    3990             :             unpack_565_128_4x128 (xmm_dst,
    3991             :                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
    3992             :             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    3993             : 
    3994             :             /* preload next round */
    3995           0 :             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
    3996             : 
    3997             :             /* preload next round */
    3998           0 :             if (pack_cmp != 0xffff)
    3999             :             {
    4000             :                 in_over_2x128 (&xmm_src, &xmm_src,
    4001             :                                &xmm_alpha, &xmm_alpha,
    4002             :                                &xmm_mask_lo, &xmm_mask_hi,
    4003             :                                &xmm_dst0, &xmm_dst1);
    4004             :             }
    4005             : 
    4006             :             /* Second round */
    4007           0 :             pack_cmp = _mm_movemask_epi8 (
    4008             :                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
    4009             : 
    4010             :             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    4011             : 
    4012           0 :             if (pack_cmp != 0xffff)
    4013             :             {
    4014             :                 in_over_2x128 (&xmm_src, &xmm_src,
    4015             :                                &xmm_alpha, &xmm_alpha,
    4016             :                                &xmm_mask_lo, &xmm_mask_hi,
    4017             :                                &xmm_dst2, &xmm_dst3);
    4018             :             }
    4019             : 
    4020           0 :             save_128_aligned (
    4021             :                 (__m128i*)dst, pack_565_4x128_128 (
    4022             :                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
    4023             : 
    4024           0 :             w -= 8;
    4025           0 :             dst += 8;
    4026           0 :             mask += 8;
    4027             :         }
    4028             : 
    4029           0 :         while (w)
    4030             :         {
    4031           0 :             m = *(uint32_t *) mask;
    4032             : 
    4033           0 :             if (m)
    4034             :             {
    4035           0 :                 d = *dst;
    4036           0 :                 mmx_mask = unpack_32_1x128 (m);
    4037           0 :                 mmx_dest = expand565_16_1x128 (d);
    4038             : 
    4039           0 :                 *dst = pack_565_32_16 (
    4040             :                     pack_1x128_32 (
    4041             :                         in_over_1x128 (
    4042             :                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
    4043             :             }
    4044             : 
    4045           0 :             w--;
    4046           0 :             dst++;
    4047           0 :             mask++;
    4048             :         }
    4049             :     }
    4050             : 
    4051             : }
    4052             : 
    4053             : static void
    4054           0 : sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
    4055             :                          pixman_composite_info_t *info)
    4056             : {
    4057           0 :     PIXMAN_COMPOSITE_ARGS (info);
    4058             :     uint8_t     *dst_line, *dst;
    4059             :     uint8_t     *mask_line, *mask;
    4060             :     int dst_stride, mask_stride;
    4061             :     uint32_t d, m;
    4062             :     uint32_t src;
    4063             :     int32_t w;
    4064             : 
    4065             :     __m128i xmm_alpha;
    4066             :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    4067             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    4068             : 
    4069           0 :     PIXMAN_IMAGE_GET_LINE (
    4070             :         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    4071           0 :     PIXMAN_IMAGE_GET_LINE (
    4072             :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    4073             : 
    4074           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    4075             : 
    4076           0 :     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
    4077             : 
    4078           0 :     while (height--)
    4079             :     {
    4080           0 :         dst = dst_line;
    4081           0 :         dst_line += dst_stride;
    4082           0 :         mask = mask_line;
    4083           0 :         mask_line += mask_stride;
    4084           0 :         w = width;
    4085             : 
    4086           0 :         while (w && ((uintptr_t)dst & 15))
    4087             :         {
    4088           0 :             m = (uint32_t) *mask++;
    4089           0 :             d = (uint32_t) *dst;
    4090             : 
    4091           0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4092             :                 pix_multiply_1x128 (
    4093             :                     pix_multiply_1x128 (xmm_alpha,
    4094             :                                        unpack_32_1x128 (m)),
    4095             :                     unpack_32_1x128 (d)));
    4096           0 :             w--;
    4097             :         }
    4098             : 
    4099           0 :         while (w >= 16)
    4100             :         {
    4101           0 :             xmm_mask = load_128_unaligned ((__m128i*)mask);
    4102           0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    4103             : 
    4104             :             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    4105             :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    4106             : 
    4107             :             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
    4108             :                                 &xmm_mask_lo, &xmm_mask_hi,
    4109             :                                 &xmm_mask_lo, &xmm_mask_hi);
    4110             : 
    4111             :             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
    4112             :                                 &xmm_dst_lo, &xmm_dst_hi,
    4113             :                                 &xmm_dst_lo, &xmm_dst_hi);
    4114             : 
    4115           0 :             save_128_aligned (
    4116             :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    4117             : 
    4118           0 :             mask += 16;
    4119           0 :             dst += 16;
    4120           0 :             w -= 16;
    4121             :         }
    4122             : 
    4123           0 :         while (w)
    4124             :         {
    4125           0 :             m = (uint32_t) *mask++;
    4126           0 :             d = (uint32_t) *dst;
    4127             : 
    4128           0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4129             :                 pix_multiply_1x128 (
    4130             :                     pix_multiply_1x128 (
    4131             :                         xmm_alpha, unpack_32_1x128 (m)),
    4132             :                     unpack_32_1x128 (d)));
    4133           0 :             w--;
    4134             :         }
    4135             :     }
    4136             : 
    4137           0 : }
    4138             : 
    4139             : static void
    4140           0 : sse2_composite_in_n_8 (pixman_implementation_t *imp,
    4141             :                        pixman_composite_info_t *info)
    4142             : {
    4143           0 :     PIXMAN_COMPOSITE_ARGS (info);
    4144             :     uint8_t     *dst_line, *dst;
    4145             :     int dst_stride;
    4146             :     uint32_t d;
    4147             :     uint32_t src;
    4148             :     int32_t w;
    4149             : 
    4150             :     __m128i xmm_alpha;
    4151             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    4152             : 
    4153           0 :     PIXMAN_IMAGE_GET_LINE (
    4154             :         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    4155             : 
    4156           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    4157             : 
    4158           0 :     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
    4159             : 
    4160           0 :     src = src >> 24;
    4161             : 
    4162           0 :     if (src == 0xff)
    4163           0 :         return;
    4164             : 
    4165           0 :     if (src == 0x00)
    4166             :     {
    4167           0 :         pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
    4168             :                      8, dest_x, dest_y, width, height, src);
    4169             : 
    4170           0 :         return;
    4171             :     }
    4172             : 
    4173           0 :     while (height--)
    4174             :     {
    4175           0 :         dst = dst_line;
    4176           0 :         dst_line += dst_stride;
    4177           0 :         w = width;
    4178             : 
    4179           0 :         while (w && ((uintptr_t)dst & 15))
    4180             :         {
    4181           0 :             d = (uint32_t) *dst;
    4182             : 
    4183           0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4184             :                 pix_multiply_1x128 (
    4185             :                     xmm_alpha,
    4186             :                     unpack_32_1x128 (d)));
    4187           0 :             w--;
    4188             :         }
    4189             : 
    4190           0 :         while (w >= 16)
    4191             :         {
    4192           0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    4193             : 
    4194             :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    4195             :             
    4196             :             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
    4197             :                                 &xmm_dst_lo, &xmm_dst_hi,
    4198             :                                 &xmm_dst_lo, &xmm_dst_hi);
    4199             : 
    4200           0 :             save_128_aligned (
    4201             :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    4202             : 
    4203           0 :             dst += 16;
    4204           0 :             w -= 16;
    4205             :         }
    4206             : 
    4207           0 :         while (w)
    4208             :         {
    4209           0 :             d = (uint32_t) *dst;
    4210             : 
    4211           0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4212             :                 pix_multiply_1x128 (
    4213             :                     xmm_alpha,
    4214             :                     unpack_32_1x128 (d)));
    4215           0 :             w--;
    4216             :         }
    4217             :     }
    4218             : 
    4219             : }
    4220             : 
    4221             : static void
    4222           0 : sse2_composite_in_8_8 (pixman_implementation_t *imp,
    4223             :                        pixman_composite_info_t *info)
    4224             : {
    4225           0 :     PIXMAN_COMPOSITE_ARGS (info);
    4226             :     uint8_t     *dst_line, *dst;
    4227             :     uint8_t     *src_line, *src;
    4228             :     int src_stride, dst_stride;
    4229             :     int32_t w;
    4230             :     uint32_t s, d;
    4231             : 
    4232             :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    4233             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    4234             : 
    4235           0 :     PIXMAN_IMAGE_GET_LINE (
    4236             :         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    4237           0 :     PIXMAN_IMAGE_GET_LINE (
    4238             :         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
    4239             : 
    4240           0 :     while (height--)
    4241             :     {
    4242           0 :         dst = dst_line;
    4243           0 :         dst_line += dst_stride;
    4244           0 :         src = src_line;
    4245           0 :         src_line += src_stride;
    4246           0 :         w = width;
    4247             : 
    4248           0 :         while (w && ((uintptr_t)dst & 15))
    4249             :         {
    4250           0 :             s = (uint32_t) *src++;
    4251           0 :             d = (uint32_t) *dst;
    4252             : 
    4253           0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4254             :                 pix_multiply_1x128 (
    4255             :                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
    4256           0 :             w--;
    4257             :         }
    4258             : 
    4259           0 :         while (w >= 16)
    4260             :         {
    4261           0 :             xmm_src = load_128_unaligned ((__m128i*)src);
    4262           0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    4263             : 
    4264             :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    4265             :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    4266             : 
    4267             :             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    4268             :                                 &xmm_dst_lo, &xmm_dst_hi,
    4269             :                                 &xmm_dst_lo, &xmm_dst_hi);
    4270             : 
    4271           0 :             save_128_aligned (
    4272             :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    4273             : 
    4274           0 :             src += 16;
    4275           0 :             dst += 16;
    4276           0 :             w -= 16;
    4277             :         }
    4278             : 
    4279           0 :         while (w)
    4280             :         {
    4281           0 :             s = (uint32_t) *src++;
    4282           0 :             d = (uint32_t) *dst;
    4283             : 
    4284           0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4285             :                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
    4286           0 :             w--;
    4287             :         }
    4288             :     }
    4289             : 
    4290           0 : }
    4291             : 
    4292             : static void
    4293           0 : sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
    4294             :                           pixman_composite_info_t *info)
    4295             : {
    4296           0 :     PIXMAN_COMPOSITE_ARGS (info);
    4297             :     uint8_t     *dst_line, *dst;
    4298             :     uint8_t     *mask_line, *mask;
    4299             :     int dst_stride, mask_stride;
    4300             :     int32_t w;
    4301             :     uint32_t src;
    4302             :     uint32_t m, d;
    4303             : 
    4304             :     __m128i xmm_alpha;
    4305             :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    4306             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    4307             : 
    4308           0 :     PIXMAN_IMAGE_GET_LINE (
    4309             :         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    4310           0 :     PIXMAN_IMAGE_GET_LINE (
    4311             :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    4312             : 
    4313           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    4314             : 
    4315           0 :     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
    4316             : 
    4317           0 :     while (height--)
    4318             :     {
    4319           0 :         dst = dst_line;
    4320           0 :         dst_line += dst_stride;
    4321           0 :         mask = mask_line;
    4322           0 :         mask_line += mask_stride;
    4323           0 :         w = width;
    4324             : 
    4325           0 :         while (w && ((uintptr_t)dst & 15))
    4326             :         {
    4327           0 :             m = (uint32_t) *mask++;
    4328           0 :             d = (uint32_t) *dst;
    4329             : 
    4330           0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4331             :                 _mm_adds_epu16 (
    4332             :                     pix_multiply_1x128 (
    4333             :                         xmm_alpha, unpack_32_1x128 (m)),
    4334             :                     unpack_32_1x128 (d)));
    4335           0 :             w--;
    4336             :         }
    4337             : 
    4338           0 :         while (w >= 16)
    4339             :         {
    4340           0 :             xmm_mask = load_128_unaligned ((__m128i*)mask);
    4341           0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    4342             : 
    4343             :             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    4344             :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    4345             : 
    4346             :             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
    4347             :                                 &xmm_mask_lo, &xmm_mask_hi,
    4348             :                                 &xmm_mask_lo, &xmm_mask_hi);
    4349             : 
    4350           0 :             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
    4351           0 :             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
    4352             : 
    4353           0 :             save_128_aligned (
    4354             :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    4355             : 
    4356           0 :             mask += 16;
    4357           0 :             dst += 16;
    4358           0 :             w -= 16;
    4359             :         }
    4360             : 
    4361           0 :         while (w)
    4362             :         {
    4363           0 :             m = (uint32_t) *mask++;
    4364           0 :             d = (uint32_t) *dst;
    4365             : 
    4366           0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4367             :                 _mm_adds_epu16 (
    4368             :                     pix_multiply_1x128 (
    4369             :                         xmm_alpha, unpack_32_1x128 (m)),
    4370             :                     unpack_32_1x128 (d)));
    4371             : 
    4372           0 :             w--;
    4373             :         }
    4374             :     }
    4375             : 
    4376           0 : }
    4377             : 
    4378             : static void
    4379           0 : sse2_composite_add_n_8 (pixman_implementation_t *imp,
    4380             :                         pixman_composite_info_t *info)
    4381             : {
    4382           0 :     PIXMAN_COMPOSITE_ARGS (info);
    4383             :     uint8_t     *dst_line, *dst;
    4384             :     int dst_stride;
    4385             :     int32_t w;
    4386             :     uint32_t src;
    4387             : 
    4388             :     __m128i xmm_src;
    4389             : 
    4390           0 :     PIXMAN_IMAGE_GET_LINE (
    4391             :         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    4392             : 
    4393           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    4394             : 
    4395           0 :     src >>= 24;
    4396             : 
    4397           0 :     if (src == 0x00)
    4398           0 :         return;
    4399             : 
    4400           0 :     if (src == 0xff)
    4401             :     {
    4402           0 :         pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
    4403             :                      8, dest_x, dest_y, width, height, 0xff);
    4404             : 
    4405           0 :         return;
    4406             :     }
    4407             : 
    4408           0 :     src = (src << 24) | (src << 16) | (src << 8) | src;
    4409           0 :     xmm_src = _mm_set_epi32 (src, src, src, src);
    4410             : 
    4411           0 :     while (height--)
    4412             :     {
    4413           0 :         dst = dst_line;
    4414           0 :         dst_line += dst_stride;
    4415           0 :         w = width;
    4416             : 
    4417           0 :         while (w && ((uintptr_t)dst & 15))
    4418             :         {
    4419           0 :             *dst = (uint8_t)_mm_cvtsi128_si32 (
    4420             :                 _mm_adds_epu8 (
    4421             :                     xmm_src,
    4422           0 :                     _mm_cvtsi32_si128 (*dst)));
    4423             : 
    4424           0 :             w--;
    4425           0 :             dst++;
    4426             :         }
    4427             : 
    4428           0 :         while (w >= 16)
    4429             :         {
    4430           0 :             save_128_aligned (
    4431             :                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
    4432             : 
    4433           0 :             dst += 16;
    4434           0 :             w -= 16;
    4435             :         }
    4436             : 
    4437           0 :         while (w)
    4438             :         {
    4439           0 :             *dst = (uint8_t)_mm_cvtsi128_si32 (
    4440             :                 _mm_adds_epu8 (
    4441             :                     xmm_src,
    4442           0 :                     _mm_cvtsi32_si128 (*dst)));
    4443             : 
    4444           0 :             w--;
    4445           0 :             dst++;
    4446             :         }
    4447             :     }
    4448             : 
    4449             : }
    4450             : 
    4451             : static void
    4452           0 : sse2_composite_add_8_8 (pixman_implementation_t *imp,
    4453             :                         pixman_composite_info_t *info)
    4454             : {
    4455           0 :     PIXMAN_COMPOSITE_ARGS (info);
    4456             :     uint8_t     *dst_line, *dst;
    4457             :     uint8_t     *src_line, *src;
    4458             :     int dst_stride, src_stride;
    4459             :     int32_t w;
    4460             :     uint16_t t;
    4461             : 
    4462           0 :     PIXMAN_IMAGE_GET_LINE (
    4463             :         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
    4464           0 :     PIXMAN_IMAGE_GET_LINE (
    4465             :         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    4466             : 
    4467           0 :     while (height--)
    4468             :     {
    4469           0 :         dst = dst_line;
    4470           0 :         src = src_line;
    4471             : 
    4472           0 :         dst_line += dst_stride;
    4473           0 :         src_line += src_stride;
    4474           0 :         w = width;
    4475             : 
    4476             :         /* Small head */
    4477           0 :         while (w && (uintptr_t)dst & 3)
    4478             :         {
    4479           0 :             t = (*dst) + (*src++);
    4480           0 :             *dst++ = t | (0 - (t >> 8));
    4481           0 :             w--;
    4482             :         }
    4483             : 
    4484           0 :         sse2_combine_add_u (imp, op,
    4485             :                             (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
    4486             : 
    4487             :         /* Small tail */
    4488           0 :         dst += w & 0xfffc;
    4489           0 :         src += w & 0xfffc;
    4490             : 
    4491           0 :         w &= 3;
    4492             : 
    4493           0 :         while (w)
    4494             :         {
    4495           0 :             t = (*dst) + (*src++);
    4496           0 :             *dst++ = t | (0 - (t >> 8));
    4497           0 :             w--;
    4498             :         }
    4499             :     }
    4500             : 
    4501           0 : }
    4502             : 
    4503             : static void
    4504           0 : sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
    4505             :                               pixman_composite_info_t *info)
    4506             : {
    4507           0 :     PIXMAN_COMPOSITE_ARGS (info);
    4508             :     uint32_t    *dst_line, *dst;
    4509             :     uint32_t    *src_line, *src;
    4510             :     int dst_stride, src_stride;
    4511             : 
    4512           0 :     PIXMAN_IMAGE_GET_LINE (
    4513             :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    4514           0 :     PIXMAN_IMAGE_GET_LINE (
    4515             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    4516             : 
    4517           0 :     while (height--)
    4518             :     {
    4519           0 :         dst = dst_line;
    4520           0 :         dst_line += dst_stride;
    4521           0 :         src = src_line;
    4522           0 :         src_line += src_stride;
    4523             : 
    4524             :         sse2_combine_add_u (imp, op, dst, src, NULL, width);
    4525             :     }
    4526           0 : }
    4527             : 
    4528             : static void
    4529           0 : sse2_composite_add_n_8888 (pixman_implementation_t *imp,
    4530             :                            pixman_composite_info_t *info)
    4531             : {
    4532           0 :     PIXMAN_COMPOSITE_ARGS (info);
    4533             :     uint32_t *dst_line, *dst, src;
    4534             :     int dst_stride;
    4535             : 
    4536             :     __m128i xmm_src;
    4537             : 
    4538           0 :     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    4539             : 
    4540           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    4541           0 :     if (src == 0)
    4542           0 :         return;
    4543             : 
    4544           0 :     if (src == ~0)
    4545             :     {
    4546           0 :         pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
    4547             :                      dest_x, dest_y, width, height, ~0);
    4548             : 
    4549           0 :         return;
    4550             :     }
    4551             : 
    4552           0 :     xmm_src = _mm_set_epi32 (src, src, src, src);
    4553           0 :     while (height--)
    4554             :     {
    4555           0 :         int w = width;
    4556             :         uint32_t d;
    4557             : 
    4558           0 :         dst = dst_line;
    4559           0 :         dst_line += dst_stride;
    4560             : 
    4561           0 :         while (w && (unsigned long)dst & 15)
    4562             :         {
    4563           0 :             d = *dst;
    4564           0 :             *dst++ =
    4565           0 :                 _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
    4566           0 :             w--;
    4567             :         }
    4568             : 
    4569           0 :         while (w >= 4)
    4570             :         {
    4571           0 :             save_128_aligned
    4572             :                 ((__m128i*)dst,
    4573             :                  _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
    4574             : 
    4575           0 :             dst += 4;
    4576           0 :             w -= 4;
    4577             :         }
    4578             : 
    4579           0 :         while (w--)
    4580             :         {
    4581           0 :             d = *dst;
    4582           0 :             *dst++ =
    4583           0 :                 _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
    4584             :                                                   _mm_cvtsi32_si128 (d)));
    4585             :         }
    4586             :     }
    4587             : }
    4588             : 
    4589             : static void
    4590           0 : sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
    4591             :                              pixman_composite_info_t *info)
    4592             : {
    4593           0 :     PIXMAN_COMPOSITE_ARGS (info);
    4594             :     uint32_t     *dst_line, *dst;
    4595             :     uint8_t     *mask_line, *mask;
    4596             :     int dst_stride, mask_stride;
    4597             :     int32_t w;
    4598             :     uint32_t src;
    4599             : 
    4600             :     __m128i xmm_src;
    4601             : 
    4602           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    4603           0 :     if (src == 0)
    4604           0 :         return;
    4605           0 :     xmm_src = expand_pixel_32_1x128 (src);
    4606             : 
    4607           0 :     PIXMAN_IMAGE_GET_LINE (
    4608             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    4609           0 :     PIXMAN_IMAGE_GET_LINE (
    4610             :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    4611             : 
    4612           0 :     while (height--)
    4613             :     {
    4614           0 :         dst = dst_line;
    4615           0 :         dst_line += dst_stride;
    4616           0 :         mask = mask_line;
    4617           0 :         mask_line += mask_stride;
    4618           0 :         w = width;
    4619             : 
    4620           0 :         while (w && ((unsigned long)dst & 15))
    4621             :         {
    4622           0 :             uint8_t m = *mask++;
    4623           0 :             if (m)
    4624             :             {
    4625           0 :                 *dst = pack_1x128_32
    4626             :                     (_mm_adds_epu16
    4627             :                      (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
    4628             :                       unpack_32_1x128 (*dst)));
    4629             :             }
    4630           0 :             dst++;
    4631           0 :             w--;
    4632             :         }
    4633             : 
    4634           0 :         while (w >= 4)
    4635             :         {
    4636           0 :             uint32_t m = *(uint32_t*)mask;
    4637           0 :             if (m)
    4638             :             {
    4639             :                 __m128i xmm_mask_lo, xmm_mask_hi;
    4640             :                 __m128i xmm_dst_lo, xmm_dst_hi;
    4641             : 
    4642           0 :                 __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
    4643           0 :                 __m128i xmm_mask =
    4644           0 :                     _mm_unpacklo_epi8 (unpack_32_1x128(m),
    4645             :                                        _mm_setzero_si128 ());
    4646             : 
    4647             :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    4648             :                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    4649             : 
    4650           0 :                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
    4651             :                                         &xmm_mask_lo, &xmm_mask_hi);
    4652             : 
    4653             :                 pix_multiply_2x128 (&xmm_src, &xmm_src,
    4654             :                                     &xmm_mask_lo, &xmm_mask_hi,
    4655             :                                     &xmm_mask_lo, &xmm_mask_hi);
    4656             : 
    4657           0 :                 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
    4658           0 :                 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
    4659             : 
    4660           0 :                 save_128_aligned (
    4661             :                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    4662             :             }
    4663             : 
    4664           0 :             w -= 4;
    4665           0 :             dst += 4;
    4666           0 :             mask += 4;
    4667             :         }
    4668             : 
    4669           0 :         while (w)
    4670             :         {
    4671           0 :             uint8_t m = *mask++;
    4672           0 :             if (m)
    4673             :             {
    4674           0 :                 *dst = pack_1x128_32
    4675             :                     (_mm_adds_epu16
    4676             :                      (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
    4677             :                       unpack_32_1x128 (*dst)));
    4678             :             }
    4679           0 :             dst++;
    4680           0 :             w--;
    4681             :         }
    4682             :     }
    4683             : }
    4684             : 
    4685             : static pixman_bool_t
    4686          22 : sse2_blt (pixman_implementation_t *imp,
    4687             :           uint32_t *               src_bits,
    4688             :           uint32_t *               dst_bits,
    4689             :           int                      src_stride,
    4690             :           int                      dst_stride,
    4691             :           int                      src_bpp,
    4692             :           int                      dst_bpp,
    4693             :           int                      src_x,
    4694             :           int                      src_y,
    4695             :           int                      dest_x,
    4696             :           int                      dest_y,
    4697             :           int                      width,
    4698             :           int                      height)
    4699             : {
    4700             :     uint8_t *   src_bytes;
    4701             :     uint8_t *   dst_bytes;
    4702             :     int byte_width;
    4703             : 
    4704          22 :     if (src_bpp != dst_bpp)
    4705           0 :         return FALSE;
    4706             : 
    4707          22 :     if (src_bpp == 16)
    4708             :     {
    4709           0 :         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
    4710           0 :         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
    4711           0 :         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
    4712           0 :         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
    4713           0 :         byte_width = 2 * width;
    4714           0 :         src_stride *= 2;
    4715           0 :         dst_stride *= 2;
    4716             :     }
    4717          22 :     else if (src_bpp == 32)
    4718             :     {
    4719          22 :         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
    4720          22 :         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
    4721          22 :         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
    4722          22 :         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
    4723          22 :         byte_width = 4 * width;
    4724          22 :         src_stride *= 4;
    4725          22 :         dst_stride *= 4;
    4726             :     }
    4727             :     else
    4728             :     {
    4729           0 :         return FALSE;
    4730             :     }
    4731             : 
    4732         748 :     while (height--)
    4733             :     {
    4734             :         int w;
    4735         704 :         uint8_t *s = src_bytes;
    4736         704 :         uint8_t *d = dst_bytes;
    4737         704 :         src_bytes += src_stride;
    4738         704 :         dst_bytes += dst_stride;
    4739         704 :         w = byte_width;
    4740             : 
    4741        1408 :         while (w >= 2 && ((uintptr_t)d & 3))
    4742             :         {
    4743           0 :             *(uint16_t *)d = *(uint16_t *)s;
    4744           0 :             w -= 2;
    4745           0 :             s += 2;
    4746           0 :             d += 2;
    4747             :         }
    4748             : 
    4749        1408 :         while (w >= 4 && ((uintptr_t)d & 15))
    4750             :         {
    4751           0 :             *(uint32_t *)d = *(uint32_t *)s;
    4752             : 
    4753           0 :             w -= 4;
    4754           0 :             s += 4;
    4755           0 :             d += 4;
    4756             :         }
    4757             : 
    4758        2816 :         while (w >= 64)
    4759             :         {
    4760             :             __m128i xmm0, xmm1, xmm2, xmm3;
    4761             : 
    4762        1408 :             xmm0 = load_128_unaligned ((__m128i*)(s));
    4763        2816 :             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
    4764        2816 :             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
    4765        2816 :             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
    4766             : 
    4767             :             save_128_aligned ((__m128i*)(d),    xmm0);
    4768        1408 :             save_128_aligned ((__m128i*)(d + 16), xmm1);
    4769        1408 :             save_128_aligned ((__m128i*)(d + 32), xmm2);
    4770        1408 :             save_128_aligned ((__m128i*)(d + 48), xmm3);
    4771             : 
    4772        1408 :             s += 64;
    4773        1408 :             d += 64;
    4774        1408 :             w -= 64;
    4775             :         }
    4776             : 
    4777        1408 :         while (w >= 16)
    4778             :         {
    4779           0 :             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
    4780             : 
    4781           0 :             w -= 16;
    4782           0 :             d += 16;
    4783           0 :             s += 16;
    4784             :         }
    4785             : 
    4786        1408 :         while (w >= 4)
    4787             :         {
    4788           0 :             *(uint32_t *)d = *(uint32_t *)s;
    4789             : 
    4790           0 :             w -= 4;
    4791           0 :             s += 4;
    4792           0 :             d += 4;
    4793             :         }
    4794             : 
    4795         704 :         if (w >= 2)
    4796             :         {
    4797           0 :             *(uint16_t *)d = *(uint16_t *)s;
    4798           0 :             w -= 2;
    4799           0 :             s += 2;
    4800           0 :             d += 2;
    4801             :         }
    4802             :     }
    4803             : 
    4804          22 :     return TRUE;
    4805             : }
    4806             : 
    4807             : static void
    4808          22 : sse2_composite_copy_area (pixman_implementation_t *imp,
    4809             :                           pixman_composite_info_t *info)
    4810             : {
    4811          22 :     PIXMAN_COMPOSITE_ARGS (info);
    4812          44 :     sse2_blt (imp, src_image->bits.bits,
    4813             :               dest_image->bits.bits,
    4814             :               src_image->bits.rowstride,
    4815             :               dest_image->bits.rowstride,
    4816          22 :               PIXMAN_FORMAT_BPP (src_image->bits.format),
    4817          22 :               PIXMAN_FORMAT_BPP (dest_image->bits.format),
    4818             :               src_x, src_y, dest_x, dest_y, width, height);
    4819          22 : }
    4820             : 
    4821             : static void
    4822           0 : sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
    4823             :                                  pixman_composite_info_t *info)
    4824             : {
    4825           0 :     PIXMAN_COMPOSITE_ARGS (info);
    4826             :     uint32_t    *src, *src_line, s;
    4827             :     uint32_t    *dst, *dst_line, d;
    4828             :     uint8_t         *mask, *mask_line;
    4829             :     uint32_t m;
    4830             :     int src_stride, mask_stride, dst_stride;
    4831             :     int32_t w;
    4832             :     __m128i ms;
    4833             : 
    4834             :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    4835             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    4836             :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    4837             : 
    4838           0 :     PIXMAN_IMAGE_GET_LINE (
    4839             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    4840           0 :     PIXMAN_IMAGE_GET_LINE (
    4841             :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    4842           0 :     PIXMAN_IMAGE_GET_LINE (
    4843             :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    4844             : 
    4845           0 :     while (height--)
    4846             :     {
    4847           0 :         src = src_line;
    4848           0 :         src_line += src_stride;
    4849           0 :         dst = dst_line;
    4850           0 :         dst_line += dst_stride;
    4851           0 :         mask = mask_line;
    4852           0 :         mask_line += mask_stride;
    4853             : 
    4854           0 :         w = width;
    4855             : 
    4856           0 :         while (w && (uintptr_t)dst & 15)
    4857             :         {
    4858           0 :             s = 0xff000000 | *src++;
    4859           0 :             m = (uint32_t) *mask++;
    4860           0 :             d = *dst;
    4861           0 :             ms = unpack_32_1x128 (s);
    4862             : 
    4863           0 :             if (m != 0xff)
    4864             :             {
    4865           0 :                 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
    4866           0 :                 __m128i md = unpack_32_1x128 (d);
    4867             : 
    4868           0 :                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
    4869             :             }
    4870             : 
    4871           0 :             *dst++ = pack_1x128_32 (ms);
    4872           0 :             w--;
    4873             :         }
    4874             : 
    4875           0 :         while (w >= 4)
    4876             :         {
    4877           0 :             m = *(uint32_t*) mask;
    4878           0 :             xmm_src = _mm_or_si128 (
    4879             :                 load_128_unaligned ((__m128i*)src), mask_ff000000);
    4880             : 
    4881           0 :             if (m == 0xffffffff)
    4882             :             {
    4883             :                 save_128_aligned ((__m128i*)dst, xmm_src);
    4884             :             }
    4885             :             else
    4886             :             {
    4887           0 :                 xmm_dst = load_128_aligned ((__m128i*)dst);
    4888             : 
    4889           0 :                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
    4890             : 
    4891             :                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    4892             :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    4893             :                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    4894             : 
    4895           0 :                 expand_alpha_rev_2x128 (
    4896             :                     xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    4897             : 
    4898             :                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
    4899             :                                &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
    4900             :                                &xmm_dst_lo, &xmm_dst_hi);
    4901             : 
    4902           0 :                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    4903             :             }
    4904             : 
    4905           0 :             src += 4;
    4906           0 :             dst += 4;
    4907           0 :             mask += 4;
    4908           0 :             w -= 4;
    4909             :         }
    4910             : 
    4911           0 :         while (w)
    4912             :         {
    4913           0 :             m = (uint32_t) *mask++;
    4914             : 
    4915           0 :             if (m)
    4916             :             {
    4917           0 :                 s = 0xff000000 | *src;
    4918             : 
    4919           0 :                 if (m == 0xff)
    4920             :                 {
    4921           0 :                     *dst = s;
    4922             :                 }
    4923             :                 else
    4924             :                 {
    4925             :                     __m128i ma, md, ms;
    4926             : 
    4927           0 :                     d = *dst;
    4928             : 
    4929           0 :                     ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
    4930           0 :                     md = unpack_32_1x128 (d);
    4931           0 :                     ms = unpack_32_1x128 (s);
    4932             : 
    4933           0 :                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
    4934             :                 }
    4935             : 
    4936             :             }
    4937             : 
    4938           0 :             src++;
    4939           0 :             dst++;
    4940           0 :             w--;
    4941             :         }
    4942             :     }
    4943             : 
    4944           0 : }
    4945             : 
    4946             : static void
    4947           0 : sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
    4948             :                                  pixman_composite_info_t *info)
    4949             : {
    4950           0 :     PIXMAN_COMPOSITE_ARGS (info);
    4951             :     uint32_t    *src, *src_line, s;
    4952             :     uint32_t    *dst, *dst_line, d;
    4953             :     uint8_t         *mask, *mask_line;
    4954             :     uint32_t m;
    4955             :     int src_stride, mask_stride, dst_stride;
    4956             :     int32_t w;
    4957             : 
    4958             :     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
    4959             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    4960             :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    4961             : 
    4962           0 :     PIXMAN_IMAGE_GET_LINE (
    4963             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    4964           0 :     PIXMAN_IMAGE_GET_LINE (
    4965             :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    4966           0 :     PIXMAN_IMAGE_GET_LINE (
    4967             :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    4968             : 
    4969           0 :     while (height--)
    4970             :     {
    4971           0 :         src = src_line;
    4972           0 :         src_line += src_stride;
    4973           0 :         dst = dst_line;
    4974           0 :         dst_line += dst_stride;
    4975           0 :         mask = mask_line;
    4976           0 :         mask_line += mask_stride;
    4977             : 
    4978           0 :         w = width;
    4979             : 
    4980           0 :         while (w && (uintptr_t)dst & 15)
    4981             :         {
    4982             :             uint32_t sa;
    4983             : 
    4984           0 :             s = *src++;
    4985           0 :             m = (uint32_t) *mask++;
    4986           0 :             d = *dst;
    4987             : 
    4988           0 :             sa = s >> 24;
    4989             : 
    4990           0 :             if (m)
    4991             :             {
    4992           0 :                 if (sa == 0xff && m == 0xff)
    4993             :                 {
    4994           0 :                     *dst = s;
    4995             :                 }
    4996             :                 else
    4997             :                 {
    4998             :                     __m128i ms, md, ma, msa;
    4999             : 
    5000           0 :                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
    5001           0 :                     ms = unpack_32_1x128 (s);
    5002           0 :                     md = unpack_32_1x128 (d);
    5003             : 
    5004           0 :                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
    5005             : 
    5006           0 :                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
    5007             :                 }
    5008             :             }
    5009             : 
    5010           0 :             dst++;
    5011           0 :             w--;
    5012             :         }
    5013             : 
    5014           0 :         while (w >= 4)
    5015             :         {
    5016           0 :             m = *(uint32_t *) mask;
    5017             : 
    5018           0 :             if (m)
    5019             :             {
    5020           0 :                 xmm_src = load_128_unaligned ((__m128i*)src);
    5021             : 
    5022           0 :                 if (m == 0xffffffff && is_opaque (xmm_src))
    5023             :                 {
    5024           0 :                     save_128_aligned ((__m128i *)dst, xmm_src);
    5025             :                 }
    5026             :                 else
    5027             :                 {
    5028           0 :                     xmm_dst = load_128_aligned ((__m128i *)dst);
    5029             : 
    5030           0 :                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
    5031             : 
    5032             :                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    5033             :                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    5034             :                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    5035             : 
    5036           0 :                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
    5037           0 :                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    5038             : 
    5039             :                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
    5040             :                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
    5041             : 
    5042           0 :                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    5043             :                 }
    5044             :             }
    5045             : 
    5046           0 :             src += 4;
    5047           0 :             dst += 4;
    5048           0 :             mask += 4;
    5049           0 :             w -= 4;
    5050             :         }
    5051             : 
    5052           0 :         while (w)
    5053             :         {
    5054             :             uint32_t sa;
    5055             : 
    5056           0 :             s = *src++;
    5057           0 :             m = (uint32_t) *mask++;
    5058           0 :             d = *dst;
    5059             : 
    5060           0 :             sa = s >> 24;
    5061             : 
    5062           0 :             if (m)
    5063             :             {
    5064           0 :                 if (sa == 0xff && m == 0xff)
    5065             :                 {
    5066           0 :                     *dst = s;
    5067             :                 }
    5068             :                 else
    5069             :                 {
    5070             :                     __m128i ms, md, ma, msa;
    5071             : 
    5072           0 :                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
    5073           0 :                     ms = unpack_32_1x128 (s);
    5074           0 :                     md = unpack_32_1x128 (d);
    5075             : 
    5076           0 :                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
    5077             : 
    5078           0 :                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
    5079             :                 }
    5080             :             }
    5081             : 
    5082           0 :             dst++;
    5083           0 :             w--;
    5084             :         }
    5085             :     }
    5086             : 
    5087           0 : }
    5088             : 
    5089             : static void
    5090           0 : sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
    5091             :                                     pixman_composite_info_t *info)
    5092             : {
    5093           0 :     PIXMAN_COMPOSITE_ARGS (info);
    5094             :     uint32_t src;
    5095             :     uint32_t    *dst_line, *dst;
    5096             :     __m128i xmm_src;
    5097             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    5098             :     __m128i xmm_dsta_hi, xmm_dsta_lo;
    5099             :     int dst_stride;
    5100             :     int32_t w;
    5101             : 
    5102           0 :     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
    5103             : 
    5104           0 :     if (src == 0)
    5105           0 :         return;
    5106             : 
    5107           0 :     PIXMAN_IMAGE_GET_LINE (
    5108             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    5109             : 
    5110           0 :     xmm_src = expand_pixel_32_1x128 (src);
    5111             : 
    5112           0 :     while (height--)
    5113             :     {
    5114           0 :         dst = dst_line;
    5115             : 
    5116           0 :         dst_line += dst_stride;
    5117           0 :         w = width;
    5118             : 
    5119           0 :         while (w && (uintptr_t)dst & 15)
    5120             :         {
    5121             :             __m128i vd;
    5122             : 
    5123           0 :             vd = unpack_32_1x128 (*dst);
    5124             : 
    5125           0 :             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
    5126             :                                               xmm_src));
    5127           0 :             w--;
    5128           0 :             dst++;
    5129             :         }
    5130             : 
    5131           0 :         while (w >= 4)
    5132             :         {
    5133             :             __m128i tmp_lo, tmp_hi;
    5134             : 
    5135           0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    5136             : 
    5137             :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    5138           0 :             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
    5139             : 
    5140           0 :             tmp_lo = xmm_src;
    5141           0 :             tmp_hi = xmm_src;
    5142             : 
    5143             :             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    5144             :                         &xmm_dsta_lo, &xmm_dsta_hi,
    5145             :                         &tmp_lo, &tmp_hi);
    5146             : 
    5147           0 :             save_128_aligned (
    5148             :                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
    5149             : 
    5150           0 :             w -= 4;
    5151           0 :             dst += 4;
    5152             :         }
    5153             : 
    5154           0 :         while (w)
    5155             :         {
    5156             :             __m128i vd;
    5157             : 
    5158           0 :             vd = unpack_32_1x128 (*dst);
    5159             : 
    5160           0 :             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
    5161             :                                               xmm_src));
    5162           0 :             w--;
    5163           0 :             dst++;
    5164             :         }
    5165             : 
    5166             :     }
    5167             : 
    5168             : }
    5169             : 
    5170             : static void
    5171           0 : sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
    5172             :                                     pixman_composite_info_t *info)
    5173             : {
    5174           0 :     PIXMAN_COMPOSITE_ARGS (info);
    5175             :     uint32_t    *src, *src_line, s;
    5176             :     uint32_t    *dst, *dst_line, d;
    5177             :     uint32_t    *mask, *mask_line;
    5178             :     uint32_t    m;
    5179             :     int src_stride, mask_stride, dst_stride;
    5180             :     int32_t w;
    5181             : 
    5182             :     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
    5183             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    5184             :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    5185             : 
    5186           0 :     PIXMAN_IMAGE_GET_LINE (
    5187             :         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    5188           0 :     PIXMAN_IMAGE_GET_LINE (
    5189             :         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
    5190           0 :     PIXMAN_IMAGE_GET_LINE (
    5191             :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    5192             : 
    5193           0 :     while (height--)
    5194             :     {
    5195           0 :         src = src_line;
    5196           0 :         src_line += src_stride;
    5197           0 :         dst = dst_line;
    5198           0 :         dst_line += dst_stride;
    5199           0 :         mask = mask_line;
    5200           0 :         mask_line += mask_stride;
    5201             : 
    5202           0 :         w = width;
    5203             : 
    5204           0 :         while (w && (uintptr_t)dst & 15)
    5205             :         {
    5206             :             uint32_t sa;
    5207             : 
    5208           0 :             s = *src++;
    5209           0 :             m = (*mask++) >> 24;
    5210           0 :             d = *dst;
    5211             : 
    5212           0 :             sa = s >> 24;
    5213             : 
    5214           0 :             if (m)
    5215             :             {
    5216           0 :                 if (sa == 0xff && m == 0xff)
    5217             :                 {
    5218           0 :                     *dst = s;
    5219             :                 }
    5220             :                 else
    5221             :                 {
    5222             :                     __m128i ms, md, ma, msa;
    5223             : 
    5224           0 :                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
    5225           0 :                     ms = unpack_32_1x128 (s);
    5226           0 :                     md = unpack_32_1x128 (d);
    5227             : 
    5228           0 :                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
    5229             : 
    5230           0 :                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
    5231             :                 }
    5232             :             }
    5233             : 
    5234           0 :             dst++;
    5235           0 :             w--;
    5236             :         }
    5237             : 
    5238           0 :         while (w >= 4)
    5239             :         {
    5240           0 :             xmm_mask = load_128_unaligned ((__m128i*)mask);
    5241             : 
    5242           0 :             if (!is_transparent (xmm_mask))
    5243             :             {
    5244           0 :                 xmm_src = load_128_unaligned ((__m128i*)src);
    5245             : 
    5246           0 :                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
    5247             :                 {
    5248           0 :                     save_128_aligned ((__m128i *)dst, xmm_src);
    5249             :                 }
    5250             :                 else
    5251             :                 {
    5252           0 :                     xmm_dst = load_128_aligned ((__m128i *)dst);
    5253             : 
    5254             :                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    5255             :                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    5256             :                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    5257             : 
    5258           0 :                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
    5259           0 :                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    5260             : 
    5261             :                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
    5262             :                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
    5263             : 
    5264           0 :                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    5265             :                 }
    5266             :             }
    5267             : 
    5268           0 :             src += 4;
    5269           0 :             dst += 4;
    5270           0 :             mask += 4;
    5271           0 :             w -= 4;
    5272             :         }
    5273             : 
    5274           0 :         while (w)
    5275             :         {
    5276             :             uint32_t sa;
    5277             : 
    5278           0 :             s = *src++;
    5279           0 :             m = (*mask++) >> 24;
    5280           0 :             d = *dst;
    5281             : 
    5282           0 :             sa = s >> 24;
    5283             : 
    5284           0 :             if (m)
    5285             :             {
    5286           0 :                 if (sa == 0xff && m == 0xff)
    5287             :                 {
    5288           0 :                     *dst = s;
    5289             :                 }
    5290             :                 else
    5291             :                 {
    5292             :                     __m128i ms, md, ma, msa;
    5293             : 
    5294           0 :                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
    5295           0 :                     ms = unpack_32_1x128 (s);
    5296           0 :                     md = unpack_32_1x128 (d);
    5297             : 
    5298           0 :                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
    5299             : 
    5300           0 :                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
    5301             :                 }
    5302             :             }
    5303             : 
    5304           0 :             dst++;
    5305           0 :             w--;
    5306             :         }
    5307             :     }
    5308             : 
    5309           0 : }
    5310             : 
    5311             : /* A variant of 'sse2_combine_over_u' with minor tweaks */
    5312             : static force_inline void
    5313             : scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
    5314             :                                              const uint32_t* ps,
    5315             :                                              int32_t         w,
    5316             :                                              pixman_fixed_t  vx,
    5317             :                                              pixman_fixed_t  unit_x,
    5318             :                                              pixman_fixed_t  src_width_fixed,
    5319             :                                              pixman_bool_t   fully_transparent_src)
    5320             : {
    5321             :     uint32_t s, d;
    5322           0 :     const uint32_t* pm = NULL;
    5323             : 
    5324             :     __m128i xmm_dst_lo, xmm_dst_hi;
    5325             :     __m128i xmm_src_lo, xmm_src_hi;
    5326             :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    5327             : 
    5328           0 :     if (fully_transparent_src)
    5329           0 :         return;
    5330             : 
    5331             :     /* Align dst on a 16-byte boundary */
    5332           0 :     while (w && ((uintptr_t)pd & 15))
    5333             :     {
    5334           0 :         d = *pd;
    5335           0 :         s = combine1 (ps + pixman_fixed_to_int (vx), pm);
    5336           0 :         vx += unit_x;
    5337           0 :         while (vx >= 0)
    5338           0 :             vx -= src_width_fixed;
    5339             : 
    5340           0 :         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
    5341           0 :         if (pm)
    5342           0 :             pm++;
    5343           0 :         w--;
    5344             :     }
    5345             : 
    5346           0 :     while (w >= 4)
    5347             :     {
    5348             :         __m128i tmp;
    5349             :         uint32_t tmp1, tmp2, tmp3, tmp4;
    5350             : 
    5351           0 :         tmp1 = *(ps + pixman_fixed_to_int (vx));
    5352           0 :         vx += unit_x;
    5353           0 :         while (vx >= 0)
    5354           0 :             vx -= src_width_fixed;
    5355           0 :         tmp2 = *(ps + pixman_fixed_to_int (vx));
    5356           0 :         vx += unit_x;
    5357           0 :         while (vx >= 0)
    5358           0 :             vx -= src_width_fixed;
    5359           0 :         tmp3 = *(ps + pixman_fixed_to_int (vx));
    5360           0 :         vx += unit_x;
    5361           0 :         while (vx >= 0)
    5362           0 :             vx -= src_width_fixed;
    5363           0 :         tmp4 = *(ps + pixman_fixed_to_int (vx));
    5364           0 :         vx += unit_x;
    5365           0 :         while (vx >= 0)
    5366           0 :             vx -= src_width_fixed;
    5367             : 
    5368           0 :         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
    5369             : 
    5370           0 :         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
    5371             : 
    5372           0 :         if (is_opaque (xmm_src_hi))
    5373             :         {
    5374           0 :             save_128_aligned ((__m128i*)pd, xmm_src_hi);
    5375             :         }
    5376           0 :         else if (!is_zero (xmm_src_hi))
    5377             :         {
    5378           0 :             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    5379             : 
    5380           0 :             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    5381           0 :             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    5382             : 
    5383           0 :             expand_alpha_2x128 (
    5384             :                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
    5385             : 
    5386             :             over_2x128 (&xmm_src_lo, &xmm_src_hi,
    5387             :                         &xmm_alpha_lo, &xmm_alpha_hi,
    5388             :                         &xmm_dst_lo, &xmm_dst_hi);
    5389             : 
    5390             :             /* rebuid the 4 pixel data and save*/
    5391           0 :             save_128_aligned ((__m128i*)pd,
    5392             :                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    5393             :         }
    5394             : 
    5395           0 :         w -= 4;
    5396           0 :         pd += 4;
    5397           0 :         if (pm)
    5398           0 :             pm += 4;
    5399             :     }
    5400             : 
    5401           0 :     while (w)
    5402             :     {
    5403           0 :         d = *pd;
    5404           0 :         s = combine1 (ps + pixman_fixed_to_int (vx), pm);
    5405           0 :         vx += unit_x;
    5406           0 :         while (vx >= 0)
    5407           0 :             vx -= src_width_fixed;
    5408             : 
    5409           0 :         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
    5410           0 :         if (pm)
    5411           0 :             pm++;
    5412             : 
    5413           0 :         w--;
    5414             :     }
    5415             : }
    5416             : 
    5417           0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
    5418             :                        scaled_nearest_scanline_sse2_8888_8888_OVER,
    5419             :                        uint32_t, uint32_t, COVER)
    5420           0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
    5421             :                        scaled_nearest_scanline_sse2_8888_8888_OVER,
    5422             :                        uint32_t, uint32_t, NONE)
    5423           0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
    5424             :                        scaled_nearest_scanline_sse2_8888_8888_OVER,
    5425             :                        uint32_t, uint32_t, PAD)
    5426           0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
    5427             :                        scaled_nearest_scanline_sse2_8888_8888_OVER,
    5428             :                        uint32_t, uint32_t, NORMAL)
    5429             : 
    5430             : static force_inline void
    5431             : scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
    5432             :                                                uint32_t *       dst,
    5433             :                                                const uint32_t * src,
    5434             :                                                int32_t          w,
    5435             :                                                pixman_fixed_t   vx,
    5436             :                                                pixman_fixed_t   unit_x,
    5437             :                                                pixman_fixed_t   src_width_fixed,
    5438             :                                                pixman_bool_t    zero_src)
    5439             : {
    5440             :     __m128i xmm_mask;
    5441             :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    5442             :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    5443             :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    5444             : 
    5445           0 :     if (zero_src || (*mask >> 24) == 0)
    5446           0 :         return;
    5447             : 
    5448           0 :     xmm_mask = create_mask_16_128 (*mask >> 24);
    5449             : 
    5450           0 :     while (w && (uintptr_t)dst & 15)
    5451             :     {
    5452           0 :         uint32_t s = *(src + pixman_fixed_to_int (vx));
    5453           0 :         vx += unit_x;
    5454           0 :         while (vx >= 0)
    5455           0 :             vx -= src_width_fixed;
    5456             : 
    5457           0 :         if (s)
    5458             :         {
    5459           0 :             uint32_t d = *dst;
    5460             : 
    5461           0 :             __m128i ms = unpack_32_1x128 (s);
    5462           0 :             __m128i alpha     = expand_alpha_1x128 (ms);
    5463           0 :             __m128i dest      = xmm_mask;
    5464           0 :             __m128i alpha_dst = unpack_32_1x128 (d);
    5465             : 
    5466           0 :             *dst = pack_1x128_32 (
    5467             :                 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
    5468             :         }
    5469           0 :         dst++;
    5470           0 :         w--;
    5471             :     }
    5472             : 
    5473           0 :     while (w >= 4)
    5474             :     {
    5475             :         uint32_t tmp1, tmp2, tmp3, tmp4;
    5476             : 
    5477           0 :         tmp1 = *(src + pixman_fixed_to_int (vx));
    5478           0 :         vx += unit_x;
    5479           0 :         while (vx >= 0)
    5480           0 :             vx -= src_width_fixed;
    5481           0 :         tmp2 = *(src + pixman_fixed_to_int (vx));
    5482           0 :         vx += unit_x;
    5483           0 :         while (vx >= 0)
    5484           0 :             vx -= src_width_fixed;
    5485           0 :         tmp3 = *(src + pixman_fixed_to_int (vx));
    5486           0 :         vx += unit_x;
    5487           0 :         while (vx >= 0)
    5488           0 :             vx -= src_width_fixed;
    5489           0 :         tmp4 = *(src + pixman_fixed_to_int (vx));
    5490           0 :         vx += unit_x;
    5491           0 :         while (vx >= 0)
    5492           0 :             vx -= src_width_fixed;
    5493             : 
    5494           0 :         xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
    5495             : 
    5496           0 :         if (!is_zero (xmm_src))
    5497             :         {
    5498           0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    5499             : 
    5500             :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    5501             :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    5502           0 :             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    5503             :                                 &xmm_alpha_lo, &xmm_alpha_hi);
    5504             : 
    5505             :             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
    5506             :                            &xmm_alpha_lo, &xmm_alpha_hi,
    5507             :                            &xmm_mask, &xmm_mask,
    5508             :                            &xmm_dst_lo, &xmm_dst_hi);
    5509             : 
    5510           0 :             save_128_aligned (
    5511             :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    5512             :         }
    5513             : 
    5514           0 :         dst += 4;
    5515           0 :         w -= 4;
    5516             :     }
    5517             : 
    5518           0 :     while (w)
    5519             :     {
    5520           0 :         uint32_t s = *(src + pixman_fixed_to_int (vx));
    5521           0 :         vx += unit_x;
    5522           0 :         while (vx >= 0)
    5523           0 :             vx -= src_width_fixed;
    5524             : 
    5525           0 :         if (s)
    5526             :         {
    5527           0 :             uint32_t d = *dst;
    5528             : 
    5529           0 :             __m128i ms = unpack_32_1x128 (s);
    5530           0 :             __m128i alpha = expand_alpha_1x128 (ms);
    5531           0 :             __m128i mask  = xmm_mask;
    5532           0 :             __m128i dest  = unpack_32_1x128 (d);
    5533             : 
    5534           0 :             *dst = pack_1x128_32 (
    5535             :                 in_over_1x128 (&ms, &alpha, &mask, &dest));
    5536             :         }
    5537             : 
    5538           0 :         dst++;
    5539           0 :         w--;
    5540             :     }
    5541             : 
    5542             : }
    5543             : 
    5544           0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
    5545             :                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
    5546             :                               uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
    5547           0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
    5548             :                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
    5549             :                               uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
    5550           0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
    5551             :                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
    5552             :                               uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
    5553           0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
    5554             :                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
    5555             :                               uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
    5556             : 
    5557             : #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
    5558             : 
    5559             : #define BILINEAR_DECLARE_VARIABLES                                              \
    5560             :     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);      \
    5561             :     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);      \
    5562             :     const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
    5563             :     const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);           \
    5564             :     const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\
    5565             :     const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);           \
    5566             :     const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,       \
    5567             :                                           unit_x, unit_x, unit_x, unit_x);      \
    5568             :     const __m128i xmm_zero = _mm_setzero_si128 ();                              \
    5569             :     __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
    5570             : 
    5571             : #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)                                     \
    5572             : do {                                                                            \
    5573             :     __m128i xmm_wh, xmm_lo, xmm_hi, a;                                          \
    5574             :     /* fetch 2x2 pixel block into sse2 registers */                             \
    5575             :     __m128i tltr = _mm_loadl_epi64 (                                            \
    5576             :                             (__m128i *)&src_top[pixman_fixed_to_int (vx)]); \
    5577             :     __m128i blbr = _mm_loadl_epi64 (                                            \
    5578             :                             (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]);      \
    5579             :     vx += unit_x;                                                               \
    5580             :     /* vertical interpolation */                                                \
    5581             :     a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),     \
    5582             :                                         xmm_wt),                                \
    5583             :                        _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),     \
    5584             :                                         xmm_wb));                               \
    5585             :     if (BILINEAR_INTERPOLATION_BITS < 8)                                     \
    5586             :     {                                                                           \
    5587             :         /* calculate horizontal weights */                                      \
    5588             :         xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7,            \
    5589             :                    _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));  \
    5590             :         xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                  \
    5591             :         /* horizontal interpolation */                                          \
    5592             :         a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (             \
    5593             :                 a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);                      \
    5594             :     }                                                                           \
    5595             :     else                                                                        \
    5596             :     {                                                                           \
    5597             :         /* calculate horizontal weights */                                      \
    5598             :         xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8,            \
    5599             :                 _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));     \
    5600             :         xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                  \
    5601             :         /* horizontal interpolation */                                          \
    5602             :         xmm_lo = _mm_mullo_epi16 (a, xmm_wh);                                   \
    5603             :         xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);                                   \
    5604             :         a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),                 \
    5605             :                            _mm_unpackhi_epi16 (xmm_lo, xmm_hi));                \
    5606             :     }                                                                           \
    5607             :     /* shift and pack the result */                                             \
    5608             :     a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);                    \
    5609             :     a = _mm_packs_epi32 (a, a);                                                 \
    5610             :     a = _mm_packus_epi16 (a, a);                                                \
    5611             :     pix = _mm_cvtsi128_si32 (a);                                                \
    5612             : } while (0)
    5613             : 
    5614             : #define BILINEAR_SKIP_ONE_PIXEL()                                               \
    5615             : do {                                                                            \
    5616             :     vx += unit_x;                                                               \
    5617             :     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                      \
    5618             : } while(0)
    5619             : 
    5620             : static force_inline void
    5621             : scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
    5622             :                                              const uint32_t * mask,
    5623             :                                              const uint32_t * src_top,
    5624             :                                              const uint32_t * src_bottom,
    5625             :                                              int32_t          w,
    5626             :                                              int              wt,
    5627             :                                              int              wb,
    5628             :                                              pixman_fixed_t   vx,
    5629             :                                              pixman_fixed_t   unit_x,
    5630             :                                              pixman_fixed_t   max_vx,
    5631             :                                              pixman_bool_t    zero_src)
    5632             : {
    5633           0 :     BILINEAR_DECLARE_VARIABLES;
    5634             :     uint32_t pix1, pix2, pix3, pix4;
    5635             : 
    5636           0 :     while ((w -= 4) >= 0)
    5637             :     {
    5638           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    5639           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
    5640           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
    5641           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
    5642           0 :         *dst++ = pix1;
    5643           0 :         *dst++ = pix2;
    5644           0 :         *dst++ = pix3;
    5645           0 :         *dst++ = pix4;
    5646             :     }
    5647             : 
    5648           0 :     if (w & 2)
    5649             :     {
    5650           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    5651           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
    5652           0 :         *dst++ = pix1;
    5653           0 :         *dst++ = pix2;
    5654             :     }
    5655             : 
    5656           0 :     if (w & 1)
    5657             :     {
    5658           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    5659           0 :         *dst = pix1;
    5660             :     }
    5661             : 
    5662             : }
    5663             : 
    5664             : /* Add extra NULL argument to the existing bilinear fast paths to indicate
    5665             :  * that we don't need two-pass processing */
    5666             : 
    5667           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
    5668             :                                scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
    5669             :                                uint32_t, uint32_t, uint32_t,
    5670             :                                COVER, FLAG_NONE)
    5671           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
    5672             :                                scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
    5673             :                                uint32_t, uint32_t, uint32_t,
    5674             :                                PAD, FLAG_NONE)
    5675           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
    5676             :                                scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
    5677             :                                uint32_t, uint32_t, uint32_t,
    5678             :                                NONE, FLAG_NONE)
    5679           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
    5680             :                                scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
    5681             :                                uint32_t, uint32_t, uint32_t,
    5682             :                                NORMAL, FLAG_NONE)
    5683             : 
    5684             : static force_inline void
    5685             : scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t *       dst,
    5686             :                                              const uint32_t * mask,
    5687             :                                              const uint32_t * src_top,
    5688             :                                              const uint32_t * src_bottom,
    5689             :                                              int32_t          w,
    5690             :                                              int              wt,
    5691             :                                              int              wb,
    5692             :                                              pixman_fixed_t   vx_,
    5693             :                                              pixman_fixed_t   unit_x_,
    5694             :                                              pixman_fixed_t   max_vx,
    5695             :                                              pixman_bool_t    zero_src)
    5696             : {
    5697           0 :     intptr_t vx = vx_;
    5698           0 :     intptr_t unit_x = unit_x_;
    5699           0 :     BILINEAR_DECLARE_VARIABLES;
    5700             :     uint32_t pix1, pix2, pix3, pix4;
    5701             : 
    5702           0 :     while (w && ((uintptr_t)dst & 15))
    5703             :     {
    5704           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    5705           0 :         *dst++ = pix1 | 0xFF000000;
    5706           0 :         w--;
    5707             :     }
    5708             : 
    5709           0 :     while ((w -= 4) >= 0) {
    5710             :         __m128i xmm_src;
    5711           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    5712           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
    5713           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
    5714           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
    5715             : 
    5716           0 :         xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
    5717           0 :         _mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
    5718           0 :         dst += 4;
    5719             :     }
    5720             : 
    5721           0 :     if (w & 2)
    5722             :     {
    5723           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    5724           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
    5725           0 :         *dst++ = pix1 | 0xFF000000;
    5726           0 :         *dst++ = pix2 | 0xFF000000;
    5727             :     }
    5728             : 
    5729           0 :     if (w & 1)
    5730             :     {
    5731           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    5732           0 :         *dst = pix1 | 0xFF000000;
    5733             :     }
    5734             : }
    5735             : 
    5736           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
    5737             :                                scaled_bilinear_scanline_sse2_x888_8888_SRC, NULL,
    5738             :                                uint32_t, uint32_t, uint32_t,
    5739             :                                COVER, FLAG_NONE)
    5740           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
    5741             :                                scaled_bilinear_scanline_sse2_x888_8888_SRC, NULL,
    5742             :                                uint32_t, uint32_t, uint32_t,
    5743             :                                PAD, FLAG_NONE)
    5744           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
    5745             :                                scaled_bilinear_scanline_sse2_x888_8888_SRC, NULL,
    5746             :                                uint32_t, uint32_t, uint32_t,
    5747             :                                NORMAL, FLAG_NONE)
    5748             : #if 0
    5749             : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
    5750             :                                scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
    5751             :                                uint32_t, uint32_t, uint32_t,
    5752             :                                PAD, FLAG_NONE)
    5753             : #endif
    5754             : static force_inline void
    5755             : scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
    5756             :                                               const uint32_t * mask,
    5757             :                                               const uint32_t * src_top,
    5758             :                                               const uint32_t * src_bottom,
    5759             :                                               int32_t          w,
    5760             :                                               int              wt,
    5761             :                                               int              wb,
    5762             :                                               pixman_fixed_t   vx,
    5763             :                                               pixman_fixed_t   unit_x,
    5764             :                                               pixman_fixed_t   max_vx,
    5765             :                                               pixman_bool_t    zero_src)
    5766             : {
    5767           0 :     BILINEAR_DECLARE_VARIABLES;
    5768             :     uint32_t pix1, pix2, pix3, pix4;
    5769             : 
    5770           0 :     while (w && ((uintptr_t)dst & 15))
    5771             :     {
    5772           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    5773             : 
    5774           0 :         if (pix1)
    5775             :         {
    5776           0 :             pix2 = *dst;
    5777           0 :             *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
    5778             :         }
    5779             : 
    5780           0 :         w--;
    5781           0 :         dst++;
    5782             :     }
    5783             : 
    5784           0 :     while (w  >= 4)
    5785             :     {
    5786             :         __m128i xmm_src;
    5787             :         __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
    5788             :         __m128i xmm_alpha_hi, xmm_alpha_lo;
    5789             : 
    5790           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    5791           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
    5792           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
    5793           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
    5794             : 
    5795           0 :         xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
    5796             : 
    5797           0 :         if (!is_zero (xmm_src))
    5798             :         {
    5799           0 :             if (is_opaque (xmm_src))
    5800             :             {
    5801             :                 save_128_aligned ((__m128i *)dst, xmm_src);
    5802             :             }
    5803             :             else
    5804             :             {
    5805           0 :                 __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
    5806             : 
    5807             :                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    5808             :                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    5809             : 
    5810           0 :                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
    5811             :                 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
    5812             :                             &xmm_dst_lo, &xmm_dst_hi);
    5813             : 
    5814           0 :                 save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    5815             :             }
    5816             :         }
    5817             : 
    5818           0 :         w -= 4;
    5819           0 :         dst += 4;
    5820             :     }
    5821             : 
    5822           0 :     while (w)
    5823             :     {
    5824           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    5825             : 
    5826           0 :         if (pix1)
    5827             :         {
    5828           0 :             pix2 = *dst;
    5829           0 :             *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
    5830             :         }
    5831             : 
    5832           0 :         w--;
    5833           0 :         dst++;
    5834             :     }
    5835             : }
    5836             : 
    5837           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
    5838             :                                scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
    5839             :                                uint32_t, uint32_t, uint32_t,
    5840             :                                COVER, FLAG_NONE)
    5841           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
    5842             :                                scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
    5843             :                                uint32_t, uint32_t, uint32_t,
    5844             :                                PAD, FLAG_NONE)
    5845           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
    5846             :                                scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
    5847             :                                uint32_t, uint32_t, uint32_t,
    5848             :                                NONE, FLAG_NONE)
    5849           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
    5850             :                                scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
    5851             :                                uint32_t, uint32_t, uint32_t,
    5852             :                                NORMAL, FLAG_NONE)
    5853             : 
    5854             : 
    5855             : /* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented
    5856             :    as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */
    5857             : 
    5858           0 : void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width)
    5859             : {
    5860             :     /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */
    5861           0 :     while (--width >= 0)
    5862             :     {
    5863           0 :         *dst = composite_over_8888_0565pixel (*src, *dst);
    5864           0 :         src++;
    5865           0 :         dst++;
    5866             :     }
    5867           0 : }
    5868             : 
    5869           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER,
    5870             :                                scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
    5871             :                                uint32_t, uint32_t, uint16_t,
    5872             :                                COVER, FLAG_NONE)
    5873           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER,
    5874             :                                scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
    5875             :                                uint32_t, uint32_t, uint16_t,
    5876             :                                PAD, FLAG_NONE)
    5877           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER,
    5878             :                                scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
    5879             :                                uint32_t, uint32_t, uint16_t,
    5880             :                                NONE, FLAG_NONE)
    5881           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER,
    5882             :                                scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
    5883             :                                uint32_t, uint32_t, uint16_t,
    5884             :                                NORMAL, FLAG_NONE)
    5885             : 
    5886             : /*****************************/
    5887             : 
    5888             : static force_inline void
    5889             : scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
    5890             :                                                 const uint8_t  * mask,
    5891             :                                                 const uint32_t * src_top,
    5892             :                                                 const uint32_t * src_bottom,
    5893             :                                                 int32_t          w,
    5894             :                                                 int              wt,
    5895             :                                                 int              wb,
    5896             :                                                 pixman_fixed_t   vx,
    5897             :                                                 pixman_fixed_t   unit_x,
    5898             :                                                 pixman_fixed_t   max_vx,
    5899             :                                                 pixman_bool_t    zero_src)
    5900             : {
    5901           0 :     BILINEAR_DECLARE_VARIABLES;
    5902             :     uint32_t pix1, pix2, pix3, pix4;
    5903             :     uint32_t m;
    5904             : 
    5905           0 :     while (w && ((uintptr_t)dst & 15))
    5906             :     {
    5907             :         uint32_t sa;
    5908             : 
    5909           0 :         m = (uint32_t) *mask++;
    5910             : 
    5911           0 :         if (m)
    5912             :         {
    5913           0 :             BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    5914           0 :             sa = pix1 >> 24;
    5915             : 
    5916           0 :             if (sa == 0xff && m == 0xff)
    5917             :             {
    5918           0 :                 *dst = pix1;
    5919             :             }
    5920             :             else
    5921             :             {
    5922             :                 __m128i ms, md, ma, msa;
    5923             : 
    5924           0 :                 pix2 = *dst;
    5925           0 :                 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
    5926           0 :                 ms = unpack_32_1x128 (pix1);
    5927           0 :                 md = unpack_32_1x128 (pix2);
    5928             : 
    5929           0 :                 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
    5930             : 
    5931           0 :                 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
    5932             :             }
    5933             :         }
    5934             :         else
    5935             :         {
    5936           0 :             BILINEAR_SKIP_ONE_PIXEL ();
    5937             :         }
    5938             : 
    5939           0 :         w--;
    5940           0 :         dst++;
    5941             :     }
    5942             : 
    5943           0 :     while (w >= 4)
    5944             :     {
    5945             :         __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
    5946             :         __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    5947             :         __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    5948             : 
    5949           0 :         m = *(uint32_t*)mask;
    5950             : 
    5951           0 :         if (m)
    5952             :         {
    5953           0 :             BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    5954           0 :             BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
    5955           0 :             BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
    5956           0 :             BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
    5957             : 
    5958           0 :             xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
    5959             : 
    5960           0 :             if (m == 0xffffffff && is_opaque (xmm_src))
    5961             :             {
    5962             :                 save_128_aligned ((__m128i *)dst, xmm_src);
    5963             :             }
    5964             :             else
    5965             :             {
    5966           0 :                 xmm_dst = load_128_aligned ((__m128i *)dst);
    5967             : 
    5968           0 :                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
    5969             : 
    5970             :                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    5971             :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    5972             :                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    5973             : 
    5974           0 :                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
    5975           0 :                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    5976             : 
    5977             :                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
    5978             :                                &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
    5979             : 
    5980           0 :                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    5981             :             }
    5982             :         }
    5983             :         else
    5984             :         {
    5985           0 :             BILINEAR_SKIP_ONE_PIXEL ();
    5986           0 :             BILINEAR_SKIP_ONE_PIXEL ();
    5987           0 :             BILINEAR_SKIP_ONE_PIXEL ();
    5988           0 :             BILINEAR_SKIP_ONE_PIXEL ();
    5989             :         }
    5990             : 
    5991           0 :         w -= 4;
    5992           0 :         dst += 4;
    5993           0 :         mask += 4;
    5994             :     }
    5995             : 
    5996           0 :     while (w)
    5997             :     {
    5998             :         uint32_t sa;
    5999             : 
    6000           0 :         m = (uint32_t) *mask++;
    6001             : 
    6002           0 :         if (m)
    6003             :         {
    6004           0 :             BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    6005           0 :             sa = pix1 >> 24;
    6006             : 
    6007           0 :             if (sa == 0xff && m == 0xff)
    6008             :             {
    6009           0 :                 *dst = pix1;
    6010             :             }
    6011             :             else
    6012             :             {
    6013             :                 __m128i ms, md, ma, msa;
    6014             : 
    6015           0 :                 pix2 = *dst;
    6016           0 :                 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
    6017           0 :                 ms = unpack_32_1x128 (pix1);
    6018           0 :                 md = unpack_32_1x128 (pix2);
    6019             : 
    6020           0 :                 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
    6021             : 
    6022           0 :                 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
    6023             :             }
    6024             :         }
    6025             :         else
    6026             :         {
    6027           0 :             BILINEAR_SKIP_ONE_PIXEL ();
    6028             :         }
    6029             : 
    6030           0 :         w--;
    6031           0 :         dst++;
    6032             :     }
    6033             : }
    6034             : 
    6035           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
    6036             :                                scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
    6037             :                                uint32_t, uint8_t, uint32_t,
    6038             :                                COVER, FLAG_HAVE_NON_SOLID_MASK)
    6039           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
    6040             :                                scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
    6041             :                                uint32_t, uint8_t, uint32_t,
    6042             :                                PAD, FLAG_HAVE_NON_SOLID_MASK)
    6043           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
    6044             :                                scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
    6045             :                                uint32_t, uint8_t, uint32_t,
    6046             :                                NONE, FLAG_HAVE_NON_SOLID_MASK)
    6047           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
    6048             :                                scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
    6049             :                                uint32_t, uint8_t, uint32_t,
    6050             :                                NORMAL, FLAG_HAVE_NON_SOLID_MASK)
    6051             : 
    6052             : static force_inline void
    6053             : scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
    6054             :                                                 const uint32_t * mask,
    6055             :                                                 const uint32_t * src_top,
    6056             :                                                 const uint32_t * src_bottom,
    6057             :                                                 int32_t          w,
    6058             :                                                 int              wt,
    6059             :                                                 int              wb,
    6060             :                                                 pixman_fixed_t   vx,
    6061             :                                                 pixman_fixed_t   unit_x,
    6062             :                                                 pixman_fixed_t   max_vx,
    6063             :                                                 pixman_bool_t    zero_src)
    6064             : {
    6065           0 :     BILINEAR_DECLARE_VARIABLES;
    6066             :     uint32_t pix1, pix2, pix3, pix4;
    6067             :     __m128i xmm_mask;
    6068             : 
    6069           0 :     if (zero_src || (*mask >> 24) == 0)
    6070           0 :         return;
    6071             : 
    6072           0 :     xmm_mask = create_mask_16_128 (*mask >> 24);
    6073             : 
    6074           0 :     while (w && ((uintptr_t)dst & 15))
    6075             :     {
    6076           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    6077           0 :         if (pix1)
    6078             :         {
    6079           0 :                 uint32_t d = *dst;
    6080             : 
    6081           0 :                 __m128i ms = unpack_32_1x128 (pix1);
    6082           0 :                 __m128i alpha     = expand_alpha_1x128 (ms);
    6083           0 :                 __m128i dest      = xmm_mask;
    6084           0 :                 __m128i alpha_dst = unpack_32_1x128 (d);
    6085             : 
    6086           0 :                 *dst = pack_1x128_32
    6087             :                         (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
    6088             :         }
    6089             : 
    6090           0 :         dst++;
    6091           0 :         w--;
    6092             :     }
    6093             : 
    6094           0 :     while (w >= 4)
    6095             :     {
    6096           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    6097           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
    6098           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
    6099           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
    6100             : 
    6101           0 :         if (pix1 | pix2 | pix3 | pix4)
    6102             :         {
    6103             :             __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    6104             :             __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    6105             :             __m128i xmm_alpha_lo, xmm_alpha_hi;
    6106             : 
    6107           0 :             xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
    6108             : 
    6109           0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    6110             : 
    6111             :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    6112             :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    6113           0 :             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    6114             :                                 &xmm_alpha_lo, &xmm_alpha_hi);
    6115             : 
    6116             :             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
    6117             :                            &xmm_alpha_lo, &xmm_alpha_hi,
    6118             :                            &xmm_mask, &xmm_mask,
    6119             :                            &xmm_dst_lo, &xmm_dst_hi);
    6120             : 
    6121           0 :             save_128_aligned
    6122             :                 ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    6123             :         }
    6124             : 
    6125           0 :         dst += 4;
    6126           0 :         w -= 4;
    6127             :     }
    6128             : 
    6129           0 :     while (w)
    6130             :     {
    6131           0 :         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
    6132           0 :         if (pix1)
    6133             :         {
    6134           0 :                 uint32_t d = *dst;
    6135             : 
    6136           0 :                 __m128i ms = unpack_32_1x128 (pix1);
    6137           0 :                 __m128i alpha     = expand_alpha_1x128 (ms);
    6138           0 :                 __m128i dest      = xmm_mask;
    6139           0 :                 __m128i alpha_dst = unpack_32_1x128 (d);
    6140             : 
    6141           0 :                 *dst = pack_1x128_32
    6142             :                         (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
    6143             :         }
    6144             : 
    6145           0 :         dst++;
    6146           0 :         w--;
    6147             :     }
    6148             : }
    6149             : 
    6150           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
    6151             :                                scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
    6152             :                                uint32_t, uint32_t, uint32_t,
    6153             :                                COVER, FLAG_HAVE_SOLID_MASK)
    6154           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
    6155             :                                scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
    6156             :                                uint32_t, uint32_t, uint32_t,
    6157             :                                PAD, FLAG_HAVE_SOLID_MASK)
    6158           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
    6159             :                                scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
    6160             :                                uint32_t, uint32_t, uint32_t,
    6161             :                                NONE, FLAG_HAVE_SOLID_MASK)
    6162           0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
    6163             :                                scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
    6164             :                                uint32_t, uint32_t, uint32_t,
    6165             :                                NORMAL, FLAG_HAVE_SOLID_MASK)
    6166             : 
    6167             : static const pixman_fast_path_t sse2_fast_paths[] =
    6168             : {
    6169             :     /* PIXMAN_OP_OVER */
    6170             :     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
    6171             :     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
    6172             :     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
    6173             :     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
    6174             :     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
    6175             :     PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
    6176             :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
    6177             :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
    6178             :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
    6179             :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
    6180             :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
    6181             :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
    6182             :     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
    6183             :     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
    6184             :     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
    6185             :     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
    6186             :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
    6187             :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
    6188             :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
    6189             :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
    6190             :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
    6191             :     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
    6192             :     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
    6193             :     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
    6194             :     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
    6195             :     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
    6196             :     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
    6197             :     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
    6198             :     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
    6199             :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
    6200             :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
    6201             :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
    6202             :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
    6203             :     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
    6204             :     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
    6205             :     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
    6206             :     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
    6207             :     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
    6208             :     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
    6209             :     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
    6210             :     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
    6211             :     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
    6212             :     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
    6213             :     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
    6214             :     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
    6215             :     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
    6216             :     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
    6217             :     
    6218             :     /* PIXMAN_OP_OVER_REVERSE */
    6219             :     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
    6220             :     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
    6221             : 
    6222             :     /* PIXMAN_OP_ADD */
    6223             :     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
    6224             :     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
    6225             :     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
    6226             :     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
    6227             :     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
    6228             :     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
    6229             :     PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
    6230             :     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
    6231             :     PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
    6232             :     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
    6233             :     PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
    6234             :     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
    6235             :     PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
    6236             :     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
    6237             : 
    6238             :     /* PIXMAN_OP_SRC */
    6239             :     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
    6240             :     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
    6241             :     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
    6242             :     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
    6243             :     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
    6244             :     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
    6245             :     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
    6246             :     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
    6247             :     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
    6248             :     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
    6249             :     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
    6250             :     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
    6251             :     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
    6252             :     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
    6253             :     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
    6254             :     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
    6255             :     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
    6256             :     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
    6257             : 
    6258             :     /* PIXMAN_OP_IN */
    6259             :     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
    6260             :     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
    6261             :     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
    6262             : 
    6263             :     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
    6264             :     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
    6265             :     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
    6266             :     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
    6267             :     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
    6268             :     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
    6269             :     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
    6270             :     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
    6271             :     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
    6272             :     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
    6273             :     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
    6274             :     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
    6275             :     SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
    6276             :     SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
    6277             :     SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
    6278             :     SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
    6279             : 
    6280             :     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
    6281             :     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
    6282             :     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
    6283             :     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
    6284             :     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
    6285             :     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
    6286             :     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
    6287             :     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
    6288             : 
    6289             :     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
    6290             :     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
    6291             :     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
    6292             :     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
    6293             :     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
    6294             :     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
    6295             : 
    6296             :     SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
    6297             :     SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
    6298             :     SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
    6299             :     SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
    6300             :     SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
    6301             :     SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
    6302             : 
    6303             :     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
    6304             :     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
    6305             :     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
    6306             :     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
    6307             : 
    6308             :     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
    6309             :     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
    6310             :     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
    6311             :     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
    6312             : 
    6313             :     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
    6314             :     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
    6315             :     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
    6316             :     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
    6317             : 
    6318             :     /* and here the needed entries are added to the fast path table */
    6319             : 
    6320             :     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565),
    6321             :     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565),
    6322             : 
    6323             :     { PIXMAN_OP_NONE },
    6324             : };
    6325             : 
    6326             : static uint32_t *
    6327           0 : sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
    6328             : {
    6329           0 :     int w = iter->width;
    6330           0 :     __m128i ff000000 = mask_ff000000;
    6331           0 :     uint32_t *dst = iter->buffer;
    6332           0 :     uint32_t *src = (uint32_t *)iter->bits;
    6333             : 
    6334           0 :     iter->bits += iter->stride;
    6335             : 
    6336           0 :     while (w && ((uintptr_t)dst) & 0x0f)
    6337             :     {
    6338           0 :         *dst++ = (*src++) | 0xff000000;
    6339           0 :         w--;
    6340             :     }
    6341             : 
    6342           0 :     while (w >= 4)
    6343             :     {
    6344           0 :         save_128_aligned (
    6345             :             (__m128i *)dst, _mm_or_si128 (
    6346             :                 load_128_unaligned ((__m128i *)src), ff000000));
    6347             : 
    6348           0 :         dst += 4;
    6349           0 :         src += 4;
    6350           0 :         w -= 4;
    6351             :     }
    6352             : 
    6353           0 :     while (w)
    6354             :     {
    6355           0 :         *dst++ = (*src++) | 0xff000000;
    6356           0 :         w--;
    6357             :     }
    6358             : 
    6359           0 :     return iter->buffer;
    6360             : }
    6361             : 
    6362             : static uint32_t *
    6363           0 : sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
    6364             : {
    6365           0 :     int w = iter->width;
    6366           0 :     uint32_t *dst = iter->buffer;
    6367           0 :     uint16_t *src = (uint16_t *)iter->bits;
    6368           0 :     __m128i ff000000 = mask_ff000000;
    6369             : 
    6370           0 :     iter->bits += iter->stride;
    6371             : 
    6372           0 :     while (w && ((uintptr_t)dst) & 0x0f)
    6373             :     {
    6374           0 :         uint16_t s = *src++;
    6375             : 
    6376           0 :         *dst++ = convert_0565_to_8888 (s);
    6377           0 :         w--;
    6378             :     }
    6379             : 
    6380           0 :     while (w >= 8)
    6381             :     {
    6382             :         __m128i lo, hi, s;
    6383             : 
    6384           0 :         s = _mm_loadu_si128 ((__m128i *)src);
    6385             : 
    6386           0 :         lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
    6387           0 :         hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
    6388             : 
    6389           0 :         save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
    6390           0 :         save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
    6391             : 
    6392           0 :         dst += 8;
    6393           0 :         src += 8;
    6394           0 :         w -= 8;
    6395             :     }
    6396             : 
    6397           0 :     while (w)
    6398             :     {
    6399           0 :         uint16_t s = *src++;
    6400             : 
    6401           0 :         *dst++ = convert_0565_to_8888 (s);
    6402           0 :         w--;
    6403             :     }
    6404             : 
    6405           0 :     return iter->buffer;
    6406             : }
    6407             : 
    6408             : static uint32_t *
    6409           0 : sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
    6410             : {
    6411           0 :     int w = iter->width;
    6412           0 :     uint32_t *dst = iter->buffer;
    6413           0 :     uint8_t *src = iter->bits;
    6414             :     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
    6415             : 
    6416           0 :     iter->bits += iter->stride;
    6417             : 
    6418           0 :     while (w && (((uintptr_t)dst) & 15))
    6419             :     {
    6420           0 :         *dst++ = *(src++) << 24;
    6421           0 :         w--;
    6422             :     }
    6423             : 
    6424           0 :     while (w >= 16)
    6425             :     {
    6426           0 :         xmm0 = _mm_loadu_si128((__m128i *)src);
    6427             : 
    6428           0 :         xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
    6429           0 :         xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
    6430           0 :         xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
    6431           0 :         xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
    6432           0 :         xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
    6433           0 :         xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
    6434             : 
    6435             :         _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
    6436           0 :         _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
    6437           0 :         _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
    6438           0 :         _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
    6439             : 
    6440           0 :         dst += 16;
    6441           0 :         src += 16;
    6442           0 :         w -= 16;
    6443             :     }
    6444             : 
    6445           0 :     while (w)
    6446             :     {
    6447           0 :         *dst++ = *(src++) << 24;
    6448           0 :         w--;
    6449             :     }
    6450             : 
    6451           0 :     return iter->buffer;
    6452             : }
    6453             : 
    6454             : typedef struct
    6455             : {
    6456             :     pixman_format_code_t        format;
    6457             :     pixman_iter_get_scanline_t  get_scanline;
    6458             : } fetcher_info_t;
    6459             : 
    6460             : static const fetcher_info_t fetchers[] =
    6461             : {
    6462             :     { PIXMAN_x8r8g8b8,          sse2_fetch_x8r8g8b8 },
    6463             :     { PIXMAN_r5g6b5,            sse2_fetch_r5g6b5 },
    6464             :     { PIXMAN_a8,                sse2_fetch_a8 },
    6465             :     { PIXMAN_null }
    6466             : };
    6467             : 
    6468             : static pixman_bool_t
    6469           0 : sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
    6470             : {
    6471           0 :     pixman_image_t *image = iter->image;
    6472             : 
    6473             : #define FLAGS                                                           \
    6474             :     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |                \
    6475             :      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
    6476             : 
    6477           0 :     if ((iter->iter_flags & ITER_NARROW)                 &&
    6478           0 :         (iter->image_flags & FLAGS) == FLAGS)
    6479             :     {
    6480             :         const fetcher_info_t *f;
    6481             : 
    6482           0 :         for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
    6483             :         {
    6484           0 :             if (image->common.extended_format_code == f->format)
    6485             :             {
    6486           0 :                 uint8_t *b = (uint8_t *)image->bits.bits;
    6487           0 :                 int s = image->bits.rowstride * 4;
    6488             : 
    6489           0 :                 iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
    6490           0 :                 iter->stride = s;
    6491             : 
    6492           0 :                 iter->get_scanline = f->get_scanline;
    6493           0 :                 return TRUE;
    6494             :             }
    6495             :         }
    6496             :     }
    6497             : 
    6498           0 :     return FALSE;
    6499             : }
    6500             : 
    6501             : #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
    6502             : __attribute__((__force_align_arg_pointer__))
    6503             : #endif
    6504             : pixman_implementation_t *
    6505           1 : _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
    6506             : {
    6507           1 :     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
    6508             : 
    6509             :     /* SSE2 constants */
    6510           1 :     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
    6511           1 :     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
    6512           1 :     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
    6513           1 :     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
    6514           1 :     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
    6515           1 :     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
    6516           1 :     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
    6517           1 :     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
    6518           1 :     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
    6519           1 :     mask_0080 = create_mask_16_128 (0x0080);
    6520           1 :     mask_00ff = create_mask_16_128 (0x00ff);
    6521           1 :     mask_0101 = create_mask_16_128 (0x0101);
    6522           1 :     mask_ffff = create_mask_16_128 (0xffff);
    6523           1 :     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
    6524           1 :     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
    6525           1 :     mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
    6526           1 :     mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
    6527             : 
    6528             :     /* Set up function pointers */
    6529           1 :     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
    6530           1 :     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
    6531           1 :     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
    6532           1 :     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
    6533           1 :     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
    6534           1 :     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
    6535           1 :     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
    6536           1 :     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
    6537           1 :     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
    6538           1 :     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
    6539             : 
    6540           1 :     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
    6541             : 
    6542           1 :     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
    6543           1 :     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
    6544           1 :     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
    6545           1 :     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
    6546           1 :     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
    6547           1 :     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
    6548           1 :     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
    6549           1 :     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
    6550           1 :     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
    6551           1 :     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
    6552           1 :     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
    6553             : 
    6554           1 :     imp->blt = sse2_blt;
    6555           1 :     imp->fill = sse2_fill;
    6556             : 
    6557           1 :     imp->src_iter_init = sse2_src_iter_init;
    6558             : 
    6559           1 :     return imp;
    6560             : }

Generated by: LCOV version 1.13