Line data Source code
1 : /*
2 : * Copyright 2016 Google Inc.
3 : *
4 : * Use of this source code is governed by a BSD-style license that can be
5 : * found in the LICENSE file.
6 : */
7 :
8 : /*
9 : ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q
10 : */
11 :
12 : #ifndef SkBlend_opts_DEFINED
13 : #define SkBlend_opts_DEFINED
14 :
15 : #include "SkNx.h"
16 : #include "SkPM4fPriv.h"
17 :
18 : #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
19 : #include <immintrin.h>
20 : #endif
21 :
22 : namespace SK_OPTS_NS {
23 :
24 0 : static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) {
25 0 : if (src >= 0xFF000000) {
26 0 : *dst = src;
27 0 : return;
28 : }
29 0 : auto d = Sk4f_fromS32(*dst),
30 0 : s = Sk4f_fromS32( src);
31 0 : *dst = Sk4f_toS32(s + d * (1.0f - s[3]));
32 : }
33 :
34 0 : static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {
35 0 : srcover_srgb_srgb_1(dst++, *src++);
36 0 : srcover_srgb_srgb_1(dst++, *src++);
37 0 : srcover_srgb_srgb_1(dst++, *src++);
38 0 : srcover_srgb_srgb_1(dst , *src );
39 0 : }
40 :
41 : #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
42 :
43 0 : static inline __m128i load(const uint32_t* p) {
44 0 : return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
45 : }
46 :
47 0 : static inline void store(uint32_t* p, __m128i v) {
48 : _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);
49 0 : }
50 :
51 : #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
52 :
53 0 : static void srcover_srgb_srgb(
54 : uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
55 0 : const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
56 0 : while (ndst > 0) {
57 0 : int count = SkTMin(ndst, nsrc);
58 0 : ndst -= count;
59 0 : const uint32_t* src = srcStart;
60 0 : const uint32_t* end = dst + (count & ~3);
61 0 : ptrdiff_t delta = src - dst;
62 :
63 0 : while (dst < end) {
64 0 : __m128i pixels = load(src);
65 0 : if (_mm_testc_si128(pixels, alphaMask)) {
66 0 : uint32_t* start = dst;
67 0 : do {
68 0 : store(dst, pixels);
69 0 : dst += 4;
70 : } while (dst < end
71 0 : && _mm_testc_si128(pixels = load(dst + delta), alphaMask));
72 0 : src += dst - start;
73 0 : } else if (_mm_testz_si128(pixels, alphaMask)) {
74 0 : do {
75 0 : dst += 4;
76 0 : src += 4;
77 : } while (dst < end
78 0 : && _mm_testz_si128(pixels = load(src), alphaMask));
79 : } else {
80 0 : uint32_t* start = dst;
81 0 : do {
82 0 : srcover_srgb_srgb_4(dst, dst + delta);
83 0 : dst += 4;
84 : } while (dst < end
85 0 : && _mm_testnzc_si128(pixels = load(dst + delta), alphaMask));
86 0 : src += dst - start;
87 : }
88 : }
89 :
90 0 : count = count & 3;
91 0 : while (count-- > 0) {
92 0 : srcover_srgb_srgb_1(dst++, *src++);
93 : }
94 : }
95 0 : }
96 : #else
97 : // SSE2 versions
98 :
99 : // Note: In the next three comparisons a group of 4 pixels is converted to a group of
100 : // "signed" pixels because the sse2 does not have an unsigned comparison.
101 : // Make it so that we can use the signed comparison operators by biasing
102 : // 0x00xxxxxx to 0x80xxxxxxx which is the smallest values and biasing 0xffxxxxxx to
103 : // 0x7fxxxxxx which is the largest set of values.
104 0 : static inline bool check_opaque_alphas(__m128i pixels) {
105 0 : __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
106 : int mask =
107 0 : _mm_movemask_epi8(
108 0 : _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000)));
109 0 : return mask == 0;
110 : }
111 :
112 0 : static inline bool check_transparent_alphas(__m128i pixels) {
113 0 : __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
114 : int mask =
115 0 : _mm_movemask_epi8(
116 0 : _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF)));
117 0 : return mask == 0;
118 : }
119 :
120 0 : static inline bool check_partial_alphas(__m128i pixels) {
121 0 : __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
122 0 : __m128i opaque = _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000));
123 0 : __m128i transparent = _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF));
124 0 : int mask = _mm_movemask_epi8(_mm_xor_si128(opaque, transparent));
125 0 : return mask == 0;
126 : }
127 :
128 0 : static void srcover_srgb_srgb(
129 : uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
130 0 : while (ndst > 0) {
131 0 : int count = SkTMin(ndst, nsrc);
132 0 : ndst -= count;
133 0 : const uint32_t* src = srcStart;
134 0 : const uint32_t* end = dst + (count & ~3);
135 0 : const ptrdiff_t delta = src - dst;
136 :
137 0 : __m128i pixels = load(src);
138 0 : do {
139 0 : if (check_opaque_alphas(pixels)) {
140 0 : uint32_t* start = dst;
141 0 : do {
142 0 : store(dst, pixels);
143 0 : dst += 4;
144 0 : } while (dst < end && check_opaque_alphas((pixels = load(dst + delta))));
145 0 : src += dst - start;
146 0 : } else if (check_transparent_alphas(pixels)) {
147 0 : const uint32_t* start = dst;
148 0 : do {
149 0 : dst += 4;
150 0 : } while (dst < end && check_transparent_alphas(pixels = load(dst + delta)));
151 0 : src += dst - start;
152 : } else {
153 0 : const uint32_t* start = dst;
154 0 : do {
155 0 : srcover_srgb_srgb_4(dst, dst + delta);
156 0 : dst += 4;
157 0 : } while (dst < end && check_partial_alphas(pixels = load(dst + delta)));
158 0 : src += dst - start;
159 : }
160 0 : } while (dst < end);
161 :
162 0 : count = count & 3;
163 0 : while (count-- > 0) {
164 0 : srcover_srgb_srgb_1(dst++, *src++);
165 : }
166 : }
167 0 : }
168 : #endif
169 : #else
170 :
171 : static void srcover_srgb_srgb(
172 : uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
173 : while (ndst > 0) {
174 : int n = SkTMin(ndst, nsrc);
175 :
176 : for (int i = 0; i < n; i++) {
177 : srcover_srgb_srgb_1(dst++, src[i]);
178 : }
179 : ndst -= n;
180 : }
181 : }
182 :
183 : #endif
184 :
185 : } // namespace SK_OPTS_NS
186 :
187 : #endif//SkBlend_opts_DEFINED
|