Line data Source code
1 : /*
2 : * Copyright 2017 Google Inc.
3 : *
4 : * Use of this source code is governed by a BSD-style license that can be
5 : * found in the LICENSE file.
6 : */
7 :
8 : #include "SkJumper.h"
9 : #include "SkJumper_misc.h" // SI, unaligned_load(), bit_cast(), C(), operator"" _i and _f.
10 : #include "SkJumper_vectors.h" // F, I32, U32, U16, U8, cast(), expand()
11 :
12 : // Our fundamental vector depth is our pixel stride.
13 : static const size_t kStride = sizeof(F) / sizeof(float);
14 :
15 : // A reminder:
16 : // Code guarded by defined(JUMPER) can assume that it will be compiled by Clang
17 : // and that F, I32, etc. are kStride-deep ext_vector_types of the appropriate type.
18 : // Otherwise, F, I32, etc. just alias the basic scalar types (and so kStride == 1).
19 :
20 : // Another reminder:
21 : // You can't generally use constants in this file except via C() or operator"" _i/_f.
22 : // Not all constants can be generated using C() or _i/_f. Stages read the rest from this struct.
23 : using K = const SkJumper_constants;
24 :
25 :
26 : // Let's start first with the mechanisms we use to build Stages.
27 :
28 : // Our program is an array of void*, either
29 : // - 1 void* per stage with no context pointer, the next stage;
30 : // - 2 void* per stage with a context pointer, first the context pointer, then the next stage.
31 :
32 : // load_and_inc() steps the program forward by 1 void*, returning that pointer.
33 0 : SI void* load_and_inc(void**& program) {
34 : #if defined(__GNUC__) && defined(__x86_64__)
35 : // If program is in %rsi (we try to make this likely) then this is a single instruction.
36 : void* rax;
37 0 : asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi.
38 0 : return rax;
39 : #else
40 : // On ARM *program++ compiles into pretty ideal code without any handholding.
41 : return *program++;
42 : #endif
43 : }
44 :
45 : // LazyCtx doesn't do anything unless you call operator T*(), encapsulating the logic
46 : // from above that stages without a context pointer are represented by just 1 void*.
47 : struct LazyCtx {
48 : void* ptr;
49 : void**& program;
50 :
51 0 : explicit LazyCtx(void**& p) : ptr(nullptr), program(p) {}
52 :
53 : template <typename T>
54 0 : operator T*() {
55 0 : if (!ptr) { ptr = load_and_inc(program); }
56 0 : return (T*)ptr;
57 : }
58 : };
59 :
60 : // A little wrapper macro to name Stages differently depending on the instruction set.
61 : // That lets us link together several options.
62 : #if !defined(JUMPER)
63 : #define WRAP(name) sk_##name
64 : #elif defined(__aarch64__)
65 : #define WRAP(name) sk_##name##_aarch64
66 : #elif defined(__arm__)
67 : #define WRAP(name) sk_##name##_vfp4
68 : #elif defined(__AVX2__)
69 : #define WRAP(name) sk_##name##_hsw
70 : #elif defined(__AVX__)
71 : #define WRAP(name) sk_##name##_avx
72 : #elif defined(__SSE4_1__)
73 : #define WRAP(name) sk_##name##_sse41
74 : #elif defined(__SSE2__)
75 : #define WRAP(name) sk_##name##_sse2
76 : #endif
77 :
78 : // We're finally going to get to what a Stage function looks like!
79 : // It's best to jump down to the #else case first, then to come back up here for AVX.
80 :
81 : #if defined(JUMPER) && defined(__AVX__)
82 : // There's a big cost to switch between SSE and AVX, so we do a little
83 : // extra work to handle even the jagged <kStride tail in AVX mode.
84 : // Compared to normal stages, we maintain an extra tail register:
85 : // tail == 0 ~~> work on a full kStride pixels
86 : // tail != 0 ~~> work on only the first tail pixels
87 : // tail is always < kStride.
88 : using Stage = void(size_t x, void** program, K* k, size_t tail, F,F,F,F, F,F,F,F);
89 :
90 : #if defined(JUMPER) && defined(WIN)
91 : __attribute__((ms_abi))
92 : #endif
93 : extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
94 : F v{};
95 : auto start = (Stage*)load_and_inc(program);
96 : while (x + kStride <= limit) {
97 : start(x,program,k,0, v,v,v,v, v,v,v,v);
98 : x += kStride;
99 : }
100 : if (size_t tail = limit - x) {
101 : start(x,program,k,tail, v,v,v,v, v,v,v,v);
102 : }
103 : return limit;
104 : }
105 :
106 : #define STAGE(name) \
107 : SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
108 : F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
109 : extern "C" void WRAP(name)(size_t x, void** program, K* k, size_t tail, \
110 : F r, F g, F b, F a, F dr, F dg, F db, F da) { \
111 : LazyCtx ctx(program); \
112 : name##_k(x,ctx,k,tail, r,g,b,a, dr,dg,db,da); \
113 : auto next = (Stage*)load_and_inc(program); \
114 : next(x,program,k,tail, r,g,b,a, dr,dg,db,da); \
115 : } \
116 : SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
117 : F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
118 :
119 : #else
120 : // Other instruction sets (SSE, NEON, portable) can fall back on narrower
121 : // pipelines cheaply, which frees us to always assume tail==0.
122 :
123 : // Stages tail call between each other by following program as described above.
124 : // x is our induction variable, stepping forward kStride at a time.
125 : using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
126 :
127 : // On Windows, start_pipeline() has a normal Windows ABI, and then the rest is System V.
128 : #if defined(JUMPER) && defined(WIN)
129 : __attribute__((ms_abi))
130 : #endif
131 0 : extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
132 0 : F v{};
133 0 : auto start = (Stage*)load_and_inc(program);
134 0 : while (x + kStride <= limit) {
135 0 : start(x,program,k, v,v,v,v, v,v,v,v);
136 0 : x += kStride;
137 : }
138 0 : return x;
139 : }
140 :
141 : // This STAGE macro makes it easier to write stages, handling all the Stage chaining for you.
142 : #define STAGE(name) \
143 : SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
144 : F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
145 : extern "C" void WRAP(name)(size_t x, void** program, K* k, \
146 : F r, F g, F b, F a, F dr, F dg, F db, F da) { \
147 : LazyCtx ctx(program); \
148 : name##_k(x,ctx,k,0, r,g,b,a, dr,dg,db,da); \
149 : auto next = (Stage*)load_and_inc(program); \
150 : next(x,program,k, r,g,b,a, dr,dg,db,da); \
151 : } \
152 : SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
153 : F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
154 : #endif
155 :
156 : // just_return() is a simple no-op stage that only exists to end the chain,
157 : // returning back up to start_pipeline(), and from there to the caller.
158 0 : extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
159 :
160 :
161 : // We could start defining normal Stages now. But first, some helper functions.
162 :
163 : // These load() and store() methods are tail-aware,
164 : // but focus mainly on keeping the at-stride tail==0 case fast.
165 :
166 : template <typename V, typename T>
167 0 : SI V load(const T* src, size_t tail) {
168 : #if defined(JUMPER)
169 : __builtin_assume(tail < kStride);
170 : if (__builtin_expect(tail, 0)) {
171 : V v{}; // Any inactive lanes are zeroed.
172 : switch (tail-1) {
173 : case 6: v[6] = src[6];
174 : case 5: v[5] = src[5];
175 : case 4: v[4] = src[4];
176 : case 3: v[3] = src[3];
177 : case 2: v[2] = src[2];
178 : case 1: v[1] = src[1];
179 : case 0: v[0] = src[0];
180 : }
181 : return v;
182 : }
183 : #endif
184 0 : return unaligned_load<V>(src);
185 : }
186 :
187 : template <typename V, typename T>
188 0 : SI void store(T* dst, V v, size_t tail) {
189 : #if defined(JUMPER)
190 : __builtin_assume(tail < kStride);
191 : if (__builtin_expect(tail, 0)) {
192 : switch (tail-1) {
193 : case 6: dst[6] = v[6];
194 : case 5: dst[5] = v[5];
195 : case 4: dst[4] = v[4];
196 : case 3: dst[3] = v[3];
197 : case 2: dst[2] = v[2];
198 : case 1: dst[1] = v[1];
199 : case 0: dst[0] = v[0];
200 : }
201 : return;
202 : }
203 : #endif
204 0 : memcpy(dst, &v, sizeof(v));
205 0 : }
206 :
207 : // This doesn't look strictly necessary, but without it Clang would generate load() using
208 : // compiler-generated constants that we can't support. This version doesn't need constants.
209 : #if defined(JUMPER) && defined(__AVX__)
210 : template <>
211 : inline U8 load(const uint8_t* src, size_t tail) {
212 : if (__builtin_expect(tail, 0)) {
213 : uint64_t v = 0;
214 : size_t shift = 0;
215 : #pragma nounroll
216 : while (tail --> 0) {
217 : v |= (uint64_t)*src++ << shift;
218 : shift += 8;
219 : }
220 : return unaligned_load<U8>(&v);
221 : }
222 : return unaligned_load<U8>(src);
223 : }
224 : #endif
225 :
226 : // AVX2 adds some mask loads and stores that make for shorter, faster code.
227 : #if defined(JUMPER) && defined(__AVX2__)
228 : SI U32 mask(size_t tail) {
229 : // We go a little out of our way to avoid needing large constant values here.
230 :
231 : // It's easiest to build the mask as 8 8-bit values, either 0x00 or 0xff.
232 : // Start fully on, then shift away lanes from the top until we've got our mask.
233 : uint64_t mask = 0xffffffffffffffff >> 8*(kStride-tail);
234 :
235 : // Sign-extend each mask lane to its full width, 0x00000000 or 0xffffffff.
236 : return _mm256_cvtepi8_epi32(_mm_cvtsi64_si128((int64_t)mask));
237 : }
238 :
239 : template <>
240 : inline U32 load(const uint32_t* src, size_t tail) {
241 : __builtin_assume(tail < kStride);
242 : if (__builtin_expect(tail, 0)) {
243 : return _mm256_maskload_epi32((const int*)src, mask(tail));
244 : }
245 : return unaligned_load<U32>(src);
246 : }
247 :
248 : template <>
249 : inline void store(uint32_t* dst, U32 v, size_t tail) {
250 : __builtin_assume(tail < kStride);
251 : if (__builtin_expect(tail, 0)) {
252 : return _mm256_maskstore_epi32((int*)dst, mask(tail), v);
253 : }
254 : memcpy(dst, &v, sizeof(v));
255 : }
256 : #endif
257 :
258 0 : SI F from_byte(U8 b) {
259 0 : return cast(expand(b)) * C(1/255.0f);
260 : }
261 0 : SI void from_565(U16 _565, F* r, F* g, F* b) {
262 0 : U32 wide = expand(_565);
263 0 : *r = cast(wide & C(31<<11)) * C(1.0f / (31<<11));
264 0 : *g = cast(wide & C(63<< 5)) * C(1.0f / (63<< 5));
265 0 : *b = cast(wide & C(31<< 0)) * C(1.0f / (31<< 0));
266 0 : }
267 0 : SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) {
268 0 : U32 wide = expand(_4444);
269 0 : *r = cast(wide & C(15<<12)) * C(1.0f / (15<<12));
270 0 : *g = cast(wide & C(15<< 8)) * C(1.0f / (15<< 8));
271 0 : *b = cast(wide & C(15<< 4)) * C(1.0f / (15<< 4));
272 0 : *a = cast(wide & C(15<< 0)) * C(1.0f / (15<< 0));
273 0 : }
274 0 : SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
275 0 : *r = cast((_8888 ) & 0xff_i) * C(1/255.0f);
276 0 : *g = cast((_8888 >> 8) & 0xff_i) * C(1/255.0f);
277 0 : *b = cast((_8888 >> 16) & 0xff_i) * C(1/255.0f);
278 0 : *a = cast((_8888 >> 24) ) * C(1/255.0f);
279 0 : }
280 :
281 : template <typename T>
282 0 : SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
283 0 : *ptr = (const T*)ctx->pixels;
284 0 : return trunc_(y)*ctx->stride + trunc_(x);
285 : }
286 :
287 : // Now finally, normal Stages!
288 :
289 0 : STAGE(seed_shader) {
290 0 : auto y = *(const int*)ctx;
291 :
292 : // It's important for speed to explicitly cast(x) and cast(y),
293 : // which has the effect of splatting them to vectors before converting to floats.
294 : // On Intel this breaks a data dependency on previous loop iterations' registers.
295 0 : r = cast(x) + 0.5_f + unaligned_load<F>(k->iota);
296 0 : g = cast(y) + 0.5_f;
297 0 : b = 1.0_f;
298 0 : a = 0;
299 0 : dr = dg = db = da = 0;
300 0 : }
301 :
302 0 : STAGE(constant_color) {
303 0 : auto rgba = (const float*)ctx;
304 0 : r = rgba[0];
305 0 : g = rgba[1];
306 0 : b = rgba[2];
307 0 : a = rgba[3];
308 0 : }
309 :
310 : // Most blend modes apply the same logic to each channel.
311 : #define BLEND_MODE(name) \
312 : SI F name##_channel(F s, F d, F sa, F da); \
313 : STAGE(name) { \
314 : r = name##_channel(r,dr,a,da); \
315 : g = name##_channel(g,dg,a,da); \
316 : b = name##_channel(b,db,a,da); \
317 : a = name##_channel(a,da,a,da); \
318 : } \
319 : SI F name##_channel(F s, F d, F sa, F da)
320 :
321 0 : SI F inv(F x) { return 1.0_f - x; }
322 0 : SI F two(F x) { return x + x; }
323 :
324 0 : BLEND_MODE(clear) { return 0; }
325 0 : BLEND_MODE(srcatop) { return s*da + d*inv(sa); }
326 0 : BLEND_MODE(dstatop) { return d*sa + s*inv(da); }
327 0 : BLEND_MODE(srcin) { return s * da; }
328 0 : BLEND_MODE(dstin) { return d * sa; }
329 0 : BLEND_MODE(srcout) { return s * inv(da); }
330 0 : BLEND_MODE(dstout) { return d * inv(sa); }
331 0 : BLEND_MODE(srcover) { return mad(d, inv(sa), s); }
332 0 : BLEND_MODE(dstover) { return mad(s, inv(da), d); }
333 :
334 0 : BLEND_MODE(modulate) { return s*d; }
335 0 : BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; }
336 0 : BLEND_MODE(plus_) { return s + d; }
337 0 : BLEND_MODE(screen) { return s + d - s*d; }
338 0 : BLEND_MODE(xor_) { return s*inv(da) + d*inv(sa); }
339 : #undef BLEND_MODE
340 :
341 : // Most other blend modes apply the same logic to colors, and srcover to alpha.
342 : #define BLEND_MODE(name) \
343 : SI F name##_channel(F s, F d, F sa, F da); \
344 : STAGE(name) { \
345 : r = name##_channel(r,dr,a,da); \
346 : g = name##_channel(g,dg,a,da); \
347 : b = name##_channel(b,db,a,da); \
348 : a = mad(da, inv(a), a); \
349 : } \
350 : SI F name##_channel(F s, F d, F sa, F da)
351 :
352 0 : BLEND_MODE(darken) { return s + d - max(s*da, d*sa) ; }
353 0 : BLEND_MODE(lighten) { return s + d - min(s*da, d*sa) ; }
354 0 : BLEND_MODE(difference) { return s + d - two(min(s*da, d*sa)); }
355 0 : BLEND_MODE(exclusion) { return s + d - two(s*d); }
356 :
357 0 : BLEND_MODE(colorburn) {
358 0 : return if_then_else(d == da, d + s*inv(da),
359 0 : if_then_else(s == 0, s + d*inv(sa),
360 0 : sa*(da - min(da, (da-d)*sa/s)) + s*inv(da) + d*inv(sa)));
361 : }
362 0 : BLEND_MODE(colordodge) {
363 0 : return if_then_else(d == 0, d + s*inv(da),
364 0 : if_then_else(s == sa, s + d*inv(sa),
365 0 : sa*min(da, (d*sa)/(sa - s)) + s*inv(da) + d*inv(sa)));
366 : }
367 0 : BLEND_MODE(hardlight) {
368 0 : return s*inv(da) + d*inv(sa)
369 0 : + if_then_else(two(s) <= sa, two(s*d), sa*da - two((da-d)*(sa-s)));
370 : }
371 0 : BLEND_MODE(overlay) {
372 0 : return s*inv(da) + d*inv(sa)
373 0 : + if_then_else(two(d) <= da, two(s*d), sa*da - two((da-d)*(sa-s)));
374 : }
375 :
376 0 : BLEND_MODE(softlight) {
377 0 : F m = if_then_else(da > 0, d / da, 0),
378 0 : s2 = two(s),
379 0 : m4 = two(two(m));
380 :
381 : // The logic forks three ways:
382 : // 1. dark src?
383 : // 2. light src, dark dst?
384 : // 3. light src, light dst?
385 0 : F darkSrc = d*(sa + (s2 - sa)*(1.0_f - m)), // Used in case 1.
386 0 : darkDst = (m4*m4 + m4)*(m - 1.0_f) + 7.0_f*m, // Used in case 2.
387 0 : liteDst = rcp(rsqrt(m)) - m, // Used in case 3.
388 0 : liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3?
389 0 : return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)?
390 : }
391 : #undef BLEND_MODE
392 :
393 0 : STAGE(clamp_0) {
394 0 : r = max(r, 0);
395 0 : g = max(g, 0);
396 0 : b = max(b, 0);
397 0 : a = max(a, 0);
398 0 : }
399 :
400 0 : STAGE(clamp_1) {
401 0 : r = min(r, 1.0_f);
402 0 : g = min(g, 1.0_f);
403 0 : b = min(b, 1.0_f);
404 0 : a = min(a, 1.0_f);
405 0 : }
406 :
407 0 : STAGE(clamp_a) {
408 0 : a = min(a, 1.0_f);
409 0 : r = min(r, a);
410 0 : g = min(g, a);
411 0 : b = min(b, a);
412 0 : }
413 :
414 0 : STAGE(set_rgb) {
415 0 : auto rgb = (const float*)ctx;
416 0 : r = rgb[0];
417 0 : g = rgb[1];
418 0 : b = rgb[2];
419 0 : }
420 0 : STAGE(swap_rb) {
421 0 : auto tmp = r;
422 0 : r = b;
423 0 : b = tmp;
424 0 : }
425 :
426 0 : STAGE(swap) {
427 0 : auto swap = [](F& v, F& dv) {
428 0 : auto tmp = v;
429 0 : v = dv;
430 0 : dv = tmp;
431 0 : };
432 0 : swap(r, dr);
433 0 : swap(g, dg);
434 0 : swap(b, db);
435 0 : swap(a, da);
436 0 : }
437 0 : STAGE(move_src_dst) {
438 0 : dr = r;
439 0 : dg = g;
440 0 : db = b;
441 0 : da = a;
442 0 : }
443 0 : STAGE(move_dst_src) {
444 0 : r = dr;
445 0 : g = dg;
446 0 : b = db;
447 0 : a = da;
448 0 : }
449 :
450 0 : STAGE(premul) {
451 0 : r = r * a;
452 0 : g = g * a;
453 0 : b = b * a;
454 0 : }
455 0 : STAGE(unpremul) {
456 0 : auto scale = if_then_else(a == 0, 0, 1.0_f / a);
457 0 : r = r * scale;
458 0 : g = g * scale;
459 0 : b = b * scale;
460 0 : }
461 :
462 0 : STAGE(from_srgb) {
463 0 : auto fn = [&](F s) {
464 0 : auto lo = s * C(1/12.92f);
465 0 : auto hi = mad(s*s, mad(s, 0.3000_f, 0.6975_f), 0.0025_f);
466 0 : return if_then_else(s < 0.055_f, lo, hi);
467 : };
468 0 : r = fn(r);
469 0 : g = fn(g);
470 0 : b = fn(b);
471 0 : }
472 0 : STAGE(to_srgb) {
473 0 : auto fn = [&](F l) {
474 0 : F sqrt = rcp (rsqrt(l)),
475 0 : ftrt = rsqrt(rsqrt(l));
476 0 : auto lo = l * 12.46_f;
477 0 : auto hi = min(1.0_f, mad(0.411192_f, ftrt,
478 0 : mad(0.689206_f, sqrt, -0.0988_f)));
479 0 : return if_then_else(l < 0.0043_f, lo, hi);
480 : };
481 0 : r = fn(r);
482 0 : g = fn(g);
483 0 : b = fn(b);
484 0 : }
485 :
486 0 : STAGE(from_2dot2) {
487 0 : auto fn = [](F x) {
488 : // x^(141/64) = x^(2.20312) is a great approximation of the true value, x^(2.2).
489 : // (note: x^(35/16) = x^(2.1875) is an okay one as well and would be quicker)
490 0 : F x16 = rsqrt(rsqrt(rsqrt(rsqrt(x)))), // x^(1/16) = x^(4/64);
491 0 : x64 = rsqrt(rsqrt(x16)); // x^(1/64)
492 :
493 : // 141/64 = 128/64 + 12/64 + 1/64
494 0 : return max((x*x) * (x16*x16*x16) * x64, 0);
495 : };
496 0 : r = fn(r);
497 0 : g = fn(g);
498 0 : b = fn(b);
499 0 : }
500 0 : STAGE(to_2dot2) {
501 0 : auto fn = [](F x) {
502 : // x^(29/64) is a very good approximation of the true value, x^(1/2.2).
503 0 : F x2 = rsqrt(x), // x^(-1/2)
504 0 : x32 = rsqrt(rsqrt(rsqrt(rsqrt(x2)))), // x^(-1/32)
505 0 : x64 = rsqrt(x32); // x^(+1/64)
506 :
507 : // 29/64 = 32/64 - 2/64 - 1/64
508 0 : return max(rcp(x2) * x32 * rcp(x64), 0);
509 : };
510 0 : r = fn(r);
511 0 : g = fn(g);
512 0 : b = fn(b);
513 0 : }
514 :
515 0 : STAGE(rgb_to_hsl) {
516 0 : F mx = max(max(r,g), b),
517 0 : mn = min(min(r,g), b),
518 0 : d = mx - mn,
519 0 : d_rcp = 1.0_f / d;
520 :
521 0 : F h = C(1/6.0f) *
522 0 : if_then_else(mx == mn, 0,
523 0 : if_then_else(mx == r, (g-b)*d_rcp + if_then_else(g < b, 6.0_f, 0),
524 0 : if_then_else(mx == g, (b-r)*d_rcp + 2.0_f,
525 0 : (r-g)*d_rcp + 4.0_f)));
526 :
527 0 : F l = (mx + mn) * 0.5_f;
528 0 : F s = if_then_else(mx == mn, 0,
529 0 : d / if_then_else(l > 0.5_f, 2.0_f-mx-mn, mx+mn));
530 :
531 0 : r = h;
532 0 : g = s;
533 0 : b = l;
534 0 : }
535 0 : STAGE(hsl_to_rgb) {
536 0 : F h = r,
537 0 : s = g,
538 0 : l = b;
539 :
540 0 : F q = if_then_else(l < 0.5_f, l*(1.0_f + s), l + s - l*s),
541 0 : p = 2.0_f*l - q;
542 :
543 0 : auto hue_to_rgb = [&](F t) {
544 0 : F t2 = if_then_else(t < 0.0_f, t + 1.0_f,
545 0 : if_then_else(t > 1.0_f, t - 1.0_f,
546 0 : t));
547 :
548 0 : return if_then_else(t2 < C(1/6.0f), p + (q-p)*6.0_f*t,
549 0 : if_then_else(t2 < C(3/6.0f), q,
550 0 : if_then_else(t2 < C(4/6.0f), p + (q-p)*6.0_f*(C(4/6.0f) - t2),
551 0 : p)));
552 0 : };
553 :
554 0 : r = if_then_else(s == 0, l, hue_to_rgb(h + C(1/3.0f)));
555 0 : g = if_then_else(s == 0, l, hue_to_rgb(h ));
556 0 : b = if_then_else(s == 0, l, hue_to_rgb(h - C(1/3.0f)));
557 0 : }
558 :
559 0 : STAGE(scale_1_float) {
560 0 : auto c = *(const float*)ctx;
561 :
562 0 : r = r * c;
563 0 : g = g * c;
564 0 : b = b * c;
565 0 : a = a * c;
566 0 : }
567 0 : STAGE(scale_u8) {
568 0 : auto ptr = *(const uint8_t**)ctx + x;
569 :
570 0 : auto scales = load<U8>(ptr, tail);
571 0 : auto c = from_byte(scales);
572 :
573 0 : r = r * c;
574 0 : g = g * c;
575 0 : b = b * c;
576 0 : a = a * c;
577 0 : }
578 :
579 0 : SI F lerp(F from, F to, F t) {
580 0 : return mad(to-from, t, from);
581 : }
582 :
583 0 : STAGE(lerp_1_float) {
584 0 : auto c = *(const float*)ctx;
585 :
586 0 : r = lerp(dr, r, c);
587 0 : g = lerp(dg, g, c);
588 0 : b = lerp(db, b, c);
589 0 : a = lerp(da, a, c);
590 0 : }
591 0 : STAGE(lerp_u8) {
592 0 : auto ptr = *(const uint8_t**)ctx + x;
593 :
594 0 : auto scales = load<U8>(ptr, tail);
595 0 : auto c = from_byte(scales);
596 :
597 0 : r = lerp(dr, r, c);
598 0 : g = lerp(dg, g, c);
599 0 : b = lerp(db, b, c);
600 0 : a = lerp(da, a, c);
601 0 : }
602 0 : STAGE(lerp_565) {
603 0 : auto ptr = *(const uint16_t**)ctx + x;
604 :
605 : F cr,cg,cb;
606 0 : from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
607 :
608 0 : r = lerp(dr, r, cr);
609 0 : g = lerp(dg, g, cg);
610 0 : b = lerp(db, b, cb);
611 0 : a = 1.0_f;
612 0 : }
613 :
614 0 : STAGE(load_tables) {
615 : struct Ctx {
616 : const uint32_t* src;
617 : const float *r, *g, *b;
618 : };
619 0 : auto c = (const Ctx*)ctx;
620 :
621 0 : auto px = load<U32>(c->src + x, tail);
622 0 : r = gather(c->r, (px ) & 0xff_i);
623 0 : g = gather(c->g, (px >> 8) & 0xff_i);
624 0 : b = gather(c->b, (px >> 16) & 0xff_i);
625 0 : a = cast( (px >> 24)) * C(1/255.0f);
626 0 : }
627 :
628 0 : STAGE(byte_tables) {
629 : struct Tables { const uint8_t *r, *g, *b, *a; };
630 0 : auto tables = (const Tables*)ctx;
631 :
632 0 : r = from_byte(gather(tables->r, round(r, 255.0_f)));
633 0 : g = from_byte(gather(tables->g, round(g, 255.0_f)));
634 0 : b = from_byte(gather(tables->b, round(b, 255.0_f)));
635 0 : a = from_byte(gather(tables->a, round(a, 255.0_f)));
636 0 : }
637 :
638 0 : STAGE(byte_tables_rgb) {
639 : struct Tables { const uint8_t *r, *g, *b; int n; };
640 0 : auto tables = (const Tables*)ctx;
641 :
642 0 : F scale = tables->n - 1;
643 0 : r = from_byte(gather(tables->r, round(r, scale)));
644 0 : g = from_byte(gather(tables->g, round(g, scale)));
645 0 : b = from_byte(gather(tables->b, round(b, scale)));
646 0 : }
647 :
648 0 : STAGE(load_a8) {
649 0 : auto ptr = *(const uint8_t**)ctx + x;
650 :
651 0 : r = g = b = 0.0f;
652 0 : a = from_byte(load<U8>(ptr, tail));
653 0 : }
654 0 : STAGE(gather_a8) {
655 : const uint8_t* ptr;
656 0 : U32 ix = ix_and_ptr(&ptr, ctx, r,g);
657 0 : r = g = b = 0.0f;
658 0 : a = from_byte(gather(ptr, ix));
659 0 : }
660 0 : STAGE(store_a8) {
661 0 : auto ptr = *(uint8_t**)ctx + x;
662 :
663 0 : U8 packed = pack(pack(round(a, 255.0_f)));
664 0 : store(ptr, packed, tail);
665 0 : }
666 :
667 0 : STAGE(load_g8) {
668 0 : auto ptr = *(const uint8_t**)ctx + x;
669 :
670 0 : r = g = b = from_byte(load<U8>(ptr, tail));
671 0 : a = 1.0_f;
672 0 : }
673 0 : STAGE(gather_g8) {
674 : const uint8_t* ptr;
675 0 : U32 ix = ix_and_ptr(&ptr, ctx, r,g);
676 0 : r = g = b = from_byte(gather(ptr, ix));
677 0 : a = 1.0_f;
678 0 : }
679 :
680 0 : STAGE(gather_i8) {
681 0 : auto c = (const SkJumper_GatherCtx*)ctx;
682 : const uint8_t* ptr;
683 0 : U32 ix = ix_and_ptr(&ptr, ctx, r,g);
684 0 : ix = expand(gather(ptr, ix));
685 0 : from_8888(gather(c->ctable, ix), &r,&g,&b,&a);
686 0 : }
687 :
688 0 : STAGE(load_565) {
689 0 : auto ptr = *(const uint16_t**)ctx + x;
690 :
691 0 : from_565(load<U16>(ptr, tail), &r,&g,&b);
692 0 : a = 1.0_f;
693 0 : }
694 0 : STAGE(gather_565) {
695 : const uint16_t* ptr;
696 0 : U32 ix = ix_and_ptr(&ptr, ctx, r,g);
697 0 : from_565(gather(ptr, ix), &r,&g,&b);
698 0 : a = 1.0_f;
699 0 : }
700 0 : STAGE(store_565) {
701 0 : auto ptr = *(uint16_t**)ctx + x;
702 :
703 0 : U16 px = pack( round(r, 31.0_f) << 11
704 0 : | round(g, 63.0_f) << 5
705 0 : | round(b, 31.0_f) );
706 0 : store(ptr, px, tail);
707 0 : }
708 :
709 0 : STAGE(load_4444) {
710 0 : auto ptr = *(const uint16_t**)ctx + x;
711 0 : from_4444(load<U16>(ptr, tail), &r,&g,&b,&a);
712 0 : }
713 0 : STAGE(gather_4444) {
714 : const uint16_t* ptr;
715 0 : U32 ix = ix_and_ptr(&ptr, ctx, r,g);
716 0 : from_4444(gather(ptr, ix), &r,&g,&b,&a);
717 0 : }
718 0 : STAGE(store_4444) {
719 0 : auto ptr = *(uint16_t**)ctx + x;
720 0 : U16 px = pack( round(r, 15.0_f) << 12
721 0 : | round(g, 15.0_f) << 8
722 0 : | round(b, 15.0_f) << 4
723 0 : | round(a, 15.0_f) );
724 0 : store(ptr, px, tail);
725 0 : }
726 :
727 0 : STAGE(load_8888) {
728 0 : auto ptr = *(const uint32_t**)ctx + x;
729 0 : from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
730 0 : }
731 0 : STAGE(gather_8888) {
732 : const uint32_t* ptr;
733 0 : U32 ix = ix_and_ptr(&ptr, ctx, r,g);
734 0 : from_8888(gather(ptr, ix), &r,&g,&b,&a);
735 0 : }
736 0 : STAGE(store_8888) {
737 0 : auto ptr = *(uint32_t**)ctx + x;
738 :
739 0 : U32 px = round(r, 255.0_f)
740 0 : | round(g, 255.0_f) << 8
741 0 : | round(b, 255.0_f) << 16
742 0 : | round(a, 255.0_f) << 24;
743 0 : store(ptr, px, tail);
744 0 : }
745 :
746 0 : STAGE(load_f16) {
747 0 : auto ptr = *(const uint64_t**)ctx + x;
748 :
749 : U16 R,G,B,A;
750 0 : load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
751 0 : r = from_half(R);
752 0 : g = from_half(G);
753 0 : b = from_half(B);
754 0 : a = from_half(A);
755 0 : }
756 0 : STAGE(gather_f16) {
757 : const uint64_t* ptr;
758 0 : U32 ix = ix_and_ptr(&ptr, ctx, r,g);
759 0 : auto px = gather(ptr, ix);
760 :
761 : U16 R,G,B,A;
762 0 : load4((const uint16_t*)&px,0, &R,&G,&B,&A);
763 0 : r = from_half(R);
764 0 : g = from_half(G);
765 0 : b = from_half(B);
766 0 : a = from_half(A);
767 0 : }
768 0 : STAGE(store_f16) {
769 0 : auto ptr = *(uint64_t**)ctx + x;
770 0 : store4((uint16_t*)ptr,tail, to_half(r)
771 0 : , to_half(g)
772 0 : , to_half(b)
773 0 : , to_half(a));
774 0 : }
775 :
776 0 : STAGE(load_u16_be) {
777 0 : auto ptr = *(const uint64_t**)ctx + x;
778 :
779 : U16 R,G,B,A;
780 0 : load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
781 :
782 0 : r = C(1/65535.0f) * cast(expand(bswap(R)));
783 0 : g = C(1/65535.0f) * cast(expand(bswap(G)));
784 0 : b = C(1/65535.0f) * cast(expand(bswap(B)));
785 0 : a = C(1/65535.0f) * cast(expand(bswap(A)));
786 0 : }
787 0 : STAGE(store_u16_be) {
788 0 : auto ptr = *(uint64_t**)ctx + x;
789 :
790 0 : U16 R = bswap(pack(round(r, 65535.0_f))),
791 0 : G = bswap(pack(round(g, 65535.0_f))),
792 0 : B = bswap(pack(round(b, 65535.0_f))),
793 0 : A = bswap(pack(round(a, 65535.0_f)));
794 :
795 0 : store4((uint16_t*)ptr,tail, R,G,B,A);
796 0 : }
797 :
798 0 : STAGE(load_f32) {
799 0 : auto ptr = *(const float**)ctx + 4*x;
800 0 : load4(ptr,tail, &r,&g,&b,&a);
801 0 : }
802 0 : STAGE(store_f32) {
803 0 : auto ptr = *(float**)ctx + 4*x;
804 0 : store4(ptr,tail, r,g,b,a);
805 0 : }
806 :
807 0 : SI F ulp_before(F v) {
808 0 : return bit_cast<F>(bit_cast<U32>(v) + U32(0xffffffff));
809 : }
810 0 : SI F clamp(F v, float limit) {
811 0 : v = max(0, v);
812 0 : return min(v, ulp_before(limit));
813 : }
814 0 : SI F repeat(F v, float limit) {
815 0 : v = v - floor_(v/limit)*limit;
816 0 : return min(v, ulp_before(limit));
817 : }
818 0 : SI F mirror(F v, float limit) {
819 0 : v = abs_( (v-limit) - (limit+limit)*floor_((v-limit)/(limit+limit)) - limit );
820 0 : return min(v, ulp_before(limit));
821 : }
822 0 : STAGE(clamp_x) { r = clamp (r, *(const float*)ctx); }
823 0 : STAGE(clamp_y) { g = clamp (g, *(const float*)ctx); }
824 0 : STAGE(repeat_x) { r = repeat(r, *(const float*)ctx); }
825 0 : STAGE(repeat_y) { g = repeat(g, *(const float*)ctx); }
826 0 : STAGE(mirror_x) { r = mirror(r, *(const float*)ctx); }
827 0 : STAGE(mirror_y) { g = mirror(g, *(const float*)ctx); }
828 :
829 0 : STAGE(luminance_to_alpha) {
830 0 : a = r*0.2126_f + g*0.7152_f + b*0.0722_f;
831 0 : r = g = b = 0;
832 0 : }
833 :
834 0 : STAGE(matrix_2x3) {
835 0 : auto m = (const float*)ctx;
836 :
837 0 : auto R = mad(r,m[0], mad(g,m[2], m[4])),
838 0 : G = mad(r,m[1], mad(g,m[3], m[5]));
839 0 : r = R;
840 0 : g = G;
841 0 : }
842 0 : STAGE(matrix_3x4) {
843 0 : auto m = (const float*)ctx;
844 :
845 0 : auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))),
846 0 : G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))),
847 0 : B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11])));
848 0 : r = R;
849 0 : g = G;
850 0 : b = B;
851 0 : }
852 0 : STAGE(matrix_4x5) {
853 0 : auto m = (const float*)ctx;
854 :
855 0 : auto R = mad(r,m[0], mad(g,m[4], mad(b,m[ 8], mad(a,m[12], m[16])))),
856 0 : G = mad(r,m[1], mad(g,m[5], mad(b,m[ 9], mad(a,m[13], m[17])))),
857 0 : B = mad(r,m[2], mad(g,m[6], mad(b,m[10], mad(a,m[14], m[18])))),
858 0 : A = mad(r,m[3], mad(g,m[7], mad(b,m[11], mad(a,m[15], m[19]))));
859 0 : r = R;
860 0 : g = G;
861 0 : b = B;
862 0 : a = A;
863 0 : }
864 0 : STAGE(matrix_perspective) {
865 : // N.B. Unlike the other matrix_ stages, this matrix is row-major.
866 0 : auto m = (const float*)ctx;
867 :
868 0 : auto R = mad(r,m[0], mad(g,m[1], m[2])),
869 0 : G = mad(r,m[3], mad(g,m[4], m[5])),
870 0 : Z = mad(r,m[6], mad(g,m[7], m[8]));
871 0 : r = R * rcp(Z);
872 0 : g = G * rcp(Z);
873 0 : }
874 :
875 0 : STAGE(linear_gradient) {
876 : struct Stop { float pos; float f[4], b[4]; };
877 : struct Ctx { size_t n; Stop *stops; float start[4]; };
878 :
879 0 : auto c = (const Ctx*)ctx;
880 0 : F fr = 0, fg = 0, fb = 0, fa = 0;
881 0 : F br = c->start[0],
882 0 : bg = c->start[1],
883 0 : bb = c->start[2],
884 0 : ba = c->start[3];
885 0 : auto t = r;
886 0 : for (size_t i = 0; i < c->n; i++) {
887 0 : fr = if_then_else(t < c->stops[i].pos, fr, c->stops[i].f[0]);
888 0 : fg = if_then_else(t < c->stops[i].pos, fg, c->stops[i].f[1]);
889 0 : fb = if_then_else(t < c->stops[i].pos, fb, c->stops[i].f[2]);
890 0 : fa = if_then_else(t < c->stops[i].pos, fa, c->stops[i].f[3]);
891 0 : br = if_then_else(t < c->stops[i].pos, br, c->stops[i].b[0]);
892 0 : bg = if_then_else(t < c->stops[i].pos, bg, c->stops[i].b[1]);
893 0 : bb = if_then_else(t < c->stops[i].pos, bb, c->stops[i].b[2]);
894 0 : ba = if_then_else(t < c->stops[i].pos, ba, c->stops[i].b[3]);
895 : }
896 :
897 0 : r = mad(t, fr, br);
898 0 : g = mad(t, fg, bg);
899 0 : b = mad(t, fb, bb);
900 0 : a = mad(t, fa, ba);
901 0 : }
902 :
903 0 : STAGE(linear_gradient_2stops) {
904 : struct Ctx { float f[4], b[4]; };
905 0 : auto c = (const Ctx*)ctx;
906 :
907 0 : auto t = r;
908 0 : r = mad(t, c->f[0], c->b[0]);
909 0 : g = mad(t, c->f[1], c->b[1]);
910 0 : b = mad(t, c->f[2], c->b[2]);
911 0 : a = mad(t, c->f[3], c->b[3]);
912 0 : }
913 :
914 0 : STAGE(save_xy) {
915 0 : auto c = (SkJumper_SamplerCtx*)ctx;
916 :
917 : // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
918 : // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
919 : // surrounding (x,y) at (0.5,0.5) off-center.
920 0 : auto fract = [](F v) { return v - floor_(v); };
921 0 : F fx = fract(r + 0.5_f),
922 0 : fy = fract(g + 0.5_f);
923 :
924 : // Samplers will need to load x and fx, or y and fy.
925 0 : memcpy(c->x, &r, sizeof(F));
926 0 : memcpy(c->y, &g, sizeof(F));
927 0 : memcpy(c->fx, &fx, sizeof(F));
928 0 : memcpy(c->fy, &fy, sizeof(F));
929 0 : }
930 :
931 0 : STAGE(accumulate) {
932 0 : auto c = (const SkJumper_SamplerCtx*)ctx;
933 :
934 : // Bilinear and bicubic filters are both separable, so we produce independent contributions
935 : // from x and y, multiplying them together here to get each pixel's total scale factor.
936 0 : auto scale = unaligned_load<F>(c->scalex)
937 0 : * unaligned_load<F>(c->scaley);
938 0 : dr = mad(scale, r, dr);
939 0 : dg = mad(scale, g, dg);
940 0 : db = mad(scale, b, db);
941 0 : da = mad(scale, a, da);
942 0 : }
943 :
944 : // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
945 : // are combined in direct proportion to their area overlapping that logical query pixel.
946 : // At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x.
947 : // The y-axis is symmetric.
948 :
949 : template <int kScale>
950 0 : SI void bilinear_x(SkJumper_SamplerCtx* ctx, F* x) {
951 0 : *x = unaligned_load<F>(ctx->x) + C(kScale * 0.5f);
952 0 : F fx = unaligned_load<F>(ctx->fx);
953 :
954 : F scalex;
955 0 : if (kScale == -1) { scalex = 1.0_f - fx; }
956 0 : if (kScale == +1) { scalex = fx; }
957 0 : memcpy(ctx->scalex, &scalex, sizeof(F));
958 0 : }
959 : template <int kScale>
960 0 : SI void bilinear_y(SkJumper_SamplerCtx* ctx, F* y) {
961 0 : *y = unaligned_load<F>(ctx->y) + C(kScale * 0.5f);
962 0 : F fy = unaligned_load<F>(ctx->fy);
963 :
964 : F scaley;
965 0 : if (kScale == -1) { scaley = 1.0_f - fy; }
966 0 : if (kScale == +1) { scaley = fy; }
967 0 : memcpy(ctx->scaley, &scaley, sizeof(F));
968 0 : }
969 :
970 0 : STAGE(bilinear_nx) { bilinear_x<-1>(ctx, &r); }
971 0 : STAGE(bilinear_px) { bilinear_x<+1>(ctx, &r); }
972 0 : STAGE(bilinear_ny) { bilinear_y<-1>(ctx, &g); }
973 0 : STAGE(bilinear_py) { bilinear_y<+1>(ctx, &g); }
974 :
975 :
976 : // In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
977 : // pixel center are combined with a non-uniform cubic filter, with higher values near the center.
978 : //
979 : // We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets.
980 : // See GrCubicEffect for details of this particular filter.
981 :
982 0 : SI F bicubic_near(F t) {
983 : // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
984 0 : return mad(t, mad(t, mad(C(-21/18.0f), t, C(27/18.0f)), C(9/18.0f)), C(1/18.0f));
985 : }
986 0 : SI F bicubic_far(F t) {
987 : // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)
988 0 : return (t*t)*mad(C(7/18.0f), t, C(-6/18.0f));
989 : }
990 :
991 : template <int kScale>
992 0 : SI void bicubic_x(SkJumper_SamplerCtx* ctx, F* x) {
993 0 : *x = unaligned_load<F>(ctx->x) + C(kScale * 0.5f);
994 0 : F fx = unaligned_load<F>(ctx->fx);
995 :
996 : F scalex;
997 0 : if (kScale == -3) { scalex = bicubic_far (1.0_f - fx); }
998 0 : if (kScale == -1) { scalex = bicubic_near(1.0_f - fx); }
999 0 : if (kScale == +1) { scalex = bicubic_near( fx); }
1000 0 : if (kScale == +3) { scalex = bicubic_far ( fx); }
1001 0 : memcpy(ctx->scalex, &scalex, sizeof(F));
1002 0 : }
1003 : template <int kScale>
1004 0 : SI void bicubic_y(SkJumper_SamplerCtx* ctx, F* y) {
1005 0 : *y = unaligned_load<F>(ctx->y) + C(kScale * 0.5f);
1006 0 : F fy = unaligned_load<F>(ctx->fy);
1007 :
1008 : F scaley;
1009 0 : if (kScale == -3) { scaley = bicubic_far (1.0_f - fy); }
1010 0 : if (kScale == -1) { scaley = bicubic_near(1.0_f - fy); }
1011 0 : if (kScale == +1) { scaley = bicubic_near( fy); }
1012 0 : if (kScale == +3) { scaley = bicubic_far ( fy); }
1013 0 : memcpy(ctx->scaley, &scaley, sizeof(F));
1014 0 : }
1015 :
1016 0 : STAGE(bicubic_n3x) { bicubic_x<-3>(ctx, &r); }
1017 0 : STAGE(bicubic_n1x) { bicubic_x<-1>(ctx, &r); }
1018 0 : STAGE(bicubic_p1x) { bicubic_x<+1>(ctx, &r); }
1019 0 : STAGE(bicubic_p3x) { bicubic_x<+3>(ctx, &r); }
1020 :
1021 0 : STAGE(bicubic_n3y) { bicubic_y<-3>(ctx, &g); }
1022 0 : STAGE(bicubic_n1y) { bicubic_y<-1>(ctx, &g); }
1023 0 : STAGE(bicubic_p1y) { bicubic_y<+1>(ctx, &g); }
1024 0 : STAGE(bicubic_p3y) { bicubic_y<+3>(ctx, &g); }
|