LCOV - output.info - gfx/skia/skia/src/jumper/SkJumper

LCOV - code coverage report

Current view:	top level - gfx/skia/skia/src/jumper - SkJumper_stages.cpp (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	0	586	0.0 %
Date:	2017-07-14 16:53:18	Functions:	0	285	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*
       2             :  * Copyright 2017 Google Inc.
       3             :  *
       4             :  * Use of this source code is governed by a BSD-style license that can be
       5             :  * found in the LICENSE file.
       6             :  */
       7             : 
       8             : #include "SkJumper.h"
       9             : #include "SkJumper_misc.h"     // SI, unaligned_load(), bit_cast(), C(), operator"" _i and _f.
      10             : #include "SkJumper_vectors.h"  // F, I32, U32, U16, U8, cast(), expand()
      11             : 
      12             : // Our fundamental vector depth is our pixel stride.
      13             : static const size_t kStride = sizeof(F) / sizeof(float);
      14             : 
      15             : // A reminder:
      16             : // Code guarded by defined(JUMPER) can assume that it will be compiled by Clang
      17             : // and that F, I32, etc. are kStride-deep ext_vector_types of the appropriate type.
      18             : // Otherwise, F, I32, etc. just alias the basic scalar types (and so kStride == 1).
      19             : 
      20             : // Another reminder:
      21             : // You can't generally use constants in this file except via C() or operator"" _i/_f.
      22             : // Not all constants can be generated using C() or _i/_f.  Stages read the rest from this struct.
      23             : using K = const SkJumper_constants;
      24             : 
      25             : 
      26             : // Let's start first with the mechanisms we use to build Stages.
      27             : 
      28             : // Our program is an array of void*, either
      29             : //   - 1 void* per stage with no context pointer, the next stage;
      30             : //   - 2 void* per stage with a context pointer, first the context pointer, then the next stage.
      31             : 
      32             : // load_and_inc() steps the program forward by 1 void*, returning that pointer.
      33           0 : SI void* load_and_inc(void**& program) {
      34             : #if defined(__GNUC__) && defined(__x86_64__)
      35             :     // If program is in %rsi (we try to make this likely) then this is a single instruction.
      36             :     void* rax;
      37           0 :     asm("lodsq" : "=a"(rax), "+S"(program));  // Write-only %rax, read-write %rsi.
      38           0 :     return rax;
      39             : #else
      40             :     // On ARM *program++ compiles into pretty ideal code without any handholding.
      41             :     return *program++;
      42             : #endif
      43             : }
      44             : 
      45             : // LazyCtx doesn't do anything unless you call operator T*(), encapsulating the logic
      46             : // from above that stages without a context pointer are represented by just 1 void*.
      47             : struct LazyCtx {
      48             :     void*   ptr;
      49             :     void**& program;
      50             : 
      51           0 :     explicit LazyCtx(void**& p) : ptr(nullptr), program(p) {}
      52             : 
      53             :     template <typename T>
      54           0 :     operator T*() {
      55           0 :         if (!ptr) { ptr = load_and_inc(program); }
      56           0 :         return (T*)ptr;
      57             :     }
      58             : };
      59             : 
      60             : // A little wrapper macro to name Stages differently depending on the instruction set.
      61             : // That lets us link together several options.
      62             : #if !defined(JUMPER)
      63             :     #define WRAP(name) sk_##name
      64             : #elif defined(__aarch64__)
      65             :     #define WRAP(name) sk_##name##_aarch64
      66             : #elif defined(__arm__)
      67             :     #define WRAP(name) sk_##name##_vfp4
      68             : #elif defined(__AVX2__)
      69             :     #define WRAP(name) sk_##name##_hsw
      70             : #elif defined(__AVX__)
      71             :     #define WRAP(name) sk_##name##_avx
      72             : #elif defined(__SSE4_1__)
      73             :     #define WRAP(name) sk_##name##_sse41
      74             : #elif defined(__SSE2__)
      75             :     #define WRAP(name) sk_##name##_sse2
      76             : #endif
      77             : 
      78             : // We're finally going to get to what a Stage function looks like!
      79             : // It's best to jump down to the #else case first, then to come back up here for AVX.
      80             : 
      81             : #if defined(JUMPER) && defined(__AVX__)
      82             :     // There's a big cost to switch between SSE and AVX, so we do a little
      83             :     // extra work to handle even the jagged <kStride tail in AVX mode.
      84             :     // Compared to normal stages, we maintain an extra tail register:
      85             :     //    tail == 0 ~~> work on a full kStride pixels
      86             :     //    tail != 0 ~~> work on only the first tail pixels
      87             :     // tail is always < kStride.
      88             :     using Stage = void(size_t x, void** program, K* k, size_t tail, F,F,F,F, F,F,F,F);
      89             : 
      90             :     #if defined(JUMPER) && defined(WIN)
      91             :     __attribute__((ms_abi))
      92             :     #endif
      93             :     extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
      94             :         F v{};
      95             :         auto start = (Stage*)load_and_inc(program);
      96             :         while (x + kStride <= limit) {
      97             :             start(x,program,k,0,    v,v,v,v, v,v,v,v);
      98             :             x += kStride;
      99             :         }
     100             :         if (size_t tail = limit - x) {
     101             :             start(x,program,k,tail, v,v,v,v, v,v,v,v);
     102             :         }
     103             :         return limit;
     104             :     }
     105             : 
     106             :     #define STAGE(name)                                                           \
     107             :         SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail,                \
     108             :                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);     \
     109             :         extern "C" void WRAP(name)(size_t x, void** program, K* k, size_t tail,   \
     110             :                                    F r, F g, F b, F a, F dr, F dg, F db, F da) {  \
     111             :             LazyCtx ctx(program);                                                 \
     112             :             name##_k(x,ctx,k,tail, r,g,b,a, dr,dg,db,da);                         \
     113             :             auto next = (Stage*)load_and_inc(program);                            \
     114             :             next(x,program,k,tail, r,g,b,a, dr,dg,db,da);                         \
     115             :         }                                                                         \
     116             :         SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail,                \
     117             :                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
     118             : 
     119             : #else
     120             :     // Other instruction sets (SSE, NEON, portable) can fall back on narrower
     121             :     // pipelines cheaply, which frees us to always assume tail==0.
     122             : 
     123             :     // Stages tail call between each other by following program as described above.
     124             :     // x is our induction variable, stepping forward kStride at a time.
     125             :     using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
     126             : 
     127             :     // On Windows, start_pipeline() has a normal Windows ABI, and then the rest is System V.
     128             :     #if defined(JUMPER) && defined(WIN)
     129             :     __attribute__((ms_abi))
     130             :     #endif
     131           0 :     extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
     132           0 :         F v{};
     133           0 :         auto start = (Stage*)load_and_inc(program);
     134           0 :         while (x + kStride <= limit) {
     135           0 :             start(x,program,k, v,v,v,v, v,v,v,v);
     136           0 :             x += kStride;
     137             :         }
     138           0 :         return x;
     139             :     }
     140             : 
     141             :     // This STAGE macro makes it easier to write stages, handling all the Stage chaining for you.
     142             :     #define STAGE(name)                                                           \
     143             :         SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail,                \
     144             :                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);     \
     145             :         extern "C" void WRAP(name)(size_t x, void** program, K* k,                \
     146             :                                    F r, F g, F b, F a, F dr, F dg, F db, F da) {  \
     147             :             LazyCtx ctx(program);                                                 \
     148             :             name##_k(x,ctx,k,0, r,g,b,a, dr,dg,db,da);                            \
     149             :             auto next = (Stage*)load_and_inc(program);                            \
     150             :             next(x,program,k, r,g,b,a, dr,dg,db,da);                              \
     151             :         }                                                                         \
     152             :         SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail,                \
     153             :                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
     154             : #endif
     155             : 
     156             : // just_return() is a simple no-op stage that only exists to end the chain,
     157             : // returning back up to start_pipeline(), and from there to the caller.
     158           0 : extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
     159             : 
     160             : 
     161             : // We could start defining normal Stages now.  But first, some helper functions.
     162             : 
     163             : // These load() and store() methods are tail-aware,
     164             : // but focus mainly on keeping the at-stride tail==0 case fast.
     165             : 
     166             : template <typename V, typename T>
     167           0 : SI V load(const T* src, size_t tail) {
     168             : #if defined(JUMPER)
     169             :     __builtin_assume(tail < kStride);
     170             :     if (__builtin_expect(tail, 0)) {
     171             :         V v{};  // Any inactive lanes are zeroed.
     172             :         switch (tail-1) {
     173             :             case 6: v[6] = src[6];
     174             :             case 5: v[5] = src[5];
     175             :             case 4: v[4] = src[4];
     176             :             case 3: v[3] = src[3];
     177             :             case 2: v[2] = src[2];
     178             :             case 1: v[1] = src[1];
     179             :             case 0: v[0] = src[0];
     180             :         }
     181             :         return v;
     182             :     }
     183             : #endif
     184           0 :     return unaligned_load<V>(src);
     185             : }
     186             : 
     187             : template <typename V, typename T>
     188           0 : SI void store(T* dst, V v, size_t tail) {
     189             : #if defined(JUMPER)
     190             :     __builtin_assume(tail < kStride);
     191             :     if (__builtin_expect(tail, 0)) {
     192             :         switch (tail-1) {
     193             :             case 6: dst[6] = v[6];
     194             :             case 5: dst[5] = v[5];
     195             :             case 4: dst[4] = v[4];
     196             :             case 3: dst[3] = v[3];
     197             :             case 2: dst[2] = v[2];
     198             :             case 1: dst[1] = v[1];
     199             :             case 0: dst[0] = v[0];
     200             :         }
     201             :         return;
     202             :     }
     203             : #endif
     204           0 :     memcpy(dst, &v, sizeof(v));
     205           0 : }
     206             : 
     207             : // This doesn't look strictly necessary, but without it Clang would generate load() using
     208             : // compiler-generated constants that we can't support.  This version doesn't need constants.
     209             : #if defined(JUMPER) && defined(__AVX__)
     210             :     template <>
     211             :     inline U8 load(const uint8_t* src, size_t tail) {
     212             :         if (__builtin_expect(tail, 0)) {
     213             :             uint64_t v = 0;
     214             :             size_t shift = 0;
     215             :             #pragma nounroll
     216             :             while (tail --> 0) {
     217             :                 v |= (uint64_t)*src++ << shift;
     218             :                 shift += 8;
     219             :             }
     220             :             return unaligned_load<U8>(&v);
     221             :         }
     222             :         return unaligned_load<U8>(src);
     223             :     }
     224             : #endif
     225             : 
     226             : // AVX2 adds some mask loads and stores that make for shorter, faster code.
     227             : #if defined(JUMPER) && defined(__AVX2__)
     228             :     SI U32 mask(size_t tail) {
     229             :         // We go a little out of our way to avoid needing large constant values here.
     230             : 
     231             :         // It's easiest to build the mask as 8 8-bit values, either 0x00 or 0xff.
     232             :         // Start fully on, then shift away lanes from the top until we've got our mask.
     233             :         uint64_t mask = 0xffffffffffffffff >> 8*(kStride-tail);
     234             : 
     235             :         // Sign-extend each mask lane to its full width, 0x00000000 or 0xffffffff.
     236             :         return _mm256_cvtepi8_epi32(_mm_cvtsi64_si128((int64_t)mask));
     237             :     }
     238             : 
     239             :     template <>
     240             :     inline U32 load(const uint32_t* src, size_t tail) {
     241             :         __builtin_assume(tail < kStride);
     242             :         if (__builtin_expect(tail, 0)) {
     243             :             return _mm256_maskload_epi32((const int*)src, mask(tail));
     244             :         }
     245             :         return unaligned_load<U32>(src);
     246             :     }
     247             : 
     248             :     template <>
     249             :     inline void store(uint32_t* dst, U32 v, size_t tail) {
     250             :         __builtin_assume(tail < kStride);
     251             :         if (__builtin_expect(tail, 0)) {
     252             :             return _mm256_maskstore_epi32((int*)dst, mask(tail), v);
     253             :         }
     254             :         memcpy(dst, &v, sizeof(v));
     255             :     }
     256             : #endif
     257             : 
     258           0 : SI F from_byte(U8 b) {
     259           0 :     return cast(expand(b)) * C(1/255.0f);
     260             : }
     261           0 : SI void from_565(U16 _565, F* r, F* g, F* b) {
     262           0 :     U32 wide = expand(_565);
     263           0 :     *r = cast(wide & C(31<<11)) * C(1.0f / (31<<11));
     264           0 :     *g = cast(wide & C(63<< 5)) * C(1.0f / (63<< 5));
     265           0 :     *b = cast(wide & C(31<< 0)) * C(1.0f / (31<< 0));
     266           0 : }
     267           0 : SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) {
     268           0 :     U32 wide = expand(_4444);
     269           0 :     *r = cast(wide & C(15<<12)) * C(1.0f / (15<<12));
     270           0 :     *g = cast(wide & C(15<< 8)) * C(1.0f / (15<< 8));
     271           0 :     *b = cast(wide & C(15<< 4)) * C(1.0f / (15<< 4));
     272           0 :     *a = cast(wide & C(15<< 0)) * C(1.0f / (15<< 0));
     273           0 : }
     274           0 : SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
     275           0 :     *r = cast((_8888      ) & 0xff_i) * C(1/255.0f);
     276           0 :     *g = cast((_8888 >>  8) & 0xff_i) * C(1/255.0f);
     277           0 :     *b = cast((_8888 >> 16) & 0xff_i) * C(1/255.0f);
     278           0 :     *a = cast((_8888 >> 24)         ) * C(1/255.0f);
     279           0 : }
     280             : 
     281             : template <typename T>
     282           0 : SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
     283           0 :     *ptr = (const T*)ctx->pixels;
     284           0 :     return trunc_(y)*ctx->stride + trunc_(x);
     285             : }
     286             : 
     287             : // Now finally, normal Stages!
     288             : 
     289           0 : STAGE(seed_shader) {
     290           0 :     auto y = *(const int*)ctx;
     291             : 
     292             :     // It's important for speed to explicitly cast(x) and cast(y),
     293             :     // which has the effect of splatting them to vectors before converting to floats.
     294             :     // On Intel this breaks a data dependency on previous loop iterations' registers.
     295           0 :     r = cast(x) + 0.5_f + unaligned_load<F>(k->iota);
     296           0 :     g = cast(y) + 0.5_f;
     297           0 :     b = 1.0_f;
     298           0 :     a = 0;
     299           0 :     dr = dg = db = da = 0;
     300           0 : }
     301             : 
     302           0 : STAGE(constant_color) {
     303           0 :     auto rgba = (const float*)ctx;
     304           0 :     r = rgba[0];
     305           0 :     g = rgba[1];
     306           0 :     b = rgba[2];
     307           0 :     a = rgba[3];
     308           0 : }
     309             : 
     310             : // Most blend modes apply the same logic to each channel.
     311             : #define BLEND_MODE(name)                       \
     312             :     SI F name##_channel(F s, F d, F sa, F da); \
     313             :     STAGE(name) {                              \
     314             :         r = name##_channel(r,dr,a,da);         \
     315             :         g = name##_channel(g,dg,a,da);         \
     316             :         b = name##_channel(b,db,a,da);         \
     317             :         a = name##_channel(a,da,a,da);         \
     318             :     }                                          \
     319             :     SI F name##_channel(F s, F d, F sa, F da)
     320             : 
     321           0 : SI F inv(F x) { return 1.0_f - x; }
     322           0 : SI F two(F x) { return x + x; }
     323             : 
     324           0 : BLEND_MODE(clear)    { return 0; }
     325           0 : BLEND_MODE(srcatop)  { return s*da + d*inv(sa); }
     326           0 : BLEND_MODE(dstatop)  { return d*sa + s*inv(da); }
     327           0 : BLEND_MODE(srcin)    { return s * da; }
     328           0 : BLEND_MODE(dstin)    { return d * sa; }
     329           0 : BLEND_MODE(srcout)   { return s * inv(da); }
     330           0 : BLEND_MODE(dstout)   { return d * inv(sa); }
     331           0 : BLEND_MODE(srcover)  { return mad(d, inv(sa), s); }
     332           0 : BLEND_MODE(dstover)  { return mad(s, inv(da), d); }
     333             : 
     334           0 : BLEND_MODE(modulate) { return s*d; }
     335           0 : BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; }
     336           0 : BLEND_MODE(plus_)    { return s + d; }
     337           0 : BLEND_MODE(screen)   { return s + d - s*d; }
     338           0 : BLEND_MODE(xor_)     { return s*inv(da) + d*inv(sa); }
     339             : #undef BLEND_MODE
     340             : 
     341             : // Most other blend modes apply the same logic to colors, and srcover to alpha.
     342             : #define BLEND_MODE(name)                       \
     343             :     SI F name##_channel(F s, F d, F sa, F da); \
     344             :     STAGE(name) {                              \
     345             :         r = name##_channel(r,dr,a,da);         \
     346             :         g = name##_channel(g,dg,a,da);         \
     347             :         b = name##_channel(b,db,a,da);         \
     348             :         a = mad(da, inv(a), a);                \
     349             :     }                                          \
     350             :     SI F name##_channel(F s, F d, F sa, F da)
     351             : 
     352           0 : BLEND_MODE(darken)     { return s + d -     max(s*da, d*sa) ; }
     353           0 : BLEND_MODE(lighten)    { return s + d -     min(s*da, d*sa) ; }
     354           0 : BLEND_MODE(difference) { return s + d - two(min(s*da, d*sa)); }
     355           0 : BLEND_MODE(exclusion)  { return s + d - two(s*d); }
     356             : 
     357           0 : BLEND_MODE(colorburn) {
     358           0 :     return if_then_else(d == da, d + s*inv(da),
     359           0 :            if_then_else(s ==  0, s + d*inv(sa),
     360           0 :                                  sa*(da - min(da, (da-d)*sa/s)) + s*inv(da) + d*inv(sa)));
     361             : }
     362           0 : BLEND_MODE(colordodge) {
     363           0 :     return if_then_else(d ==  0, d + s*inv(da),
     364           0 :            if_then_else(s == sa, s + d*inv(sa),
     365           0 :                                  sa*min(da, (d*sa)/(sa - s)) + s*inv(da) + d*inv(sa)));
     366             : }
     367           0 : BLEND_MODE(hardlight) {
     368           0 :     return s*inv(da) + d*inv(sa)
     369           0 :          + if_then_else(two(s) <= sa, two(s*d), sa*da - two((da-d)*(sa-s)));
     370             : }
     371           0 : BLEND_MODE(overlay) {
     372           0 :     return s*inv(da) + d*inv(sa)
     373           0 :          + if_then_else(two(d) <= da, two(s*d), sa*da - two((da-d)*(sa-s)));
     374             : }
     375             : 
     376           0 : BLEND_MODE(softlight) {
     377           0 :     F m  = if_then_else(da > 0, d / da, 0),
     378           0 :       s2 = two(s),
     379           0 :       m4 = two(two(m));
     380             : 
     381             :     // The logic forks three ways:
     382             :     //    1. dark src?
     383             :     //    2. light src, dark dst?
     384             :     //    3. light src, light dst?
     385           0 :     F darkSrc = d*(sa + (s2 - sa)*(1.0_f - m)),      // Used in case 1.
     386           0 :       darkDst = (m4*m4 + m4)*(m - 1.0_f) + 7.0_f*m,  // Used in case 2.
     387           0 :       liteDst = rcp(rsqrt(m)) - m,                   // Used in case 3.
     388           0 :       liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3?
     389           0 :     return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc);      // 1 or (2 or 3)?
     390             : }
     391             : #undef BLEND_MODE
     392             : 
     393           0 : STAGE(clamp_0) {
     394           0 :     r = max(r, 0);
     395           0 :     g = max(g, 0);
     396           0 :     b = max(b, 0);
     397           0 :     a = max(a, 0);
     398           0 : }
     399             : 
     400           0 : STAGE(clamp_1) {
     401           0 :     r = min(r, 1.0_f);
     402           0 :     g = min(g, 1.0_f);
     403           0 :     b = min(b, 1.0_f);
     404           0 :     a = min(a, 1.0_f);
     405           0 : }
     406             : 
     407           0 : STAGE(clamp_a) {
     408           0 :     a = min(a, 1.0_f);
     409           0 :     r = min(r, a);
     410           0 :     g = min(g, a);
     411           0 :     b = min(b, a);
     412           0 : }
     413             : 
     414           0 : STAGE(set_rgb) {
     415           0 :     auto rgb = (const float*)ctx;
     416           0 :     r = rgb[0];
     417           0 :     g = rgb[1];
     418           0 :     b = rgb[2];
     419           0 : }
     420           0 : STAGE(swap_rb) {
     421           0 :     auto tmp = r;
     422           0 :     r = b;
     423           0 :     b = tmp;
     424           0 : }
     425             : 
     426           0 : STAGE(swap) {
     427           0 :     auto swap = [](F& v, F& dv) {
     428           0 :         auto tmp = v;
     429           0 :         v = dv;
     430           0 :         dv = tmp;
     431           0 :     };
     432           0 :     swap(r, dr);
     433           0 :     swap(g, dg);
     434           0 :     swap(b, db);
     435           0 :     swap(a, da);
     436           0 : }
     437           0 : STAGE(move_src_dst) {
     438           0 :     dr = r;
     439           0 :     dg = g;
     440           0 :     db = b;
     441           0 :     da = a;
     442           0 : }
     443           0 : STAGE(move_dst_src) {
     444           0 :     r = dr;
     445           0 :     g = dg;
     446           0 :     b = db;
     447           0 :     a = da;
     448           0 : }
     449             : 
     450           0 : STAGE(premul) {
     451           0 :     r = r * a;
     452           0 :     g = g * a;
     453           0 :     b = b * a;
     454           0 : }
     455           0 : STAGE(unpremul) {
     456           0 :     auto scale = if_then_else(a == 0, 0, 1.0_f / a);
     457           0 :     r = r * scale;
     458           0 :     g = g * scale;
     459           0 :     b = b * scale;
     460           0 : }
     461             : 
     462           0 : STAGE(from_srgb) {
     463           0 :     auto fn = [&](F s) {
     464           0 :         auto lo = s * C(1/12.92f);
     465           0 :         auto hi = mad(s*s, mad(s, 0.3000_f, 0.6975_f), 0.0025_f);
     466           0 :         return if_then_else(s < 0.055_f, lo, hi);
     467             :     };
     468           0 :     r = fn(r);
     469           0 :     g = fn(g);
     470           0 :     b = fn(b);
     471           0 : }
     472           0 : STAGE(to_srgb) {
     473           0 :     auto fn = [&](F l) {
     474           0 :         F sqrt = rcp  (rsqrt(l)),
     475           0 :           ftrt = rsqrt(rsqrt(l));
     476           0 :         auto lo = l * 12.46_f;
     477           0 :         auto hi = min(1.0_f, mad(0.411192_f, ftrt,
     478           0 :                              mad(0.689206_f, sqrt, -0.0988_f)));
     479           0 :         return if_then_else(l < 0.0043_f, lo, hi);
     480             :     };
     481           0 :     r = fn(r);
     482           0 :     g = fn(g);
     483           0 :     b = fn(b);
     484           0 : }
     485             : 
     486           0 : STAGE(from_2dot2) {
     487           0 :     auto fn = [](F x) {
     488             :         // x^(141/64) = x^(2.20312) is a great approximation of the true value, x^(2.2).
     489             :         // (note: x^(35/16) = x^(2.1875) is an okay one as well and would be quicker)
     490           0 :         F x16 = rsqrt(rsqrt(rsqrt(rsqrt(x)))),    // x^(1/16) = x^(4/64);
     491           0 :           x64 = rsqrt(rsqrt(x16));                // x^(1/64)
     492             : 
     493             :         // 141/64 = 128/64 + 12/64 + 1/64
     494           0 :         return max((x*x) * (x16*x16*x16) * x64, 0);
     495             :     };
     496           0 :     r = fn(r);
     497           0 :     g = fn(g);
     498           0 :     b = fn(b);
     499           0 : }
     500           0 : STAGE(to_2dot2) {
     501           0 :     auto fn = [](F x) {
     502             :         // x^(29/64) is a very good approximation of the true value, x^(1/2.2).
     503           0 :         F x2  = rsqrt(x),                         // x^(-1/2)
     504           0 :           x32 = rsqrt(rsqrt(rsqrt(rsqrt(x2)))),   // x^(-1/32)
     505           0 :           x64 = rsqrt(x32);                       // x^(+1/64)
     506             : 
     507             :         // 29/64 = 32/64 - 2/64 - 1/64
     508           0 :         return max(rcp(x2) * x32 * rcp(x64), 0);
     509             :     };
     510           0 :     r = fn(r);
     511           0 :     g = fn(g);
     512           0 :     b = fn(b);
     513           0 : }
     514             : 
     515           0 : STAGE(rgb_to_hsl) {
     516           0 :     F mx = max(max(r,g), b),
     517           0 :       mn = min(min(r,g), b),
     518           0 :       d = mx - mn,
     519           0 :       d_rcp = 1.0_f / d;
     520             : 
     521           0 :     F h = C(1/6.0f) *
     522           0 :           if_then_else(mx == mn, 0,
     523           0 :           if_then_else(mx ==  r, (g-b)*d_rcp + if_then_else(g < b, 6.0_f, 0),
     524           0 :           if_then_else(mx ==  g, (b-r)*d_rcp + 2.0_f,
     525           0 :                                  (r-g)*d_rcp + 4.0_f)));
     526             : 
     527           0 :     F l = (mx + mn) * 0.5_f;
     528           0 :     F s = if_then_else(mx == mn, 0,
     529           0 :                        d / if_then_else(l > 0.5_f, 2.0_f-mx-mn, mx+mn));
     530             : 
     531           0 :     r = h;
     532           0 :     g = s;
     533           0 :     b = l;
     534           0 : }
     535           0 : STAGE(hsl_to_rgb) {
     536           0 :     F h = r,
     537           0 :       s = g,
     538           0 :       l = b;
     539             : 
     540           0 :     F q = if_then_else(l < 0.5_f, l*(1.0_f + s), l + s - l*s),
     541           0 :       p = 2.0_f*l - q;
     542             : 
     543           0 :     auto hue_to_rgb = [&](F t) {
     544           0 :         F t2 = if_then_else(t < 0.0_f, t + 1.0_f,
     545           0 :                if_then_else(t > 1.0_f, t - 1.0_f,
     546           0 :                                        t));
     547             : 
     548           0 :         return if_then_else(t2 < C(1/6.0f),  p + (q-p)*6.0_f*t,
     549           0 :                if_then_else(t2 < C(3/6.0f),  q,
     550           0 :                if_then_else(t2 < C(4/6.0f),  p + (q-p)*6.0_f*(C(4/6.0f) - t2),
     551           0 :                                              p)));
     552           0 :     };
     553             : 
     554           0 :     r = if_then_else(s == 0, l, hue_to_rgb(h + C(1/3.0f)));
     555           0 :     g = if_then_else(s == 0, l, hue_to_rgb(h            ));
     556           0 :     b = if_then_else(s == 0, l, hue_to_rgb(h - C(1/3.0f)));
     557           0 : }
     558             : 
     559           0 : STAGE(scale_1_float) {
     560           0 :     auto c = *(const float*)ctx;
     561             : 
     562           0 :     r = r * c;
     563           0 :     g = g * c;
     564           0 :     b = b * c;
     565           0 :     a = a * c;
     566           0 : }
     567           0 : STAGE(scale_u8) {
     568           0 :     auto ptr = *(const uint8_t**)ctx + x;
     569             : 
     570           0 :     auto scales = load<U8>(ptr, tail);
     571           0 :     auto c = from_byte(scales);
     572             : 
     573           0 :     r = r * c;
     574           0 :     g = g * c;
     575           0 :     b = b * c;
     576           0 :     a = a * c;
     577           0 : }
     578             : 
     579           0 : SI F lerp(F from, F to, F t) {
     580           0 :     return mad(to-from, t, from);
     581             : }
     582             : 
     583           0 : STAGE(lerp_1_float) {
     584           0 :     auto c = *(const float*)ctx;
     585             : 
     586           0 :     r = lerp(dr, r, c);
     587           0 :     g = lerp(dg, g, c);
     588           0 :     b = lerp(db, b, c);
     589           0 :     a = lerp(da, a, c);
     590           0 : }
     591           0 : STAGE(lerp_u8) {
     592           0 :     auto ptr = *(const uint8_t**)ctx + x;
     593             : 
     594           0 :     auto scales = load<U8>(ptr, tail);
     595           0 :     auto c = from_byte(scales);
     596             : 
     597           0 :     r = lerp(dr, r, c);
     598           0 :     g = lerp(dg, g, c);
     599           0 :     b = lerp(db, b, c);
     600           0 :     a = lerp(da, a, c);
     601           0 : }
     602           0 : STAGE(lerp_565) {
     603           0 :     auto ptr = *(const uint16_t**)ctx + x;
     604             : 
     605             :     F cr,cg,cb;
     606           0 :     from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
     607             : 
     608           0 :     r = lerp(dr, r, cr);
     609           0 :     g = lerp(dg, g, cg);
     610           0 :     b = lerp(db, b, cb);
     611           0 :     a = 1.0_f;
     612           0 : }
     613             : 
     614           0 : STAGE(load_tables) {
     615             :     struct Ctx {
     616             :         const uint32_t* src;
     617             :         const float *r, *g, *b;
     618             :     };
     619           0 :     auto c = (const Ctx*)ctx;
     620             : 
     621           0 :     auto px = load<U32>(c->src + x, tail);
     622           0 :     r = gather(c->r, (px      ) & 0xff_i);
     623           0 :     g = gather(c->g, (px >>  8) & 0xff_i);
     624           0 :     b = gather(c->b, (px >> 16) & 0xff_i);
     625           0 :     a = cast(        (px >> 24)) * C(1/255.0f);
     626           0 : }
     627             : 
     628           0 : STAGE(byte_tables) {
     629             :     struct Tables { const uint8_t *r, *g, *b, *a; };
     630           0 :     auto tables = (const Tables*)ctx;
     631             : 
     632           0 :     r = from_byte(gather(tables->r, round(r, 255.0_f)));
     633           0 :     g = from_byte(gather(tables->g, round(g, 255.0_f)));
     634           0 :     b = from_byte(gather(tables->b, round(b, 255.0_f)));
     635           0 :     a = from_byte(gather(tables->a, round(a, 255.0_f)));
     636           0 : }
     637             : 
     638           0 : STAGE(byte_tables_rgb) {
     639             :     struct Tables { const uint8_t *r, *g, *b; int n; };
     640           0 :     auto tables = (const Tables*)ctx;
     641             : 
     642           0 :     F scale = tables->n - 1;
     643           0 :     r = from_byte(gather(tables->r, round(r, scale)));
     644           0 :     g = from_byte(gather(tables->g, round(g, scale)));
     645           0 :     b = from_byte(gather(tables->b, round(b, scale)));
     646           0 : }
     647             : 
     648           0 : STAGE(load_a8) {
     649           0 :     auto ptr = *(const uint8_t**)ctx + x;
     650             : 
     651           0 :     r = g = b = 0.0f;
     652           0 :     a = from_byte(load<U8>(ptr, tail));
     653           0 : }
     654           0 : STAGE(gather_a8) {
     655             :     const uint8_t* ptr;
     656           0 :     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
     657           0 :     r = g = b = 0.0f;
     658           0 :     a = from_byte(gather(ptr, ix));
     659           0 : }
     660           0 : STAGE(store_a8) {
     661           0 :     auto ptr = *(uint8_t**)ctx + x;
     662             : 
     663           0 :     U8 packed = pack(pack(round(a, 255.0_f)));
     664           0 :     store(ptr, packed, tail);
     665           0 : }
     666             : 
     667           0 : STAGE(load_g8) {
     668           0 :     auto ptr = *(const uint8_t**)ctx + x;
     669             : 
     670           0 :     r = g = b = from_byte(load<U8>(ptr, tail));
     671           0 :     a = 1.0_f;
     672           0 : }
     673           0 : STAGE(gather_g8) {
     674             :     const uint8_t* ptr;
     675           0 :     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
     676           0 :     r = g = b = from_byte(gather(ptr, ix));
     677           0 :     a = 1.0_f;
     678           0 : }
     679             : 
     680           0 : STAGE(gather_i8) {
     681           0 :     auto c = (const SkJumper_GatherCtx*)ctx;
     682             :     const uint8_t* ptr;
     683           0 :     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
     684           0 :     ix = expand(gather(ptr, ix));
     685           0 :     from_8888(gather(c->ctable, ix), &r,&g,&b,&a);
     686           0 : }
     687             : 
     688           0 : STAGE(load_565) {
     689           0 :     auto ptr = *(const uint16_t**)ctx + x;
     690             : 
     691           0 :     from_565(load<U16>(ptr, tail), &r,&g,&b);
     692           0 :     a = 1.0_f;
     693           0 : }
     694           0 : STAGE(gather_565) {
     695             :     const uint16_t* ptr;
     696           0 :     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
     697           0 :     from_565(gather(ptr, ix), &r,&g,&b);
     698           0 :     a = 1.0_f;
     699           0 : }
     700           0 : STAGE(store_565) {
     701           0 :     auto ptr = *(uint16_t**)ctx + x;
     702             : 
     703           0 :     U16 px = pack( round(r, 31.0_f) << 11
     704           0 :                  | round(g, 63.0_f) <<  5
     705           0 :                  | round(b, 31.0_f)      );
     706           0 :     store(ptr, px, tail);
     707           0 : }
     708             : 
     709           0 : STAGE(load_4444) {
     710           0 :     auto ptr = *(const uint16_t**)ctx + x;
     711           0 :     from_4444(load<U16>(ptr, tail), &r,&g,&b,&a);
     712           0 : }
     713           0 : STAGE(gather_4444) {
     714             :     const uint16_t* ptr;
     715           0 :     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
     716           0 :     from_4444(gather(ptr, ix), &r,&g,&b,&a);
     717           0 : }
     718           0 : STAGE(store_4444) {
     719           0 :     auto ptr = *(uint16_t**)ctx + x;
     720           0 :     U16 px = pack( round(r, 15.0_f) << 12
     721           0 :                  | round(g, 15.0_f) <<  8
     722           0 :                  | round(b, 15.0_f) <<  4
     723           0 :                  | round(a, 15.0_f)      );
     724           0 :     store(ptr, px, tail);
     725           0 : }
     726             : 
     727           0 : STAGE(load_8888) {
     728           0 :     auto ptr = *(const uint32_t**)ctx + x;
     729           0 :     from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
     730           0 : }
     731           0 : STAGE(gather_8888) {
     732             :     const uint32_t* ptr;
     733           0 :     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
     734           0 :     from_8888(gather(ptr, ix), &r,&g,&b,&a);
     735           0 : }
     736           0 : STAGE(store_8888) {
     737           0 :     auto ptr = *(uint32_t**)ctx + x;
     738             : 
     739           0 :     U32 px = round(r, 255.0_f)
     740           0 :            | round(g, 255.0_f) <<  8
     741           0 :            | round(b, 255.0_f) << 16
     742           0 :            | round(a, 255.0_f) << 24;
     743           0 :     store(ptr, px, tail);
     744           0 : }
     745             : 
     746           0 : STAGE(load_f16) {
     747           0 :     auto ptr = *(const uint64_t**)ctx + x;
     748             : 
     749             :     U16 R,G,B,A;
     750           0 :     load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
     751           0 :     r = from_half(R);
     752           0 :     g = from_half(G);
     753           0 :     b = from_half(B);
     754           0 :     a = from_half(A);
     755           0 : }
     756           0 : STAGE(gather_f16) {
     757             :     const uint64_t* ptr;
     758           0 :     U32 ix = ix_and_ptr(&ptr, ctx, r,g);
     759           0 :     auto px = gather(ptr, ix);
     760             : 
     761             :     U16 R,G,B,A;
     762           0 :     load4((const uint16_t*)&px,0, &R,&G,&B,&A);
     763           0 :     r = from_half(R);
     764           0 :     g = from_half(G);
     765           0 :     b = from_half(B);
     766           0 :     a = from_half(A);
     767           0 : }
     768           0 : STAGE(store_f16) {
     769           0 :     auto ptr = *(uint64_t**)ctx + x;
     770           0 :     store4((uint16_t*)ptr,tail, to_half(r)
     771           0 :                               , to_half(g)
     772           0 :                               , to_half(b)
     773           0 :                               , to_half(a));
     774           0 : }
     775             : 
     776           0 : STAGE(load_u16_be) {
     777           0 :     auto ptr = *(const uint64_t**)ctx + x;
     778             : 
     779             :     U16 R,G,B,A;
     780           0 :     load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
     781             : 
     782           0 :     r = C(1/65535.0f) * cast(expand(bswap(R)));
     783           0 :     g = C(1/65535.0f) * cast(expand(bswap(G)));
     784           0 :     b = C(1/65535.0f) * cast(expand(bswap(B)));
     785           0 :     a = C(1/65535.0f) * cast(expand(bswap(A)));
     786           0 : }
     787           0 : STAGE(store_u16_be) {
     788           0 :     auto ptr = *(uint64_t**)ctx + x;
     789             : 
     790           0 :     U16 R = bswap(pack(round(r, 65535.0_f))),
     791           0 :         G = bswap(pack(round(g, 65535.0_f))),
     792           0 :         B = bswap(pack(round(b, 65535.0_f))),
     793           0 :         A = bswap(pack(round(a, 65535.0_f)));
     794             : 
     795           0 :     store4((uint16_t*)ptr,tail, R,G,B,A);
     796           0 : }
     797             : 
     798           0 : STAGE(load_f32) {
     799           0 :     auto ptr = *(const float**)ctx + 4*x;
     800           0 :     load4(ptr,tail, &r,&g,&b,&a);
     801           0 : }
     802           0 : STAGE(store_f32) {
     803           0 :     auto ptr = *(float**)ctx + 4*x;
     804           0 :     store4(ptr,tail, r,g,b,a);
     805           0 : }
     806             : 
     807           0 : SI F ulp_before(F v) {
     808           0 :     return bit_cast<F>(bit_cast<U32>(v) + U32(0xffffffff));
     809             : }
     810           0 : SI F clamp(F v, float limit) {
     811           0 :     v = max(0, v);
     812           0 :     return min(v, ulp_before(limit));
     813             : }
     814           0 : SI F repeat(F v, float limit) {
     815           0 :     v = v - floor_(v/limit)*limit;
     816           0 :     return min(v, ulp_before(limit));
     817             : }
     818           0 : SI F mirror(F v, float limit) {
     819           0 :     v = abs_( (v-limit) - (limit+limit)*floor_((v-limit)/(limit+limit)) - limit );
     820           0 :     return min(v, ulp_before(limit));
     821             : }
     822           0 : STAGE(clamp_x)  { r = clamp (r, *(const float*)ctx); }
     823           0 : STAGE(clamp_y)  { g = clamp (g, *(const float*)ctx); }
     824           0 : STAGE(repeat_x) { r = repeat(r, *(const float*)ctx); }
     825           0 : STAGE(repeat_y) { g = repeat(g, *(const float*)ctx); }
     826           0 : STAGE(mirror_x) { r = mirror(r, *(const float*)ctx); }
     827           0 : STAGE(mirror_y) { g = mirror(g, *(const float*)ctx); }
     828             : 
     829           0 : STAGE(luminance_to_alpha) {
     830           0 :     a = r*0.2126_f + g*0.7152_f + b*0.0722_f;
     831           0 :     r = g = b = 0;
     832           0 : }
     833             : 
     834           0 : STAGE(matrix_2x3) {
     835           0 :     auto m = (const float*)ctx;
     836             : 
     837           0 :     auto R = mad(r,m[0], mad(g,m[2], m[4])),
     838           0 :          G = mad(r,m[1], mad(g,m[3], m[5]));
     839           0 :     r = R;
     840           0 :     g = G;
     841           0 : }
     842           0 : STAGE(matrix_3x4) {
     843           0 :     auto m = (const float*)ctx;
     844             : 
     845           0 :     auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))),
     846           0 :          G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))),
     847           0 :          B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11])));
     848           0 :     r = R;
     849           0 :     g = G;
     850           0 :     b = B;
     851           0 : }
     852           0 : STAGE(matrix_4x5) {
     853           0 :     auto m = (const float*)ctx;
     854             : 
     855           0 :     auto R = mad(r,m[0], mad(g,m[4], mad(b,m[ 8], mad(a,m[12], m[16])))),
     856           0 :          G = mad(r,m[1], mad(g,m[5], mad(b,m[ 9], mad(a,m[13], m[17])))),
     857           0 :          B = mad(r,m[2], mad(g,m[6], mad(b,m[10], mad(a,m[14], m[18])))),
     858           0 :          A = mad(r,m[3], mad(g,m[7], mad(b,m[11], mad(a,m[15], m[19]))));
     859           0 :     r = R;
     860           0 :     g = G;
     861           0 :     b = B;
     862           0 :     a = A;
     863           0 : }
     864           0 : STAGE(matrix_perspective) {
     865             :     // N.B. Unlike the other matrix_ stages, this matrix is row-major.
     866           0 :     auto m = (const float*)ctx;
     867             : 
     868           0 :     auto R = mad(r,m[0], mad(g,m[1], m[2])),
     869           0 :          G = mad(r,m[3], mad(g,m[4], m[5])),
     870           0 :          Z = mad(r,m[6], mad(g,m[7], m[8]));
     871           0 :     r = R * rcp(Z);
     872           0 :     g = G * rcp(Z);
     873           0 : }
     874             : 
     875           0 : STAGE(linear_gradient) {
     876             :     struct Stop { float pos; float f[4], b[4]; };
     877             :     struct Ctx { size_t n; Stop *stops; float start[4]; };
     878             : 
     879           0 :     auto c = (const Ctx*)ctx;
     880           0 :     F fr = 0, fg = 0, fb = 0, fa = 0;
     881           0 :     F br = c->start[0],
     882           0 :       bg = c->start[1],
     883           0 :       bb = c->start[2],
     884           0 :       ba = c->start[3];
     885           0 :     auto t = r;
     886           0 :     for (size_t i = 0; i < c->n; i++) {
     887           0 :         fr = if_then_else(t < c->stops[i].pos, fr, c->stops[i].f[0]);
     888           0 :         fg = if_then_else(t < c->stops[i].pos, fg, c->stops[i].f[1]);
     889           0 :         fb = if_then_else(t < c->stops[i].pos, fb, c->stops[i].f[2]);
     890           0 :         fa = if_then_else(t < c->stops[i].pos, fa, c->stops[i].f[3]);
     891           0 :         br = if_then_else(t < c->stops[i].pos, br, c->stops[i].b[0]);
     892           0 :         bg = if_then_else(t < c->stops[i].pos, bg, c->stops[i].b[1]);
     893           0 :         bb = if_then_else(t < c->stops[i].pos, bb, c->stops[i].b[2]);
     894           0 :         ba = if_then_else(t < c->stops[i].pos, ba, c->stops[i].b[3]);
     895             :     }
     896             : 
     897           0 :     r = mad(t, fr, br);
     898           0 :     g = mad(t, fg, bg);
     899           0 :     b = mad(t, fb, bb);
     900           0 :     a = mad(t, fa, ba);
     901           0 : }
     902             : 
     903           0 : STAGE(linear_gradient_2stops) {
     904             :     struct Ctx { float f[4], b[4]; };
     905           0 :     auto c = (const Ctx*)ctx;
     906             : 
     907           0 :     auto t = r;
     908           0 :     r = mad(t, c->f[0], c->b[0]);
     909           0 :     g = mad(t, c->f[1], c->b[1]);
     910           0 :     b = mad(t, c->f[2], c->b[2]);
     911           0 :     a = mad(t, c->f[3], c->b[3]);
     912           0 : }
     913             : 
     914           0 : STAGE(save_xy) {
     915           0 :     auto c = (SkJumper_SamplerCtx*)ctx;
     916             : 
     917             :     // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
     918             :     // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
     919             :     // surrounding (x,y) at (0.5,0.5) off-center.
     920           0 :     auto fract = [](F v) { return v - floor_(v); };
     921           0 :     F fx = fract(r + 0.5_f),
     922           0 :       fy = fract(g + 0.5_f);
     923             : 
     924             :     // Samplers will need to load x and fx, or y and fy.
     925           0 :     memcpy(c->x,  &r,  sizeof(F));
     926           0 :     memcpy(c->y,  &g,  sizeof(F));
     927           0 :     memcpy(c->fx, &fx, sizeof(F));
     928           0 :     memcpy(c->fy, &fy, sizeof(F));
     929           0 : }
     930             : 
     931           0 : STAGE(accumulate) {
     932           0 :     auto c = (const SkJumper_SamplerCtx*)ctx;
     933             : 
     934             :     // Bilinear and bicubic filters are both separable, so we produce independent contributions
     935             :     // from x and y, multiplying them together here to get each pixel's total scale factor.
     936           0 :     auto scale = unaligned_load<F>(c->scalex)
     937           0 :                * unaligned_load<F>(c->scaley);
     938           0 :     dr = mad(scale, r, dr);
     939           0 :     dg = mad(scale, g, dg);
     940           0 :     db = mad(scale, b, db);
     941           0 :     da = mad(scale, a, da);
     942           0 : }
     943             : 
     944             : // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
     945             : // are combined in direct proportion to their area overlapping that logical query pixel.
     946             : // At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x.
     947             : // The y-axis is symmetric.
     948             : 
     949             : template <int kScale>
     950           0 : SI void bilinear_x(SkJumper_SamplerCtx* ctx, F* x) {
     951           0 :     *x = unaligned_load<F>(ctx->x) + C(kScale * 0.5f);
     952           0 :     F fx = unaligned_load<F>(ctx->fx);
     953             : 
     954             :     F scalex;
     955           0 :     if (kScale == -1) { scalex = 1.0_f - fx; }
     956           0 :     if (kScale == +1) { scalex =         fx; }
     957           0 :     memcpy(ctx->scalex, &scalex, sizeof(F));
     958           0 : }
     959             : template <int kScale>
     960           0 : SI void bilinear_y(SkJumper_SamplerCtx* ctx, F* y) {
     961           0 :     *y = unaligned_load<F>(ctx->y) + C(kScale * 0.5f);
     962           0 :     F fy = unaligned_load<F>(ctx->fy);
     963             : 
     964             :     F scaley;
     965           0 :     if (kScale == -1) { scaley = 1.0_f - fy; }
     966           0 :     if (kScale == +1) { scaley =         fy; }
     967           0 :     memcpy(ctx->scaley, &scaley, sizeof(F));
     968           0 : }
     969             : 
     970           0 : STAGE(bilinear_nx) { bilinear_x<-1>(ctx, &r); }
     971           0 : STAGE(bilinear_px) { bilinear_x<+1>(ctx, &r); }
     972           0 : STAGE(bilinear_ny) { bilinear_y<-1>(ctx, &g); }
     973           0 : STAGE(bilinear_py) { bilinear_y<+1>(ctx, &g); }
     974             : 
     975             : 
     976             : // In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
     977             : // pixel center are combined with a non-uniform cubic filter, with higher values near the center.
     978             : //
     979             : // We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets.
     980             : // See GrCubicEffect for details of this particular filter.
     981             : 
     982           0 : SI F bicubic_near(F t) {
     983             :     // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
     984           0 :     return mad(t, mad(t, mad(C(-21/18.0f), t, C(27/18.0f)), C(9/18.0f)), C(1/18.0f));
     985             : }
     986           0 : SI F bicubic_far(F t) {
     987             :     // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)
     988           0 :     return (t*t)*mad(C(7/18.0f), t, C(-6/18.0f));
     989             : }
     990             : 
     991             : template <int kScale>
     992           0 : SI void bicubic_x(SkJumper_SamplerCtx* ctx, F* x) {
     993           0 :     *x = unaligned_load<F>(ctx->x) + C(kScale * 0.5f);
     994           0 :     F fx = unaligned_load<F>(ctx->fx);
     995             : 
     996             :     F scalex;
     997           0 :     if (kScale == -3) { scalex = bicubic_far (1.0_f - fx); }
     998           0 :     if (kScale == -1) { scalex = bicubic_near(1.0_f - fx); }
     999           0 :     if (kScale == +1) { scalex = bicubic_near(        fx); }
    1000           0 :     if (kScale == +3) { scalex = bicubic_far (        fx); }
    1001           0 :     memcpy(ctx->scalex, &scalex, sizeof(F));
    1002           0 : }
    1003             : template <int kScale>
    1004           0 : SI void bicubic_y(SkJumper_SamplerCtx* ctx, F* y) {
    1005           0 :     *y = unaligned_load<F>(ctx->y) + C(kScale * 0.5f);
    1006           0 :     F fy = unaligned_load<F>(ctx->fy);
    1007             : 
    1008             :     F scaley;
    1009           0 :     if (kScale == -3) { scaley = bicubic_far (1.0_f - fy); }
    1010           0 :     if (kScale == -1) { scaley = bicubic_near(1.0_f - fy); }
    1011           0 :     if (kScale == +1) { scaley = bicubic_near(        fy); }
    1012           0 :     if (kScale == +3) { scaley = bicubic_far (        fy); }
    1013           0 :     memcpy(ctx->scaley, &scaley, sizeof(F));
    1014           0 : }
    1015             : 
    1016           0 : STAGE(bicubic_n3x) { bicubic_x<-3>(ctx, &r); }
    1017           0 : STAGE(bicubic_n1x) { bicubic_x<-1>(ctx, &r); }
    1018           0 : STAGE(bicubic_p1x) { bicubic_x<+1>(ctx, &r); }
    1019           0 : STAGE(bicubic_p3x) { bicubic_x<+3>(ctx, &r); }
    1020             : 
    1021           0 : STAGE(bicubic_n3y) { bicubic_y<-3>(ctx, &g); }
    1022           0 : STAGE(bicubic_n1y) { bicubic_y<-1>(ctx, &g); }
    1023           0 : STAGE(bicubic_p1y) { bicubic_y<+1>(ctx, &g); }
    1024           0 : STAGE(bicubic_p3y) { bicubic_y<+3>(ctx, &g); }

Generated by: LCOV version 1.13