LCOV - code coverage report
Current view: top level - gfx/skia/skia/src/opts - SkRasterPipeline_opts.h (source / functions) Hit Total Coverage
Test: output.info Lines: 0 656 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 181 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright 2016 Google Inc.
       3             :  *
       4             :  * Use of this source code is governed by a BSD-style license that can be
       5             :  * found in the LICENSE file.
       6             :  */
       7             : 
       8             : #ifndef SkRasterPipeline_opts_DEFINED
       9             : #define SkRasterPipeline_opts_DEFINED
      10             : 
      11             : #include "SkColorPriv.h"
      12             : #include "SkColorLookUpTable.h"
      13             : #include "SkColorSpaceXform_A2B.h"
      14             : #include "SkColorSpaceXformPriv.h"
      15             : #include "SkHalf.h"
      16             : #include "SkMSAN.h"
      17             : #include "SkPM4f.h"
      18             : #include "SkPM4fPriv.h"
      19             : #include "SkRasterPipeline.h"
      20             : #include "SkShader.h"
      21             : #include "SkSRGB.h"
      22             : #include "../jumper/SkJumper.h"
      23             : 
      24             : namespace {
      25             : 
      26             :     static constexpr int N = 4;
      27             : 
      28             :     using SkNf = SkNx<N, float>;
      29             :     using SkNi = SkNx<N, int32_t>;
      30             :     using SkNu = SkNx<N, uint32_t>;
      31             :     using SkNh = SkNx<N, uint16_t>;
      32             :     using SkNb = SkNx<N, uint8_t>;
      33             : 
      34             :     using Fn = void(SK_VECTORCALL *)(size_t x_tail, void** p, SkNf,SkNf,SkNf,SkNf,
      35             :                                                               SkNf,SkNf,SkNf,SkNf);
      36             :     // x_tail encodes two values x and tail as x*N+tail, where 0 <= tail < N.
      37             :     // x is the induction variable we're walking along, incrementing by N each step.
      38             :     // tail == 0 means work with a full N pixels; otherwise use only the low tail pixels.
      39             :     //
      40             :     // p is our program, a sequence of Fn to call interlaced with any void* context pointers.  E.g.
      41             :     //    &load_8888
      42             :     //    (src ptr)
      43             :     //    &from_srgb
      44             :     //    &move_src_dst
      45             :     //    &load_f16
      46             :     //    (dst ptr)
      47             :     //    &swap
      48             :     //    &srcover
      49             :     //    &store_f16
      50             :     //    (dst ptr)
      51             :     //    &just_return
      52             : 
      53             : }  // namespace
      54             : 
      55             : #define SI static inline
      56             : 
      57             : // Basically, return *(*ptr)++, maybe faster than the compiler can do it.
      58           0 : SI void* load_and_increment(void*** ptr) {
      59             :     // We do this often enough that it's worth hyper-optimizing.
      60             :     // x86 can do this in one instruction if ptr is in rsi.
      61             :     // (This is why p is the second argument to Fn: it's passed in rsi.)
      62             : #if defined(__GNUC__) && defined(__x86_64__)
      63             :     void* rax;
      64           0 :     __asm__("lodsq" : "=a"(rax), "+S"(*ptr));
      65           0 :     return rax;
      66             : #else
      67             :     return *(*ptr)++;
      68             : #endif
      69             : }
      70             : 
      71             : // Stages are logically a pipeline, and physically are contiguous in an array.
      72             : // To get to the next stage, we just increment our pointer to the next array element.
      73           0 : SI void SK_VECTORCALL next(size_t x_tail, void** p, SkNf  r, SkNf  g, SkNf  b, SkNf  a,
      74             :                                                     SkNf dr, SkNf dg, SkNf db, SkNf da) {
      75           0 :     auto next = (Fn)load_and_increment(&p);
      76           0 :     next(x_tail,p, r,g,b,a, dr,dg,db,da);
      77           0 : }
      78             : 
      79             : // Stages defined below always call next.
      80             : // This is always the last stage, a backstop that actually returns to the caller when done.
      81           0 : SI void SK_VECTORCALL just_return(size_t, void**, SkNf, SkNf, SkNf, SkNf,
      82           0 :                                                   SkNf, SkNf, SkNf, SkNf) {}
      83             : 
      84             : #define STAGE(name)                                                                      \
      85             :     static SK_ALWAYS_INLINE void name##_kernel(size_t x, size_t tail,                    \
      86             :                                                SkNf&  r, SkNf&  g, SkNf&  b, SkNf&  a,   \
      87             :                                                SkNf& dr, SkNf& dg, SkNf& db, SkNf& da);  \
      88             :     SI void SK_VECTORCALL name(size_t x_tail, void** p,                                  \
      89             :                                SkNf  r, SkNf  g, SkNf  b, SkNf  a,                       \
      90             :                                SkNf dr, SkNf dg, SkNf db, SkNf da) {                     \
      91             :         name##_kernel(x_tail/N, x_tail%N, r,g,b,a, dr,dg,db,da);                         \
      92             :         next(x_tail,p, r,g,b,a, dr,dg,db,da);                                            \
      93             :     }                                                                                    \
      94             :     static SK_ALWAYS_INLINE void name##_kernel(size_t x, size_t tail,                    \
      95             :                                                SkNf&  r, SkNf&  g, SkNf&  b, SkNf&  a,   \
      96             :                                                SkNf& dr, SkNf& dg, SkNf& db, SkNf& da)
      97             : 
      98             : #define STAGE_CTX(name, Ctx)                                                             \
      99             :     static SK_ALWAYS_INLINE void name##_kernel(Ctx ctx, size_t x, size_t tail,           \
     100             :                                                SkNf&  r, SkNf&  g, SkNf&  b, SkNf&  a,   \
     101             :                                                SkNf& dr, SkNf& dg, SkNf& db, SkNf& da);  \
     102             :     SI void SK_VECTORCALL name(size_t x_tail, void** p,                                  \
     103             :                                SkNf  r, SkNf  g, SkNf  b, SkNf  a,                       \
     104             :                                SkNf dr, SkNf dg, SkNf db, SkNf da) {                     \
     105             :         auto ctx = (Ctx)load_and_increment(&p);                                          \
     106             :         name##_kernel(ctx, x_tail/N, x_tail%N, r,g,b,a, dr,dg,db,da);                    \
     107             :         next(x_tail,p, r,g,b,a, dr,dg,db,da);                                            \
     108             :     }                                                                                    \
     109             :     static SK_ALWAYS_INLINE void name##_kernel(Ctx ctx, size_t x, size_t tail,           \
     110             :                                                SkNf&  r, SkNf&  g, SkNf&  b, SkNf&  a,   \
     111             :                                                SkNf& dr, SkNf& dg, SkNf& db, SkNf& da)
     112             : 
     113             : // Many xfermodes apply the same logic to each channel.
     114             : #define RGBA_XFERMODE(name)                                                     \
     115             :     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,   \
     116             :                                                const SkNf& d, const SkNf& da);  \
     117             :     SI void SK_VECTORCALL name(size_t x_tail, void** p,                         \
     118             :                                SkNf  r, SkNf  g, SkNf  b, SkNf  a,              \
     119             :                                SkNf dr, SkNf dg, SkNf db, SkNf da) {            \
     120             :         r = name##_kernel(r,a,dr,da);                                           \
     121             :         g = name##_kernel(g,a,dg,da);                                           \
     122             :         b = name##_kernel(b,a,db,da);                                           \
     123             :         a = name##_kernel(a,a,da,da);                                           \
     124             :         next(x_tail,p, r,g,b,a, dr,dg,db,da);                                   \
     125             :     }                                                                           \
     126             :     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,   \
     127             :                                                const SkNf& d, const SkNf& da)
     128             : 
     129             : // Most of the rest apply the same logic to color channels and use srcover's alpha logic.
     130             : #define RGB_XFERMODE(name)                                                      \
     131             :     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,   \
     132             :                                                const SkNf& d, const SkNf& da);  \
     133             :     SI void SK_VECTORCALL name(size_t x_tail, void** p,                         \
     134             :                                SkNf  r, SkNf  g, SkNf  b, SkNf  a,              \
     135             :                                SkNf dr, SkNf dg, SkNf db, SkNf da) {            \
     136             :         r = name##_kernel(r,a,dr,da);                                           \
     137             :         g = name##_kernel(g,a,dg,da);                                           \
     138             :         b = name##_kernel(b,a,db,da);                                           \
     139             :         a = a + (da * (1.0f-a));                                                \
     140             :         next(x_tail,p, r,g,b,a, dr,dg,db,da);                                   \
     141             :     }                                                                           \
     142             :     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,   \
     143             :                                                const SkNf& d, const SkNf& da)
     144             : 
     145             : template <typename T>
     146           0 : SI SkNx<N,T> load(size_t tail, const T* src) {
     147           0 :     if (tail) {
     148             :         T buf[8];
     149           0 :         memset(buf, 0, 8*sizeof(T));
     150           0 :         switch (tail & (N-1)) {
     151           0 :             case 7: buf[6] = src[6];
     152           0 :             case 6: buf[5] = src[5];
     153           0 :             case 5: buf[4] = src[4];
     154           0 :             case 4: buf[3] = src[3];
     155           0 :             case 3: buf[2] = src[2];
     156           0 :             case 2: buf[1] = src[1];
     157             :         }
     158           0 :         buf[0] = src[0];
     159             :         return SkNx<N,T>::Load(buf);
     160             :     }
     161           0 :     return SkNx<N,T>::Load(src);
     162             : }
     163             : template <typename T>
     164           0 : SI SkNx<N,T> gather(size_t tail, const T* src, const SkNi& offset) {
     165           0 :     if (tail) {
     166           0 :         T buf[8] = {0};
     167           0 :         switch (tail & (N-1)) {
     168           0 :             case 7: buf[6] = src[offset[6]];
     169           0 :             case 6: buf[5] = src[offset[5]];
     170           0 :             case 5: buf[4] = src[offset[4]];
     171           0 :             case 4: buf[3] = src[offset[3]];
     172           0 :             case 3: buf[2] = src[offset[2]];
     173           0 :             case 2: buf[1] = src[offset[1]];
     174             :         }
     175           0 :         buf[0] = src[offset[0]];
     176             :         return SkNx<N,T>::Load(buf);
     177             :     }
     178             :     T buf[8];
     179           0 :     for (size_t i = 0; i < N; i++) {
     180           0 :         buf[i] = src[offset[i]];
     181             :     }
     182           0 :     return SkNx<N,T>::Load(buf);
     183             : }
     184             : template <typename T>
     185           0 : SI void store(size_t tail, const SkNx<N,T>& v, T* dst) {
     186           0 :     if (tail) {
     187           0 :         switch (tail & (N-1)) {
     188           0 :             case 7: dst[6] = v[6];
     189           0 :             case 6: dst[5] = v[5];
     190           0 :             case 5: dst[4] = v[4];
     191           0 :             case 4: dst[3] = v[3];
     192           0 :             case 3: dst[2] = v[2];
     193           0 :             case 2: dst[1] = v[1];
     194             :         }
     195           0 :         dst[0] = v[0];
     196           0 :         return;
     197             :     }
     198             :     v.store(dst);
     199             : }
     200             : 
     201           0 : SI SkNf SkNf_fma(const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma(f,m,a); }
     202             : 
     203           0 : SI SkNi SkNf_round(const SkNf& x, const SkNf& scale) {
     204             :     // Every time I try, _mm_cvtps_epi32 benches as slower than using FMA and _mm_cvttps_epi32.  :/
     205           0 :     return SkNx_cast<int>(SkNf_fma(x,scale, 0.5f));
     206             : }
     207             : 
     208           0 : SI SkNf SkNf_from_byte(const SkNi& x) {
     209             :     // Same trick as in store_8888: 0x470000BB == 32768.0f + BB/256.0f for all bytes BB.
     210           0 :     auto v = 0x47000000 | x;
     211             :     // Read this as (pun_float(v) - 32768.0f) * (256/255.0f), redistributed to be an FMA.
     212           0 :     return SkNf_fma(SkNf::Load(&v), 256/255.0f, -32768*256/255.0f);
     213             : }
     214           0 : SI SkNf SkNf_from_byte(const SkNu& x) { return SkNf_from_byte(SkNi::Load(&x)); }
     215           0 : SI SkNf SkNf_from_byte(const SkNb& x) { return SkNf_from_byte(SkNx_cast<int>(x)); }
     216             : 
     217           0 : SI void from_8888(const SkNu& _8888, SkNf* r, SkNf* g, SkNf* b, SkNf* a) {
     218           0 :     *r = SkNf_from_byte((_8888      ) & 0xff);
     219           0 :     *g = SkNf_from_byte((_8888 >>  8) & 0xff);
     220           0 :     *b = SkNf_from_byte((_8888 >> 16) & 0xff);
     221           0 :     *a = SkNf_from_byte((_8888 >> 24)       );
     222           0 : }
     223           0 : SI void from_4444(const SkNh& _4444, SkNf* r, SkNf* g, SkNf* b, SkNf* a) {
     224           0 :     auto _32_bit = SkNx_cast<int>(_4444);
     225             : 
     226           0 :     *r = SkNx_cast<float>(_32_bit & (0xF << SK_R4444_SHIFT)) * (1.0f / (0xF << SK_R4444_SHIFT));
     227           0 :     *g = SkNx_cast<float>(_32_bit & (0xF << SK_G4444_SHIFT)) * (1.0f / (0xF << SK_G4444_SHIFT));
     228           0 :     *b = SkNx_cast<float>(_32_bit & (0xF << SK_B4444_SHIFT)) * (1.0f / (0xF << SK_B4444_SHIFT));
     229           0 :     *a = SkNx_cast<float>(_32_bit & (0xF << SK_A4444_SHIFT)) * (1.0f / (0xF << SK_A4444_SHIFT));
     230           0 : }
     231           0 : SI void from_565(const SkNh& _565, SkNf* r, SkNf* g, SkNf* b) {
     232           0 :     auto _32_bit = SkNx_cast<int>(_565);
     233             : 
     234           0 :     *r = SkNx_cast<float>(_32_bit & SK_R16_MASK_IN_PLACE) * (1.0f / SK_R16_MASK_IN_PLACE);
     235           0 :     *g = SkNx_cast<float>(_32_bit & SK_G16_MASK_IN_PLACE) * (1.0f / SK_G16_MASK_IN_PLACE);
     236           0 :     *b = SkNx_cast<float>(_32_bit & SK_B16_MASK_IN_PLACE) * (1.0f / SK_B16_MASK_IN_PLACE);
     237           0 : }
     238           0 : SI void from_f16(const void* px, SkNf* r, SkNf* g, SkNf* b, SkNf* a) {
     239             :     SkNh rh, gh, bh, ah;
     240             :     SkNh::Load4(px, &rh, &gh, &bh, &ah);
     241             : 
     242           0 :     *r = SkHalfToFloat_finite_ftz(rh);
     243           0 :     *g = SkHalfToFloat_finite_ftz(gh);
     244           0 :     *b = SkHalfToFloat_finite_ftz(bh);
     245           0 :     *a = SkHalfToFloat_finite_ftz(ah);
     246           0 : }
     247             : 
     248           0 : STAGE(clamp_0) {
     249           0 :     a = SkNf::Max(a, 0.0f);
     250           0 :     r = SkNf::Max(r, 0.0f);
     251           0 :     g = SkNf::Max(g, 0.0f);
     252           0 :     b = SkNf::Max(b, 0.0f);
     253             : }
     254           0 : STAGE(clamp_1) {
     255           0 :     a = SkNf::Min(a, 1.0f);
     256           0 :     r = SkNf::Min(r, 1.0f);
     257           0 :     g = SkNf::Min(g, 1.0f);
     258           0 :     b = SkNf::Min(b, 1.0f);
     259             : }
     260           0 : STAGE(clamp_a) {
     261           0 :     a = SkNf::Min(a, 1.0f);
     262           0 :     r = SkNf::Min(r, a);
     263           0 :     g = SkNf::Min(g, a);
     264           0 :     b = SkNf::Min(b, a);
     265             : }
     266             : 
     267           0 : STAGE(unpremul) {
     268           0 :     auto scale = (a == 0.0f).thenElse(0.0f, 1.0f/a);
     269             :     r *= scale;
     270             :     g *= scale;
     271             :     b *= scale;
     272             : }
     273           0 : STAGE(premul) {
     274             :     r *= a;
     275             :     g *= a;
     276             :     b *= a;
     277             : }
     278             : 
     279           0 : STAGE_CTX(set_rgb, const float*) {
     280           0 :     r = ctx[0];
     281           0 :     g = ctx[1];
     282           0 :     b = ctx[2];
     283             : }
     284           0 : STAGE(swap_rb) { SkTSwap(r,b); }
     285             : 
     286           0 : STAGE(move_src_dst) {
     287           0 :     dr = r;
     288           0 :     dg = g;
     289           0 :     db = b;
     290           0 :     da = a;
     291             : }
     292           0 : STAGE(move_dst_src) {
     293           0 :     r = dr;
     294           0 :     g = dg;
     295           0 :     b = db;
     296           0 :     a = da;
     297             : }
     298           0 : STAGE(swap) {
     299           0 :     SkTSwap(r,dr);
     300           0 :     SkTSwap(g,dg);
     301           0 :     SkTSwap(b,db);
     302           0 :     SkTSwap(a,da);
     303             : }
     304             : 
     305           0 : STAGE(from_srgb) {
     306           0 :     r = sk_linear_from_srgb_math(r);
     307           0 :     g = sk_linear_from_srgb_math(g);
     308           0 :     b = sk_linear_from_srgb_math(b);
     309             : }
     310           0 : STAGE(to_srgb) {
     311           0 :     r = sk_linear_to_srgb_needs_round(r);
     312           0 :     g = sk_linear_to_srgb_needs_round(g);
     313           0 :     b = sk_linear_to_srgb_needs_round(b);
     314             : }
     315             : 
     316           0 : STAGE(from_2dot2) {
     317           0 :     auto from_2dot2 = [](const SkNf& x) {
     318             :         // x^(141/64) = x^(2.20312) is a great approximation of the true value, x^(2.2).
     319             :         // (note: x^(35/16) = x^(2.1875) is an okay one as well and would be quicker)
     320           0 :         auto x16 = x.rsqrt().rsqrt().rsqrt().rsqrt();   // x^(1/16) = x^(4/64);
     321           0 :         auto x64 = x16.rsqrt().rsqrt();                 // x^(1/64)
     322             : 
     323             :         // x^(141/64) = x^(128/64) * x^(12/64) * x^(1/64)
     324           0 :         return SkNf::Max((x*x) * (x16*x16*x16) * (x64), 0.0f);
     325             :     };
     326             : 
     327           0 :     r = from_2dot2(r);
     328           0 :     g = from_2dot2(g);
     329           0 :     b = from_2dot2(b);
     330             : }
     331           0 : STAGE(to_2dot2) {
     332           0 :     auto to_2dot2 = [](const SkNf& x) {
     333             :         // x^(29/64) is a very good approximation of the true value, x^(1/2.2).
     334           0 :         auto x2  = x.rsqrt(),                            // x^(-1/2)
     335           0 :              x32 = x2.rsqrt().rsqrt().rsqrt().rsqrt(),   // x^(-1/32)
     336           0 :              x64 = x32.rsqrt();                          // x^(+1/64)
     337             : 
     338             :         // 29 = 32 - 2 - 1
     339           0 :         return SkNf::Max(x2.invert() * x32 * x64.invert(), 0.0f); // Watch out for NaN.
     340             :     };
     341             : 
     342           0 :     r = to_2dot2(r);
     343           0 :     g = to_2dot2(g);
     344           0 :     b = to_2dot2(b);
     345             : }
     346             : 
     347             : // The default shader produces a constant color (from the SkPaint).
     348           0 : STAGE_CTX(constant_color, const SkPM4f*) {
     349           0 :     r = ctx->r();
     350           0 :     g = ctx->g();
     351           0 :     b = ctx->b();
     352           0 :     a = ctx->a();
     353             : }
     354             : 
     355             : // Set up registers with values relevant to shaders.
     356           0 : STAGE_CTX(seed_shader, const int*) {
     357           0 :     int y = *ctx;
     358             : 
     359             :     static const float dx[] = { 0,1,2,3,4,5,6,7 };
     360           0 :     r = x + 0.5f + SkNf::Load(dx);  // dst pixel center x coordinates
     361           0 :     g = y + 0.5f;                   // dst pixel center y coordinate(s)
     362           0 :     b = 1.0f;
     363           0 :     a = 0.0f;
     364           0 :     dr = dg = db = da = 0.0f;
     365             : }
     366             : 
     367             : // s' = sc for a scalar c.
     368           0 : STAGE_CTX(scale_1_float, const float*) {
     369           0 :     SkNf c = *ctx;
     370             : 
     371             :     r *= c;
     372             :     g *= c;
     373             :     b *= c;
     374             :     a *= c;
     375             : }
     376             : // s' = sc for 8-bit c.
     377           0 : STAGE_CTX(scale_u8, const uint8_t**) {
     378           0 :     auto ptr = *ctx + x;
     379           0 :     SkNf c = SkNf_from_byte(load(tail, ptr));
     380             : 
     381           0 :     r = r*c;
     382           0 :     g = g*c;
     383           0 :     b = b*c;
     384           0 :     a = a*c;
     385             : }
     386             : 
     387           0 : SI SkNf lerp(const SkNf& from, const SkNf& to, const SkNf& cov) {
     388           0 :     return SkNf_fma(to-from, cov, from);
     389             : }
     390             : 
     391             : // s' = d(1-c) + sc, for a scalar c.
     392           0 : STAGE_CTX(lerp_1_float, const float*) {
     393           0 :     SkNf c = *ctx;
     394             : 
     395           0 :     r = lerp(dr, r, c);
     396           0 :     g = lerp(dg, g, c);
     397           0 :     b = lerp(db, b, c);
     398           0 :     a = lerp(da, a, c);
     399             : }
     400             : 
     401             : // s' = d(1-c) + sc for 8-bit c.
     402           0 : STAGE_CTX(lerp_u8, const uint8_t**) {
     403           0 :     auto ptr = *ctx + x;
     404           0 :     SkNf c = SkNf_from_byte(load(tail, ptr));
     405             : 
     406           0 :     r = lerp(dr, r, c);
     407           0 :     g = lerp(dg, g, c);
     408           0 :     b = lerp(db, b, c);
     409           0 :     a = lerp(da, a, c);
     410             : }
     411             : 
     412             : // s' = d(1-c) + sc for 565 c.
     413           0 : STAGE_CTX(lerp_565, const uint16_t**) {
     414           0 :     auto ptr = *ctx + x;
     415             :     SkNf cr, cg, cb;
     416           0 :     from_565(load(tail, ptr), &cr, &cg, &cb);
     417             : 
     418           0 :     r = lerp(dr, r, cr);
     419           0 :     g = lerp(dg, g, cg);
     420           0 :     b = lerp(db, b, cb);
     421           0 :     a = 1.0f;
     422             : }
     423             : 
     424           0 : STAGE_CTX(load_a8, const uint8_t**) {
     425           0 :     auto ptr = *ctx + x;
     426           0 :     r = g = b = 0.0f;
     427           0 :     a = SkNf_from_byte(load(tail, ptr));
     428             : }
     429           0 : STAGE_CTX(store_a8, uint8_t**) {
     430           0 :     auto ptr = *ctx + x;
     431           0 :     store(tail, SkNx_cast<uint8_t>(SkNf_round(255.0f, a)), ptr);
     432             : }
     433             : 
     434           0 : STAGE_CTX(load_g8, const uint8_t**) {
     435           0 :     auto ptr = *ctx + x;
     436           0 :     r = g = b = SkNf_from_byte(load(tail, ptr));
     437           0 :     a = 1.0f;
     438             : }
     439             : 
     440           0 : STAGE_CTX(load_565, const uint16_t**) {
     441           0 :     auto ptr = *ctx + x;
     442           0 :     from_565(load(tail, ptr), &r,&g,&b);
     443           0 :     a = 1.0f;
     444             : }
     445           0 : STAGE_CTX(store_565, uint16_t**) {
     446           0 :     auto ptr = *ctx + x;
     447           0 :     store(tail, SkNx_cast<uint16_t>( SkNf_round(r, SK_R16_MASK) << SK_R16_SHIFT
     448           0 :                                    | SkNf_round(g, SK_G16_MASK) << SK_G16_SHIFT
     449           0 :                                    | SkNf_round(b, SK_B16_MASK) << SK_B16_SHIFT), ptr);
     450             : }
     451             : 
     452           0 : STAGE_CTX(load_4444, const uint16_t**) {
     453           0 :     auto ptr = *ctx + x;
     454           0 :     from_4444(load(tail, ptr), &r,&g,&b,&a);
     455             : }
     456           0 : STAGE_CTX(store_4444, uint16_t**) {
     457           0 :     auto ptr = *ctx + x;
     458           0 :     store(tail, SkNx_cast<uint16_t>( SkNf_round(r, 0xF) << SK_R4444_SHIFT
     459           0 :                                    | SkNf_round(g, 0xF) << SK_G4444_SHIFT
     460           0 :                                    | SkNf_round(b, 0xF) << SK_B4444_SHIFT
     461           0 :                                    | SkNf_round(a, 0xF) << SK_A4444_SHIFT), ptr);
     462             : }
     463             : 
     464           0 : STAGE_CTX(load_f16, const uint64_t**) {
     465           0 :     auto ptr = *ctx + x;
     466             : 
     467           0 :     const void* src = ptr;
     468             :     SkNx<N, uint64_t> px;
     469           0 :     if (tail) {
     470           0 :         px = load(tail, ptr);
     471           0 :         src = &px;
     472             :     }
     473           0 :     from_f16(src, &r, &g, &b, &a);
     474             : }
     475           0 : STAGE_CTX(store_f16, uint64_t**) {
     476           0 :     auto ptr = *ctx + x;
     477             : 
     478             :     SkNx<N, uint64_t> px;
     479           0 :     SkNh::Store4(tail ? (void*)&px : (void*)ptr, SkFloatToHalf_finite_ftz(r),
     480           0 :                                                  SkFloatToHalf_finite_ftz(g),
     481           0 :                                                  SkFloatToHalf_finite_ftz(b),
     482           0 :                                                  SkFloatToHalf_finite_ftz(a));
     483           0 :     if (tail) {
     484           0 :         store(tail, px, ptr);
     485             :     }
     486             : }
     487             : 
     488           0 : STAGE_CTX(load_f32, const SkPM4f**) {
     489           0 :     auto ptr = *ctx + x;
     490             : 
     491           0 :     const void* src = ptr;
     492             :     SkNx<N, SkPM4f> px;
     493           0 :     if (tail) {
     494           0 :         px = load(tail, ptr);
     495           0 :         src = &px;
     496             :     }
     497             :     SkNf::Load4(src, &r, &g, &b, &a);
     498             : }
     499           0 : STAGE_CTX(store_f32, SkPM4f**) {
     500           0 :     auto ptr = *ctx + x;
     501             : 
     502             :     SkNx<N, SkPM4f> px;
     503           0 :     SkNf::Store4(tail ? (void*)&px : (void*)ptr, r,g,b,a);
     504           0 :     if (tail) {
     505           0 :         store(tail, px, ptr);
     506             :     }
     507             : }
     508             : 
     509             : 
     510           0 : STAGE_CTX(load_8888, const uint32_t**) {
     511           0 :     auto ptr = *ctx + x;
     512           0 :     from_8888(load(tail, ptr), &r, &g, &b, &a);
     513             : }
     514           0 : STAGE_CTX(store_8888, uint32_t**) {
     515           0 :     auto byte = [](const SkNf& x, int ix) {
     516             :         // Here's a neat trick: 0x47000000 == 32768.0f, and 0x470000ff == 32768.0f + (255/256.0f).
     517           0 :         auto v = SkNf_fma(255/256.0f, x, 32768.0f);
     518           0 :         switch (ix) {
     519           0 :             case 0: return SkNi::Load(&v) & 0xff;  // R
     520           0 :             case 3: return SkNi::Load(&v) << 24;   // A
     521             :         }
     522           0 :         return (SkNi::Load(&v) & 0xff) << (8*ix);  // B or G
     523             :     };
     524             : 
     525           0 :     auto ptr = *ctx + x;
     526           0 :     store(tail, byte(r,0)|byte(g,1)|byte(b,2)|byte(a,3), (int*)ptr);
     527             : }
     528             : 
     529           0 : STAGE_CTX(load_u16_be, const uint64_t**) {
     530           0 :     auto ptr = *ctx + x;
     531           0 :     const void* src = ptr;
     532             :     SkNx<N, uint64_t> px;
     533           0 :     if (tail) {
     534           0 :         px = load(tail, ptr);
     535           0 :         src = &px;
     536             :     }
     537             : 
     538             :     SkNh rh, gh, bh, ah;
     539             :     SkNh::Load4(src, &rh, &gh, &bh, &ah);
     540           0 :     r = (1.0f / 65535.0f) * SkNx_cast<float>((rh << 8) | (rh >> 8));
     541           0 :     g = (1.0f / 65535.0f) * SkNx_cast<float>((gh << 8) | (gh >> 8));
     542           0 :     b = (1.0f / 65535.0f) * SkNx_cast<float>((bh << 8) | (bh >> 8));
     543           0 :     a = (1.0f / 65535.0f) * SkNx_cast<float>((ah << 8) | (ah >> 8));
     544             : }
     545             : 
     546           0 : STAGE_CTX(load_rgb_u16_be, const uint16_t**) {
     547           0 :     auto ptr = *ctx + 3*x;
     548           0 :     const void* src = ptr;
     549           0 :     uint16_t buf[N*3] = {0};
     550           0 :     if (tail) {
     551           0 :         memcpy(buf, src, tail*3*sizeof(uint16_t));
     552           0 :         src = buf;
     553             :     }
     554             : 
     555             :     SkNh rh, gh, bh;
     556             :     SkNh::Load3(src, &rh, &gh, &bh);
     557           0 :     r = (1.0f / 65535.0f) * SkNx_cast<float>((rh << 8) | (rh >> 8));
     558           0 :     g = (1.0f / 65535.0f) * SkNx_cast<float>((gh << 8) | (gh >> 8));
     559           0 :     b = (1.0f / 65535.0f) * SkNx_cast<float>((bh << 8) | (bh >> 8));
     560           0 :     a = 1.0f;
     561             : }
     562             : 
     563           0 : STAGE_CTX(store_u16_be, uint64_t**) {
     564           0 :     auto to_u16_be = [](const SkNf& x) {
     565           0 :         SkNh x16 = SkNx_cast<uint16_t>(65535.0f * x);
     566           0 :         return (x16 << 8) | (x16 >> 8);
     567             :     };
     568             : 
     569           0 :     auto ptr = *ctx + x;
     570             :     SkNx<N, uint64_t> px;
     571           0 :     SkNh::Store4(tail ? (void*)&px : (void*)ptr, to_u16_be(r),
     572           0 :                                                  to_u16_be(g),
     573           0 :                                                  to_u16_be(b),
     574           0 :                                                  to_u16_be(a));
     575           0 :     if (tail) {
     576           0 :         store(tail, px, ptr);
     577             :     }
     578             : }
     579             : 
     580           0 : STAGE_CTX(load_tables, const LoadTablesContext*) {
     581           0 :     auto ptr = (const uint32_t*)ctx->fSrc + x;
     582             : 
     583           0 :     SkNu rgba = load(tail, ptr);
     584           0 :     auto to_int = [](const SkNu& v) { return SkNi::Load(&v); };
     585           0 :     r = gather(tail, ctx->fR, to_int((rgba >>  0) & 0xff));
     586           0 :     g = gather(tail, ctx->fG, to_int((rgba >>  8) & 0xff));
     587           0 :     b = gather(tail, ctx->fB, to_int((rgba >> 16) & 0xff));
     588           0 :     a = SkNf_from_byte(rgba >> 24);
     589             : }
     590             : 
     591           0 : STAGE_CTX(load_tables_u16_be, const LoadTablesContext*) {
     592           0 :     auto ptr = (const uint64_t*)ctx->fSrc + x;
     593           0 :     const void* src = ptr;
     594             :     SkNx<N, uint64_t> px;
     595           0 :     if (tail) {
     596           0 :         px = load(tail, ptr);
     597           0 :         src = &px;
     598             :     }
     599             : 
     600             :     SkNh rh, gh, bh, ah;
     601             :     SkNh::Load4(src, &rh, &gh, &bh, &ah);
     602             : 
     603             :     // ctx->fSrc is big-endian, so "& 0xff" grabs the 8 most significant bits of each component.
     604           0 :     r = gather(tail, ctx->fR, SkNx_cast<int>(rh & 0xff));
     605           0 :     g = gather(tail, ctx->fG, SkNx_cast<int>(gh & 0xff));
     606           0 :     b = gather(tail, ctx->fB, SkNx_cast<int>(bh & 0xff));
     607           0 :     a = (1.0f / 65535.0f) * SkNx_cast<float>((ah << 8) | (ah >> 8));
     608             : }
     609             : 
     610           0 : STAGE_CTX(load_tables_rgb_u16_be, const LoadTablesContext*) {
     611           0 :     auto ptr = (const uint16_t*)ctx->fSrc + 3*x;
     612           0 :     const void* src = ptr;
     613           0 :     uint16_t buf[N*3] = {0};
     614           0 :     if (tail) {
     615           0 :         memcpy(buf, src, tail*3*sizeof(uint16_t));
     616           0 :         src = buf;
     617             :     }
     618             : 
     619             :     SkNh rh, gh, bh;
     620             :     SkNh::Load3(src, &rh, &gh, &bh);
     621             : 
     622             :     // ctx->fSrc is big-endian, so "& 0xff" grabs the 8 most significant bits of each component.
     623           0 :     r = gather(tail, ctx->fR, SkNx_cast<int>(rh & 0xff));
     624           0 :     g = gather(tail, ctx->fG, SkNx_cast<int>(gh & 0xff));
     625           0 :     b = gather(tail, ctx->fB, SkNx_cast<int>(bh & 0xff));
     626           0 :     a = 1.0f;
     627             : }
     628             : 
     629           0 : SI SkNf inv(const SkNf& x) { return 1.0f - x; }
     630             : 
     631           0 : RGBA_XFERMODE(clear)    { return 0.0f; }
     632           0 : RGBA_XFERMODE(srcatop)  { return s*da + d*inv(sa); }
     633           0 : RGBA_XFERMODE(srcin)    { return s * da; }
     634           0 : RGBA_XFERMODE(srcout)   { return s * inv(da); }
     635           0 : RGBA_XFERMODE(srcover)  { return SkNf_fma(d, inv(sa), s); }
     636           0 : RGBA_XFERMODE(dstatop)  { return srcatop_kernel(d,da,s,sa); }
     637           0 : RGBA_XFERMODE(dstin)    { return srcin_kernel  (d,da,s,sa); }
     638           0 : RGBA_XFERMODE(dstout)   { return srcout_kernel (d,da,s,sa); }
     639           0 : RGBA_XFERMODE(dstover)  { return srcover_kernel(d,da,s,sa); }
     640             : 
     641           0 : RGBA_XFERMODE(modulate) { return s*d; }
     642           0 : RGBA_XFERMODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; }
     643           0 : RGBA_XFERMODE(plus_)    { return s + d; }
     644           0 : RGBA_XFERMODE(screen)   { return s + d - s*d; }
     645           0 : RGBA_XFERMODE(xor_)     { return s*inv(da) + d*inv(sa); }
     646             : 
     647           0 : RGB_XFERMODE(colorburn) {
     648           0 :     return (d == da  ).thenElse(d + s*inv(da),
     649           0 :            (s == 0.0f).thenElse(s + d*inv(sa),
     650           0 :                                 sa*(da - SkNf::Min(da, (da-d)*sa/s)) + s*inv(da) + d*inv(sa)));
     651             : }
     652           0 : RGB_XFERMODE(colordodge) {
     653           0 :     return (d == 0.0f).thenElse(d + s*inv(da),
     654           0 :            (s == sa  ).thenElse(s + d*inv(sa),
     655           0 :                                 sa*SkNf::Min(da, (d*sa)/(sa - s)) + s*inv(da) + d*inv(sa)));
     656             : }
     657           0 : RGB_XFERMODE(darken)     { return s + d - SkNf::Max(s*da, d*sa); }
     658           0 : RGB_XFERMODE(difference) { return s + d - 2.0f*SkNf::Min(s*da,d*sa); }
     659           0 : RGB_XFERMODE(exclusion)  { return s + d - 2.0f*s*d; }
     660           0 : RGB_XFERMODE(hardlight) {
     661           0 :     return s*inv(da) + d*inv(sa)
     662           0 :          + (2.0f*s <= sa).thenElse(2.0f*s*d, sa*da - 2.0f*(da-d)*(sa-s));
     663             : }
     664           0 : RGB_XFERMODE(lighten) { return s + d - SkNf::Min(s*da, d*sa); }
     665           0 : RGB_XFERMODE(overlay) { return hardlight_kernel(d,da,s,sa); }
     666           0 : RGB_XFERMODE(softlight) {
     667           0 :     SkNf m  = (da > 0.0f).thenElse(d / da, 0.0f),
     668           0 :          s2 = 2.0f*s,
     669           0 :          m4 = 4.0f*m;
     670             : 
     671             :     // The logic forks three ways:
     672             :     //    1. dark src?
     673             :     //    2. light src, dark dst?
     674             :     //    3. light src, light dst?
     675           0 :     SkNf darkSrc = d*(sa + (s2 - sa)*(1.0f - m)),     // Used in case 1.
     676           0 :          darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m,  // Used in case 2.
     677           0 :          liteDst = m.rsqrt().invert() - m,            // Used in case 3.
     678           0 :          liteSrc = d*sa + da*(s2 - sa) * (4.0f*d <= da).thenElse(darkDst, liteDst);  // 2 or 3?
     679           0 :     return s*inv(da) + d*inv(sa) + (s2 <= sa).thenElse(darkSrc, liteSrc);  // 1 or (2 or 3)?
     680             : }
     681             : 
     682           0 : STAGE(luminance_to_alpha) {
     683           0 :     a = SK_LUM_COEFF_R*r + SK_LUM_COEFF_G*g + SK_LUM_COEFF_B*b;
     684           0 :     r = g = b = 0;
     685             : }
     686             : 
     687           0 : STAGE(rgb_to_hsl) {
     688           0 :     auto max = SkNf::Max(SkNf::Max(r, g), b);
     689           0 :     auto min = SkNf::Min(SkNf::Min(r, g), b);
     690           0 :     auto l = 0.5f * (max + min);
     691             : 
     692           0 :     auto d = max - min;
     693           0 :     auto d_inv = 1.0f/d;
     694           0 :     auto s = (max == min).thenElse(0.0f,
     695           0 :         d/(l > 0.5f).thenElse(2.0f - max - min, max + min));
     696           0 :     SkNf h = (max != r).thenElse(0.0f,
     697           0 :         (g - b)*d_inv + (g < b).thenElse(6.0f, 0.0f));
     698           0 :     h = (max == g).thenElse((b - r)*d_inv + 2.0f, h);
     699           0 :     h = (max == b).thenElse((r - g)*d_inv + 4.0f, h);
     700             :     h *= (1/6.0f);
     701             : 
     702           0 :     h = (max == min).thenElse(0.0f, h);
     703             : 
     704           0 :     r = h;
     705           0 :     g = s;
     706           0 :     b = l;
     707             : }
     708             : 
     709           0 : STAGE(hsl_to_rgb) {
     710           0 :     auto h = r;
     711           0 :     auto s = g;
     712           0 :     auto l = b;
     713           0 :     auto q = (l < 0.5f).thenElse(l*(1.0f + s), l + s - l*s);
     714           0 :     auto p = 2.0f*l - q;
     715             : 
     716           0 :     auto hue_to_rgb = [](const SkNf& p, const SkNf& q, const SkNf& t) {
     717           0 :         auto t2 = (t < 0.0f).thenElse(t + 1.0f, (t > 1.0f).thenElse(t - 1.0f, t));
     718           0 :         return (t2 < (1/6.0f)).thenElse(
     719           0 :             p + (q - p)*6.0f*t, (t2 < (3/6.0f)).thenElse(
     720           0 :                 q, (t2 < (4/6.0f)).thenElse(
     721           0 :                     p + (q - p)*((4/6.0f) - t2)*6.0f, p)));
     722             :     };
     723             : 
     724           0 :     r = (s == 0.f).thenElse(l, hue_to_rgb(p, q, h + (1/3.0f)));
     725           0 :     g = (s == 0.f).thenElse(l, hue_to_rgb(p, q, h));
     726           0 :     b = (s == 0.f).thenElse(l, hue_to_rgb(p, q, h - (1/3.0f)));
     727             : }
     728             : 
     729           0 : STAGE_CTX(matrix_2x3, const float*) {
     730           0 :     auto m = ctx;
     731             : 
     732           0 :     auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[2], m[4])),
     733           0 :          G = SkNf_fma(r,m[1], SkNf_fma(g,m[3], m[5]));
     734           0 :     r = R;
     735           0 :     g = G;
     736             : }
     737           0 : STAGE_CTX(matrix_3x4, const float*) {
     738           0 :     auto m = ctx;
     739             : 
     740           0 :     auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[3], SkNf_fma(b,m[6], m[ 9]))),
     741           0 :          G = SkNf_fma(r,m[1], SkNf_fma(g,m[4], SkNf_fma(b,m[7], m[10]))),
     742           0 :          B = SkNf_fma(r,m[2], SkNf_fma(g,m[5], SkNf_fma(b,m[8], m[11])));
     743           0 :     r = R;
     744           0 :     g = G;
     745           0 :     b = B;
     746             : }
     747           0 : STAGE_CTX(matrix_4x5, const float*) {
     748           0 :     auto m = ctx;
     749             : 
     750           0 :     auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[4], SkNf_fma(b,m[ 8], SkNf_fma(a,m[12], m[16])))),
     751           0 :          G = SkNf_fma(r,m[1], SkNf_fma(g,m[5], SkNf_fma(b,m[ 9], SkNf_fma(a,m[13], m[17])))),
     752           0 :          B = SkNf_fma(r,m[2], SkNf_fma(g,m[6], SkNf_fma(b,m[10], SkNf_fma(a,m[14], m[18])))),
     753           0 :          A = SkNf_fma(r,m[3], SkNf_fma(g,m[7], SkNf_fma(b,m[11], SkNf_fma(a,m[15], m[19]))));
     754           0 :     r = R;
     755           0 :     g = G;
     756           0 :     b = B;
     757           0 :     a = A;
     758             : }
     759           0 : STAGE_CTX(matrix_perspective, const float*) {
     760             :     // N.B. unlike the matrix_NxM stages, this takes a row-major matrix.
     761           0 :     auto m = ctx;
     762             : 
     763           0 :     auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[1], m[2])),
     764           0 :          G = SkNf_fma(r,m[3], SkNf_fma(g,m[4], m[5])),
     765           0 :          Z = SkNf_fma(r,m[6], SkNf_fma(g,m[7], m[8]));
     766           0 :     r = R * Z.invert();
     767           0 :     g = G * Z.invert();
     768             : }
     769             : 
     770           0 : SI SkNf parametric(const SkNf& v, const SkColorSpaceTransferFn& p) {
     771             :     float result[N];   // Unconstrained powf() doesn't vectorize well...
     772           0 :     for (int i = 0; i < N; i++) {
     773           0 :         float s = v[i];
     774           0 :         result[i] = (s <= p.fD) ? p.fC * s + p.fF
     775           0 :                                 : powf(s * p.fA + p.fB, p.fG) + p.fE;
     776             :     }
     777             :     // Clamp the output to [0, 1].
     778             :     // Max(NaN, 0) = 0, but Max(0, NaN) = NaN, so we want this exact order to ensure NaN => 0
     779           0 :     return SkNf::Min(SkNf::Max(SkNf::Load(result), 0.0f), 1.0f);
     780             : }
     781           0 : STAGE_CTX(parametric_r, const SkColorSpaceTransferFn*) { r = parametric(r, *ctx); }
     782           0 : STAGE_CTX(parametric_g, const SkColorSpaceTransferFn*) { g = parametric(g, *ctx); }
     783           0 : STAGE_CTX(parametric_b, const SkColorSpaceTransferFn*) { b = parametric(b, *ctx); }
     784           0 : STAGE_CTX(parametric_a, const SkColorSpaceTransferFn*) { a = parametric(a, *ctx); }
     785             : 
     786           0 : SI SkNf table(const SkNf& v, const SkTableTransferFn& table) {
     787             :     float result[N];
     788           0 :     for (int i = 0; i < N; i++) {
     789           0 :         result[i] = interp_lut(v[i], table.fData, table.fSize);
     790             :     }
     791             :     // no need to clamp - tables are by-design [0,1] -> [0,1]
     792           0 :     return SkNf::Load(result);
     793             : }
     794           0 : STAGE_CTX(table_r, const SkTableTransferFn*) { r = table(r, *ctx); }
     795           0 : STAGE_CTX(table_g, const SkTableTransferFn*) { g = table(g, *ctx); }
     796           0 : STAGE_CTX(table_b, const SkTableTransferFn*) { b = table(b, *ctx); }
     797           0 : STAGE_CTX(table_a, const SkTableTransferFn*) { a = table(a, *ctx); }
     798             : 
     799           0 : STAGE_CTX(color_lookup_table, const SkColorLookUpTable*) {
     800           0 :     const SkColorLookUpTable* colorLUT = ctx;
     801           0 :     SkASSERT(3 == colorLUT->inputChannels() || 4 == colorLUT->inputChannels());
     802           0 :     SkASSERT(3 == colorLUT->outputChannels());
     803             :     float result[3][N];
     804           0 :     for (int i = 0; i < N; ++i) {
     805           0 :         const float in[4] = { r[i], g[i], b[i], a[i] };
     806             :         float out[3];
     807           0 :         colorLUT->interp(out, in);
     808           0 :         for (int j = 0; j < colorLUT->outputChannels(); ++j) {
     809           0 :             result[j][i] = out[j];
     810             :         }
     811             :     }
     812           0 :     r = SkNf::Load(result[0]);
     813           0 :     g = SkNf::Load(result[1]);
     814           0 :     b = SkNf::Load(result[2]);
     815           0 :     if (4 == colorLUT->inputChannels()) {
     816             :         // we must set the pixel to opaque, as the alpha channel was used
     817             :         // as input before this.
     818           0 :         a = 1.f;
     819             :     }
     820             : }
     821             : 
     822           0 : STAGE(lab_to_xyz) {
     823           0 :     const auto lab_l = r * 100.0f;
     824           0 :     const auto lab_a = g * 255.0f - 128.0f;
     825           0 :     const auto lab_b = b * 255.0f - 128.0f;
     826           0 :     auto Y = (lab_l + 16.0f) * (1/116.0f);
     827           0 :     auto X = lab_a * (1/500.0f) + Y;
     828           0 :     auto Z = Y - (lab_b * (1/200.0f));
     829             : 
     830           0 :     const auto X3 = X*X*X;
     831           0 :     X = (X3 > 0.008856f).thenElse(X3, (X - (16/116.0f)) * (1/7.787f));
     832           0 :     const auto Y3 = Y*Y*Y;
     833           0 :     Y = (Y3 > 0.008856f).thenElse(Y3, (Y - (16/116.0f)) * (1/7.787f));
     834           0 :     const auto Z3 = Z*Z*Z;
     835           0 :     Z = (Z3 > 0.008856f).thenElse(Z3, (Z - (16/116.0f)) * (1/7.787f));
     836             : 
     837             :     // adjust to D50 illuminant
     838             :     X *= 0.96422f;
     839             :     Y *= 1.00000f;
     840             :     Z *= 0.82521f;
     841             : 
     842           0 :     r = X;
     843           0 :     g = Y;
     844           0 :     b = Z;
     845             : }
     846             : 
     847           0 : SI SkNf assert_in_tile(const SkNf& v, float limit) {
     848           0 :     for (int i = 0; i < N; i++) {
     849           0 :         SkASSERT(0 <= v[i] && v[i] < limit);
     850             :     }
     851           0 :     return v;
     852             : }
     853             : 
     854           0 : SI SkNf ulp_before(float v) {
     855           0 :     SkASSERT(v > 0);
     856             :     SkNf vs(v);
     857           0 :     SkNu uvs = SkNu::Load(&vs) - 1;
     858           0 :     return SkNf::Load(&uvs);
     859             : }
     860             : 
     861           0 : SI SkNf clamp(const SkNf& v, float limit) {
     862           0 :     SkNf result = SkNf::Max(0, SkNf::Min(v, ulp_before(limit)));
     863           0 :     return assert_in_tile(result, limit);
     864             : }
     865           0 : SI SkNf repeat(const SkNf& v, float limit) {
     866           0 :     SkNf result = v - (v/limit).floor()*limit;
     867             :     // For small negative v, (v/limit).floor()*limit can dominate v in the subtraction,
     868             :     // which leaves result == limit.  We want result < limit, so clamp it one ULP.
     869           0 :     result = SkNf::Min(result, ulp_before(limit));
     870           0 :     return assert_in_tile(result, limit);
     871             : }
     872           0 : SI SkNf mirror(const SkNf& v, float l/*imit*/) {
     873           0 :     SkNf result = ((v - l) - ((v - l) / (2*l)).floor()*(2*l) - l).abs();
     874             :     // Same deal as repeat.
     875           0 :     result = SkNf::Min(result, ulp_before(l));
     876           0 :     return assert_in_tile(result, l);
     877             : }
     878           0 : STAGE_CTX( clamp_x, const float*) { r = clamp (r, *ctx); }
     879           0 : STAGE_CTX(repeat_x, const float*) { r = repeat(r, *ctx); }
     880           0 : STAGE_CTX(mirror_x, const float*) { r = mirror(r, *ctx); }
     881           0 : STAGE_CTX( clamp_y, const float*) { g = clamp (g, *ctx); }
     882           0 : STAGE_CTX(repeat_y, const float*) { g = repeat(g, *ctx); }
     883           0 : STAGE_CTX(mirror_y, const float*) { g = mirror(g, *ctx); }
     884             : 
     885           0 : STAGE_CTX(save_xy, SkJumper_SamplerCtx*) {
     886           0 :     r.store(ctx->x);
     887           0 :     g.store(ctx->y);
     888             : 
     889             :     // Whether bilinear or bicubic, all sample points have the same fractional offset (fx,fy).
     890             :     // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
     891             :     // surrounding (x,y), all (0.5,0.5) off-center.
     892           0 :     auto fract = [](const SkNf& v) { return v - v.floor(); };
     893           0 :     fract(r + 0.5f).store(ctx->fx);
     894           0 :     fract(g + 0.5f).store(ctx->fy);
     895             : }
     896             : 
     897           0 : STAGE_CTX(accumulate, const SkJumper_SamplerCtx*) {
     898             :     // Bilinear and bicubic filtering are both separable, so we'll end up with independent
     899             :     // scale contributions in x and y that we multiply together to get each pixel's scale factor.
     900           0 :     auto scale = SkNf::Load(ctx->scalex) * SkNf::Load(ctx->scaley);
     901           0 :     dr = SkNf_fma(scale, r, dr);
     902           0 :     dg = SkNf_fma(scale, g, dg);
     903           0 :     db = SkNf_fma(scale, b, db);
     904           0 :     da = SkNf_fma(scale, a, da);
     905             : }
     906             : 
     907             : // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
     908             : // are combined in direct proportion to their area overlapping that logical query pixel.
     909             : // At positive offsets, the x-axis contribution to that rectangular area is fx; (1-fx)
     910             : // at negative x offsets.  The y-axis is treated symmetrically.
     911             : template <int Scale>
     912           0 : SI void bilinear_x(SkJumper_SamplerCtx* ctx, SkNf* x) {
     913           0 :     *x = SkNf::Load(ctx->x) + Scale*0.5f;
     914           0 :     auto fx = SkNf::Load(ctx->fx);
     915           0 :     (Scale > 0 ? fx : (1.0f - fx)).store(ctx->scalex);
     916           0 : }
     917             : template <int Scale>
     918           0 : SI void bilinear_y(SkJumper_SamplerCtx* ctx, SkNf* y) {
     919           0 :     *y = SkNf::Load(ctx->y) + Scale*0.5f;
     920           0 :     auto fy = SkNf::Load(ctx->fy);
     921           0 :     (Scale > 0 ? fy : (1.0f - fy)).store(ctx->scaley);
     922           0 : }
     923           0 : STAGE_CTX(bilinear_nx, SkJumper_SamplerCtx*) { bilinear_x<-1>(ctx, &r); }
     924           0 : STAGE_CTX(bilinear_px, SkJumper_SamplerCtx*) { bilinear_x<+1>(ctx, &r); }
     925           0 : STAGE_CTX(bilinear_ny, SkJumper_SamplerCtx*) { bilinear_y<-1>(ctx, &g); }
     926           0 : STAGE_CTX(bilinear_py, SkJumper_SamplerCtx*) { bilinear_y<+1>(ctx, &g); }
     927             : 
     928             : 
     929             : // In bilinear interpolation, the 16 pixels at +/- 0.5 and +/- 1.5 offsets from the sample
     930             : // pixel center are combined with a non-uniform cubic filter, with high filter values near
     931             : // the center and lower values farther away.
     932             : //
     933             : // We break this filter function into two parts, one for near +/- 0.5 offsets,
     934             : // and one for far +/- 1.5 offsets.
     935             : //
     936             : // See GrBicubicEffect for details about this particular Mitchell-Netravali filter.
     937           0 : SI SkNf bicubic_near(const SkNf& t) {
     938             :     // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
     939           0 :     return SkNf_fma(t, SkNf_fma(t, SkNf_fma(-21/18.0f, t, 27/18.0f), 9/18.0f), 1/18.0f);
     940             : }
     941           0 : SI SkNf bicubic_far(const SkNf& t) {
     942             :     // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)
     943           0 :     return (t*t)*SkNf_fma(7/18.0f, t, -6/18.0f);
     944             : }
     945             : 
     946             : template <int Scale>
     947           0 : SI void bicubic_x(SkJumper_SamplerCtx* ctx, SkNf* x) {
     948           0 :     *x = SkNf::Load(ctx->x) + Scale*0.5f;
     949           0 :     auto fx = SkNf::Load(ctx->fx);
     950           0 :     if (Scale == -3) { return bicubic_far (1.0f - fx).store(ctx->scalex); }
     951           0 :     if (Scale == -1) { return bicubic_near(1.0f - fx).store(ctx->scalex); }
     952           0 :     if (Scale == +1) { return bicubic_near(       fx).store(ctx->scalex); }
     953           0 :     if (Scale == +3) { return bicubic_far (       fx).store(ctx->scalex); }
     954             :     SkDEBUGFAIL("unreachable");
     955             : }
     956             : template <int Scale>
     957           0 : SI void bicubic_y(SkJumper_SamplerCtx* ctx, SkNf* y) {
     958           0 :     *y = SkNf::Load(ctx->y) + Scale*0.5f;
     959           0 :     auto fy = SkNf::Load(ctx->fy);
     960           0 :     if (Scale == -3) { return bicubic_far (1.0f - fy).store(ctx->scaley); }
     961           0 :     if (Scale == -1) { return bicubic_near(1.0f - fy).store(ctx->scaley); }
     962           0 :     if (Scale == +1) { return bicubic_near(       fy).store(ctx->scaley); }
     963           0 :     if (Scale == +3) { return bicubic_far (       fy).store(ctx->scaley); }
     964             :     SkDEBUGFAIL("unreachable");
     965             : }
     966           0 : STAGE_CTX(bicubic_n3x, SkJumper_SamplerCtx*) { bicubic_x<-3>(ctx, &r); }
     967           0 : STAGE_CTX(bicubic_n1x, SkJumper_SamplerCtx*) { bicubic_x<-1>(ctx, &r); }
     968           0 : STAGE_CTX(bicubic_p1x, SkJumper_SamplerCtx*) { bicubic_x<+1>(ctx, &r); }
     969           0 : STAGE_CTX(bicubic_p3x, SkJumper_SamplerCtx*) { bicubic_x<+3>(ctx, &r); }
     970             : 
     971           0 : STAGE_CTX(bicubic_n3y, SkJumper_SamplerCtx*) { bicubic_y<-3>(ctx, &g); }
     972           0 : STAGE_CTX(bicubic_n1y, SkJumper_SamplerCtx*) { bicubic_y<-1>(ctx, &g); }
     973           0 : STAGE_CTX(bicubic_p1y, SkJumper_SamplerCtx*) { bicubic_y<+1>(ctx, &g); }
     974           0 : STAGE_CTX(bicubic_p3y, SkJumper_SamplerCtx*) { bicubic_y<+3>(ctx, &g); }
     975             : 
     976             : 
     977             : template <typename T>
     978           0 : SI SkNi offset_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, const SkNf& x, const SkNf& y) {
     979           0 :     SkNi ix = SkNx_cast<int>(x),
     980           0 :          iy = SkNx_cast<int>(y);
     981           0 :     SkNi offset = iy*ctx->stride + ix;
     982             : 
     983           0 :     *ptr = (const T*)ctx->pixels;
     984           0 :     return offset;
     985             : }
     986             : 
     987           0 : STAGE_CTX(gather_a8, const SkJumper_GatherCtx*) {
     988             :     const uint8_t* p;
     989           0 :     SkNi offset = offset_and_ptr(&p, ctx, r, g);
     990             : 
     991           0 :     r = g = b = 0.0f;
     992           0 :     a = SkNf_from_byte(gather(tail, p, offset));
     993             : }
     994           0 : STAGE_CTX(gather_i8, const SkJumper_GatherCtx*) {
     995             :     const uint8_t* p;
     996           0 :     SkNi offset = offset_and_ptr(&p, ctx, r, g);
     997             : 
     998           0 :     SkNi ix = SkNx_cast<int>(gather(tail, p, offset));
     999           0 :     from_8888(gather(tail, ctx->ctable, ix), &r, &g, &b, &a);
    1000             : }
    1001           0 : STAGE_CTX(gather_g8, const SkJumper_GatherCtx*) {
    1002             :     const uint8_t* p;
    1003           0 :     SkNi offset = offset_and_ptr(&p, ctx, r, g);
    1004             : 
    1005           0 :     r = g = b = SkNf_from_byte(gather(tail, p, offset));
    1006           0 :     a = 1.0f;
    1007             : }
    1008           0 : STAGE_CTX(gather_565, const SkJumper_GatherCtx*) {
    1009             :     const uint16_t* p;
    1010           0 :     SkNi offset = offset_and_ptr(&p, ctx, r, g);
    1011             : 
    1012           0 :     from_565(gather(tail, p, offset), &r, &g, &b);
    1013           0 :     a = 1.0f;
    1014             : }
    1015           0 : STAGE_CTX(gather_4444, const SkJumper_GatherCtx*) {
    1016             :     const uint16_t* p;
    1017           0 :     SkNi offset = offset_and_ptr(&p, ctx, r, g);
    1018             : 
    1019           0 :     from_4444(gather(tail, p, offset), &r, &g, &b, &a);
    1020             : }
    1021           0 : STAGE_CTX(gather_8888, const SkJumper_GatherCtx*) {
    1022             :     const uint32_t* p;
    1023           0 :     SkNi offset = offset_and_ptr(&p, ctx, r, g);
    1024             : 
    1025           0 :     from_8888(gather(tail, p, offset), &r, &g, &b, &a);
    1026             : }
    1027           0 : STAGE_CTX(gather_f16, const SkJumper_GatherCtx*) {
    1028             :     const uint64_t* p;
    1029           0 :     SkNi offset = offset_and_ptr(&p, ctx, r, g);
    1030             : 
    1031           0 :     auto px = gather(tail, p, offset);
    1032           0 :     from_f16(&px, &r, &g, &b, &a);
    1033             : }
    1034             : 
    1035           0 : STAGE_CTX(linear_gradient, const SkPM4f*) {
    1036             :     struct Stop { float pos; float f[4], b[4]; };
    1037             :     struct Ctx { size_t n; Stop *stops; float start[4]; };
    1038             : 
    1039           0 :     auto c = (const Ctx*)ctx;
    1040             :     SkNf fr = 0, fg = 0, fb = 0, fa = 0;
    1041           0 :     SkNf br = c->start[0],
    1042           0 :          bg = c->start[1],
    1043           0 :          bb = c->start[2],
    1044           0 :          ba = c->start[3];
    1045           0 :     auto t = r;
    1046           0 :     for (size_t i = 0; i < c->n; i++) {
    1047           0 :         fr = (t < c->stops[i].pos).thenElse(fr, c->stops[i].f[0]);
    1048           0 :         fg = (t < c->stops[i].pos).thenElse(fg, c->stops[i].f[1]);
    1049           0 :         fb = (t < c->stops[i].pos).thenElse(fb, c->stops[i].f[2]);
    1050           0 :         fa = (t < c->stops[i].pos).thenElse(fa, c->stops[i].f[3]);
    1051           0 :         br = (t < c->stops[i].pos).thenElse(br, c->stops[i].b[0]);
    1052           0 :         bg = (t < c->stops[i].pos).thenElse(bg, c->stops[i].b[1]);
    1053           0 :         bb = (t < c->stops[i].pos).thenElse(bb, c->stops[i].b[2]);
    1054           0 :         ba = (t < c->stops[i].pos).thenElse(ba, c->stops[i].b[3]);
    1055             :     }
    1056             : 
    1057           0 :     r = SkNf_fma(t, fr, br);
    1058           0 :     g = SkNf_fma(t, fg, bg);
    1059           0 :     b = SkNf_fma(t, fb, bb);
    1060           0 :     a = SkNf_fma(t, fa, ba);
    1061             : }
    1062             : 
    1063           0 : STAGE_CTX(linear_gradient_2stops, const SkPM4f*) {
    1064           0 :     auto t = r;
    1065           0 :     SkPM4f c0 = ctx[0],
    1066           0 :         dc = ctx[1];
    1067             : 
    1068           0 :     r = SkNf_fma(t, dc.r(), c0.r());
    1069           0 :     g = SkNf_fma(t, dc.g(), c0.g());
    1070           0 :     b = SkNf_fma(t, dc.b(), c0.b());
    1071           0 :     a = SkNf_fma(t, dc.a(), c0.a());
    1072             : }
    1073             : 
    1074           0 : STAGE_CTX(byte_tables, const void*) {
    1075             :     struct Tables { const uint8_t *r, *g, *b, *a; };
    1076           0 :     auto tables = (const Tables*)ctx;
    1077             : 
    1078           0 :     r = SkNf_from_byte(gather(tail, tables->r, SkNf_round(255.0f, r)));
    1079           0 :     g = SkNf_from_byte(gather(tail, tables->g, SkNf_round(255.0f, g)));
    1080           0 :     b = SkNf_from_byte(gather(tail, tables->b, SkNf_round(255.0f, b)));
    1081           0 :     a = SkNf_from_byte(gather(tail, tables->a, SkNf_round(255.0f, a)));
    1082             : }
    1083             : 
    1084           0 : STAGE_CTX(byte_tables_rgb, const void*) {
    1085             :     struct Tables { const uint8_t *r, *g, *b; int n; };
    1086           0 :     auto tables = (const Tables*)ctx;
    1087             : 
    1088           0 :     float scale = tables->n - 1;
    1089           0 :     r = SkNf_from_byte(gather(tail, tables->r, SkNf_round(scale, r)));
    1090           0 :     g = SkNf_from_byte(gather(tail, tables->g, SkNf_round(scale, g)));
    1091           0 :     b = SkNf_from_byte(gather(tail, tables->b, SkNf_round(scale, b)));
    1092             : }
    1093             : 
    1094           0 : STAGE_CTX(shader_adapter, SkShader::Context*) {
    1095             :     SkPM4f buf[N];
    1096             :     static_assert(sizeof(buf) == sizeof(r) + sizeof(g) + sizeof(b) + sizeof(a), "");
    1097           0 :     ctx->shadeSpan4f(x, (int)g[0], buf, N);
    1098             :     SkNf::Load4(buf, &r, &g, &b, &a);
    1099             : }
    1100             : 
    1101           0 : SI Fn enum_to_Fn(SkRasterPipeline::StockStage st) {
    1102           0 :     switch (st) {
    1103             :     #define M(stage) case SkRasterPipeline::stage: return stage;
    1104           0 :         SK_RASTER_PIPELINE_STAGES(M)
    1105             :     #undef M
    1106             :     }
    1107           0 :     SkASSERT(false);
    1108           0 :     return just_return;
    1109             : }
    1110             : 
    1111             : namespace {
    1112             : 
    1113           0 :     static void build_program(void** program, const SkRasterPipeline::Stage* stages, int nstages) {
    1114           0 :         for (int i = 0; i < nstages; i++) {
    1115           0 :             *program++ = (void*)enum_to_Fn(stages[i].stage);
    1116           0 :             if (stages[i].ctx) {
    1117           0 :                 *program++ = stages[i].ctx;
    1118             :             }
    1119             :         }
    1120           0 :         *program++ = (void*)just_return;
    1121           0 :     }
    1122             : 
    1123           0 :     static void run_program(void** program, size_t x, size_t n) {
    1124             :         SkNf u;  // fastest to start uninitialized.
    1125             : 
    1126           0 :         auto start = (Fn)load_and_increment(&program);
    1127           0 :         while (n >= N) {
    1128           0 :             start(x*N, program, u,u,u,u, u,u,u,u);
    1129           0 :             x += N;
    1130           0 :             n -= N;
    1131             :         }
    1132           0 :         if (n) {
    1133           0 :             start(x*N+n, program, u,u,u,u, u,u,u,u);
    1134             :         }
    1135           0 :     }
    1136             : 
    1137             :     // Compiled manages its memory manually because it's not safe to use
    1138             :     // std::vector, SkTDArray, etc without setting us up for big ODR violations.
    1139             :     struct Compiled {
    1140           0 :         Compiled(const SkRasterPipeline::Stage* stages, int nstages) {
    1141           0 :             int slots = nstages + 1;  // One extra for just_return.
    1142           0 :             for (int i = 0; i < nstages; i++) {
    1143           0 :                 if (stages[i].ctx) {
    1144           0 :                     slots++;
    1145             :                 }
    1146             :             }
    1147           0 :             fProgram = (void**)sk_malloc_throw(slots * sizeof(void*));
    1148           0 :             build_program(fProgram, stages, nstages);
    1149           0 :         }
    1150           0 :         ~Compiled() { sk_free(fProgram); }
    1151             : 
    1152             :         Compiled(const Compiled& o) {
    1153             :             int slots = 0;
    1154             :             while (o.fProgram[slots++] != (void*)just_return);
    1155             : 
    1156             :             fProgram = (void**)sk_malloc_throw(slots * sizeof(void*));
    1157             :             memcpy(fProgram, o.fProgram, slots * sizeof(void*));
    1158             :         }
    1159             : 
    1160           0 :         void operator()(size_t x, size_t n) {
    1161           0 :             run_program(fProgram, x, n);
    1162           0 :         }
    1163             : 
    1164             :         void** fProgram;
    1165             :     };
    1166             : }
    1167             : 
    1168             : namespace SK_OPTS_NS {
    1169             : 
    1170           0 :     SI void run_pipeline(size_t x, size_t n,
    1171             :                          const SkRasterPipeline::Stage* stages, int nstages) {
    1172             :         static const int kStackMax = 256;
    1173             :         // Worst case is nstages stages with nstages context pointers, and just_return.
    1174           0 :         if (2*nstages+1 <= kStackMax) {
    1175             :             void* program[kStackMax];
    1176           0 :             build_program(program, stages, nstages);
    1177           0 :             run_program(program, x,n);
    1178             :         } else {
    1179           0 :             Compiled{stages,nstages}(x,n);
    1180             :         }
    1181           0 :     }
    1182             : 
    1183             : }  // namespace SK_OPTS_NS
    1184             : 
    1185             : #undef SI
    1186             : #undef STAGE
    1187             : #undef STAGE_CTX
    1188             : #undef RGBA_XFERMODE
    1189             : #undef RGB_XFERMODE
    1190             : 
    1191             : #endif//SkRasterPipeline_opts_DEFINED

Generated by: LCOV version 1.13