// Every function in this file should be marked static and inline using SI. #ifdefined(__clang__) || defined(__GNUC__) #define SI __attribute__((always_inline)) staticinline #else #define SI staticinline #endif
#ifdefined(__clang__) template <int N, typename T> using Vec = T __attribute__((ext_vector_type(N))); #elifdefined(__GNUC__) #ifndef __has_builtin #define SKRP_CPU_SCALAR #elif !__has_builtin(__builtin_convertvector) #define SKRP_CPU_SCALAR #endif
// Unfortunately, GCC does not allow us to omit the struct. This will not compile: // template <int N, typename T> using Vec = T __attribute__((vector_size(N*sizeof(T)))); template <int N, typename T> struct VecHelper { typedef T __attribute__((vector_size(N * sizeof(T)))) V;
}; template <int N, typename T> using Vec = typename VecHelper<N, T>::V; #endif
// Notes: // * rcp_fast and rcp_precise both produce a reciprocal, but rcp_fast is an estimate with at least // 12 bits of precision while rcp_precise should be accurate for float size. For ARM rcp_precise // requires 2 Newton-Raphson refinement steps because its estimate has 8 bit precision, and for // Intel this requires one additional step because its estimate has 12 bit precision. // // * Don't call rcp_approx or rsqrt_approx directly; only use rcp_fast and rsqrt.
namespace SK_OPTS_NS { #ifdefined(SKRP_CPU_SCALAR) // This path should lead to portable scalar code. using F = float ; using I32 = int32_t; using U64 = uint64_t; using U32 = uint32_t; using U16 = uint16_t; using U8 = uint8_t ;
SI F min(F a, F b) { return fminf(a,b); }
SI I32 min(I32 a, I32 b) { return a < b ? a : b; }
SI U32 min(U32 a, U32 b) { return a < b ? a : b; }
SI F max(F a, F b) { return fmaxf(a,b); }
SI I32 max(I32 a, I32 b) { return a > b ? a : b; }
SI U32 max(U32 a, U32 b) { return a > b ? a : b; }
SI F mad(F f, F m, F a) { return a+f*m; }
SI F nmad(F f, F m, F a) { return a-f*m; }
SI F abs_ (F v) { return fabsf(v); }
SI I32 abs_ (I32 v) { return v < 0 ? -v : v; }
SI F floor_(F v) { return floorf(v); }
SI F ceil_(F v) { return ceilf(v); }
SI F rcp_approx(F v) { return 1.0f / v; } // use rcp_fast instead
SI F rsqrt_approx(F v) { return 1.0f / sqrtf(v); }
SI F sqrt_ (F v) { return sqrtf(v); }
SI F rcp_precise (F v) { return 1.0f / v; }
SI I32 iround(F v) { return (I32)(v + 0.5f); }
SI U32 round(F v) { return (U32)(v + 0.5f); }
SI U32 round(F v, F scale) { return (U32)(v*scale + 0.5f); }
SI U16 pack(U32 v) { return (U16)v; }
SI U8 pack(U16 v) { return (U8)v; }
SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
SI I32 if_then_else(I32 c, I32 t, I32 e) { return c ? t : e; }
SI bool any(I32 c) { return c != 0; }
SI bool all(I32 c) { return c != 0; }
template <typename T>
SI T gather(const T* p, U32 ix) { return p[ix]; }
SI void load4(constfloat* ptr, F* r, F* g, F* b, F* a) {
*r = ptr[0];
*g = ptr[1];
*b = ptr[2];
*a = ptr[3];
}
SI void store4(float* ptr, F r, F g, F b, F a) {
ptr[0] = r;
ptr[1] = g;
ptr[2] = b;
ptr[3] = a;
}
#elifdefined(SKRP_CPU_NEON) template <typename T> using V = Vec<4, T>; using F = V<float >; using I32 = V< int32_t>; using U64 = V<uint64_t>; using U32 = V<uint32_t>; using U16 = V<uint16_t>; using U8 = V<uint8_t >;
// We polyfill a few routines that Clang doesn't build into ext_vector_types.
SI F min(F a, F b) { return vminq_f32(a,b); }
SI I32 min(I32 a, I32 b) { return vminq_s32(a,b); }
SI U32 min(U32 a, U32 b) { return vminq_u32(a,b); }
SI F max(F a, F b) { return vmaxq_f32(a,b); }
SI I32 max(I32 a, I32 b) { return vmaxq_s32(a,b); }
SI U32 max(U32 a, U32 b) { return vmaxq_u32(a,b); }
SI F abs_ (F v) { return vabsq_f32(v); }
SI I32 abs_ (I32 v) { return vabsq_s32(v); }
SI F rcp_approx(F v) { auto e = vrecpeq_f32(v); return vrecpsq_f32 (v,e ) * e; }
SI F rcp_precise(F v) { auto e = rcp_approx(v); return vrecpsq_f32 (v,e ) * e; }
SI F rsqrt_approx(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
SI I32 if_then_else(I32 c, I32 t, I32 e) { return vbslq_s32((U32)c,t,e); }
#ifdefined(SK_CPU_ARM64)
SI bool any(I32 c) { return vmaxvq_u32((U32)c) != 0; }
SI bool all(I32 c) { return vminvq_u32((U32)c) != 0; }
SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
SI F nmad(F f, F m, F a) { return vfmsq_f32(a,f,m); }
SI F floor_(F v) { return vrndmq_f32(v); }
SI F ceil_(F v) { return vrndpq_f32(v); }
SI F sqrt_(F v) { return vsqrtq_f32(v); }
SI I32 iround(F v) { return vcvtnq_s32_f32(v); }
SI U32 round(F v) { return vcvtnq_u32_f32(v); }
SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); } #else
SI bool any(I32 c) { return c[0] | c[1] | c[2] | c[3]; }
SI bool all(I32 c) { return c[0] & c[1] & c[2] & c[3]; }
SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); }
SI F nmad(F f, F m, F a) { return vmlsq_f32(a,f,m); }
SI F floor_(F v) {
F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); return roundtrip - if_then_else(roundtrip > v, F() + 1, F());
}
SI F ceil_(F v) {
F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); return roundtrip + if_then_else(roundtrip < v, F() + 1, F());
}
SI F sqrt_(F v) { auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v).
e *= vrsqrtsq_f32(v,e*e);
e *= vrsqrtsq_f32(v,e*e); return v*e; // sqrt(v) == v*rsqrt(v).
}
SI I32 iround(F v) { return vcvtq_s32_f32(v + 0.5f);
}
SI U32 round(F v) { return vcvtq_u32_f32(v + 0.5f);
}
SI U32 round(F v, F scale) { return vcvtq_u32_f32(mad(v, scale, F() + 0.5f));
} #endif
SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
}
SI void load4(constfloat* ptr, F* r, F* g, F* b, F* a) {
float32x4x4_t rgba = vld4q_f32(ptr);
*r = rgba.val[0];
*g = rgba.val[1];
*b = rgba.val[2];
*a = rgba.val[3];
}
SI void store4(float* ptr, F r, F g, F b, F a) {
vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
}
#elifdefined(SKRP_CPU_SKX) template <typename T> using V = Vec<16, T>; using F = V<float >; using I32 = V< int32_t>; using U64 = V<uint64_t>; using U32 = V<uint32_t>; using U16 = V<uint16_t>; using U8 = V<uint8_t >;
SI F mad(F f, F m, F a) { return _mm512_fmadd_ps(f, m, a); }
SI F nmad(F f, F m, F a) { return _mm512_fnmadd_ps(f, m, a); }
SI F min(F a, F b) { return _mm512_min_ps(a,b); }
SI I32 min(I32 a, I32 b) { return (I32)_mm512_min_epi32((__m512i)a,(__m512i)b); }
SI U32 min(U32 a, U32 b) { return (U32)_mm512_min_epu32((__m512i)a,(__m512i)b); }
SI F max(F a, F b) { return _mm512_max_ps(a,b); }
SI I32 max(I32 a, I32 b) { return (I32)_mm512_max_epi32((__m512i)a,(__m512i)b); }
SI U32 max(U32 a, U32 b) { return (U32)_mm512_max_epu32((__m512i)a,(__m512i)b); }
SI F abs_ (F v) { return _mm512_and_ps(v, _mm512_sub_ps(_mm512_setzero(), v)); }
SI I32 abs_ (I32 v) { return (I32)_mm512_abs_epi32((__m512i)v); }
SI F floor_(F v) { return _mm512_floor_ps(v); }
SI F ceil_(F v) { return _mm512_ceil_ps(v); }
SI F rcp_approx(F v) { return _mm512_rcp14_ps (v); }
SI F rsqrt_approx (F v) { return _mm512_rsqrt14_ps(v); }
SI F sqrt_ (F v) { return _mm512_sqrt_ps (v); }
SI F rcp_precise (F v) {
F e = rcp_approx(v); return _mm512_fnmadd_ps(v, e, _mm512_set1_ps(2.0f)) * e;
}
SI I32 iround(F v) { return (I32)_mm512_cvtps_epi32(v); }
SI U32 round(F v) { return (U32)_mm512_cvtps_epi32(v); }
SI U32 round(F v, F scale) { return (U32)_mm512_cvtps_epi32(v*scale); }
SI U16 pack(U32 v) {
__m256i rst = _mm256_packus_epi32(_mm512_castsi512_si256((__m512i)v),
_mm512_extracti64x4_epi64((__m512i)v, 1)); return (U16)_mm256_permutex_epi64(rst, 216);
}
SI U8 pack(U16 v) {
__m256i rst = _mm256_packus_epi16((__m256i)v, (__m256i)v); return (U8)_mm256_castsi256_si128(_mm256_permute4x64_epi64(rst, 8));
}
SI F if_then_else(I32 c, F t, F e) {
__m512i mask = _mm512_set1_epi32(0x80000000);
__m512i aa = _mm512_and_si512((__m512i)c, mask); return _mm512_mask_blend_ps(_mm512_test_epi32_mask(aa, aa),e,t);
}
SI I32 if_then_else(I32 c, I32 t, I32 e) {
__m512i mask = _mm512_set1_epi32(0x80000000);
__m512i aa = _mm512_and_si512((__m512i)c, mask); return (I32)_mm512_mask_blend_epi32(_mm512_test_epi32_mask(aa, aa),(__m512i)e,(__m512i)t);
}
SI bool any(I32 c) {
__mmask16 mask32 = _mm512_test_epi32_mask((__m512i)c, (__m512i)c); return mask32 != 0;
}
SI bool all(I32 c) {
__mmask16 mask32 = _mm512_test_epi32_mask((__m512i)c, (__m512i)c); return mask32 == 0xffff;
} template <typename T>
SI V<T> gather(const T* p, U32 ix) { return V<T>{ p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
}
SI F gather(constfloat* p, U32 ix) { return _mm512_i32gather_ps((__m512i)ix, p, 4); }
SI U32 gather(const uint32_t* p, U32 ix) { return (U32)_mm512_i32gather_epi32((__m512i)ix, p, 4); }
SI U64 gather(const uint64_t* p, U32 ix) {
__m512i parts[] = {
_mm512_i32gather_epi64(_mm512_castsi512_si256((__m512i)ix), p, 8),
_mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)ix, 1), p, 8),
}; return sk_bit_cast<U64>(parts);
} template <typename V, typename S>
SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
V before = gather(dst, ix);
V after = if_then_else(mask, src, before);
dst[ix[0]] = after[0];
dst[ix[1]] = after[1];
dst[ix[2]] = after[2];
dst[ix[3]] = after[3];
dst[ix[4]] = after[4];
dst[ix[5]] = after[5];
dst[ix[6]] = after[6];
dst[ix[7]] = after[7];
dst[ix[8]] = after[8];
dst[ix[9]] = after[9];
dst[ix[10]] = after[10];
dst[ix[11]] = after[11];
dst[ix[12]] = after[12];
dst[ix[13]] = after[13];
dst[ix[14]] = after[14];
dst[ix[15]] = after[15];
}
SI void store4(float* ptr, F r, F g, F b, F a) {
F rg014589cd = _mm512_unpacklo_ps(r, g),
rg2367abef = _mm512_unpackhi_ps(r, g),
ba014589cd = _mm512_unpacklo_ps(b, a),
ba2367abef = _mm512_unpackhi_ps(b, a);
#elifdefined(SKRP_CPU_HSW) // These are __m256 and __m256i, but friendlier and strongly-typed. template <typename T> using V = Vec<8, T>; using F = V<float >; using I32 = V< int32_t>; using U64 = V<uint64_t>; using U32 = V<uint32_t>; using U16 = V<uint16_t>; using U8 = V<uint8_t >;
SI F mad(F f, F m, F a) { return _mm256_fmadd_ps(f, m, a); }
SI F nmad(F f, F m, F a) { return _mm256_fnmadd_ps(f, m, a); }
SI F min(F a, F b) { return _mm256_min_ps(a,b); }
SI I32 min(I32 a, I32 b) { return (I32)_mm256_min_epi32((__m256i)a,(__m256i)b); }
SI U32 min(U32 a, U32 b) { return (U32)_mm256_min_epu32((__m256i)a,(__m256i)b); }
SI F max(F a, F b) { return _mm256_max_ps(a,b); }
SI I32 max(I32 a, I32 b) { return (I32)_mm256_max_epi32((__m256i)a,(__m256i)b); }
SI U32 max(U32 a, U32 b) { return (U32)_mm256_max_epu32((__m256i)a,(__m256i)b); }
SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); }
SI I32 abs_ (I32 v) { return (I32)_mm256_abs_epi32((__m256i)v); }
SI F floor_(F v) { return _mm256_floor_ps(v); }
SI F ceil_(F v) { return _mm256_ceil_ps(v); }
SI F rcp_approx(F v) { return _mm256_rcp_ps (v); } // use rcp_fast instead
SI F rsqrt_approx(F v) { return _mm256_rsqrt_ps(v); }
SI F sqrt_ (F v) { return _mm256_sqrt_ps (v); }
SI F rcp_precise (F v) {
F e = rcp_approx(v); return _mm256_fnmadd_ps(v, e, _mm256_set1_ps(2.0f)) * e;
}
SI I32 iround(F v) { return (I32)_mm256_cvtps_epi32(v); }
SI U32 round(F v) { return (U32)_mm256_cvtps_epi32(v); }
SI U32 round(F v, F scale) { return (U32)_mm256_cvtps_epi32(v*scale); }
SI U16 pack(U32 v) { return (U16)_mm_packus_epi32(_mm256_extractf128_si256((__m256i)v, 0),
_mm256_extractf128_si256((__m256i)v, 1));
}
SI U8 pack(U16 v) { auto r = _mm_packus_epi16((__m128i)v,(__m128i)v); return sk_unaligned_load<U8>(&r);
}
SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e, t, (__m256)c); }
SI I32 if_then_else(I32 c, I32 t, I32 e) { return (I32)_mm256_blendv_ps((__m256)e, (__m256)t, (__m256)c);
}
// NOTE: This version of 'all' only works with mask values (true == all bits set)
SI bool any(I32 c) { return !_mm256_testz_si256((__m256i)c, _mm256_set1_epi32(-1)); }
SI bool all(I32 c) { return _mm256_testc_si256((__m256i)c, _mm256_set1_epi32(-1)); }
#elifdefined(SKRP_CPU_SSE2) || defined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX) template <typename T> using V = Vec<4, T>; using F = V<float >; using I32 = V< int32_t>; using U64 = V<uint64_t>; using U32 = V<uint32_t>; using U16 = V<uint16_t>; using U8 = V<uint8_t >;
SI F if_then_else(I32 c, F t, F e) { return _mm_or_ps(_mm_and_ps((__m128)c, t), _mm_andnot_ps((__m128)c, e));
}
SI I32 if_then_else(I32 c, I32 t, I32 e) { return (I32)_mm_or_ps(_mm_and_ps((__m128)c, (__m128)t),
_mm_andnot_ps((__m128)c, (__m128)e));
}
SI F min(F a, F b) { return _mm_min_ps(a,b); }
SI F max(F a, F b) { return _mm_max_ps(a,b); } #ifdefined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX)
SI I32 min(I32 a, I32 b) { return (I32)_mm_min_epi32((__m128i)a,(__m128i)b); }
SI U32 min(U32 a, U32 b) { return (U32)_mm_min_epu32((__m128i)a,(__m128i)b); }
SI I32 max(I32 a, I32 b) { return (I32)_mm_max_epi32((__m128i)a,(__m128i)b); }
SI U32 max(U32 a, U32 b) { return (U32)_mm_max_epu32((__m128i)a,(__m128i)b); } #else
SI I32 min(I32 a, I32 b) { return if_then_else(a < b, a, b); }
SI I32 max(I32 a, I32 b) { return if_then_else(a > b, a, b); }
SI U32 min(U32 a, U32 b) { return sk_bit_cast<U32>(if_then_else(a < b, sk_bit_cast<I32>(a), sk_bit_cast<I32>(b)));
}
SI U32 max(U32 a, U32 b) { return sk_bit_cast<U32>(if_then_else(a > b, sk_bit_cast<I32>(a), sk_bit_cast<I32>(b)));
} #endif
SI F mad(F f, F m, F a) { return a+f*m; }
SI F nmad(F f, F m, F a) { return a-f*m; }
SI F abs_(F v) { return _mm_and_ps(v, 0-v); } #ifdefined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX)
SI I32 abs_(I32 v) { return (I32)_mm_abs_epi32((__m128i)v); } #else
SI I32 abs_(I32 v) { return max(v, -v); } #endif
SI F rcp_approx(F v) { return _mm_rcp_ps (v); } // use rcp_fast instead
SI F rcp_precise (F v) { F e = rcp_approx(v); return e * (2.0f - v * e); }
SI F rsqrt_approx(F v) { return _mm_rsqrt_ps(v); }
SI F sqrt_(F v) { return _mm_sqrt_ps (v); }
SI I32 iround(F v) { return (I32)_mm_cvtps_epi32(v); }
SI U32 round(F v) { return (U32)_mm_cvtps_epi32(v); }
SI U32 round(F v, F scale) { return (U32)_mm_cvtps_epi32(v*scale); }
SI U16 pack(U32 v) { #ifdefined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX) auto p = _mm_packus_epi32((__m128i)v,(__m128i)v); #else // Sign extend so that _mm_packs_epi32() does the pack we want. auto p = _mm_srai_epi32(_mm_slli_epi32((__m128i)v, 16), 16);
p = _mm_packs_epi32(p,p); #endif return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
}
SI U8 pack(U16 v) { auto r = widen_cast<__m128i>(v);
r = _mm_packus_epi16(r,r); return sk_unaligned_load<U8>(&r);
}
// NOTE: This only checks the top bit of each lane, and is incorrect with non-mask values.
SI bool any(I32 c) { return _mm_movemask_ps(sk_bit_cast<F>(c)) != 0b0000; }
SI bool all(I32 c) { return _mm_movemask_ps(sk_bit_cast<F>(c)) == 0b1111; }
SI F floor_(F v) { #ifdefined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX) return _mm_floor_ps(v); #else
F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); return roundtrip - if_then_else(roundtrip > v, F() + 1, F() + 0); #endif
}
SI F ceil_(F v) { #ifdefined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX) return _mm_ceil_ps(v); #else
F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); return roundtrip + if_then_else(roundtrip < v, F() + 1, F() + 0); #endif
}
SI void store4(float* ptr, F r, F g, F b, F a) {
_MM_TRANSPOSE4_PS(r,g,b,a);
_mm_storeu_ps(ptr + 0, r);
_mm_storeu_ps(ptr + 4, g);
_mm_storeu_ps(ptr + 8, b);
_mm_storeu_ps(ptr +12, a);
}
#elifdefined(SKRP_CPU_LASX) // These are __m256 and __m256i, but friendlier and strongly-typed. template <typename T> using V = Vec<8, T>; using F = V<float >; using I32 = V<int32_t>; using U64 = V<uint64_t>; using U32 = V<uint32_t>; using U16 = V<uint16_t>; using U8 = V<uint8_t >;
SI __m128i emulate_lasx_d_xr2vr_l(__m256i a) {
v4i64 tmp = a;
v2i64 al = {tmp[0], tmp[1]}; return (__m128i)al;
}
SI __m128i emulate_lasx_d_xr2vr_h(__m256i a) {
v4i64 tmp = a;
v2i64 ah = {tmp[2], tmp[3]}; return (__m128i)ah;
}
SI F if_then_else(I32 c, F t, F e) { return sk_bit_cast<Vec<8,float>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
sk_bit_cast<__m256i>(t),
sk_bit_cast<__m256i>(c)));
}
SI F min(F a, F b) { return __lasx_xvfmin_s(a,b); }
SI F max(F a, F b) { return __lasx_xvfmax_s(a,b); }
SI I32 min(I32 a, I32 b) { return __lasx_xvmin_w(a,b); }
SI U32 min(U32 a, U32 b) { return __lasx_xvmin_wu(a,b); }
SI I32 max(I32 a, I32 b) { return __lasx_xvmax_w(a,b); }
SI U32 max(U32 a, U32 b) { return __lasx_xvmax_wu(a,b); }
SI F mad(F f, F m, F a) { return __lasx_xvfmadd_s(f, m, a); }
SI F nmad(F f, F m, F a) { return __lasx_xvfmadd_s(-f, m, a); }
SI F abs_ (F v) { return (F)__lasx_xvand_v((I32)v, (I32)(0-v)); }
SI I32 abs_(I32 v) { return max(v, -v); }
SI F rcp_approx(F v) { return __lasx_xvfrecip_s(v); }
SI F rcp_precise (F v) { F e = rcp_approx(v); return e * nmad(v, e, F() + 2.0f); }
SI F rsqrt_approx (F v) { return __lasx_xvfrsqrt_s(v); }
SI F sqrt_(F v) { return __lasx_xvfsqrt_s(v); }
SI U32 iround(F v) {
F t = F() + 0.5f; return __lasx_xvftintrz_w_s(v + t);
}
SI U32 round(F v) {
F t = F() + 0.5f; return __lasx_xvftintrz_w_s(v + t);
}
SI U32 round(F v, F scale) {
F t = F() + 0.5f; return __lasx_xvftintrz_w_s(mad(v, scale, t));
}
#elifdefined(SKRP_CPU_LSX) template <typename T> using V = Vec<4, T>; using F = V<float >; using I32 = V<int32_t >; using U64 = V<uint64_t>; using U32 = V<uint32_t>; using U16 = V<uint16_t>; using U8 = V<uint8_t >;
SI F if_then_else(I32 c, F t, F e) { return sk_bit_cast<Vec<4,float>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(e),
sk_bit_cast<__m128i>(t),
sk_bit_cast<__m128i>(c)));
}
SI F min(F a, F b) { return __lsx_vfmin_s(a,b); }
SI F max(F a, F b) { return __lsx_vfmax_s(a,b); }
SI I32 min(I32 a, I32 b) { return __lsx_vmin_w(a,b); }
SI U32 min(U32 a, U32 b) { return __lsx_vmin_wu(a,b); }
SI I32 max(I32 a, I32 b) { return __lsx_vmax_w(a,b); }
SI U32 max(U32 a, U32 b) { return __lsx_vmax_wu(a,b); }
SI F mad(F f, F m, F a) { return __lsx_vfmadd_s(f, m, a); }
SI F nmad(F f, F m, F a) { return __lsx_vfmadd_s(-f, m, a); }
SI F abs_(F v) { return (F)__lsx_vand_v((I32)v, (I32)(0-v)); }
SI I32 abs_(I32 v) { return max(v, -v); }
SI F rcp_approx (F v) { return __lsx_vfrecip_s(v); }
SI F rcp_precise (F v) { F e = rcp_approx(v); return e * nmad(v, e, F() + 2.0f); }
SI F rsqrt_approx (F v) { return __lsx_vfrsqrt_s(v); }
SI F sqrt_(F v) { return __lsx_vfsqrt_s (v); }
SI U32 iround(F v) {
F t = F() + 0.5f; return __lsx_vftintrz_w_s(v + t); }
SI U32 round(F v) {
F t = F() + 0.5f; return __lsx_vftintrz_w_s(v + t); }
SI U32 round(F v, F scale) {
F t = F() + 0.5f; return __lsx_vftintrz_w_s(mad(v, scale, t)); }
SI U16 pack(U32 v) {
__m128i tmp = __lsx_vsat_wu(v, 15); auto p = __lsx_vpickev_h(tmp, tmp); return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
}
SI U8 pack(U16 v) { auto r = widen_cast<__m128i>(v);
__m128i tmp = __lsx_vsat_hu(r, 7);
r = __lsx_vpickev_b(tmp, tmp); return sk_unaligned_load<U8>(&r);
}
template <typename T>
SI V<T> gather(const T* p, U32 ix) { return V<T>{p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
} // Using 'int*' prevents data from passing through floating-point registers.
SI F gather(constint* p, int ix0, int ix1, int ix2, int ix3) {
F ret = {0.0};
ret = (F)__lsx_vinsgr2vr_w(ret, p[ix0], 0);
ret = (F)__lsx_vinsgr2vr_w(ret, p[ix1], 1);
ret = (F)__lsx_vinsgr2vr_w(ret, p[ix2], 2);
ret = (F)__lsx_vinsgr2vr_w(ret, p[ix3], 3); return ret;
}
template <typename V, typename S>
SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
V before = gather(dst, ix);
V after = if_then_else(mask, src, before);
dst[ix[0]] = after[0];
dst[ix[1]] = after[1];
dst[ix[2]] = after[2];
dst[ix[3]] = after[3];
}
SI void store4(float* ptr, F r, F g, F b, F a) {
_LSX_TRANSPOSE4_S(r,g,b,a);
__lsx_vst(r, ptr, 0);
__lsx_vst(g, ptr, 16);
__lsx_vst(b, ptr, 32);
__lsx_vst(a, ptr, 48);
}
#endif
// Helpers to do scalar -> vector promotion on GCC (clang does this automatically) // We need to subtract (not add) zero to keep float conversion zero-cost. See: // https://stackoverflow.com/q/48255293 // // The GCC implementation should be usable everywhere, but Mac clang (only) complains that the // expressions make these functions not constexpr. // // Further: We can't use the subtract-zero version in scalar mode. There, the subtraction will // really happen (at least at low optimization levels), which can alter the bit pattern of NaNs. // Because F_() is used when copying uniforms (even integer uniforms), this can corrupt values. // The vector subtraction of zero doesn't appear to ever alter NaN bit patterns. #ifdefined(__clang__) || defined(SKRP_CPU_SCALAR)
SI constexpr F F_(float x) { return x; }
SI constexpr I32 I32_(int32_t x) { return x; }
SI constexpr U32 U32_(uint32_t x) { return x; } #else
SI constexpr F F_(float x) { return x - F(); }
SI constexpr I32 I32_(int32_t x) { return x + I32(); }
SI constexpr U32 U32_(uint32_t x) { return x + U32(); } #endif
// Extremely helpful literals: static constexpr F F0 = F_(0.0f),
F1 = F_(1.0f);
#if !defined(SKRP_CPU_SCALAR)
SI F min(F a, float b) { return min(a, F_(b)); }
SI F min(float a, F b) { return min(F_(a), b); }
SI F max(F a, float b) { return max(a, F_(b)); }
SI F max(float a, F b) { return max(F_(a), b); }
SI F mad(F f, F m, float a) { return mad(f, m, F_(a)); }
SI F mad(F f, float m, F a) { return mad(f, F_(m), a); }
SI F mad(F f, float m, float a) { return mad(f, F_(m), F_(a)); }
SI F mad(float f, F m, F a) { return mad(F_(f), m, a); }
SI F mad(float f, F m, float a) { return mad(F_(f), m, F_(a)); }
SI F mad(float f, float m, F a) { return mad(F_(f), F_(m), a); }
SI F nmad(F f, F m, float a) { return nmad(f, m, F_(a)); }
SI F nmad(F f, float m, F a) { return nmad(f, F_(m), a); }
SI F nmad(F f, float m, float a) { return nmad(f, F_(m), F_(a)); }
SI F nmad(float f, F m, F a) { return nmad(F_(f), m, a); }
SI F nmad(float f, F m, float a) { return nmad(F_(f), m, F_(a)); }
SI F nmad(float f, float m, F a) { return nmad(F_(f), F_(m), a); } #endif
// We need to be a careful with casts. // (F)x means cast x to float in the portable path, but bit_cast x to float in the others. // These named casts and bit_cast() are always what they seem to be. #ifdefined(SKRP_CPU_SCALAR)
SI F cast (U32 v) { return (F)v; }
SI F cast64(U64 v) { return (F)v; }
SI U32 trunc_(F v) { return (U32)v; }
SI U32 expand(U16 v) { return (U32)v; }
SI U32 expand(U8 v) { return (U32)v; } #else
SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
SI F cast64(U64 v) { return __builtin_convertvector( v, F); }
SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); }
SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); } #endif
#if !defined(SKRP_CPU_SCALAR)
SI F if_then_else(I32 c, F t, float e) { return if_then_else(c, t , F_(e)); }
SI F if_then_else(I32 c, float t, F e) { return if_then_else(c, F_(t), e ); }
SI F if_then_else(I32 c, float t, float e) { return if_then_else(c, F_(t), F_(e)); } #endif
#else // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
U32 sem = sk_bit_cast<U32>(f),
s = sem & 0x80000000,
em = sem ^ s;
// Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero. auto denorm = (I32)em < 0x38800000; // I32 comparison is often quicker, and always safe here. returnpack((U32)if_then_else(denorm, I32_(0)
, (I32)((s>>16) + (em>>13) - ((127-15)<<10)))); #endif
}
#ifdefined(SKRP_CPU_SCALAR) || defined(SKRP_CPU_SSE2) // In scalar and SSE2 mode, we always use precise math so we can have more predictable results. // Chrome will use the SSE2 implementation when --disable-skia-runtime-opts is set. (b/40042946)
SI F rcp_fast(F v) { return rcp_precise(v); }
SI F rsqrt(F v) { return rcp_precise(sqrt_(v)); } #else
SI F rcp_fast(F v) { return rcp_approx(v); }
SI F rsqrt(F v) { return rsqrt_approx(v); } #endif
// Our fundamental vector depth is our pixel stride. static constexpr size_t N = sizeof(F) / sizeof(float);
// We're finally going to get to what a Stage function looks like!
// Any custom ABI to use for all (non-externally-facing) stage functions? // Also decide here whether to use narrow (compromise) or wide (ideal) stages. #ifdefined(SK_CPU_ARM32) && defined(SKRP_CPU_NEON) // This lets us pass vectors more efficiently on 32-bit ARM. // We can still only pass 16 floats, so best as 4x {r,g,b,a}. #define ABI __attribute__((pcs("aapcs-vfp"))) #define SKRP_NARROW_STAGES 1 #elifdefined(_MSC_VER) // Even if not vectorized, this lets us pass {r,g,b,a} as registers, // instead of {b,a} on the stack. Narrow stages work best for __vectorcall. #define ABI __vectorcall #define SKRP_NARROW_STAGES 1 #elifdefined(__x86_64__) || defined(SK_CPU_ARM64) || defined(SK_CPU_LOONGARCH) // These platforms are ideal for wider stages, and their default ABI is ideal. #define ABI #define SKRP_NARROW_STAGES 0 #else // 32-bit or unknown... shunt them down the narrow path. // Odds are these have few registers and are better off there. #define ABI #define SKRP_NARROW_STAGES 1 #endif
#if SKRP_NARROW_STAGES struct Params {
size_t dx, dy;
std::byte* base;
F dr,dg,db,da;
}; using Stage = void(ABI*)(Params*, SkRasterPipelineStage* program, F r, F g, F b, F a); #else using Stage = void(ABI*)(SkRasterPipelineStage* program, size_t dx, size_t dy,
std::byte* base, F,F,F,F, F,F,F,F); #endif
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.