/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
// When using a solid color with clip masking, the cost of loading the clip mask
// in the blend stage exceeds the cost of processing the color. Here we handle
// the entire span of clip mask texture before the blend stage to more
// efficiently process it and modulate it with color without incurring blend
// stage overheads.
template <
typename P,
typename C>
static void commit_masked_solid_span(P* buf, C color,
int len) {
override_clip_mask();
uint8_t* mask = get_clip_mask(buf);
for (P* end = &buf[len]; buf < end; buf += 4, mask += 4) {
commit_span(
buf,
blend_span(
buf,
applyColor(expand_mask(buf, unpack(unaligned_load<PackedR8>(mask))),
color)));
}
restore_clip_mask();
}
// When using a solid color with anti-aliasing, most of the solid span will not
// benefit from anti-aliasing in the opaque region. We only want to apply the AA
// blend stage in the non-opaque start and end of the span where AA is needed.
template <
typename P,
typename R>
static ALWAYS_INLINE
void commit_aa_solid_span(P* buf, R r,
int len) {
if (
int start = min((get_aa_opaque_start(buf) + 3) & ~3, len)) {
commit_solid_span<
true>(buf, r, start);
buf += start;
len -= start;
}
if (
int opaque = min((get_aa_opaque_size(buf) + 3) & ~3, len)) {
override_aa();
commit_solid_span<
true>(buf, r, opaque);
restore_aa();
buf += opaque;
len -= opaque;
}
if (len > 0) {
commit_solid_span<
true>(buf, r, len);
}
}
// Forces a value with vector run-class to have scalar run-class.
template <
typename T>
static ALWAYS_INLINE
auto swgl_forceScalar(T v) -> decltype(force_scalar(v)) {
return force_scalar(v);
}
// Advance all varying inperpolants by a single chunk
#define swgl_stepInterp() step_interp_inputs()
// Pseudo-intrinsic that accesses the interpolation step for a given varying
#define swgl_interpStep(v) (interp_step.v)
// Commit an entire span of a solid color. This dispatches to clip-masked and
// anti-aliased fast-paths as appropriate.
#define swgl_commitSolid(format, v, n) \
do { \
int len = (n); \
if (blend_key) { \
if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { \
commit_masked_solid_span(swgl_Out
##format, \
packColor(swgl_Out
##format, (v)), len); \
}
else if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) { \
commit_aa_solid_span(swgl_Out
##format, \
pack_span(swgl_Out
##format, (v)), len); \
}
else { \
commit_solid_span<
true>(swgl_Out
##format, \
pack_span(swgl_Out
##format, (v)), len); \
} \
}
else { \
commit_solid_span<
false>(swgl_Out
##format, \
pack_span(swgl_Out
##format, (v)), len); \
} \
swgl_Out
##format += len; \
swgl_SpanLength -= len; \
}
while (0)
#define swgl_commitSolidRGBA8(v) swgl_commitSolid(RGBA8, v, swgl_SpanLength)
#define swgl_commitSolidR8(v) swgl_commitSolid(R8, v, swgl_SpanLength)
#define swgl_commitPartialSolidRGBA8(len, v) \
swgl_commitSolid(RGBA8, v, min(
int(len), swgl_SpanLength))
#define swgl_commitPartialSolidR8(len, v) \
swgl_commitSolid(R8, v, min(
int(len), swgl_SpanLength))
#define swgl_commitChunk(format, chunk) \
do { \
auto r = chunk; \
if (blend_key) r = blend_span(swgl_Out
##format, r); \
commit_span(swgl_Out
##format, r); \
swgl_Out
##format += swgl_StepSize; \
swgl_SpanLength -= swgl_StepSize; \
}
while (0)
// Commit a single chunk of a color
#define swgl_commitColor(format, color) \
swgl_commitChunk(format, pack_pixels_
##format(color))
#define swgl_commitColorRGBA8(color) swgl_commitColor(RGBA8, color)
#define swgl_commitColorR8(color) swgl_commitColor(R8, color)
template <
typename S>
static ALWAYS_INLINE
bool swgl_isTextureLinear(S s) {
return s->filter == TextureFilter::LINEAR;
}
template <
typename S>
static ALWAYS_INLINE
bool swgl_isTextureRGBA8(S s) {
return s->format == TextureFormat::RGBA8;
}
template <
typename S>
static ALWAYS_INLINE
bool swgl_isTextureR8(S s) {
return s->format == TextureFormat::R8;
}
// Use the default linear quantization scale of 128. This gives 7 bits of
// fractional precision, which when multiplied with a signed 9 bit value
// still fits in a 16 bit integer.
const int swgl_LinearQuantizeScale = 128;
// Quantizes UVs for access into a linear texture.
template <
typename S,
typename T>
static ALWAYS_INLINE T swgl_linearQuantize(S s, T p) {
return linearQuantize(p, swgl_LinearQuantizeScale, s);
}
// Quantizes an interpolation step for UVs for access into a linear texture.
template <
typename S,
typename T>
static ALWAYS_INLINE T swgl_linearQuantizeStep(S s, T p) {
return samplerScale(s, p) * swgl_LinearQuantizeScale;
}
template <
typename S>
static ALWAYS_INLINE WideRGBA8 textureLinearUnpacked(UNUSED uint32_t* buf,
S sampler, ivec2 i) {
return textureLinearUnpackedRGBA8(sampler, i);
}
template <
typename S>
static ALWAYS_INLINE WideR8 textureLinearUnpacked(UNUSED uint8_t* buf,
S sampler, ivec2 i) {
return textureLinearUnpackedR8(sampler, i);
}
template <
typename S>
static ALWAYS_INLINE
bool matchTextureFormat(S s, UNUSED uint32_t* buf) {
return swgl_isTextureRGBA8(s);
}
template <
typename S>
static ALWAYS_INLINE
bool matchTextureFormat(S s, UNUSED uint8_t* buf) {
return swgl_isTextureR8(s);
}
// Quantizes the UVs to the 2^7 scale needed for calculating fractional offsets
// for linear sampling.
#define LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv) \
uv = swgl_linearQuantize(sampler, uv); \
vec2_scalar uv_step = \
float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; \
vec2_scalar min_uv = max( \
swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); \
vec2_scalar max_uv = \
max(swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), \
min_uv);
// Implements the fallback linear filter that can deal with clamping and
// arbitrary scales.
template <
bool BLEND,
typename S,
typename C,
typename P>
static P* blendTextureLinearFallback(S sampler, vec2 uv,
int span,
vec2_scalar uv_step, vec2_scalar min_uv,
vec2_scalar max_uv, C color, P* buf) {
for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
commit_blend_span<BLEND>(
buf, applyColor(textureLinearUnpacked(buf, sampler,
ivec2(clamp(uv, min_uv, max_uv))),
color));
}
return buf;
}
static ALWAYS_INLINE U64 castForShuffle(V16<int16_t> r) {
return bit_cast<U64>(r);
}
static ALWAYS_INLINE U16 castForShuffle(V4<int16_t> r) {
return bit_cast<U16>(r);
}
static ALWAYS_INLINE V16<int16_t> applyFracX(V16<int16_t> r, I16 fracx) {
return r * fracx.xxxxyyyyzzzzwwww;
}
static ALWAYS_INLINE V4<int16_t> applyFracX(V4<int16_t> r, I16 fracx) {
return r * fracx;
}
// Implements a faster linear filter that works with axis-aligned constant Y but
// scales less than 1, i.e. upscaling. In this case we can optimize for the
// constant Y fraction as well as load all chunks from memory in a single tap
// for each row.
template <
bool BLEND,
typename S,
typename C,
typename P>
static void blendTextureLinearUpscale(S sampler, vec2 uv,
int span,
vec2_scalar uv_step, vec2_scalar min_uv,
vec2_scalar max_uv, C color, P* buf) {
typedef VectorType<uint8_t, 4 *
sizeof(P)> packed_type;
typedef VectorType<uint16_t, 4 *
sizeof(P)> unpacked_type;
typedef VectorType<int16_t, 4 *
sizeof(P)> signed_unpacked_type;
ivec2 i(clamp(uv, min_uv, max_uv));
ivec2 frac = i;
i >>= 7;
P* row0 = (P*)sampler->buf + computeRow(sampler, ivec2_scalar(0, i.y.x));
P* row1 = row0 + computeNextRowOffset(sampler, ivec2_scalar(0, i.y.x));
I16 fracx = computeFracX(sampler, i, frac);
int16_t fracy = computeFracY(frac).x;
auto src0 =
CONVERT(unaligned_load<packed_type>(&row0[i.x.x]), signed_unpacked_type);
auto src1 =
CONVERT(unaligned_load<packed_type>(&row1[i.x.x]), signed_unpacked_type);
auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
// We attempt to sample ahead by one chunk and interpolate it with the current
// one. However, due to the complication of upscaling, we may not necessarily
// shift in all the next set of samples.
for (P* end = buf + span; buf < end; buf += 4) {
uv.x += uv_step.x;
I32 ixn = cast(uv.x);
I16 fracn = computeFracNoClamp(ixn);
ixn >>= 7;
auto src0n = CONVERT(unaligned_load<packed_type>(&row0[ixn.x]),
signed_unpacked_type);
auto src1n = CONVERT(unaligned_load<packed_type>(&row1[ixn.x]),
signed_unpacked_type);
auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
// Since we're upscaling, we know that a source pixel has a larger footprint
// than the destination pixel, and thus all the source pixels needed for
// this chunk will fall within a single chunk of texture data. However,
// since the source pixels don't map 1:1 with destination pixels, we need to
// shift the source pixels over based on their offset from the start of the
// chunk. This could conceivably be optimized better with usage of PSHUFB or
// VTBL instructions However, since PSHUFB requires SSSE3, instead we resort
// to masking in the correct pixels to avoid having to index into memory.
// For the last sample to interpolate with, we need to potentially shift in
// a sample from the next chunk over in the case the samples fill out an
// entire chunk.
auto shuf = src;
auto shufn = SHUFFLE(src, ixn.x == i.x.w ? srcn.yyyy : srcn, 1, 2, 3, 4);
if (i.x.y == i.x.x) {
shuf = shuf.xxyz;
shufn = shufn.xxyz;
}
if (i.x.z == i.x.y) {
shuf = shuf.xyyz;
shufn = shufn.xyyz;
}
if (i.x.w == i.x.z) {
shuf = shuf.xyzz;
shufn = shufn.xyzz;
}
// Convert back to a signed unpacked type so that we can interpolate the
// final result.
auto interp = bit_cast<signed_unpacked_type>(shuf);
auto interpn = bit_cast<signed_unpacked_type>(shufn);
interp += applyFracX(interpn - interp, fracx) >> 7;
commit_blend_span<BLEND>(
buf, applyColor(bit_cast<unpacked_type>(interp), color));
i.x = ixn;
fracx = fracn;
src = srcn;
}
}
// This is the fastest variant of the linear filter that still provides
// filtering. In cases where there is no scaling required, but we have a
// subpixel offset that forces us to blend in neighboring pixels, we can
// optimize away most of the memory loads and shuffling that is required by the
// fallback filter.
template <
bool BLEND,
typename S,
typename C,
typename P>
static void blendTextureLinearFast(S sampler, vec2 uv,
int span,
vec2_scalar min_uv, vec2_scalar max_uv,
C color, P* buf) {
typedef VectorType<uint8_t, 4 *
sizeof(P)> packed_type;
typedef VectorType<uint16_t, 4 *
sizeof(P)> unpacked_type;
typedef VectorType<int16_t, 4 *
sizeof(P)> signed_unpacked_type;
ivec2 i(clamp(uv, min_uv, max_uv));
ivec2 frac = i;
i >>= 7;
P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i));
P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i));
int16_t fracx = computeFracX(sampler, i, frac).x;
int16_t fracy = computeFracY(frac).x;
auto src0 = CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
auto src1 = CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
// Since there is no scaling, we sample ahead by one chunk and interpolate it
// with the current one. We can then reuse this value on the next iteration.
for (P* end = buf + span; buf < end; buf += 4) {
row0 += 4;
row1 += 4;
auto src0n =
CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
auto src1n =
CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
// For the last sample to interpolate with, we need to potentially shift in
// a sample from the next chunk over since the samples fill out an entire
// chunk.
auto interp = bit_cast<signed_unpacked_type>(src);
auto interpn =
bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 2, 3, 4));
interp += ((interpn - interp) * fracx) >> 7;
commit_blend_span<BLEND>(
buf, applyColor(bit_cast<unpacked_type>(interp), color));
src = srcn;
}
}
// Implements a faster linear filter that works with axis-aligned constant Y but
// downscaling the texture by half. In this case we can optimize for the
// constant X/Y fractions and reduction factor while minimizing shuffling.
template <
bool BLEND,
typename S,
typename C,
typename P>
static NO_INLINE
void blendTextureLinearDownscale(S sampler, vec2 uv,
int span,
vec2_scalar min_uv,
vec2_scalar max_uv, C color,
P* buf) {
typedef VectorType<uint8_t, 4 *
sizeof(P)> packed_type;
typedef VectorType<uint16_t, 4 *
sizeof(P)> unpacked_type;
typedef VectorType<int16_t, 4 *
sizeof(P)> signed_unpacked_type;
ivec2 i(clamp(uv, min_uv, max_uv));
ivec2 frac = i;
i >>= 7;
P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i));
P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i));
int16_t fracx = computeFracX(sampler, i, frac).x;
int16_t fracy = computeFracY(frac).x;
for (P* end = buf + span; buf < end; buf += 4) {
auto src0 =
CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
auto src1 =
CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
row0 += 4;
row1 += 4;
auto src0n =
CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
auto src1n =
CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
row0 += 4;
row1 += 4;
auto interp =
bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 0, 2, 4, 6));
auto interpn =
bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 3, 5, 7));
interp += ((interpn - interp) * fracx) >> 7;
commit_blend_span<BLEND>(
buf, applyColor(bit_cast<unpacked_type>(interp), color));
}
}
enum LinearFilter {
// No linear filter is needed.
LINEAR_FILTER_NEAREST = 0,
// The most general linear filter that handles clamping and varying scales.
LINEAR_FILTER_FALLBACK,
// A linear filter optimized for axis-aligned upscaling.
LINEAR_FILTER_UPSCALE,
// A linear filter with no scaling but with subpixel offset.
LINEAR_FILTER_FAST,
// A linear filter optimized for 2x axis-aligned downscaling.
LINEAR_FILTER_DOWNSCALE
};
// Dispatches to an appropriate linear filter depending on the selected filter.
template <
bool BLEND,
typename S,
typename C,
typename P>
static P* blendTextureLinearDispatch(S sampler, vec2 uv,
int span,
vec2_scalar uv_step, vec2_scalar min_uv,
vec2_scalar max_uv, C color, P* buf,
LinearFilter filter) {
P* end = buf + span;
if (filter != LINEAR_FILTER_FALLBACK) {
// If we're not using the fallback, then Y is constant across the entire
// row. We just need to ensure that we handle any samples that might pull
// data from before the start of the row and require clamping.
float beforeDist = max(0.0f, min_uv.x) - uv.x.x;
if (beforeDist > 0) {
int before = clamp(
int(ceil(beforeDist / uv_step.x)) * swgl_StepSize, 0,
int(end - buf));
buf = blendTextureLinearFallback<BLEND>(sampler, uv, before, uv_step,
min_uv, max_uv, color, buf);
uv.x += (before / swgl_StepSize) * uv_step.x;
}
// We need to check how many samples we can take from inside the row without
// requiring clamping. In case the filter oversamples the row by a step, we
// subtract off a step from the width to leave some room.
float insideDist =
min(max_uv.x,
float((
int(sampler->width) - swgl_StepSize) *
swgl_LinearQuantizeScale)) -
uv.x.x;
if (uv_step.x > 0.0f && insideDist >= uv_step.x) {
int32_t inside =
int(end - buf);
if (filter == LINEAR_FILTER_DOWNSCALE) {
inside = min(
int(insideDist * (0.5f / swgl_LinearQuantizeScale)) &
~(swgl_StepSize - 1),
inside);
if (inside > 0) {
blendTextureLinearDownscale<BLEND>(sampler, uv, inside, min_uv,
max_uv, color, buf);
buf += inside;
uv.x += (inside / swgl_StepSize) * uv_step.x;
}
}
else if (filter == LINEAR_FILTER_UPSCALE) {
inside = min(
int(insideDist / uv_step.x) * swgl_StepSize, inside);
if (inside > 0) {
blendTextureLinearUpscale<BLEND>(sampler, uv, inside, uv_step, min_uv,
max_uv, color, buf);
buf += inside;
uv.x += (inside / swgl_StepSize) * uv_step.x;
}
}
else {
inside = min(
int(insideDist * (1.0f / swgl_LinearQuantizeScale)) &
~(swgl_StepSize - 1),
inside);
if (inside > 0) {
blendTextureLinearFast<BLEND>(sampler, uv, inside, min_uv, max_uv,
color, buf);
buf += inside;
uv.x += (inside / swgl_StepSize) * uv_step.x;
}
}
}
}
// If the fallback filter was requested, or if there are any samples left that
// may be outside the row and require clamping, then handle that with here.
if (buf < end) {
buf = blendTextureLinearFallback<BLEND>(
sampler, uv,
int(end - buf), uv_step, min_uv, max_uv, color, buf);
}
return buf;
}
// Helper function to quantize UVs for linear filtering before dispatch
template <
bool BLEND,
typename S,
typename C,
typename P>
static inline int blendTextureLinear(S sampler, vec2 uv,
int span,
const vec4_scalar& uv_rect, C color,
P* buf, LinearFilter filter) {
if (!matchTextureFormat(sampler, buf)) {
return 0;
}
LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv);
blendTextureLinearDispatch<BLEND>(sampler, uv, span, uv_step, min_uv, max_uv,
color, buf, filter);
return span;
}
// Samples an axis-aligned span of on a single row of a texture using 1:1
// nearest filtering. Sampling is constrained to only fall within the given UV
// bounds. This requires a pointer to the destination buffer. An optional color
// modulus can be supplied.
template <
bool BLEND,
typename S,
typename C,
typename P>
static int blendTextureNearestFast(S sampler, vec2 uv,
int span,
const vec4_scalar& uv_rect, C color,
P* buf) {
if (!matchTextureFormat(sampler, buf)) {
return 0;
}
typedef VectorType<uint8_t, 4 *
sizeof(P)> packed_type;
ivec2_scalar i = make_ivec2(samplerScale(sampler, force_scalar(uv)));
ivec2_scalar minUV =
make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y}));
ivec2_scalar maxUV =
make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w}));
// Calculate the row pointer within the buffer, clamping to within valid row
// bounds.
P* row =
&((P*)sampler
->buf)[clampCoord(clamp(i.y, minUV.y, maxUV.y), sampler->height) *
sampler->stride];
// Find clamped X bounds within the row.
int minX = clamp(minUV.x, 0, sampler->width - 1);
int maxX = clamp(maxUV.x, minX, sampler->width - 1);
int curX = i.x;
int endX = i.x + span;
// If we need to start sampling below the valid sample bounds, then we need to
// fill this section with a constant clamped sample.
if (curX < minX) {
int n = min(minX, endX) - curX;
auto src =
applyColor(unpack(bit_cast<packed_type>(V4<P>(row[minX]))), color);
commit_solid_span<BLEND>(buf, src, n);
buf += n;
curX += n;
}
// Here we only deal with valid samples within the sample bounds. No clamping
// should occur here within these inner loops.
int n = max(min(maxX + 1, endX) - curX, 0);
// Try to process as many chunks as possible with full loads and stores.
for (
int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) {
auto src = applyColor(unaligned_load<packed_type>(&row[curX]), color);
commit_blend_span<BLEND>(buf, src);
}
n &= 3;
// If we have any leftover samples after processing chunks, use partial loads
// and stores.
if (n > 0) {
auto src = applyColor(partial_load_span<packed_type>(&row[curX], n), color);
commit_blend_span<BLEND>(buf, src, n);
buf += n;
curX += n;
}
// If we still have samples left above the valid sample bounds, then we again
// need to fill this section with a constant clamped sample.
if (curX < endX) {
auto src =
applyColor(unpack(bit_cast<packed_type>(V4<P>(row[maxX]))), color);
commit_solid_span<BLEND>(buf, src, endX - curX);
}
return span;
}
// We need to verify that the pixel step reasonably approximates stepping by a
// single texel for every pixel we need to reproduce. Try to ensure that the
// margin of error is no more than approximately 2^-7. Also, we check here if
// the scaling can be quantized for acceleration.
template <
typename T>
static ALWAYS_INLINE
int spanNeedsScale(
int span, T P) {
span &= ~(128 - 1);
span += 128;
int scaled = round((P.x.y - P.x.x) * span);
return scaled != span ? (scaled == span * 2 ? 2 : 1) : 0;
}
// Helper function to decide whether we can safely apply 1:1 nearest filtering
// without diverging too much from the linear filter.
template <
typename S,
typename T>
static inline LinearFilter needsTextureLinear(S sampler, T P,
int span) {
// If each row is not wide enough for linear filtering, then just use nearest
// filtering.
if (sampler->width < 2) {
return LINEAR_FILTER_NEAREST;
}
// First verify if the row Y doesn't change across samples
if (P.y.x != P.y.y) {
return LINEAR_FILTER_FALLBACK;
}
P = samplerScale(sampler, P);
if (
int scale = spanNeedsScale(span, P)) {
// If the source region is not flipped and smaller than the destination,
// then we can use the upscaling filter since row Y is constant.
return P.x.x < P.x.y && P.x.y - P.x.x <= 1
? LINEAR_FILTER_UPSCALE
: (scale == 2 ? LINEAR_FILTER_DOWNSCALE
: LINEAR_FILTER_FALLBACK);
}
// Also verify that we're reasonably close to the center of a texel
// so that it doesn't look that much different than if a linear filter
// was used.
if ((
int(P.x.x * 4.0f + 0.5f) & 3) != 2 ||
(
int(P.y.x * 4.0f + 0.5f) & 3) != 2) {
// The source and destination regions are the same, but there is a
// significant subpixel offset. We can use a faster linear filter to deal
// with the offset in this case.
return LINEAR_FILTER_FAST;
}
// Otherwise, we have a constant 1:1 step and we're stepping reasonably close
// to the center of each pixel, so it's safe to disable the linear filter and
// use nearest.
return LINEAR_FILTER_NEAREST;
}
// Commit an entire span with linear filtering
#define swgl_commitTextureLinear(format, s, p, uv_rect, color, n) \
do { \
auto packed_color = packColor(swgl_Out
##format, color); \
int len = (n); \
int drawn = 0; \
if (LinearFilter filter = needsTextureLinear(s, p, len)) { \
if (blend_key) { \
drawn = blendTextureLinear<
true>(s, p, len, uv_rect, packed_color, \
swgl_Out
##format, filter); \
}
else { \
drawn = blendTextureLinear<
false>(s, p, len, uv_rect, packed_color, \
swgl_Out
##format, filter); \
} \
}
else if (blend_key) { \
drawn = blendTextureNearestFast<
true>(s, p, len, uv_rect, packed_color, \
swgl_Out
##format); \
}
else { \
drawn = blendTextureNearestFast<
false>(s, p, len, uv_rect, packed_color, \
swgl_Out
##format); \
} \
swgl_Out
##format += drawn; \
swgl_SpanLength -= drawn; \
}
while (0)
#define swgl_commitTextureLinearRGBA8(s, p, uv_rect) \
swgl_commitTextureLinear(RGBA8, s, p, uv_rect, NoColor(), swgl_SpanLength)
#define swgl_commitTextureLinearR8(s, p, uv_rect) \
swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), swgl_SpanLength)
// Commit a partial span with linear filtering, optionally inverting the color
#define swgl_commitPartialTextureLinearR8(len, s, p, uv_rect) \
swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), \
min(
int(len), swgl_SpanLength))
#define swgl_commitPartialTextureLinearInvertR8(len, s, p, uv_rect) \
swgl_commitTextureLinear(R8, s, p, uv_rect, InvertColor(), \
min(
int(len), swgl_SpanLength))
// Commit an entire span with linear filtering that is scaled by a color
#define swgl_commitTextureLinearColorRGBA8(s, p, uv_rect, color) \
swgl_commitTextureLinear(RGBA8, s, p, uv_rect, color, swgl_SpanLength)
#define swgl_commitTextureLinearColorR8(s, p, uv_rect, color) \
swgl_commitTextureLinear(R8, s, p, uv_rect, color, swgl_SpanLength)
// Helper function that samples from an R8 texture while expanding it to support
// a differing framebuffer format.
template <
bool BLEND,
typename S,
typename C,
typename P>
static inline int blendTextureLinearR8(S sampler, vec2 uv,
int span,
const vec4_scalar& uv_rect, C color,
P* buf) {
if (!swgl_isTextureR8(sampler) || sampler->width < 2) {
return 0;
}
LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv);
for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
commit_blend_span<BLEND>(
buf, applyColor(expand_mask(buf, textureLinearUnpackedR8(
sampler,
ivec2(clamp(uv, min_uv, max_uv)))),
color));
}
return span;
}
// Commit an entire span with linear filtering while expanding from R8 to RGBA8
#define swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, color) \
do { \
auto packed_color = packColor(swgl_OutRGBA8, color); \
int drawn = 0; \
if (blend_key) { \
drawn = blendTextureLinearR8<
true>(s, p, swgl_SpanLength, uv_rect, \
packed_color, swgl_OutRGBA8); \
}
else { \
drawn = blendTextureLinearR8<
false>(s, p, swgl_SpanLength, uv_rect, \
packed_color, swgl_OutRGBA8); \
} \
swgl_OutRGBA8 += drawn; \
swgl_SpanLength -= drawn; \
}
while (0)
#define swgl_commitTextureLinearR8ToRGBA8(s, p, uv_rect) \
swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, NoColor())
// Compute repeating UVs, possibly constrained by tile repeat limits
static inline vec2 tileRepeatUV(vec2 uv,
const vec2_scalar& tile_repeat) {
if (tile_repeat.x > 0.0f) {
// Clamp to a number slightly less than the tile repeat limit so that
// it results in a number close to but not equal to 1 after fract().
// This avoids fract() yielding 0 if the limit was left as whole integer.
uv = clamp(uv, vec2_scalar(0.0f), tile_repeat - 1.0e-6f);
}
return fract(uv);
}
// Compute the number of non-repeating steps before we need to potentially
// repeat the UVs.
static inline int computeNoRepeatSteps(
Float uv,
float uv_step,
float tile_repeat,
int steps) {
if (uv.w < uv.x) {
// Ensure the UV taps are ordered low to high.
uv = uv.wzyx;
}
// Check if the samples cross the boundary of the next whole integer or the
// tile repeat limit, whichever is lower.
float limit = floor(uv.x) + 1.0f;
if (tile_repeat > 0.0f) {
limit = min(limit, tile_repeat);
}
return uv.x >= 0.0f && uv.w < limit
? (uv_step != 0.0f
?
int(clamp((limit - uv.x) / uv_step, 0.0f,
float(steps)))
: steps)
: 0;
}
// Blends an entire span of texture with linear filtering and repeating UVs.
template <
bool BLEND,
typename S,
typename C,
typename P>
static int blendTextureLinearRepeat(S sampler, vec2 uv,
int span,
const vec2_scalar& tile_repeat,
const vec4_scalar& uv_repeat,
const vec4_scalar& uv_rect, C color,
P* buf) {
if (!matchTextureFormat(sampler, buf)) {
return 0;
}
vec2_scalar uv_scale = {uv_repeat.z - uv_repeat.x, uv_repeat.w - uv_repeat.y};
vec2_scalar uv_offset = {uv_repeat.x, uv_repeat.y};
// Choose a linear filter to use for no-repeat sub-spans
LinearFilter filter =
needsTextureLinear(sampler, uv * uv_scale + uv_offset, span);
// We need to step UVs unscaled and unquantized so that we can modulo them
// with fract. We use uv_scale and uv_offset to map them into the correct
// range.
vec2_scalar uv_step =
float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x};
uv_scale = swgl_linearQuantizeStep(sampler, uv_scale);
uv_offset = swgl_linearQuantize(sampler, uv_offset);
vec2_scalar min_uv = max(
swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f);
vec2_scalar max_uv = max(
swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), min_uv);
for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
int steps =
int(end - buf) / swgl_StepSize;
// Find the sub-span before UVs repeat to avoid expensive repeat math
steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps);
if (steps > 0) {
steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps);
if (steps > 0) {
buf = blendTextureLinearDispatch<BLEND>(
sampler, fract(uv) * uv_scale + uv_offset, steps * swgl_StepSize,
uv_step * uv_scale, min_uv, max_uv, color, buf, filter);
if (buf >= end) {
break;
}
uv += steps * uv_step;
}
}
// UVs might repeat within this step, so explicitly compute repeated UVs
vec2 repeated_uv = clamp(
tileRepeatUV(uv, tile_repeat) * uv_scale + uv_offset, min_uv, max_uv);
commit_blend_span<BLEND>(
buf, applyColor(textureLinearUnpacked(buf, sampler, ivec2(repeated_uv)),
color));
}
return span;
}
// Commit an entire span with linear filtering and repeating UVs
#define swgl_commitTextureLinearRepeat(format, s, p, tile_repeat, uv_repeat, \
uv_rect, color) \
do { \
auto packed_color = packColor(swgl_Out
##format, color); \
int drawn = 0; \
if (blend_key) { \
drawn = blendTextureLinearRepeat<
true>(s, p, swgl_SpanLength, \
tile_repeat, uv_repeat, uv_rect, \
packed_color, swgl_Out
##format); \
}
else { \
drawn = blendTextureLinearRepeat<
false>(s, p, swgl_SpanLength, \
tile_repeat, uv_repeat, uv_rect, \
packed_color, swgl_Out
##format); \
} \
swgl_Out
##format += drawn; \
swgl_SpanLength -= drawn; \
}
while (0)
#define swgl_commitTextureLinearRepeatRGBA8(s, p, tile_repeat, uv_repeat, \
uv_rect) \
swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \
NoColor())
#define swgl_commitTextureLinearRepeatColorRGBA8(s, p, tile_repeat, uv_repeat, \
uv_rect, color) \
swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \
color)
template <
typename S>
static ALWAYS_INLINE PackedRGBA8 textureNearestPacked(UNUSED uint32_t* buf,
S sampler, ivec2 i) {
return textureNearestPackedRGBA8(sampler, i);
}
// Blends an entire span of texture with nearest filtering and either
// repeated or clamped UVs.
template <
bool BLEND,
bool REPEAT,
typename S,
typename C,
typename P>
static int blendTextureNearestRepeat(S sampler, vec2 uv,
int span,
const vec2_scalar& tile_repeat,
const vec4_scalar& uv_rect, C color,
P* buf) {
if (!matchTextureFormat(sampler, buf)) {
return 0;
}
if (!REPEAT) {
// If clamping, then we step pre-scaled to the sampler. For repeat modes,
// this will be accomplished via uv_scale instead.
uv = samplerScale(sampler, uv);
}
vec2_scalar uv_step =
float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x};
vec2_scalar min_uv = samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y});
vec2_scalar max_uv = samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w});
vec2_scalar uv_scale = max_uv - min_uv;
// If the effective sampling area of this texture is only a single pixel, then
// treat it as a solid span. For repeat modes, the bounds are specified on
// pixel boundaries, whereas for clamp modes, bounds are on pixel centers, so
// the test varies depending on which. If the sample range on an axis is
// greater than one pixel, we can still check if we don't move far enough from
// the pixel center on that axis to hit the next pixel.
if ((
int(min_uv.x) + (REPEAT ? 1 : 0) >=
int(max_uv.x) ||
(abs(uv_step.x) * span * (REPEAT ? uv_scale.x : 1.0f) < 0.5f)) &&
(
int(min_uv.y) + (REPEAT ? 1 : 0) >=
int(max_uv.y) ||
(abs(uv_step.y) * span * (REPEAT ? uv_scale.y : 1.0f) < 0.5f))) {
vec2 repeated_uv = REPEAT
? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv
: clamp(uv, min_uv, max_uv);
commit_solid_span<BLEND>(buf,
applyColor(unpack(textureNearestPacked(
buf, sampler, ivec2(repeated_uv))),
color),
span);
}
else {
for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
if (REPEAT) {
int steps =
int(end - buf) / swgl_StepSize;
// Find the sub-span before UVs repeat to avoid expensive repeat math
steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps);
if (steps > 0) {
steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps);
if (steps > 0) {
vec2 inside_uv = fract(uv) * uv_scale + min_uv;
vec2 inside_step = uv_step * uv_scale;
for (P* outside = &buf[steps * swgl_StepSize]; buf < outside;
buf += swgl_StepSize, inside_uv += inside_step) {
commit_blend_span<BLEND>(
buf, applyColor(
textureNearestPacked(buf, sampler, ivec2(inside_uv)),
color));
}
if (buf >= end) {
break;
}
uv += steps * uv_step;
}
}
}
// UVs might repeat within this step, so explicitly compute repeated UVs
vec2 repeated_uv = REPEAT
? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv
: clamp(uv, min_uv, max_uv);
commit_blend_span<BLEND>(
buf,
applyColor(textureNearestPacked(buf, sampler, ivec2(repeated_uv)),
color));
}
}
return span;
}
// Determine if we can use the fast nearest filter for the given nearest mode.
// If the Y coordinate varies more than half a pixel over
// the span (which might cause the texel to alias to the next one), or the span
// needs X scaling, then we have to use the fallback.
template <
typename S,
typename T>
static ALWAYS_INLINE
bool needsNearestFallback(S sampler, T P,
int span) {
P = samplerScale(sampler, P);
return (P.y.y - P.y.x) * span >= 0.5f || spanNeedsScale(span, P);
}
// Commit an entire span with nearest filtering and either clamped or repeating
// UVs
#define swgl_commitTextureNearest(format, s, p, uv_rect, color) \
do { \
auto packed_color = packColor(swgl_Out
##format, color); \
int drawn = 0; \
if (needsNearestFallback(s, p, swgl_SpanLength)) { \
if (blend_key) { \
drawn = blendTextureNearestRepeat<
true,
false>( \
s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \
swgl_Out
##format); \
}
else { \
drawn = blendTextureNearestRepeat<
false,
false>( \
s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \
swgl_Out
##format); \
} \
}
else if (blend_key) { \
drawn = blendTextureNearestFast<
true>(s, p, swgl_SpanLength, uv_rect, \
packed_color, swgl_Out
##format); \
}
else { \
drawn = blendTextureNearestFast<
false>(s, p, swgl_SpanLength, uv_rect, \
packed_color, swgl_Out
##format); \
} \
swgl_Out
##format += drawn; \
swgl_SpanLength -= drawn; \
}
while (0)
#define swgl_commitTextureNearestRGBA8(s, p, uv_rect) \
swgl_commitTextureNearest(RGBA8, s, p, uv_rect, NoColor())
#define swgl_commitTextureNearestColorRGBA8(s, p, uv_rect, color) \
swgl_commitTextureNearest(RGBA8, s, p, uv_rect, color)
#define swgl_commitTextureNearestRepeat(format, s, p, tile_repeat, uv_rect, \
color) \
do { \
auto packed_color = packColor(swgl_Out
##format, color); \
int drawn = 0; \
if (blend_key) { \
drawn = blendTextureNearestRepeat<
true,
true>( \
s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \
swgl_Out
##format); \
}
else { \
drawn = blendTextureNearestRepeat<
false,
true>( \
s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \
swgl_Out
##format); \
} \
swgl_Out
##format += drawn; \
swgl_SpanLength -= drawn; \
}
while (0)
#define swgl_commitTextureNearestRepeatRGBA8(s, p, tile_repeat, uv_repeat, \
uv_rect) \
swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, \
NoColor())
#define swgl_commitTextureNearestRepeatColorRGBA8(s, p, tile_repeat, \
uv_repeat, uv_rect, color) \
swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, color)
// Commit an entire span of texture with filtering determined by sampler state.
#define swgl_commitTexture(format, s, ...) \
do { \
if (s->filter == TextureFilter::LINEAR) { \
swgl_commitTextureLinear
##format(s, __VA_ARGS__); \
}
else { \
swgl_commitTextureNearest
##format(s, __VA_ARGS__); \
} \
}
while (0)
#define swgl_commitTextureRGBA8(...) swgl_commitTexture(RGBA8, __VA_ARGS__)
#define swgl_commitTextureColorRGBA8(...) \
swgl_commitTexture(ColorRGBA8, __VA_ARGS__)
#define swgl_commitTextureRepeatRGBA8(...) \
swgl_commitTexture(RepeatRGBA8, __VA_ARGS__)
#define swgl_commitTextureRepeatColorRGBA8(...) \
swgl_commitTexture(RepeatColorRGBA8, __VA_ARGS__)
// Commit an entire span of a separable pass of a Gaussian blur that falls
// within the given radius scaled by supplied coefficients, clamped to uv_rect
// bounds.
template <
bool BLEND,
typename S,
typename P>
static int blendGaussianBlur(S sampler, vec2 uv,
const vec4_scalar& uv_rect,
P* buf,
int span,
bool hori,
int radius,
vec2_scalar coeffs) {
if (!matchTextureFormat(sampler, buf)) {
return 0;
}
vec2_scalar size = {
float(sampler->width),
float(sampler->height)};
ivec2_scalar curUV = make_ivec2(force_scalar(uv) * size);
ivec4_scalar bounds = make_ivec4(uv_rect * make_vec4(size, size));
int startX = curUV.x;
int endX = min(min(bounds.z, curUV.x + span),
int(size.x));
if (hori) {
for (; curUV.x + swgl_StepSize <= endX;
buf += swgl_StepSize, curUV.x += swgl_StepSize) {
commit_blend_span<BLEND>(
buf, gaussianBlurHorizontal<P>(sampler, curUV, bounds.x, bounds.z,
radius, coeffs.x, coeffs.y));
}
}
else {
for (; curUV.x + swgl_StepSize <= endX;
buf += swgl_StepSize, curUV.x += swgl_StepSize) {
commit_blend_span<BLEND>(
buf, gaussianBlurVertical<P>(sampler, curUV, bounds.y, bounds.w,
radius, coeffs.x, coeffs.y));
}
}
return curUV.x - startX;
}
#define swgl_commitGaussianBlur(format, s, p, uv_rect, hori, radius, coeffs) \
do { \
int drawn = 0; \
if (blend_key) { \
drawn = blendGaussianBlur<
true>(s, p, uv_rect, swgl_Out
##format, \
swgl_SpanLength, hori, radius, coeffs); \
}
else { \
drawn = blendGaussianBlur<
false>(s, p, uv_rect, swgl_Out
##format, \
swgl_SpanLength, hori, radius, coeffs); \
} \
swgl_Out
##format += drawn; \
swgl_SpanLength -= drawn; \
}
while (0)
#define swgl_commitGaussianBlurRGBA8(s, p, uv_rect, hori, radius, coeffs) \
swgl_commitGaussianBlur(RGBA8, s, p, uv_rect, hori, radius, coeffs)
#define swgl_commitGaussianBlurR8(s, p, uv_rect, hori, radius, coeffs) \
swgl_commitGaussianBlur(R8, s, p, uv_rect, hori, radius, coeffs)
// Convert and pack planar YUV samples to RGB output using a color space
static ALWAYS_INLINE PackedRGBA8 convertYUV(
const YUVMatrix& rgb_from_ycbcr,
U16 y, U16 u, U16 v) {
auto yy = V8<int16_t>(zip(y, y));
auto uv = V8<int16_t>(zip(u, v));
return rgb_from_ycbcr.convert(yy, uv);
}
// Helper functions to sample from planar YUV textures before converting to RGB
template <
typename S0>
static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0,
const YUVMatrix& rgb_from_ycbcr,
UNUSED
int rescaleFactor) {
switch (sampler0->format) {
case TextureFormat::RGBA8: {
auto planar = textureLinearPlanarRGBA8(sampler0, uv0);
return convertYUV(rgb_from_ycbcr, highHalf(planar.rg), lowHalf(planar.rg),
lowHalf(planar.ba));
}
case TextureFormat::YUY2: {
auto planar = textureLinearPlanarYUY2(sampler0, uv0);
return convertYUV(rgb_from_ycbcr, planar.y, planar.u, planar.v);
}
default:
assert(
false);
return PackedRGBA8(0);
}
}
template <
bool BLEND,
typename S0,
typename P,
typename C = NoColor>
static int blendYUV(P* buf,
int span, S0 sampler0, vec2 uv0,
const vec4_scalar& uv_rect0,
const vec3_scalar& ycbcr_bias,
const mat3_scalar& rgb_from_debiased_ycbcr,
int rescaleFactor, C color = C()) {
if (!swgl_isTextureLinear(sampler0)) {
return 0;
}
LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
const auto rgb_from_ycbcr =
YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor);
auto c = packColor(buf, color);
auto* end = buf + span;
for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0) {
commit_blend_span<BLEND>(
buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
rgb_from_ycbcr, rescaleFactor),
c));
}
return span;
}
template <
typename S0,
typename S1>
static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1,
ivec2 uv1,
const YUVMatrix& rgb_from_ycbcr,
int rescaleFactor) {
switch (sampler1->format) {
case TextureFormat::RG8: {
assert(sampler0->format == TextureFormat::R8);
auto y = textureLinearUnpackedR8(sampler0, uv0);
auto planar = textureLinearPlanarRG8(sampler1, uv1);
return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.rg),
highHalf(planar.rg));
}
case TextureFormat::RGBA8: {
assert(sampler0->format == TextureFormat::R8);
auto y = textureLinearUnpackedR8(sampler0, uv0);
auto planar = textureLinearPlanarRGBA8(sampler1, uv1);
return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.ba),
highHalf(planar.rg));
}
case TextureFormat::RG16: {
assert(sampler0->format == TextureFormat::R16);
// The rescaling factor represents how many bits to add to renormalize the
// texture to 16 bits, and so the color depth is actually 16 minus the
// rescaling factor.
// Need to right shift the sample by the amount of bits over 8 it
// occupies. On output from textureLinearUnpackedR16, we have lost 1 bit
// of precision at the low end already, hence 1 is subtracted from the
// color depth.
int colorDepth = 16 - rescaleFactor;
int rescaleBits = (colorDepth - 1) - 8;
auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits;
auto uv = textureLinearUnpackedRG16(sampler1, uv1) >> rescaleBits;
return rgb_from_ycbcr.convert(zip(y, y), uv);
}
default:
assert(
false);
return PackedRGBA8(0);
}
}
template <
bool BLEND,
typename S0,
typename S1,
typename P,
typename C = NoColor>
static int blendYUV(P* buf,
int span, S0 sampler0, vec2 uv0,
const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1,
const vec4_scalar& uv_rect1,
const vec3_scalar& ycbcr_bias,
const mat3_scalar& rgb_from_debiased_ycbcr,
int rescaleFactor, C color = C()) {
if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1)) {
return 0;
}
LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
const auto rgb_from_ycbcr =
YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor);
auto c = packColor(buf, color);
auto* end = buf + span;
for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0, uv1 += uv_step1) {
commit_blend_span<BLEND>(
buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)),
rgb_from_ycbcr, rescaleFactor),
c));
}
return span;
}
template <
typename S0,
typename S1,
typename S2>
static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1,
ivec2 uv1, S2 sampler2, ivec2 uv2,
const YUVMatrix& rgb_from_ycbcr,
int rescaleFactor) {
assert(sampler0->format == sampler1->format &&
sampler0->format == sampler2->format);
switch (sampler0->format) {
case TextureFormat::R8: {
auto y = textureLinearUnpackedR8(sampler0, uv0);
auto u = textureLinearUnpackedR8(sampler1, uv1);
auto v = textureLinearUnpackedR8(sampler2, uv2);
return convertYUV(rgb_from_ycbcr, y, u, v);
}
case TextureFormat::R16: {
// The rescaling factor represents how many bits to add to renormalize the
// texture to 16 bits, and so the color depth is actually 16 minus the
// rescaling factor.
// Need to right shift the sample by the amount of bits over 8 it
// occupies. On output from textureLinearUnpackedR16, we have lost 1 bit
// of precision at the low end already, hence 1 is subtracted from the
// color depth.
int colorDepth = 16 - rescaleFactor;
int rescaleBits = (colorDepth - 1) - 8;
auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits;
auto u = textureLinearUnpackedR16(sampler1, uv1) >> rescaleBits;
auto v = textureLinearUnpackedR16(sampler2, uv2) >> rescaleBits;
return convertYUV(rgb_from_ycbcr, U16(y), U16(u), U16(v));
}
default:
assert(
false);
return PackedRGBA8(0);
}
}
// Fallback helper for when we can't specifically accelerate YUV with
// composition.
template <
bool BLEND,
typename S0,
typename S1,
typename S2,
typename P,
typename C>
static void blendYUVFallback(P* buf,
int span, S0 sampler0, vec2 uv0,
vec2_scalar uv_step0, vec2_scalar min_uv0,
vec2_scalar max_uv0, S1 sampler1, vec2 uv1,
vec2_scalar uv_step1, vec2_scalar min_uv1,
vec2_scalar max_uv1, S2 sampler2, vec2 uv2,
vec2_scalar uv_step2, vec2_scalar min_uv2,
vec2_scalar max_uv2,
const vec3_scalar& ycbcr_bias,
const mat3_scalar& rgb_from_debiased_ycbcr,
int rescaleFactor, C color) {
const auto rgb_from_ycbcr =
YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor);
for (
auto* end = buf + span; buf < end; buf += swgl_StepSize, uv0 += uv_step0,
uv1 += uv_step1, uv2 += uv_step2) {
commit_blend_span<BLEND>(
buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)),
sampler2, ivec2(clamp(uv2, min_uv2, max_uv2)),
rgb_from_ycbcr, rescaleFactor),
color));
}
}
template <
bool BLEND,
typename S0,
typename S1,
typename S2,
typename P,
typename C = NoColor>
static int blendYUV(P* buf,
int span, S0 sampler0, vec2 uv0,
const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1,
const vec4_scalar& uv_rect1, S2 sampler2, vec2 uv2,
const vec4_scalar& uv_rect2,
const vec3_scalar& ycbcr_bias,
const mat3_scalar& rgb_from_debiased_ycbcr,
int rescaleFactor, C color = C()) {
if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) ||
!swgl_isTextureLinear(sampler2)) {
return 0;
}
LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2);
auto c = packColor(buf, color);
blendYUVFallback<BLEND>(buf, span, sampler0, uv0, uv_step0, min_uv0, max_uv0,
sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2,
uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias,
rgb_from_debiased_ycbcr, rescaleFactor, c);
return span;
}
// A variant of the blendYUV that attempts to reuse the inner loops from the
// CompositeYUV infrastructure. CompositeYUV imposes stricter requirements on
// the source data, which in turn allows it to be much faster than blendYUV.
// At a minimum, we need to ensure that we are outputting to a BGRA8 framebuffer
// and that no color scaling is applied, which we can accomplish via template
// specialization. We need to further validate inside that texture formats
// and dimensions are sane for video and that the video is axis-aligned before
// acceleration can proceed.
template <
bool BLEND>
static int blendYUV(uint32_t* buf,
int span, sampler2DRect sampler0, vec2 uv0,
const vec4_scalar& uv_rect0, sampler2DRect sampler1,
vec2 uv1,
const vec4_scalar& uv_rect1,
sampler2DRect sampler2, vec2 uv2,
const vec4_scalar& uv_rect2,
const vec3_scalar& ycbcr_bias,
const mat3_scalar& rgb_from_debiased_ycbcr,
int rescaleFactor, NoColor noColor = NoColor()) {
if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) ||
!swgl_isTextureLinear(sampler2)) {
return 0;
}
LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2);
auto* end = buf + span;
// CompositeYUV imposes further restrictions on the source textures, such that
// the the Y/U/V samplers must all have a matching format, the U/V samplers
// must have matching sizes and sample coordinates, and there must be no
// change in row across the entire span.
if (sampler0->format == sampler1->format &&
sampler1->format == sampler2->format &&
sampler1->width == sampler2->width &&
sampler1->height == sampler2->height && uv_step0.y == 0 &&
uv_step0.x > 0 && uv_step1.y == 0 && uv_step1.x > 0 &&
uv_step1 == uv_step2 && uv1.x.x == uv2.x.x && uv1.y.x == uv2.y.x) {
// CompositeYUV does not support a clamp rect, so we must take care to
// advance till we're inside the bounds of the clamp rect.
int outside = min(
int(ceil(max((min_uv0.x - uv0.x.x) / uv_step0.x,
(min_uv1.x - uv1.x.x) / uv_step1.x))),
(end - buf) / swgl_StepSize);
if (outside > 0) {
blendYUVFallback<BLEND>(buf, outside * swgl_StepSize, sampler0, uv0,
uv_step0, min_uv0, max_uv0, sampler1, uv1,
uv_step1, min_uv1, max_uv1, sampler2, uv2,
uv_step2, min_uv2, max_uv2, ycbcr_bias,
rgb_from_debiased_ycbcr, rescaleFactor, noColor);
buf += outside * swgl_StepSize;
uv0.x += outside * uv_step0.x;
uv1.x += outside * uv_step1.x;
uv2.x += outside * uv_step2.x;
}
// Find the amount of chunks inside the clamp rect before we hit the
// maximum. If there are any chunks inside, we can finally dispatch to
// CompositeYUV.
int inside = min(
int(min((max_uv0.x - uv0.x.x) / uv_step0.x,
(max_uv1.x - uv1.x.x) / uv_step1.x)),
(end - buf) / swgl_StepSize);
if (inside > 0) {
// We need the color depth, which is relative to the texture format and
// rescale factor.
int colorDepth =
(sampler0->format == TextureFormat::R16 ? 16 : 8) - rescaleFactor;
// Finally, call the inner loop of CompositeYUV.
const auto rgb_from_ycbcr =
YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor);
linear_row_yuv<BLEND>(
buf, inside * swgl_StepSize, sampler0, force_scalar(uv0),
uv_step0.x / swgl_StepSize, sampler1, sampler2, force_scalar(uv1),
uv_step1.x / swgl_StepSize, colorDepth, rgb_from_ycbcr);
// Now that we're done, advance past the processed inside portion.
buf += inside * swgl_StepSize;
uv0.x += inside * uv_step0.x;
uv1.x += inside * uv_step1.x;
uv2.x += inside * uv_step2.x;
}
}
// We either got here because we have some samples outside the clamp rect, or
// because some of the preconditions were not satisfied. Process whatever is
// left of the span.
blendYUVFallback<BLEND>(buf, end - buf, sampler0, uv0, uv_step0, min_uv0,
max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1,
sampler2, uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias,
rgb_from_debiased_ycbcr, rescaleFactor, noColor);
return span;
}
// Commit a single chunk of a YUV surface represented by multiple planar
// textures. This requires a color space specifier selecting how to convert
// from YUV to RGB output. In the case of HDR formats, a rescaling factor
// selects how many bits of precision must be utilized on conversion. See the
// sampleYUV dispatcher functions for the various supported plane
// configurations this intrinsic accepts.
#define swgl_commitTextureLinearYUV(...) \
do { \
int drawn = 0; \
if (blend_key) { \
drawn = blendYUV<
true>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \
}
else { \
drawn = blendYUV<
false>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \
} \
swgl_OutRGBA8 += drawn; \
swgl_SpanLength -= drawn; \
}
while (0)
// Commit a single chunk of a YUV surface scaled by a color.
#define swgl_commitTextureLinearColorYUV(...) \
swgl_commitTextureLinearYUV(__VA_ARGS__)
// Each gradient stops entry is a pair of RGBA32F start color and end step.
struct GradientStops {
Float startColor;
union {
Float stepColor;
vec4_scalar stepData;
};
// Whether this gradient entry can be merged with an adjacent entry. The
// step will be equal with the adjacent step if and only if they can be
// merged, or rather, that the stops are actually part of a single larger
// gradient.
bool can_merge(
const GradientStops& next)
const {
return stepData == next.stepData;
}
// Get the interpolated color within the entry based on the offset from its
// start.
Float interpolate(
float offset)
const {
return startColor + stepColor * offset;
}
// Get the end color of the entry where interpolation stops.
Float end_color()
const {
return startColor + stepColor; }
};
// Checks if a gradient table of the specified size exists at the UV coords of
// the address within an RGBA32F texture. If so, a linear address within the
// texture is returned that may be used to sample the gradient table later. If
// the address doesn't describe a valid gradient, then a negative value is
// returned.
static inline int swgl_validateGradient(sampler2D sampler, ivec2_scalar address,
int entries) {
return sampler->format == TextureFormat::RGBA32F && address.y >= 0 &&
address.y <
int(sampler->height) && address.x >= 0 &&
address.x <
int(sampler->width) && entries > 0 &&
address.x +
int(
sizeof(GradientStops) /
sizeof(
Float)) * entries <=
int(sampler->width)
? address.y * sampler->stride + address.x * 4
: -1;
}
static inline WideRGBA8 sampleGradient(sampler2D sampler,
int address,
Float entry) {
assert(sampler->format == TextureFormat::RGBA32F);
assert(address >= 0 && address <
int(sampler->height * sampler->stride));
// Get the integer portion of the entry index to find the entry colors.
I32 index = cast(entry);
// Use the fractional portion of the entry index to control blending between
// entry colors.
Float offset = entry - cast(index);
// Every entry is a pair of colors blended by the fractional offset.
assert(test_all(index >= 0 &&
--> --------------------
--> maximum size reached
--> --------------------