// Copyright (c) the JPEG XL Project Authors. All rights reserved. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file.
// These templates are not found via ADL. using hwy::HWY_NAMESPACE::AbsDiff; using hwy::HWY_NAMESPACE::Add; using hwy::HWY_NAMESPACE::And; using hwy::HWY_NAMESPACE::Div; using hwy::HWY_NAMESPACE::Floor; using hwy::HWY_NAMESPACE::GetLane; using hwy::HWY_NAMESPACE::Max; using hwy::HWY_NAMESPACE::Min; using hwy::HWY_NAMESPACE::Mul; using hwy::HWY_NAMESPACE::MulAdd; using hwy::HWY_NAMESPACE::NegMulAdd; using hwy::HWY_NAMESPACE::Rebind; using hwy::HWY_NAMESPACE::ShiftLeft; using hwy::HWY_NAMESPACE::ShiftRight; using hwy::HWY_NAMESPACE::Sqrt; using hwy::HWY_NAMESPACE::Sub; using hwy::HWY_NAMESPACE::ZeroIfNegative;
constexpr float kInputScaling = 1.0f / 255.0f;
// Primary template: default to actual division. template <typename T, class V> struct FastDivision {
HWY_INLINE V operator()(const V n, const V d) const { return n / d; }
}; // Partial specialization for float vectors. template <class V> struct FastDivision<float, V> { // One Newton-Raphson iteration. static HWY_INLINE V ReciprocalNR(const V x) { constauto rcp = ApproximateReciprocal(x); constauto sum = Add(rcp, rcp); constauto x_rcp = Mul(x, rcp); return NegMulAdd(x_rcp, rcp, sum);
}
V operator()(const V n, const V d) const { #if JXL_TRUE // Faster on SKX return Div(n, d); #else return n * ReciprocalNR(d); #endif
}
};
// Approximates smooth functions via rational polynomials (i.e. dividing two // polynomials). Evaluates polynomials via Horner's scheme, which is faster than // Clenshaw recurrence for Chebyshev polynomials. LoadDup128 allows us to // specify constants (replicated 4x) independently of the lane count. template <size_t NP, size_t NQ, class D, class V, typename T>
HWY_INLINE HWY_MAYBE_UNUSED V EvalRationalPolynomial(const D d, const V x, const T (&p)[NP], const T (&q)[NQ]) {
constexpr size_t kDegP = NP / 4 - 1;
constexpr size_t kDegQ = NQ / 4 - 1; auto yp = LoadDup128(d, &p[kDegP * 4]); auto yq = LoadDup128(d, &q[kDegQ * 4]); // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a // compiler warning that the index is out of bounds since we are already // checking that it is not out of bounds with (kDegP >= n) and the access // will be optimized away. Similarly with q and kDegQ.
HWY_FENCE; if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4))); if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4)));
HWY_FENCE; if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4))); if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4)));
HWY_FENCE; if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4))); if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4)));
HWY_FENCE; if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4))); if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4)));
HWY_FENCE; if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4))); if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4)));
HWY_FENCE; if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4))); if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4)));
HWY_FENCE; if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4))); if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4)));
return FastDivision<T, V>()(yp, yq);
}
// Computes base-2 logarithm like std::log2. Undefined if negative / NaN. // L1 error ~3.9E-6 template <class DF, class V>
V FastLog2f(const DF df, V x) { // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
HWY_ALIGN constfloat p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
HWY_REP4(1.4287160470083755E+00f),
HWY_REP4(7.4245873327820566E-01f)};
HWY_ALIGN constfloat q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
HWY_REP4(1.0096718572241148E+00f),
HWY_REP4(1.7409343003366853E-01f)};
// Avoid division by zero. constauto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f)); constauto v2 = Div(k1, Add(v1, kOffset2)); constauto v3 = Div(k1, MulAdd(v1, v1, kOffset3)); constauto v4 = Div(k1, MulAdd(v1, v1, kOffset4)); // TODO(jyrki): // A log or two here could make sense. In butteraugli we have effectively // log(log(x + C)) for this kind of use, as a single log is used in // saturating visual masking and here the modulation values are exponential, // another log would counter that. return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3))));
}
// mul and mul2 represent a scaling difference between jxl and butteraugli. constfloat kSGmul = 226.0480446705883f; constfloat kSGmul2 = 1.0f / 73.377132366608819f; constfloat kLog2 = 0.693147181f; // Includes correction factor for std::log -> log2. constfloat kSGRetMul = kSGmul2 * 18.6580932135f * kLog2; constfloat kSGVOffset = 7.14672470003f;
template <bool invert, typename D, typename V>
V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) { // The opsin space in jxl is the cubic root of photons, i.e., v * v * v // is related to the number of photons. // // SimpleGamma(v * v * v) is the psychovisual space in butteraugli. // This ratio allows quantization to move from jxl's opsin space to // butteraugli's log-gamma space. staticconstfloat kEpsilon = 1e-2; staticconstfloat kNumOffset = kEpsilon / kInputScaling / kInputScaling; staticconstfloat kNumMul = kSGRetMul * 3 * kSGmul; staticconstfloat kVOffset = (kSGVOffset * kLog2 + kEpsilon) / kInputScaling; staticconstfloat kDenMul = kLog2 * kSGmul * kInputScaling * kInputScaling;
// TODO(veluca): this function computes an approximation of the derivative of // SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or // exact derivatives. For reference, SimpleGamma was: /* template <typename D, typename V> V SimpleGamma(const D d, V v) { // A simple HDR compatible gamma function. const auto mul = Set(d, kSGmul); const auto kRetMul = Set(d, kSGRetMul); const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f); const auto kVOffset = Set(d, kSGVOffset);
v *= mul;
// This should happen rarely, but may lead to a NaN, which is rather // undesirable. Since negative photons don't exist we solve the NaNs by // clamping here. // TODO(veluca): with FastLog2f, this no longer leads to NaNs. v = ZeroIfNegative(v); return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd; }
*/
template <class D, class V>
V GammaModulation(const D d, const size_t x, const size_t y, const RowBuffer<float>& input, const V out_val) { staticconstfloat kBias = 0.16f / kInputScaling; staticconstfloat kScale = kInputScaling / 64.0f; auto overall_ratio = Zero(d); constauto bias = Set(d, kBias); constauto scale = Set(d, kScale); constfloat* const JXL_RESTRICT block_start = input.Row(y) + x; for (size_t dy = 0; dy < 8; ++dy) { constfloat* const JXL_RESTRICT row_in = block_start + dy * input.stride(); for (size_t dx = 0; dx < 8; dx += Lanes(d)) { constauto iny = Add(Load(d, row_in + dx), bias); constauto ratio_g =
RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, iny);
overall_ratio = Add(overall_ratio, ratio_g);
}
}
overall_ratio = Mul(SumOfLanes(d, overall_ratio), scale); // ideally -1.0, but likely optimal correction adds some entropy, so slightly // less than that. // ln(2) constant folded in because we want std::log but have FastLog2f. constauto kGamma = Set(d, -0.15526878023684174f * 0.693147180559945f); return MulAdd(kGamma, FastLog2f(d, overall_ratio), out_val);
}
// Change precision in 8x8 blocks that have high frequency content. template <class D, class V>
V HfModulation(const D d, const size_t x, const size_t y, const RowBuffer<float>& input, const V out_val) { // Zero out the invalid differences for the rightmost value per row. const Rebind<uint32_t, D> du;
HWY_ALIGN constexpr uint32_t kMaskRight[8] = {~0u, ~0u, ~0u, ~0u,
~0u, ~0u, ~0u, 0};
auto sum = Zero(d); // sum of absolute differences with right and below staticconstfloat kSumCoeff = -2.0052193233688884f * kInputScaling / 112.0; auto sumcoeff = Set(d, kSumCoeff);
constfloat* const JXL_RESTRICT block_start = input.Row(y) + x; for (size_t dy = 0; dy < 8; ++dy) { constfloat* JXL_RESTRICT row_in = block_start + dy * input.stride(); constfloat* JXL_RESTRICT row_in_next =
dy == 7 ? row_in : row_in + input.stride();
// The XYB gamma is 3.0 to be able to decode faster with two muls. // Butteraugli's gamma is matching the gamma of human eye, around 2.6. // We approximate the gamma difference by adding one cubic root into // the adaptive quantization. This gives us a total gamma of 2.6666 // for quantization uses. staticconstfloat match_gamma_offset = 0.019 / kInputScaling;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.