// Copyright (c) the JPEG XL Project Authors. All rights reserved. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file.
// These templates are not found via ADL. using hwy::HWY_NAMESPACE::Abs; using hwy::HWY_NAMESPACE::Add; using hwy::HWY_NAMESPACE::DemoteTo; using hwy::HWY_NAMESPACE::Ge; using hwy::HWY_NAMESPACE::IfThenElseZero; using hwy::HWY_NAMESPACE::Mul; using hwy::HWY_NAMESPACE::MulAdd; using hwy::HWY_NAMESPACE::Rebind; using hwy::HWY_NAMESPACE::Round; using hwy::HWY_NAMESPACE::Sub; using hwy::HWY_NAMESPACE::Vec;
using D = HWY_FULL(float); using DI = HWY_FULL(int32_t);
template <size_t N> void AddReverse(constfloat* JXL_RESTRICT a_in1, constfloat* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) {
HWY_CAPPED(float, 8) d8; for (size_t i = 0; i < N; i++) { auto in1 = Load(d8, a_in1 + i * 8); auto in2 = Load(d8, a_in2 + (N - i - 1) * 8);
Store(Add(in1, in2), d8, a_out + i * 8);
}
}
template <size_t N> void SubReverse(constfloat* JXL_RESTRICT a_in1, constfloat* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) {
HWY_CAPPED(float, 8) d8; for (size_t i = 0; i < N; i++) { auto in1 = Load(d8, a_in1 + i * 8); auto in2 = Load(d8, a_in2 + (N - i - 1) * 8);
Store(Sub(in1, in2), d8, a_out + i * 8);
}
}
template <size_t N> void B(float* JXL_RESTRICT coeff) {
HWY_CAPPED(float, 8) d8;
constexpr float kSqrt2 = 1.41421356237f; auto sqrt2 = Set(d8, kSqrt2); auto in1 = Load(d8, coeff); auto in2 = Load(d8, coeff + 8);
Store(MulAdd(in1, sqrt2, in2), d8, coeff); for (size_t i = 1; i + 1 < N; i++) { auto in1 = Load(d8, coeff + i * 8); auto in2 = Load(d8, coeff + (i + 1) * 8);
Store(Add(in1, in2), d8, coeff + i * 8);
}
}
// Ideally optimized away by compiler (except the multiply). template <size_t N> void InverseEvenOdd(constfloat* JXL_RESTRICT a_in, float* JXL_RESTRICT a_out) {
HWY_CAPPED(float, 8) d8; for (size_t i = 0; i < N / 2; i++) { auto in1 = Load(d8, a_in + i * 8);
Store(in1, d8, a_out + 2 * i * 8);
} for (size_t i = N / 2; i < N; i++) { auto in1 = Load(d8, a_in + i * 8);
Store(in1, d8, a_out + (2 * (i - N / 2) + 1) * 8);
}
}
// Constants for DCT implementation. Generated by the following snippet: // for i in range(N // 2): // print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ") template <size_t N> struct WcMultipliers;
template <> struct DCT1DImpl<2> {
JXL_INLINE voidoperator()(float* JXL_RESTRICT mem) {
HWY_CAPPED(float, 8) d8; auto in1 = Load(d8, mem); auto in2 = Load(d8, mem + 8);
Store(Add(in1, in2), d8, mem);
Store(Sub(in1, in2), d8, mem + 8);
}
};
template <size_t N> struct DCT1DImpl { voidoperator()(float* JXL_RESTRICT mem) {
HWY_ALIGN float tmp[N * 8];
AddReverse<N / 2>(mem, mem + N * 4, tmp);
DCT1DImpl<N / 2>()(tmp);
SubReverse<N / 2>(mem, mem + N * 4, tmp + N * 4);
Multiply<N>(tmp);
DCT1DImpl<N / 2>()(tmp + N * 4);
B<N / 2>(tmp + N * 4);
InverseEvenOdd<N>(tmp, mem);
}
};
void DCT1D(constfloat* JXL_RESTRICT pixels, size_t pixels_stride, float* JXL_RESTRICT output) {
HWY_CAPPED(float, 8) d8;
HWY_ALIGN float tmp[64]; for (size_t i = 0; i < 8; i += Lanes(d8)) { // TODO(veluca): consider removing the temporary memory here (as is done in // IDCT), if it turns out that some compilers don't optimize away the loads // and this is performance-critical.
LoadFromBlock(pixels, pixels_stride, i, tmp);
DCT1DImpl<8>()(tmp);
StoreToBlockAndScale(tmp, output, i);
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.