// Copyright (c) the JPEG XL Project Authors. All rights reserved. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file.
// Fast SIMD floating-point (I)DCT, any power of two.
// These templates are not found via ADL. using hwy::HWY_NAMESPACE::Add; using hwy::HWY_NAMESPACE::Mul; using hwy::HWY_NAMESPACE::MulAdd; using hwy::HWY_NAMESPACE::NegMulAdd; using hwy::HWY_NAMESPACE::Sub;
template <size_t SZ> struct FVImpl { using type = HWY_CAPPED(float, SZ);
};
template <> struct FVImpl<0> { using type = HWY_FULL(float);
};
template <size_t SZ> using FV = typename FVImpl<SZ>::type;
// Implementation of Lowest Complexity Self Recursive Radix-2 DCT II/III // Algorithms, by Siriani M. Perera and Jianhua Liu.
template <size_t N, size_t SZ> struct CoeffBundle { staticvoid AddReverse(constfloat* JXL_RESTRICT a_in1, constfloat* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) { for (size_t i = 0; i < N; i++) { auto in1 = Load(FV<SZ>(), a_in1 + i * SZ); auto in2 = Load(FV<SZ>(), a_in2 + (N - i - 1) * SZ);
Store(Add(in1, in2), FV<SZ>(), a_out + i * SZ);
}
} staticvoid SubReverse(constfloat* JXL_RESTRICT a_in1, constfloat* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) { for (size_t i = 0; i < N; i++) { auto in1 = Load(FV<SZ>(), a_in1 + i * SZ); auto in2 = Load(FV<SZ>(), a_in2 + (N - i - 1) * SZ);
Store(Sub(in1, in2), FV<SZ>(), a_out + i * SZ);
}
} staticvoid B(float* JXL_RESTRICT coeff) { auto sqrt2 = Set(FV<SZ>(), kSqrt2); auto in1 = Load(FV<SZ>(), coeff); auto in2 = Load(FV<SZ>(), coeff + SZ);
Store(MulAdd(in1, sqrt2, in2), FV<SZ>(), coeff); for (size_t i = 1; i + 1 < N; i++) { auto in1 = Load(FV<SZ>(), coeff + i * SZ); auto in2 = Load(FV<SZ>(), coeff + (i + 1) * SZ);
Store(Add(in1, in2), FV<SZ>(), coeff + i * SZ);
}
} staticvoid BTranspose(float* JXL_RESTRICT coeff) { for (size_t i = N - 1; i > 0; i--) { auto in1 = Load(FV<SZ>(), coeff + i * SZ); auto in2 = Load(FV<SZ>(), coeff + (i - 1) * SZ);
Store(Add(in1, in2), FV<SZ>(), coeff + i * SZ);
} auto sqrt2 = Set(FV<SZ>(), kSqrt2); auto in1 = Load(FV<SZ>(), coeff);
Store(Mul(in1, sqrt2), FV<SZ>(), coeff);
} // Ideally optimized away by compiler (except the multiply). staticvoid InverseEvenOdd(constfloat* JXL_RESTRICT a_in, float* JXL_RESTRICT a_out) { for (size_t i = 0; i < N / 2; i++) { auto in1 = Load(FV<SZ>(), a_in + i * SZ);
Store(in1, FV<SZ>(), a_out + 2 * i * SZ);
} for (size_t i = N / 2; i < N; i++) { auto in1 = Load(FV<SZ>(), a_in + i * SZ);
Store(in1, FV<SZ>(), a_out + (2 * (i - N / 2) + 1) * SZ);
}
} // Ideally optimized away by compiler. staticvoid ForwardEvenOdd(constfloat* JXL_RESTRICT a_in, size_t a_in_stride, float* JXL_RESTRICT a_out) { for (size_t i = 0; i < N / 2; i++) { auto in1 = LoadU(FV<SZ>(), a_in + 2 * i * a_in_stride);
Store(in1, FV<SZ>(), a_out + i * SZ);
} for (size_t i = N / 2; i < N; i++) { auto in1 = LoadU(FV<SZ>(), a_in + (2 * (i - N / 2) + 1) * a_in_stride);
Store(in1, FV<SZ>(), a_out + i * SZ);
}
} // Invoked on full vector. staticvoid Multiply(float* JXL_RESTRICT coeff) { for (size_t i = 0; i < N / 2; i++) { auto in1 = Load(FV<SZ>(), coeff + (N / 2 + i) * SZ); auto mul = Set(FV<SZ>(), WcMultipliers<N>::kMultipliers[i]);
Store(Mul(in1, mul), FV<SZ>(), coeff + (N / 2 + i) * SZ);
}
} staticvoid MultiplyAndAdd(constfloat* JXL_RESTRICT coeff, float* JXL_RESTRICT out, size_t out_stride) { for (size_t i = 0; i < N / 2; i++) { auto mul = Set(FV<SZ>(), WcMultipliers<N>::kMultipliers[i]); auto in1 = Load(FV<SZ>(), coeff + i * SZ); auto in2 = Load(FV<SZ>(), coeff + (N / 2 + i) * SZ); auto out1 = MulAdd(mul, in2, in1); auto out2 = NegMulAdd(mul, in2, in1);
StoreU(out1, FV<SZ>(), out + i * out_stride);
StoreU(out2, FV<SZ>(), out + (N - i - 1) * out_stride);
}
} template <typename Block> staticvoid LoadFromBlock(const Block& in, size_t off, float* JXL_RESTRICT coeff) { for (size_t i = 0; i < N; i++) {
Store(in.LoadPart(FV<SZ>(), i, off), FV<SZ>(), coeff + i * SZ);
}
} template <typename Block> staticvoid StoreToBlockAndScale(constfloat* JXL_RESTRICT coeff, const Block& out, size_t off) { auto mul = Set(FV<SZ>(), 1.0f / N); for (size_t i = 0; i < N; i++) {
out.StorePart(FV<SZ>(), Mul(mul, Load(FV<SZ>(), coeff + i * SZ)), i, off);
}
}
};
template <size_t N, size_t M_or_0, typename FromBlock, typename ToBlock> void DCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp, float* JXL_RESTRICT tmp) {
size_t M = M_or_0 != 0 ? M_or_0 : Mp;
constexpr size_t SZ = MaxLanes(FV<M_or_0>()); for (size_t i = 0; i < M; i += Lanes(FV<M_or_0>())) { // TODO(veluca): consider removing the temporary memory here (as is done in // IDCT), if it turns out that some compilers don't optimize away the loads // and this is performance-critical.
CoeffBundle<N, SZ>::LoadFromBlock(from, i, tmp);
DCT1DImpl<N, SZ>()(tmp, tmp + N * SZ);
CoeffBundle<N, SZ>::StoreToBlockAndScale(tmp, to, i);
}
}
template <size_t N, size_t M_or_0, typename FromBlock, typename ToBlock> void IDCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp, float* JXL_RESTRICT tmp) {
size_t M = M_or_0 != 0 ? M_or_0 : Mp;
constexpr size_t SZ = MaxLanes(FV<M_or_0>()); for (size_t i = 0; i < M; i += Lanes(FV<M_or_0>())) {
IDCT1DImpl<N, SZ>()(from.Address(0, i), from.Stride(), to.Address(0, i),
to.Stride(), tmp);
}
}
template <size_t N, size_t M, typename = void> struct DCT1D { template <typename FromBlock, typename ToBlock> voidoperator()(const FromBlock& from, const ToBlock& to, float* JXL_RESTRICT tmp) { return DCT1DWrapper<N, M>(from, to, M, tmp);
}
};
template <size_t N, size_t M> struct DCT1D<N, M, typename std::enable_if<(M > MaxLanes(FV<0>()))>::type> { template <typename FromBlock, typename ToBlock> voidoperator()(const FromBlock& from, const ToBlock& to, float* JXL_RESTRICT tmp) { return NoInlineWrapper(DCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, M,
tmp);
}
};
template <size_t N, size_t M, typename = void> struct IDCT1D { template <typename FromBlock, typename ToBlock> voidoperator()(const FromBlock& from, const ToBlock& to, float* JXL_RESTRICT tmp) { return IDCT1DWrapper<N, M>(from, to, M, tmp);
}
};
template <size_t N, size_t M> struct IDCT1D<N, M, typename std::enable_if<(M > MaxLanes(FV<0>()))>::type> { template <typename FromBlock, typename ToBlock> voidoperator()(const FromBlock& from, const ToBlock& to, float* JXL_RESTRICT tmp) { return NoInlineWrapper(IDCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, M,
tmp);
}
};
// Computes the maybe-transposed, scaled DCT of a block, that needs to be // HWY_ALIGN'ed. template <size_t ROWS, size_t COLS> struct ComputeScaledDCT { // scratch_space must be aligned, and should have space for ROWS*COLS // floats. template <class From>
HWY_MAYBE_UNUSED voidoperator()(const From& from, float* to, float* JXL_RESTRICT scratch_space) { float* JXL_RESTRICT block = scratch_space; float* JXL_RESTRICT tmp = scratch_space + ROWS * COLS; if (ROWS < COLS) {
DCT1D<ROWS, COLS>()(from, DCTTo(block, COLS), tmp);
Transpose<ROWS, COLS>::Run(DCTFrom(block, COLS), DCTTo(to, ROWS));
DCT1D<COLS, ROWS>()(DCTFrom(to, ROWS), DCTTo(block, ROWS), tmp);
Transpose<COLS, ROWS>::Run(DCTFrom(block, ROWS), DCTTo(to, COLS));
} else {
DCT1D<ROWS, COLS>()(from, DCTTo(to, COLS), tmp);
Transpose<ROWS, COLS>::Run(DCTFrom(to, COLS), DCTTo(block, ROWS));
DCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(to, ROWS), tmp);
}
}
}; // Computes the maybe-transposed, scaled IDCT of a block, that needs to be // HWY_ALIGN'ed. template <size_t ROWS, size_t COLS> struct ComputeScaledIDCT { // scratch_space must be aligned, and should have space for ROWS*COLS // floats. template <class To>
HWY_MAYBE_UNUSED voidoperator()(float* JXL_RESTRICT from, const To& to, float* JXL_RESTRICT scratch_space) { float* JXL_RESTRICT block = scratch_space; float* JXL_RESTRICT tmp = scratch_space + ROWS * COLS; // Reverse the steps done in ComputeScaledDCT. if (ROWS < COLS) {
Transpose<ROWS, COLS>::Run(DCTFrom(from, COLS), DCTTo(block, ROWS));
IDCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(from, ROWS), tmp);
Transpose<COLS, ROWS>::Run(DCTFrom(from, ROWS), DCTTo(block, COLS));
IDCT1D<ROWS, COLS>()(DCTFrom(block, COLS), to, tmp);
} else {
IDCT1D<COLS, ROWS>()(DCTFrom(from, ROWS), DCTTo(block, ROWS), tmp);
Transpose<COLS, ROWS>::Run(DCTFrom(block, ROWS), DCTTo(from, COLS));
IDCT1D<ROWS, COLS>()(DCTFrom(from, COLS), to, tmp);
}
}
};
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.