// Copyright (c) the JPEG XL Project Authors. All rights reserved. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file.
// These templates are not found via ADL. using hwy::HWY_NAMESPACE::Add; using hwy::HWY_NAMESPACE::Mul; using hwy::HWY_NAMESPACE::MulAdd; using hwy::HWY_NAMESPACE::Vec;
// 5x5 convolution by separable kernel with a single scan through the input. // This is more cache-efficient than separate horizontal/vertical passes, and // possibly faster (given enough registers) than tiling and/or transposing. // // Overview: imagine a 5x5 window around a central pixel. First convolve the // rows by multiplying the pixels with the corresponding weights from // WeightsSeparable5.horz[abs(x_offset) * 4]. Then multiply each of these // intermediate results by the corresponding vertical weight, i.e. // vert[abs(y_offset) * 4]. Finally, store the sum of these values as the // convolution result at the position of the central pixel in the output. // // Each of these operations uses SIMD vectors. The central pixel and most // importantly the output are aligned, so neighnoring pixels (e.g. x_offset=1) // require unaligned loads. Because weights are supplied in identical groups of // 4, we can use LoadDup128 to load them (slightly faster). // // Uses mirrored boundary handling. Until x >= kRadius, the horizontal // convolution uses Neighbors class to shuffle vectors as if each of its lanes // had been loaded from the mirrored offset. Similarly, the last full vector to // write uses mirroring. In the case of scalar vectors, Neighbors is not usable // and the value is loaded directly. Otherwise, the number of valid pixels // modulo the vector size enables a small optimization: for smaller offsets, // a non-mirrored load is sufficient. class Separable5Strategy { using D = HWY_CAPPED(float, 16); using V = Vec<D>;
const V wh0 = LoadDup128(d, weights.horz + 0 * 4); const V wh1 = LoadDup128(d, weights.horz + 1 * 4); const V wh2 = LoadDup128(d, weights.horz + 2 * 4); const V wv0 = LoadDup128(d, weights.vert + 0 * 4); const V wv1 = LoadDup128(d, weights.vert + 1 * 4); const V wv2 = LoadDup128(d, weights.vert + 2 * 4);
size_t x = 0;
// More than one iteration for scalars. for (; x < kRadius; x += Lanes(d)) { const V conv0 =
Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2), wv0);
const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2); const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2); const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
// Last full vector to write (the above loop handled mod >= kRadius) #if HWY_TARGET == HWY_SCALAR while (x < xsize) { #else if (kSizeModN < kRadius) { #endif const V conv0 =
Mul(HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2), wv0);
const V conv1t =
HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1, wh2); const V conv1b =
HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1, wh2); const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
const V conv2t =
HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1, wh2); const V conv2b =
HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1, wh2); const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
Store(conv2, d, row_out + x);
x += Lanes(d);
}
// If mod = 0, the above vector was the last. if (kSizeModN != 0) { for (; x < xsize; ++x) { float mul = 0.0f; for (int64_t dy = -kRadius; dy <= kRadius; ++dy) { constfloat wy = weights.vert[std::abs(dy) * 4]; constfloat* clamped_row = wrap_row(row_m + dy * stride, stride); for (int64_t dx = -kRadius; dx <= kRadius; ++dx) { constfloat wx = weights.horz[std::abs(dx) * 4]; const int64_t clamped_x = Mirror(x + dx, xsize);
mul += clamped_row[clamped_x] * wx * wy;
}
}
row_out[x] = mul;
}
}
}
private: // Same as HorzConvolve for the first/last vector in a row. static JXL_MAYBE_INLINE V HorzConvolveFirst( constfloat* const JXL_RESTRICT row, const int64_t x, const int64_t xsize, const V wh0, const V wh1, const V wh2) { const D d; const V c = LoadU(d, row + x); const V mul0 = Mul(c, wh0);
#if HWY_TARGET == HWY_SCALAR const V l1 = LoadU(d, row + Mirror(x - 1, xsize)); const V l2 = LoadU(d, row + Mirror(x - 2, xsize)); #else
(void)xsize; const V l1 = Neighbors::FirstL1(c); const V l2 = Neighbors::FirstL2(c); #endif
const V r1 = LoadU(d, row + x + 1); const V r2 = LoadU(d, row + x + 2);
const V mul1 = MulAdd(Add(l1, r1), wh1, mul0); const V mul2 = MulAdd(Add(l2, r2), wh2, mul1); return mul2;
}
template <size_t kSizeModN> static JXL_MAYBE_INLINE V
HorzConvolveLast(constfloat* const JXL_RESTRICT row, const int64_t x, const int64_t xsize, const V wh0, const V wh1, const V wh2) { const D d; const V c = LoadU(d, row + x); const V mul0 = Mul(c, wh0);
const V l1 = LoadU(d, row + x - 1); const V l2 = LoadU(d, row + x - 2);
// Sum of pixels with Manhattan distance i, multiplied by weights[i]. const V sum1 = Add(l1, r1); const V mul1 = MulAdd(sum1, wh1, mul0); const V sum2 = Add(l2, r2); const V mul2 = MulAdd(sum2, wh2, mul1); return mul2;
}
// Requires kRadius valid pixels before/after pos. static JXL_MAYBE_INLINE V HorzConvolve(constfloat* const JXL_RESTRICT pos, const V wh0, const V wh1, const V wh2) { const D d; const V c = LoadU(d, pos); const V mul0 = Mul(c, wh0);
// Loading anew is faster than combining vectors. const V l1 = LoadU(d, pos - 1); const V r1 = LoadU(d, pos + 1); const V l2 = LoadU(d, pos - 2); const V r2 = LoadU(d, pos + 2); // Sum of pixels with Manhattan distance i, multiplied by weights[i]. const V sum1 = Add(l1, r1); const V mul1 = MulAdd(sum1, wh1, mul0); const V sum2 = Add(l2, r2); const V mul2 = MulAdd(sum2, wh2, mul1); return mul2;
}
};
Status Separable5(const ImageF& in, const Rect& rect, const WeightsSeparable5& weights, ThreadPool* pool,
ImageF* out) { using Conv = ConvolveT<Separable5Strategy>; if (rect.xsize() >= Conv::MinWidth()) {
JXL_ENSURE(SameSize(rect, *out));
JXL_ENSURE(rect.xsize() >= Conv::MinWidth());
Conv::Run(in, rect, weights, pool, out); returntrue;
}
return SlowSeparable5(in, rect, weights, pool, out, Rect(*out));
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.