/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
// We can accumulate up to 32768 8-bit multiplication results in a signed // 32-bit integer. We are processing 2 pixels at a time, so the accumulator max // can be as high as 16384 for the compute stats. #define STAT_ACCUMULATOR_MAX 16384
// The M matrix is accumulated in STAT_ACCUMULATOR_MAX steps to speed-up the // computation. This function computes the final M from the accumulated // (src_s64) and the residual parts (src_s32). It also transposes the result as // the output needs to be column-major. staticinlinevoid acc_transpose_M(int64_t *dst, const int64_t *src_s64, const int32_t *src_s32, constint wiener_win, int scale) { for (int i = 0; i < wiener_win; ++i) { for (int j = 0; j < wiener_win; ++j) { int tr_idx = j * wiener_win + i;
*dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale;
}
}
}
// The resulting H is a column-major matrix accumulated from the transposed // (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single // vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This // function transforms back to the originally expected format (double // transpose). The H matrix is accumulated in STAT_ACCUMULATOR_MAX steps to // speed-up the computation. This function computes the final H from the // accumulated (src_s64) and the residual parts (src_s32). The computed H is // only an upper triangle matrix, this function also fills the lower triangle of // the resulting matrix. staticvoid update_H(int64_t *dst, const int64_t *src_s64, const int32_t *src_s32, constint wiener_win, int stride, int scale) { // For a simplified theoretical 3x3 case where `wiener_win` is 3 and // `wiener_win2` is 9, the M matrix is 3x3: // 0, 3, 6 // 1, 4, 7 // 2, 5, 8 // // This is viewed as a vector to compute H (9x9) by vector outer product: // 0, 3, 6, 1, 4, 7, 2, 5, 8 // // Double transpose and upper triangle remapping for 3x3 -> 9x9 case: // 0, 3, 6, 1, 4, 7, 2, 5, 8, // 3, 30, 33, 12, 31, 34, 21, 32, 35, // 6, 33, 60, 15, 42, 61, 24, 51, 62, // 1, 12, 15, 10, 13, 16, 11, 14, 17, // 4, 31, 42, 13, 40, 43, 22, 41, 44, // 7, 34, 61, 16, 43, 70, 25, 52, 71, // 2, 21, 24, 11, 22, 25, 20, 23, 26, // 5, 32, 51, 14, 41, 52, 23, 50, 53, // 8, 35, 62, 17, 44, 71, 26, 53, 80, constint wiener_win2 = wiener_win * wiener_win;
// Loop through the indices according to the remapping above, along the // columns: // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ..., // wiener_win - 1, wiener_win - 1 + wiener_win, ... // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8. for (int i = 0; i < wiener_win; ++i) { for (int j = i; j < wiener_win2; j += wiener_win) { // These two inner loops are the same as the two outer loops, but running // along rows instead of columns. For the 3x3 case `l` will be: // 0, 3, 6, 1, 4, 7, 2, 5, 8. for (int k = 0; k < wiener_win; ++k) { for (int l = k; l < wiener_win2; l += wiener_win) { // The nominal double transpose indexing would be: // int idx = stride * j + l; // However we need the upper-triangle indices, it is easy with some // min/max operations. int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l);
// Resulting matrix is filled by combining the 64-bit and the residual // 32-bit matrices together with scaling.
*dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale;
}
}
}
}
}
// Load 7x7 matrix into 3 and a half 128-bit vectors from consecutive rows, the // last load address is offset to prevent out-of-bounds access. staticinlinevoid load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src,
ptrdiff_t stride) {
dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
src += 2 * stride;
dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
src += 2 * stride;
dst[2] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
src += 2 * stride;
dst[3] = vcombine_u8(vld1_u8(src - 1), vdup_n_u8(0));
}
staticinlinevoid compute_stats_win7_downsampled_neon( const uint8_t *dgd, const uint8_t *src, int width, int height, int dgd_stride, int src_stride, int avg, int64_t *M, int64_t *H, int downsample_factor) { // Matrix names are capitalized to help readability.
DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]);
DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]);
DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]);
DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]);
DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
do { int j = width; while (j >= 2) { // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the // middle 6x7 elements being shared.
uint8x16_t dgd_rows[4];
load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride);
// Re-arrange (and widen) the combined 8x7 matrix to have the 2 whole 7x7 // matrices (1 for each of the 2 pixels) separated into distinct // int16x8_t[6] arrays. These arrays contain 48 elements of the 49 (7x7). // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 49 // consecutive elements.
int16x8_t dgd_avg0[6];
int16x8_t dgd_avg1[6];
uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
uint8x16_t dgd_shuf3 = tbl2q(dgd_rows[0], dgd_rows[1], lut3);
// The remaining last (49th) elements of `dgd - avg`.
DGD_AVG0[48] = dgd_ptr[6] - avg;
DGD_AVG1[48] = dgd_ptr[7] - avg;
// Accumulate into row-major variant of matrix M (cross-correlation) for 2 // output pixels at a time. M is of size 7 * 7. It needs to be filled such // that multiplying one element from src with each element of a row of the // wiener window will fill one column of M. However this is not very // convenient in terms of memory access, as it means we do contiguous // loads of dgd but strided stores to M. As a result, we use an // intermediate matrix M_s32 which is instead filled such that one row of // the wiener window gives one row of M_s32. Once fully computed, M_s32 is // then transposed to return M. int src_avg0 = *src++ - avg; int src_avg1 = *src++ - avg;
int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
dgd_avg1[0]);
update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
dgd_avg1[1]);
update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
dgd_avg1[2]);
update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3],
dgd_avg1[3]);
update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4],
dgd_avg1[4]);
update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5],
dgd_avg1[5]);
// Last (49th) element of M_s32 can be computed as scalar more efficiently // for 2 output pixels.
M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1;
// Start accumulating into row-major version of matrix H // (auto-covariance), it expects the DGD_AVG[01] matrices to also be // row-major. H is of size 49 * 49. It is filled by multiplying every pair // of elements of the wiener window together (vector outer product). Since // it is a symmetric matrix, we only compute the upper-right triangle, and // then copy it down to the lower-left later. The upper triangle is // covered by 4x4 tiles. The original algorithm assumes the M matrix is // column-major and the resulting H matrix is also expected to be // column-major. It is not efficient to work with column-major matrices, // so we accumulate into a row-major matrix H_s32. At the end of the // algorithm a double transpose transformation will convert H_s32 back to // the expected output layout.
update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
// The last element of the triangle of H_s32 matrix can be computed as a // scalar more efficiently.
H_s32[48 * WIENER_WIN2_ALIGN2 + 48] +=
DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48];
// Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent // overflow. if (--acc_cnt == 0) {
acc_cnt = STAT_ACCUMULATOR_MAX;
// The widening accumulation is only needed for the upper triangle part // of the matrix.
int64_t *lh = H_s64;
int32_t *lh32 = H_s32; for (int k = 0; k < WIENER_WIN2; ++k) { // The widening accumulation is only run for the relevant parts // (upper-right triangle) in a row 4-element aligned. int k4 = k / 4 * 4;
accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4);
// Last element of the row is computed separately.
lh[48] += lh32[48];
lh32[48] = 0;
// Computations for odd pixel in the row. if (width & 1) { // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the // middle 6x7 elements being shared.
uint8x16_t dgd_rows[4];
load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride);
// The remaining last (49th) element of `dgd - avg`.
DGD_AVG0[48] = dgd_ptr[6] - avg;
// Accumulate into row-major order variant of matrix M (cross-correlation) // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled // such that multiplying one element from src with each element of a row // of the wiener window will fill one column of M. However this is not // very convenient in terms of memory access, as it means we do // contiguous loads of dgd but strided stores to M. As a result, we use an // intermediate matrix M_s32 which is instead filled such that one row of // the wiener window gives one row of M_s32. Once fully computed, M_s32 is // then transposed to return M. int src_avg0 = *src++ - avg;
int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]);
update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]);
update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]);
// Last (49th) element of M_s32 can be computed as scalar more efficiently // for 1 output pixel.
M_s32[48] += DGD_AVG0[48] * src_avg0;
// Start accumulating into row-major order version of matrix H // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. // H is of size 49 * 49. It is filled by multiplying every pair of // elements of the wiener window together (vector outer product). Since it // is a symmetric matrix, we only compute the upper-right triangle, and // then copy it down to the lower-left later. The upper triangle is // covered by 4x4 tiles. The original algorithm assumes the M matrix is // column-major and the resulting H matrix is also expected to be // column-major. It is not efficient to work column-major matrices, so we // accumulate into a row-major matrix H_s32. At the end of the algorithm a // double transpose transformation will convert H_s32 back to the expected // output layout.
update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48);
// The last element of the triangle of H_s32 matrix can be computed as // scalar more efficiently.
H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48];
}
do { int j = width; while (j >= 2) { // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the // middle 4x5 elements being shared.
uint8x16_t dgd_rows[3];
load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride);
// Re-arrange (and widen) the combined 6x5 matrix to have the 2 whole 5x5 // matrices (1 for each of the 2 pixels) separated into distinct // int16x8_t[3] arrays. These arrays contain 24 elements of the 25 (5x5). // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 25 // consecutive elements.
int16x8_t dgd_avg0[3];
int16x8_t dgd_avg1[3];
uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[0], dgd_rows[1], lut1);
uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[1], dgd_rows[2], lut2);
// The remaining last (25th) elements of `dgd - avg`.
DGD_AVG0[24] = dgd_ptr[4] - avg;
DGD_AVG1[24] = dgd_ptr[5] - avg;
// Accumulate into row-major variant of matrix M (cross-correlation) for 2 // output pixels at a time. M is of size 5 * 5. It needs to be filled such // that multiplying one element from src with each element of a row of the // wiener window will fill one column of M. However this is not very // convenient in terms of memory access, as it means we do contiguous // loads of dgd but strided stores to M. As a result, we use an // intermediate matrix M_s32 which is instead filled such that one row of // the wiener window gives one row of M_s32. Once fully computed, M_s32 is // then transposed to return M. int src_avg0 = *src++ - avg; int src_avg1 = *src++ - avg;
int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
dgd_avg1[0]);
update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
dgd_avg1[1]);
update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
dgd_avg1[2]);
// Last (25th) element of M_s32 can be computed as scalar more efficiently // for 2 output pixels.
M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1;
// Start accumulating into row-major version of matrix H // (auto-covariance), it expects the DGD_AVG[01] matrices to also be // row-major. H is of size 25 * 25. It is filled by multiplying every pair // of elements of the wiener window together (vector outer product). Since // it is a symmetric matrix, we only compute the upper-right triangle, and // then copy it down to the lower-left later. The upper triangle is // covered by 4x4 tiles. The original algorithm assumes the M matrix is // column-major and the resulting H matrix is also expected to be // column-major. It is not efficient to work with column-major matrices, // so we accumulate into a row-major matrix H_s32. At the end of the // algorithm a double transpose transformation will convert H_s32 back to // the expected output layout.
update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
// The last element of the triangle of H_s32 matrix can be computed as a // scalar more efficiently.
H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24];
// Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent // overflow. if (--acc_cnt == 0) {
acc_cnt = STAT_ACCUMULATOR_MAX;
// The widening accumulation is only needed for the upper triangle part // of the matrix.
int64_t *lh = H_s64;
int32_t *lh32 = H_s32; for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) { // The widening accumulation is only run for the relevant parts // (upper-right triangle) in a row 4-element aligned. int k4 = k / 4 * 4;
accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4);
// Last element of the row is computed separately.
lh[24] += lh32[24];
lh32[24] = 0;
// Computations for odd pixel in the row. if (width & 1) { // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the // middle 4x5 elements being shared.
uint8x16_t dgd_rows[3];
load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride);
// Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5 // matrix tightly packed into a int16x8_t[3] array. This array contains // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer. // The DGD_AVG buffer contains 25 consecutive elements.
int16x8_t dgd_avg0[3];
uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
uint8x8_t dgd_shuf1 = tbl2(dgd_rows[1], dgd_rows[2], vget_low_u8(lut2));
// The remaining last (25th) element of `dgd - avg`.
DGD_AVG0[24] = dgd_ptr[4] - avg;
// Accumulate into row-major order variant of matrix M (cross-correlation) // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled // such that multiplying one element from src with each element of a row // of the wiener window will fill one column of M. However this is not // very convenient in terms of memory access, as it means we do // contiguous loads of dgd but strided stores to M. As a result, we use an // intermediate matrix M_s32 which is instead filled such that one row of // the wiener window gives one row of M_s32. Once fully computed, M_s32 is // then transposed to return M. int src_avg0 = *src++ - avg;
int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
// Last (25th) element of M_s32 can be computed as scalar more efficiently // for 1 output pixel.
M_s32[24] += DGD_AVG0[24] * src_avg0;
// Start accumulating into row-major order version of matrix H // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. // H is of size 25 * 25. It is filled by multiplying every pair of // elements of the wiener window together (vector outer product). Since it // is a symmetric matrix, we only compute the upper-right triangle, and // then copy it down to the lower-left later. The upper triangle is // covered by 4x4 tiles. The original algorithm assumes the M matrix is // column-major and the resulting H matrix is also expected to be // column-major. It is not efficient to work column-major matrices, so we // accumulate into a row-major matrix H_s32. At the end of the algorithm a // double transpose transformation will convert H_s32 back to the expected // output layout.
update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24);
// The last element of the triangle of H_s32 matrix can be computed as a // scalar more efficiently.
H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
DGD_AVG0[24] * DGD_AVG0[24];
}
// Step 1: Calculate the top edge of the whole matrix, i.e., the top // edge of each triangle and square on the top row.
j = 0; do { const int16_t *s_t = s; const int16_t *d_t = d;
int32_t height_t = 0;
int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
int16x8_t src[2], dgd[2];
// Step 2: Calculate the left edge of each square on the top row.
j = 1; do { const int16_t *d_t = d;
int32_t height_t = 0;
int64x2_t sum_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s64(0) };
int16x8_t dgd[2];
hadd_update_4_stats_neon(
H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
hadd_update_4_stats_neon(
H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
hadd_update_4_stats_neon(
H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
hadd_update_4_stats_neon(
H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
} while (++j < wiener_win);
} while (++i < wiener_win - 1);
// Step 6: Derive other points of each upper triangle along the diagonal.
i = 0; do { const int16_t *const di = d + i;
int32x4_t deltas[WIENER_WIN_CHROMA * 2 + 1] = { vdupq_n_s32(0) };
int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
x = 0; while (x < w16) {
load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie);
derive_triangle_win5_neon(d_is, d_ie, deltas);
x += 16;
}
// Step 1: Calculate the top edge of the whole matrix, i.e., the top // edge of each triangle and square on the top row.
j = 0; do { const int16_t *s_t = s; const int16_t *d_t = d;
int32_t height_t = 0;
int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0) };
int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0) };
int16x8_t src[2], dgd[2];
// Step 2: Calculate the left edge of each square on the top row.
j = 1; do { const int16_t *d_t = d;
int32_t height_t = 0;
int64x2_t sum_h[WIENER_WIN - 1] = { vdupq_n_s64(0) };
int16x8_t dgd[2];
// Step 3: Derive the top edge of each triangle along the diagonal. No // triangle in top row.
{ const int16_t *d_t = d; // Pad to call transpose function.
int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
int16x8_t ds[WIENER_WIN * 2];
// Writing one more element on the top edge of a square falls to // the next square in the same row or the first element in the next // row, which will just be overwritten later.
update_8_stats_neon(
H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win,
deltas[0], deltas[1],
H + i * wiener_win * wiener_win2 + j * wiener_win);
hadd_update_6_stats_neon(
H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
hadd_update_6_stats_neon(
H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
hadd_update_6_stats_neon(
H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
hadd_update_6_stats_neon(
H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
hadd_update_6_stats_neon(
H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win, deltas[4],
H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win + 1);
hadd_update_6_stats_neon(
H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win, deltas[5],
H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win + 1);
} while (++j < wiener_win);
} while (++i < wiener_win - 1);
// Step 6: Derive other points of each upper triangle along the diagonal.
i = 0; do { const int16_t *const di = d + i;
int32x4_t deltas[WIENER_WIN * (WIENER_WIN - 1)] = { vdupq_n_s32(0) };
int16x8_t d_is[WIN_7], d_ie[WIN_7];
x = 0; while (x < w16) {
load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie);
derive_triangle_win7_neon(d_is, d_ie, deltas);
x += 16;
}
staticinline uint8_t find_average_neon(const uint8_t *src, int src_stride, int width, int height) {
uint64_t sum = 0;
if (width >= 16) { int h = 0; // We can accumulate up to 257 8-bit values in a 16-bit value, given // that each 16-bit vector has 8 elements, that means we can process up to // int(257*8/width) rows before we need to widen to 32-bit vector // elements. int h_overflow = 257 * 8 / width; int h_limit = height > h_overflow ? h_overflow : height;
uint32x4_t avg_u32 = vdupq_n_u32(0); do {
uint16x8_t avg_u16 = vdupq_n_u16(0); do { int j = width; const uint8_t *src_ptr = src; do {
uint8x16_t s = vld1q_u8(src_ptr);
avg_u16 = vpadalq_u8(avg_u16, s);
j -= 16;
src_ptr += 16;
} while (j >= 16); if (j >= 8) {
uint8x8_t s = vld1_u8(src_ptr);
avg_u16 = vaddw_u8(avg_u16, s);
j -= 8;
src_ptr += 8;
} // Scalar tail case. while (j > 0) {
sum += src[width - j];
j--;
}
src += src_stride;
} while (++h < h_limit);
avg_u32 = vpadalq_u16(avg_u32, avg_u16);
h_limit += h_overflow;
h_limit = height > h_overflow ? h_overflow : height;
} while (h < height); return (uint8_t)((horizontal_long_add_u32x4(avg_u32) + sum) /
(width * height));
} if (width >= 8) { int h = 0; // We can accumulate up to 257 8-bit values in a 16-bit value, given // that each 16-bit vector has 4 elements, that means we can process up to // int(257*4/width) rows before we need to widen to 32-bit vector // elements. int h_overflow = 257 * 4 / width; int h_limit = height > h_overflow ? h_overflow : height;
uint32x2_t avg_u32 = vdup_n_u32(0); do {
uint16x4_t avg_u16 = vdup_n_u16(0); do { int j = width; const uint8_t *src_ptr = src;
uint8x8_t s = vld1_u8(src_ptr);
avg_u16 = vpadal_u8(avg_u16, s);
j -= 8;
src_ptr += 8; // Scalar tail case. while (j > 0) {
sum += src[width - j];
j--;
}
src += src_stride;
} while (++h < h_limit);
avg_u32 = vpadal_u16(avg_u32, avg_u16);
h_limit += h_overflow;
h_limit = height > h_overflow ? h_overflow : height;
} while (h < height); return (uint8_t)((horizontal_long_add_u32x2(avg_u32) + sum) /
(width * height));
} int i = height; do { int j = 0; do {
sum += src[j];
} while (++j < width);
src += src_stride;
} while (--i != 0); return (uint8_t)(sum / (width * height));
}
staticinlinevoid compute_sub_avg(const uint8_t *buf, int buf_stride, int avg,
int16_t *buf_avg, int buf_avg_stride, int width, int height, int downsample_factor) {
uint8x8_t avg_u8 = vdup_n_u8(avg);
if (width > 8) { int i = 0; do { int j = width; const uint8_t *buf_ptr = buf;
int16_t *buf_avg_ptr = buf_avg; do {
uint8x8_t d = vld1_u8(buf_ptr);
vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8)));
j -= 8;
buf_ptr += 8;
buf_avg_ptr += 8;
} while (j >= 8); while (j > 0) {
*buf_avg_ptr = (int16_t)buf[width - j] - (int16_t)avg;
buf_avg_ptr++;
j--;
}
buf += buf_stride;
buf_avg += buf_avg_stride;
i += downsample_factor;
} while (i < height);
} else { // For width < 8, don't use Neon. for (int i = 0; i < height; i = i + downsample_factor) { for (int j = 0; j < width; j++) {
buf_avg[j] = (int16_t)buf[j] - (int16_t)avg;
}
buf += buf_stride;
buf_avg += buf_avg_stride;
}
}
}
staticinlinevoid av1_compute_stats_downsampled_neon( int wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg,
int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats) {
assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4);
(void)dgd_avg;
(void)src_avg;
// The wiener window will slide along the dgd frame, centered on each pixel. // For the top left pixel and all the pixels on the side of the frame this // means half of the window will be outside of the frame. As such the actual // buffer that we need to subtract the avg from will be 2 * wiener_halfwin // wider and 2 * wiener_halfwin higher than the original dgd buffer. constint vert_offset = v_start - wiener_halfwin; constint horiz_offset = h_start - wiener_halfwin; const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
// Since the height is not necessarily a multiple of the downsample factor, // the last line of src will be scaled according to how many rows remain. int downsample_factor =
use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
int downsampled_height = height / downsample_factor; int downsample_remainder = height % downsample_factor;
// Calculate the M and H matrices for the normal and downsampled cases. if (downsampled_height > 0) { if (wiener_win == WIENER_WIN) {
compute_stats_win7_downsampled_neon(
dgd_win, src_start, width, downsampled_height, dgd_stride, src_stride,
avg, M, H, downsample_factor);
} else {
compute_stats_win5_downsampled_neon(
dgd_win, src_start, width, downsampled_height, dgd_stride, src_stride,
avg, M, H, downsample_factor);
}
}
// Accumulate the remaining last rows in the downsampled case. if (downsample_remainder > 0) { int remainder_offset = height - downsample_remainder; if (wiener_win == WIENER_WIN) {
compute_stats_win7_downsampled_neon(
dgd_win + remainder_offset * dgd_stride,
src_start + remainder_offset * src_stride, width, 1, dgd_stride,
src_stride, avg, M, H, downsample_remainder);
} else {
compute_stats_win5_downsampled_neon(
dgd_win + remainder_offset * dgd_stride,
src_start + remainder_offset * src_stride, width, 1, dgd_stride,
src_stride, avg, M, H, downsample_remainder);
}
}
}
if (wiener_win == WIENER_WIN) {
compute_stats_win7_neon(dgd_avg, d_stride, src_avg, s_stride, width, height,
M, H);
} elseif (wiener_win == WIENER_WIN_CHROMA) {
compute_stats_win5_neon(dgd_avg, d_stride, src_avg, s_stride, width, height,
M, H);
}
// H is a symmetric matrix, so we only need to fill out the upper triangle. // We can copy it down to the lower triangle outside the (i, j) loops.
diagonal_copy_stats_neon(wiener_win2, H);
}
staticinlinevoid calc_proj_params_r0_r1_neon( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
assert(width % 8 == 0); constint size = width * height;
// The function calls 3 subfunctions for the following cases : // 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements // of C and H need to be computed. // 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are // non-zero and need to be computed. // 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are // non-zero and need to be computed. void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
int32_t *flt1, int flt1_stride, int64_t H[2][2],
int64_t C[2], const sgr_params_type *params) { if ((params->r[0] > 0) && (params->r[1] > 0)) {
calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
dat_stride, flt0, flt0_stride, flt1,
flt1_stride, H, C);
} elseif (params->r[0] > 0) {
calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride,
flt0, flt0_stride, H, C);
} elseif (params->r[1] > 0) {
calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride,
flt1, flt1_stride, H, C);
}
}
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.40 Sekunden
(vorverarbeitet am 2026-04-28)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.