/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
staticinline uint8_t find_average_sve(const uint8_t *src, int src_stride, int width, int height) {
uint32x4_t avg_u32 = vdupq_n_u32(0);
uint8x16_t ones = vdupq_n_u8(1);
// Use a predicate to compute the last columns.
svbool_t pattern = svwhilelt_b8_u32(0, width % 16);
int h = height; do { int j = width; const uint8_t *src_ptr = src; while (j >= 16) {
uint8x16_t s = vld1q_u8(src_ptr);
avg_u32 = vdotq_u32(avg_u32, s, ones);
staticinlinevoid compute_sub_avg(const uint8_t *buf, int buf_stride, int avg,
int16_t *buf_avg, int buf_avg_stride, int width, int height, int downsample_factor) {
uint8x8_t avg_u8 = vdup_n_u8(avg);
// Use a predicate to compute the last columns.
svbool_t pattern = svwhilelt_b8_u32(0, width % 8);
staticinlinevoid copy_upper_triangle(int64_t *H, int64_t *H_tmp, constint wiener_win2, constint scale) { for (int i = 0; i < wiener_win2 - 2; i = i + 2) { // Transpose the first 2x2 square. It needs a special case as the element // of the bottom left is on the diagonal.
int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1);
int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1);
int64x2_t tr_row = aom_vtrn2q_s64(row0, row1);
vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0));
vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row);
// Transpose and store all the remaining 2x2 squares of the line. for (int j = i + 3; j < wiener_win2; j = j + 2) {
row0 = vld1q_s64(H_tmp + i * wiener_win2 + j);
row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j);
vst1q_s64(H_tmp + j * wiener_win2 + i, tr_row0);
vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1);
}
} for (int i = 0; i < wiener_win2 * wiener_win2; i++) {
H[i] += H_tmp[i] * scale;
}
}
// Transpose the matrix that has just been computed and accumulate it in M. staticinlinevoid acc_transpose_M(int64_t *M, const int64_t *M_trn, constint wiener_win, int scale) { for (int i = 0; i < wiener_win; ++i) { for (int j = 0; j < wiener_win; ++j) { int tr_idx = j * wiener_win + i;
*M++ += (int64_t)(M_trn[tr_idx] * scale);
}
}
}
// This function computes two matrices: the cross-correlation between the src // buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). // // M is of size 7 * 7. It needs to be filled such that multiplying one element // from src with each element of a row of the wiener window will fill one // column of M. However this is not very convenient in terms of memory // accesses, as it means we do contiguous loads of dgd but strided stores to M. // As a result, we use an intermediate matrix M_trn which is instead filled // such that one row of the wiener window gives one row of M_trn. Once fully // computed, M_trn is then transposed to return M. // // H is of size 49 * 49. It is filled by multiplying every pair of elements of // the wiener window together. Since it is a symmetric matrix, we only compute // the upper triangle, and then copy it down to the lower one. Here we fill it // by taking each different pair of columns, and multiplying all the elements of // the first one with all the elements of the second one, with a special case // when multiplying a column by itself. staticinlinevoid compute_stats_win7_downsampled_sve(
int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride, int width, int height, int64_t *M, int64_t *H, int downsample_factor) { constint wiener_win = 7; constint wiener_win2 = wiener_win * wiener_win;
// Use a predicate to compute the last columns of the block for H.
svbool_t pattern = svwhilelt_b16_u32(0, width % 8);
// Use intermediate matrices for H and M to perform the computation, they // will be accumulated into the original H and M at the end.
int64_t M_trn[49];
memset(M_trn, 0, sizeof(M_trn));
// Perform computation of the first column with itself (28 elements). // For the first column this will fill the upper triangle of the 7x7 // matrix at the top left of the H matrix. For the next columns this // will fill the upper triangle of the other 7x7 matrices around H's // diagonal.
compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
// All computation next to the matrix diagonal has already been done. for (int col1 = col0 + 1; col1 < wiener_win; col1++) { // Load second column and scale based on downsampling factor.
int16x8_t dgd1[7];
load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
&dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
// Compute all elements from the combination of both columns (49 // elements).
compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
}
}
j += 8;
}
// Perform computation of the first column with itself (28 elements). // For the first column this will fill the upper triangle of the 7x7 // matrix at the top left of the H matrix. For the next columns this // will fill the upper triangle of the other 7x7 matrices around H's // diagonal.
compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
// All computation next to the matrix diagonal has already been done. for (int col1 = col0 + 1; col1 < wiener_win; col1++) { // Load second column and scale based on downsampling factor.
int16x8_t dgd1[7];
load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
&dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
// Compute all elements from the combination of both columns (49 // elements).
compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
}
}
}
dgd_avg += downsample_factor * dgd_avg_stride;
src_avg += src_avg_stride;
} while (--height != 0);
// Copy upper triangle of H in the lower one.
copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor);
}
// This function computes two matrices: the cross-correlation between the src // buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). // // M is of size 5 * 5. It needs to be filled such that multiplying one element // from src with each element of a row of the wiener window will fill one // column of M. However this is not very convenient in terms of memory // accesses, as it means we do contiguous loads of dgd but strided stores to M. // As a result, we use an intermediate matrix M_trn which is instead filled // such that one row of the wiener window gives one row of M_trn. Once fully // computed, M_trn is then transposed to return M. // // H is of size 25 * 25. It is filled by multiplying every pair of elements of // the wiener window together. Since it is a symmetric matrix, we only compute // the upper triangle, and then copy it down to the lower one. Here we fill it // by taking each different pair of columns, and multiplying all the elements of // the first one with all the elements of the second one, with a special case // when multiplying a column by itself. staticinlinevoid compute_stats_win5_downsampled_sve(
int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride, int width, int height, int64_t *M, int64_t *H, int downsample_factor) { constint wiener_win = 5; constint wiener_win2 = wiener_win * wiener_win;
// Use a predicate to compute the last columns of the block for H.
svbool_t pattern = svwhilelt_b16_u32(0, width % 8);
// Use intermediate matrices for H and M to perform the computation, they // will be accumulated into the original H and M at the end.
int64_t M_trn[25];
memset(M_trn, 0, sizeof(M_trn));
// Perform computation of the first column with itself (15 elements). // For the first column this will fill the upper triangle of the 5x5 // matrix at the top left of the H matrix. For the next columns this // will fill the upper triangle of the other 5x5 matrices around H's // diagonal.
compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
// All computation next to the matrix diagonal has already been done. for (int col1 = col0 + 1; col1 < wiener_win; col1++) { // Load second column and scale based on downsampling factor.
int16x8_t dgd1[5];
load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
&dgd1[2], &dgd1[3], &dgd1[4]);
// Compute all elements from the combination of both columns (25 // elements).
compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
}
}
j += 8;
}
// Perform computation of the first column with itself (15 elements). // For the first column this will fill the upper triangle of the 5x5 // matrix at the top left of the H matrix. For the next columns this // will fill the upper triangle of the other 5x5 matrices around H's // diagonal.
compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
// All computation next to the matrix diagonal has already been done. for (int col1 = col0 + 1; col1 < wiener_win; col1++) { // Load second column and scale based on downsampling factor.
int16x8_t dgd1[5];
load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
&dgd1[2], &dgd1[3], &dgd1[4]);
// Compute all elements from the combination of both columns (25 // elements).
compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
}
}
}
dgd_avg += downsample_factor * dgd_avg_stride;
src_avg += src_avg_stride;
} while (--height != 0);
// dgd_avg and src_avg have been memset to zero before calling this // function, so round up the stride to the next multiple of 8 so that we // don't have to worry about a tail loop when computing M. constint dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8; constint src_avg_stride = (width & ~7) + 8;
// Compute (dgd - avg) and store it in dgd_avg. // The wiener window will slide along the dgd frame, centered on each pixel. // For the top left pixel and all the pixels on the side of the frame this // means half of the window will be outside of the frame. As such the actual // buffer that we need to subtract the avg from will be 2 * wiener_halfwin // wider and 2 * wiener_halfwin higher than the original dgd buffer. constint vert_offset = v_start - wiener_halfwin; constint horiz_offset = h_start - wiener_halfwin; const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1);
// Since the height is not necessarily a multiple of the downsample factor, // the last line of src will be scaled according to how many rows remain. constint downsample_remainder = height % downsample_factor;
if (downsample_height > 0) { if (wiener_win == WIENER_WIN) {
compute_stats_win7_downsampled_sve(
dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, width,
downsample_height, M, H, downsample_factor);
} else {
compute_stats_win5_downsampled_sve(
dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, width,
downsample_height, M, H, downsample_factor);
}
}
// dgd_avg and src_avg have been memset to zero before calling this // function, so round up the stride to the next multiple of 8 so that we // don't have to worry about a tail loop when computing M. constint dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8; constint src_avg_stride = (width & ~7) + 8;
// Compute (dgd - avg) and store it in dgd_avg. // The wiener window will slide along the dgd frame, centered on each pixel. // For the top left pixel and all the pixels on the side of the frame this // means half of the window will be outside of the frame. As such the actual // buffer that we need to subtract the avg from will be 2 * wiener_halfwin // wider and 2 * wiener_halfwin higher than the original dgd buffer. constint vert_offset = v_start - wiener_halfwin; constint horiz_offset = h_start - wiener_halfwin; const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1);
// Compute (src - avg), and store in src-avg. const uint8_t *src_start = src + h_start + v_start * src_stride;
compute_sub_avg(src_start, src_stride, avg, src_avg, src_avg_stride, width,
height, 1);
if (wiener_win == WIENER_WIN) {
compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
width, height, M, H);
} else {
compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
width, height, M, H);
}
// H is a symmetric matrix, so we only need to fill out the upper triangle. // We can copy it down to the lower triangle outside the (i, j) loops.
diagonal_copy_stats_neon(wiener_win2, H);
}
Messung V0.5
¤ Dauer der Verarbeitung: 0.13 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.