/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/ #include <immintrin.h> #include <string.h>
// f0 f1 f0 f1 ..
coeffs[2] = _mm256_broadcastw_epi16(filter_8bit); // f2 f3 f2 f3 ..
coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2)); // f3 f2 f3 f2 ..
coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6)); // f1 f0 f1 f0 ..
coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
}
bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int stride, int start_col) {
assert(start_col <= stride); // For the GM tool, the input layer height or width is assured to be an even // number. Hence the function 'down2_symodd()' is not invoked and SIMD // optimization of the same is not implemented. // When the input height is less than 8 and even, the potential input // heights are limited to 2, 4, or 6. These scenarios require seperate // handling due to padding requirements. Invoking the C function here will // eliminate the need for conditional statements within the subsequent SIMD // code to manage these cases. if (height & 1 || height < 8) { return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
stride, start_col);
}
constint num_col16 = stride / 16; int remain_col = stride % 16; // The core vertical SIMD processes 4 input rows simultaneously to generate // output corresponding to 2 rows. To streamline the core loop and eliminate // the need for conditional checks, the remaining rows (4 or 6) are processed // separately. constint remain_row = (height % 4 == 0) ? 4 : 6;
for (int j = start_col; j < stride - remain_col; j += 16) { const uint8_t *data = &intbuf[j]; const __m128i l3 = _mm_loadu_si128((__m128i *)(data + 0 * stride)); // Padding top 3 rows with the last available row at the top. const __m128i l0 = l3; const __m128i l1 = l3; const __m128i l2 = l3; const __m128i l4 = _mm_loadu_si128((__m128i *)(data + 1 * stride));
constint is_store_valid = (i < height - 1); if (is_store_valid)
_mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
_mm256_extracti128_si256(res_8bit0, 1));
i += 2;
// Check if there is any remaining height to process. If so, perform the // necessary data loading for processing the next row. if (i < height - 1) {
l10 = l11 = l9; const __m256i s810 =
_mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20); const __m256i s911 =
_mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20); // i0j0... i7j7 | k0l0... k7l7
s[4] = _mm256_unpacklo_epi8(s810, s911); // i8j8... i15j15 | k8l8... k15l15
s[9] = _mm256_unpackhi_epi8(s810, s911);
constint is_store_valid = (i < height - 1); if (is_store_valid)
_mm_storel_epi64(
(__m128i *)&output[(i / 2) * out_stride + processed_wd],
_mm256_extracti128_si256(res_a_round_1, 1));
i += 2;
// Check rows are still remaining for processing. If yes do the required // load of data for the next iteration. if (i < height - 1) {
l10 = l11 = l9; // k0l0... k7l7 const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11); // i0j0... i7j7 | k0l0... k7l7
s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);
// The core horizontal SIMD processes 32 input pixels of 2 rows simultaneously // to generate output corresponding to 2 rows. To streamline the core loop and // eliminate the need for conditional checks, the remaining columns (16 or 8) // are processed separately. if (filtered_length % 32 == 0) { for (int i = 0; i < height; i += 2) { int filter_offset = 0; int row_offset = 0; for (int j = 0; j < filtered_length; j += 32) {
PROCESS_RESIZE_X_WD32
}
}
} else { for (int i = 0; i < height; i += 2) { int filter_offset = 0; int remain_col = filtered_length; int row_offset = 0; // To avoid pixel over-read at frame boundary, processing of 32 pixels // is done using the core loop only if sufficient number of pixels // required for the load are present. The remaining pixels are processed // separately. for (int j = 0; j <= filtered_length - 32; j += 32) { if (remain_col == 34 || remain_col == 36) { break;
}
PROCESS_RESIZE_X_WD32
remain_col -= 32;
}
int wd_processed = filtered_length - remain_col; // To avoid pixel over-read at frame boundary, processing of 16 pixels // is done only if sufficient number of pixels required for the // load are present. The remaining pixels are processed separately. if (remain_col > 15 && remain_col != 18 && remain_col != 20) {
remain_col = filtered_length - wd_processed - 16; constint in_idx = i * in_stride + wd_processed; constint out_idx = (i * dst_stride) + wd_processed / 2; // a0 a1 --- a15
__m128i row0 =
_mm_loadu_si128((__m128i *)&input[in_idx - filter_offset]); // b0 b1 --- b15
__m128i row1 = _mm_loadu_si128(
(__m128i *)&input[in_idx + in_stride - filter_offset]); // a0 a1 --- a15 || b0 b1 --- b15
__m256i r0 =
_mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20); if (filter_offset == 0) {
r0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);
}
filter_offset = 3; constint is_last_cols16 = wd_processed + 16 == filtered_length; if (is_last_cols16) row_offset = ROW_OFFSET;
// a16-a23 x x x x| b16-b23 x x x x
__m256i r1 =
_mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
// Pad end pixels to the right, while processing the last pixels in the // row. if (is_last_cols16) {
r1 = _mm256_shuffle_epi8(_mm256_srli_si256(r1, ROW_OFFSET),
wd32_end_pad_mask);
}
// result for 16 pixels (a0 to a15) of row0 and row1
__m256i res_out_0[2];
res_out_0[0] = res_out_0[1] = zero;
resize_convolve(s0, coeffs_x, res_out_0);
// To avoid pixel over-read at frame boundary, processing of 8 pixels // is done only if sufficient number of pixels required for the // load are present. The remaining pixels are processed by C function.
wd_processed = filtered_length - remain_col; if (remain_col > 7 && remain_col != 10 && remain_col != 12) {
remain_col = filtered_length - wd_processed - 8; constint in_idx = i * in_stride + wd_processed - filter_offset; constint out_idx = (i * dst_stride) + wd_processed / 2; constint is_last_cols_8 = wd_processed + 8 == filtered_length; if (is_last_cols_8) row_offset = ROW_OFFSET; // a0 a1 --- a15
__m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx - row_offset]); // b0 b1 --- b15
__m128i row1 =
_mm_loadu_si128((__m128i *)&input[in_idx + in_stride - row_offset]); // a0 a1 --- a15 || b0 b1 --- b15
__m256i r0 =
_mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
// Pad end pixels to the right, while processing the last pixels in the // row. if (is_last_cols_8)
r0 = _mm256_shuffle_epi8(_mm256_srli_si256(r0, ROW_OFFSET),
wd8_end_pad_mask);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.