/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/ #include <immintrin.h>
bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int stride, int start_col) { // For the GM tool, the input layer height or width is assured to be an even // number. Hence the function 'down2_symodd()' is not invoked and SIMD // optimization of the same is not implemented. // When the input height is less than 8 and even, the potential input // heights are limited to 2, 4, or 6. These scenarios require seperate // handling due to padding requirements. Invoking the C function here will // eliminate the need for conditional statements within the subsequent SIMD // code to manage these cases. if (height & 1 || height < 8) { return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
stride, start_col);
}
// Increment the pointer such that the loading starts from row G.
data = data + 3 * stride; // The core vertical SIMD processes 2 input rows simultaneously to generate // output corresponding to 1 row. To streamline the core loop and eliminate // the need for conditional checks, the remaining rows 4 are processed // separately. for (int i = 0; i < height - 4; i += 2) { // g0 ... g7
__m128i l8_6 = _mm_loadl_epi64((__m128i *)(data)); // h0 ... h7
__m128i l8_7 = _mm_loadl_epi64((__m128i *)(data + stride));
__m128i l6 = _mm_unpacklo_epi8(l8_6, zero); // G(128bit):g0-g7(16b)
__m128i l7 = _mm_unpacklo_epi8(l8_7, zero); // H(128bit):h0-h7(16b)
PROCESS_RESIZE_Y_WD8
}
__m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
__m128i l6 = _mm_unpacklo_epi8(l8_6, zero); // Process the last 4 input rows here. for (int i = height - 4; i < height; i += 2) {
__m128i l7 = l6;
PROCESS_RESIZE_Y_WD8
}
}
void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride,
uint8_t *intbuf, int height, int filtered_length, int width2) {
assert(height % 2 == 0); // Invoke C for width less than 16. if (filtered_length < 16) {
av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
width2); return;
}
for (int i = 0; i < height; ++i) { int filter_offset = 0; int row01_offset = ROW_OFFSET; int remain_col = filtered_length; // To avoid pixel over-read at frame boundary, processing of 16 pixels // is done using the core loop only if sufficient number of pixels required // for the load are present.The remaining pixels are processed separately. for (int j = 0; j <= filtered_length - 16; j += 16) { if (remain_col == 18 || remain_col == 20) { break;
} constint is_last_cols16 = (j == filtered_length - 16); // While processing the last 16 pixels of the row, ensure that only valid // pixels are loaded. if (is_last_cols16) row01_offset = 0; constint in_idx = i * in_stride + j - filter_offset; constint out_idx = i * dst_stride + j / 2;
remain_col -= 16; // a0 a1 a2 a3 .... a15
__m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]); // a8 a9 a10 a11 .... a23
__m128i row01 = _mm_loadu_si128(
(__m128i *)&input[in_idx + row01_offset + filter_offset]);
filter_offset = 3;
// Pad start pixels to the left, while processing the first pixels in the // row. if (j == 0) { const __m128i start_pixel_row0 =
_mm_set1_epi8((char)input[i * in_stride]);
row00 =
blend(_mm_slli_si128(row00, 3), start_pixel_row0, start_pad_mask);
}
// Pad end pixels to the right, while processing the last pixels in the // row. if (is_last_cols16) { const __m128i end_pixel_row0 =
_mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]);
row01 = blend(_mm_srli_si128(row01, ROW_OFFSET), end_pixel_row0,
end_pad_mask);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.