/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/ #include <immintrin.h>
/** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. * * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
*/ staticvoid cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input, int input_stride,
uint16_t *pred_buf_q3, int width, int height) {
(void)width; // Forever 32 const __m256i twos = _mm256_set1_epi8(2); // Thirty two twos constint luma_stride = input_stride << 1;
__m256i *row = (__m256i *)pred_buf_q3; const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; do {
__m256i top = _mm256_loadu_si256((__m256i *)input);
__m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
/** * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more * precise version of a box filter 4:2:2 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer.
*/ staticvoid cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input, int input_stride,
uint16_t *pred_buf_q3, int width, int height) {
(void)width; // Forever 32 const __m256i fours = _mm256_set1_epi8(4); // Thirty two fours
__m256i *row = (__m256i *)pred_buf_q3; const __m256i *row_end = row + height * CFL_BUF_LINE_I256; do {
__m256i top = _mm256_loadu_si256((__m256i *)input);
__m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
_mm256_storeu_si256(row, top_16x16);
input += input_stride;
} while ((row += CFL_BUF_LINE_I256) < row_end);
}
CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd)
/** * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only * performed on block of width 32. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer.
*/ staticvoid cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input, int input_stride,
uint16_t *pred_buf_q3, int width, int height) {
(void)width; // Forever 32
__m256i *row = (__m256i *)pred_buf_q3; const __m256i *row_end = row + height * CFL_BUF_LINE_I256; const __m256i zeros = _mm256_setzero_si256(); do {
__m256i top = _mm256_loadu_si256((__m256i *)input);
top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0));
#if CONFIG_AV1_HIGHBITDEPTH /** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. * * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
*/ staticvoid cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input, int input_stride,
uint16_t *pred_buf_q3, int width, int height) {
(void)width; // Forever 32 constint luma_stride = input_stride << 1;
__m256i *row = (__m256i *)pred_buf_q3; const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; do {
__m256i top = _mm256_loadu_si256((__m256i *)input);
__m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
__m256i sum = _mm256_add_epi16(top, bot);
/** * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more * precise version of a box filter 4:2:2 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. *
*/ staticvoid cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input, int input_stride,
uint16_t *pred_buf_q3, int width, int height) {
(void)width; // Forever 32
__m256i *row = (__m256i *)pred_buf_q3; const __m256i *row_end = row + height * CFL_BUF_LINE_I256; do {
__m256i top = _mm256_loadu_si256((__m256i *)input);
__m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
__m256i hsum = _mm256_hadd_epi16(top, top_1);
hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
hsum = _mm256_slli_epi16(hsum, 2);
// Returns a vector where all the (32-bits) elements are the sum of all the // lanes in a. staticinline __m256i fill_sum_epi32(__m256i a) { // Given that a == [A, B, C, D, E, F, G, H]
a = _mm256_hadd_epi32(a, a); // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H // a == [A', C', A', C', E', G', E', G']
a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)); // a == [A', C', E', G', A', C', E', G']
a = _mm256_hadd_epi32(a, a); // Given that A'' == A' + C' and E'' == E' + G' // a == [A'', E'', A'', E'', A'', E'', A'', E''] return _mm256_hadd_epi32(a, a); // Given that A''' == A'' + E'' // a == [A''', A''', A''', A''', A''', A''', A''', A''']
}
staticinlinevoid subtract_average_avx2(const uint16_t *src_ptr,
int16_t *dst_ptr, int width, int height, int round_offset, int num_pel_log2) { // Use SSE2 version for smaller widths
assert(width == 16 || width == 32);
const __m256i *src = (__m256i *)src_ptr; const __m256i *const end = src + height * CFL_BUF_LINE_I256; // To maximize usage of the AVX2 registers, we sum two rows per loop // iteration constint step = 2 * CFL_BUF_LINE_I256;
__m256i sum = _mm256_setzero_si256(); // For width 32, we use a second sum accumulator to reduce accumulator // dependencies in the loop.
__m256i sum2; if (width == 32) sum2 = _mm256_setzero_si256();
do { // Add top row to the bottom row
__m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
_mm256_loadu_si256(src + CFL_BUF_LINE_I256));
sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0)); if (width == 32) { /* Don't worry, this if it gets optimized out. */ // Add the second part of the top row to the second part of the bottom row
__m256i l1 =
_mm256_add_epi16(_mm256_loadu_si256(src + 1),
_mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
}
src += step;
} while (src < end); // Combine both sum accumulators if (width == 32) sum = _mm256_add_epi32(sum, sum2);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.