/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
// 128-bit xmmwords are written as [ ... ] with the MSB on the left. // 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB // on the left. // A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be // loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
// Exploiting the range of wiener filter coefficients, // horizontal filtering can be done in 16 bit intermediate precision. // The details are as follows : // Consider the horizontal wiener filter coefficients of the following form : // [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0] // Subtracting 2^(FILTER_BITS) from the centre tap we get the following : // [C0, C1, C2, -2 * (C0 + C1 + C2), C2, C1, C0] // The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3 // + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit // precision. Finally, after rounding the above result by round_0, we multiply // the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the // horizontal filter output.
// Add an offset to account for the "add_src" part of the convolve function. const __m128i zero_128 = _mm_setzero_si128(); const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
for (j = 0; j < w; j += 8) { for (i = 0; i < im_h; i += 2) {
__m256i data = _mm256_castsi128_si256(
_mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
// Load the next line if (i + 1 < im_h)
data = _mm256_inserti128_si256(
data,
_mm_loadu_si128(
(__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
1);
__m256i res = convolve_lowbd_x(data, coeffs_h, filt);
res =
_mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
// multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to // the result
data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
res = _mm256_add_epi16(res, data_0);
res = _mm256_add_epi16(res, round_const_horz); const __m256i res_clamped =
_mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
_mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.