/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
staticinlinevoid obmc_variance_8x1_s16_neon(int16x8_t pre_s16, const int32_t *wsrc, const int32_t *mask,
int32x4_t *ssev,
int32x4_t *sumv) { // For 4xh and 8xh we observe it is faster to avoid the double-widening of // pre. Instead we do a single widening step and narrow the mask to 16-bits // to allow us to perform a widening multiply. Widening multiply // instructions have better throughput on some micro-architectures but for // the larger block sizes this benefit is outweighed by the additional // instruction needed to first narrow the mask vectors.
// ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. // This difference only affects the bit patterns at the rounding breakpoints // exactly, so we can add -1 to all negative numbers to move the breakpoint // one value across and into the correct rounding region.
diff_s32_lo = vsraq_n_s32(diff_s32_lo, diff_s32_lo, 31);
diff_s32_hi = vsraq_n_s32(diff_s32_hi, diff_s32_hi, 31);
int32x4_t round_s32_lo = vrshrq_n_s32(diff_s32_lo, 12);
int32x4_t round_s32_hi = vrshrq_n_s32(diff_s32_hi, 12);
// Use tbl for doing a double-width zero extension from 8->32 bits since we can // do this in one instruction rather than two (indices out of range (255 here) // are set to zero by tbl).
DECLARE_ALIGNED(16, staticconst uint8_t, obmc_variance_permute_idx[]) = {
0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255,
4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255,
8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255,
12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255
};
// ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away from // zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. This // difference only affects the bit patterns at the rounding breakpoints // exactly, so we can add -1 to all negative numbers to move the breakpoint // one value across and into the correct rounding region.
diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31);
diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31);
int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12);
int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12);
staticinlinevoid obmc_variance_large_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int width, int height, unsigned *sse, int *sum) {
assert(width % 16 == 0);
// Use tbl for doing a double-width zero extension from 8->32 bits since we // can do this in one instruction rather than two.
uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]);
uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]);
uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]);
uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]);
staticinlinevoid obmc_variance_large_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int width, int height, unsigned *sse, int *sum) { // Non-aarch64 targets do not have a 128-bit tbl instruction, so use the // widening version of the core kernel instead.
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.