/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
staticinlineunsigned masked_sad_64xh_neon(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, int height) { // Four accumulator vectors are required to avoid overflow in the 64x128 case.
assert(height <= 128);
uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
staticinlineunsigned masked_sad_32xh_neon(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, int height) { // We could use a single accumulator up to height=64 without overflow.
assert(height <= 64);
uint16x8_t sad = vdupq_n_u16(0);
do {
sad = masked_sad_16x1_neon(sad, &src[0], &a[0], &b[0], &m[0]);
sad = masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]);
src += src_stride;
a += a_stride;
b += b_stride;
m += m_stride;
height--;
} while (height != 0);
return horizontal_add_u16x8(sad);
}
staticinlineunsigned masked_sad_16xh_neon(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, int height) { // We could use a single accumulator up to height=128 without overflow.
assert(height <= 128);
uint16x8_t sad = vdupq_n_u16(0);
do {
sad = masked_sad_16x1_neon(sad, src, a, b, m);
src += src_stride;
a += a_stride;
b += b_stride;
m += m_stride;
height--;
} while (height != 0);
return horizontal_add_u16x8(sad);
}
staticinlineunsigned masked_sad_8xh_neon(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, int height) { // We could use a single accumulator up to height=128 without overflow.
assert(height <= 128);
uint16x4_t sad = vdup_n_u16(0);
src += src_stride;
a += a_stride;
b += b_stride;
m += m_stride;
height--;
} while (height != 0);
return horizontal_add_u16x4(sad);
}
staticinlineunsigned masked_sad_4xh_neon(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, int height) { // Process two rows per loop iteration.
assert(height % 2 == 0);
// We could use a single accumulator up to height=256 without overflow.
assert(height <= 256);
uint16x4_t sad = vdup_n_u16(0);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.