/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
// These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register.
int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]);
sum = vusmmlaq_s32(sum, perm_samples[1], filter[1]);
// Narrow and re-pack. return vshrn_n_s32(sum, ROUND0_BITS);
}
// These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register.
int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]);
int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter[0]);
sum0123 = vusmmlaq_s32(sum0123, perm_samples[2], filter[1]);
sum4567 = vusmmlaq_s32(sum4567, perm_samples[3], filter[1]);
// Narrow and re-pack. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS),
vshrn_n_s32(sum4567, ROUND0_BITS));
}
staticinlinevoid convolve_2d_sr_horiz_12tap_neon_i8mm( const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, constint dst_stride, int w, int h, const int16_t *x_filter_ptr) { // The no-op filter should never be used here.
assert(x_filter_ptr[5] != 128);
constint bd = 8;
// Split 12-tap filter into two 6-tap filters, masking the top two elements. // { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 } const int8x8_t mask = vcreate_s8(0x0000ffffffffffff); const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(x_filter_ptr)), mask); const int8x8_t filter_1 =
vext_s8(vmovn_s16(vld1q_s16(x_filter_ptr + 4)), vdup_n_s8(0), 2);
// This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts // in convolution kernels - which are generally faster than rounding shifts on // modern CPUs. const int32x4_t horiz_const =
vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
if (w <= 4) { const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.