/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
// Dot product constants: // Accumulate into 128 << FILTER_BITS to account for range transform. // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding // right shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS.
int32x4_t acc =
vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1))));
int32x4_t sum = vdotq_laneq_s32(acc, perm_samples[0], filter, 0);
sum = vdotq_laneq_s32(sum, perm_samples[1], filter, 1);
sum = vdotq_laneq_s32(sum, perm_samples[2], filter, 2);
return vqrshrn_n_s32(sum, FILTER_BITS);
}
staticinline uint8x8_t convolve12_8_x(uint8x16_t samples[2], const int8x16_t filter, const uint8x16x3_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product.
int8x16_t samples_128[2] = {
vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))),
vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128)))
};
// Dot product constants: // Accumulate into 128 << FILTER_BITS to account for range transform. // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding // right shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS.
int32x4_t acc =
vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1))));
staticinlinevoid convolve_x_sr_12tap_neon_dotprod( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr) { // The no-op filter should never be used here.
assert(x_filter_ptr[5] != 128);
// Dot product constants: // Accumulate into 128 << FILTER_BITS to account for range transform. // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding // right shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS. Halve the total because we halved the filter values.
int32x4_t acc =
vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
// Further narrowing and packing is performed by the caller. return vmovn_s32(sum);
}
staticinline uint8x8_t convolve4_8_x(const uint8x16_t samples, const int8x8_t filters, const uint8x16x2_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product.
int8x16_t samples_128 =
vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
// Dot product constants: // Accumulate into 128 << FILTER_BITS to account for range transform. // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding // right shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS. Halve the total because we halved the filter values.
int32x4_t acc =
vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
// Narrow and re-pack.
int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
staticinlinevoid convolve_x_sr_4tap_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x) { const int16x4_t x_filter = vld1_s16(filter_x + 2); // All 4-tap and bilinear filter values are even, so halve them to reduce // intermediate precision requirements. const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
if (width == 4) { const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
// Dot product constants: // Accumulate into 128 << FILTER_BITS to account for range transform. // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding // right shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS. Halve the total because we halved the filter values.
int32x4_t acc =
vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
// Narrow and re-pack.
int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the convolution filter values so - 1 from the right shift. return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
}
void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, constint subpel_x_qn,
ConvolveParams *conv_params) { if (w == 2 || h == 2) {
av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
subpel_x_qn, conv_params); return;
}
staticinline int16x4_t convolve12_4_y(const int8x16_t s0, const int8x16_t s1, const int8x16_t s2, const int8x8_t filters_0_7, const int8x8_t filters_4_11) { // The sample range transform and permutation are performed by the caller. // Accumulate into 128 << FILTER_BITS to account for range transform. const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
int32x4_t sum = vdotq_lane_s32(acc, s0, filters_0_7, 0);
sum = vdotq_lane_s32(sum, s1, filters_0_7, 1);
sum = vdotq_lane_s32(sum, s2, filters_4_11, 1);
// Further narrowing and packing is performed by the caller. return vqmovn_s32(sum);
}
staticinline uint8x8_t convolve12_8_y( const int8x16_t s0_lo, const int8x16_t s0_hi, const int8x16_t s1_lo, const int8x16_t s1_hi, const int8x16_t s2_lo, const int8x16_t s2_hi, const int8x8_t filters_0_7, const int8x8_t filters_4_11) { // The sample range transform and permutation are performed by the caller. // Accumulate into 128 << FILTER_BITS to account for range transform. const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
// Narrow and re-pack.
int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567)); return vqrshrun_n_s16(sum, FILTER_BITS);
}
staticinlinevoid convolve_y_sr_12tap_neon_dotprod( const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr) { // The no-op filter should never be used here.
assert(y_filter_ptr[5] != 128);
// Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows.
s0123_lo = s4567_lo;
s0123_hi = s4567_hi;
s1234_lo = s5678_lo;
s1234_hi = s5678_hi;
s2345_lo = s6789_lo;
s2345_hi = s6789_hi;
s3456_lo = s789A_lo;
s3456_hi = s789A_hi;
s4567_lo = s89AB_lo;
s4567_hi = s89AB_hi;
s5678_lo = s9ABC_lo;
s5678_hi = s9ABC_hi;
s6789_lo = sABCD_lo;
s6789_hi = sABCD_hi;
s789A_lo = sBCDE_lo;
s789A_hi = sBCDE_hi;
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
} while (height != 0);
src_ptr += 8;
dst_ptr += 8;
w -= 8;
} while (w != 0);
}
}
staticinline int16x4_t convolve8_4_y(const int8x16_t s0, const int8x16_t s1, const int8x8_t filters) { // The sample range transform and permutation are performed by the caller. // Accumulate into 128 << FILTER_BITS to account for range transform. const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
int32x4_t sum = vdotq_lane_s32(acc, s0, filters, 0);
sum = vdotq_lane_s32(sum, s1, filters, 1);
// Further narrowing and packing is performed by the caller. return vqmovn_s32(sum);
}
staticinline uint8x8_t convolve8_8_y(const int8x16_t s0_lo, const int8x16_t s0_hi, const int8x16_t s1_lo, const int8x16_t s1_hi, const int8x8_t filters) { // The sample range transform and permutation are performed by the caller. // Accumulate into 128 << FILTER_BITS to account for range transform. const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
// Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows.
s0123 = s4567;
s1234 = s5678;
s2345 = s6789;
s3456 = s78910;
src_ptr += 4 * src_stride;
dst_ptr += 4 * dst_stride;
h -= 4;
} while (h != 0);
} else { do { int height = h; const uint8_t *s = src_ptr;
uint8_t *d = dst_ptr;
// Accumulate dot product into 'correction' to account for range transform.
int32x4_t sum = vdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
sum = vdotq_laneq_s32(sum, perm_samples[1], filters, 1);
sum = vdotq_laneq_s32(sum, perm_samples[2], filters, 2);
// Narrow and re-pack. return vshrn_n_s32(sum, ROUND0_BITS);
}
staticinline int16x8_t convolve12_8_2d_h(uint8x16_t samples[2], const int8x16_t filters, const int32x4_t correction, const uint8x16x3_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product.
int8x16_t samples_128[2] = {
vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))),
vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128)))
};
// Narrow and re-pack. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS),
vshrn_n_s32(sum4567, ROUND0_BITS));
}
staticinlinevoid convolve_2d_sr_horiz_12tap_neon_dotprod( const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, constint dst_stride, int w, int h, const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11) { // The no-op filter should never be used here.
assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
// Accumulate into 'correction' to account for range transform.
int32x4_t sum0123 = vdotq_lane_s32(correction, perm_samples[0], filters, 0);
int32x4_t sum4567 = vdotq_lane_s32(correction, perm_samples[1], filters, 0);
// Narrow and re-pack. // We halved the filter values so -1 from right shift. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
vshrn_n_s32(sum4567, ROUND0_BITS - 1));
}
staticinlinevoid convolve_2d_sr_horiz_4tap_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) { constint bd = 8; const int16x4_t x_filter = vld1_s16(filter_x + 2); // All 4-tap and bilinear filter values are even, so halve them to reduce // intermediate precision requirements. const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
// Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_const =
((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Accumulate into 128 << FILTER_BITS to account for range transform. // Halve the total because we halved the filter values. const int32x4_t correction =
vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
if (w == 4) { const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
// Narrow and re-pack. // We halved the convolution filter values so -1 from the right shift. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
vshrn_n_s32(sum4567, ROUND0_BITS - 1));
}
staticinlinevoid convolve_2d_sr_horiz_8tap_neon_dotprod( const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, int im_h, const int16_t *x_filter_ptr) { const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
constint bd = 8; // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_const =
((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Halve the total because we halved the filter values. const int32x4_t correction =
vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
const uint8_t *src_ptr = src;
int16_t *dst_ptr = im_block; int dst_stride = im_stride; int height = im_h;
const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); do { const uint8_t *s = src_ptr;
int16_t *d = dst_ptr; int width = w;
s += 8;
d += 8;
width -= 8;
} while (width != 0);
src_ptr += src_stride;
dst_ptr += dst_stride;
} while (--height != 0);
}
staticinlinevoid convolve_2d_sr_6tap_neon_dotprod( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) { const int16x8_t y_filter = vld1q_s16(y_filter_ptr); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
constint bd = 8; // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_const =
((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Accumulate into 128 << FILTER_BITS to account for range transform. // Halve the total because we halved the filter values. const int32x4_t correction =
vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
do { const uint8_t *s = src;
uint8_t *d = dst; int height = h;
s += 4 * src_stride;
d += 4 * dst_stride;
height -= 4;
} while (height != 0);
src += 8;
dst += 8;
w -= 8;
} while (w != 0);
}
staticinlinevoid convolve_2d_sr_4tap_neon_dotprod( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) { constint bd = 8; const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); const int16x4_t x_filter_s16 = vld1_s16(x_filter_ptr + 2); // All 4-tap and bilinear filter values are even, so halve them to reduce // intermediate precision requirements. const int8x8_t x_filter =
vshrn_n_s16(vcombine_s16(x_filter_s16, vdup_n_s16(0)), 1);
// Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_const =
((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Accumulate into 128 << FILTER_BITS to account for range transform. // Halve the total because we halved the filter values. const int32x4_t correction =
vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
if (w == 4) { const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.