/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
int i = 0; do {
int32x4_t sum[4];
int32x4_t sse[2];
int16x4_t sum_s16[4];
const int16x8_t r1_l = vld1q_s16(r1 + i); const int16x8_t r1_h = vld1q_s16(r1 + i + 8); const int16x8_t d_l = vld1q_s16(d + i); const int16x8_t d_h = vld1q_s16(d + i + 8); // The following three lines are a bit inelegant compared to using a pair // of vmovl_u8()... but it forces the compiler to emit a ZIP1, ZIP2 pair - // which can be executed in parallel with the subsequent SSHL instructions. // (SSHL can only be executed on half of the Neon pipes in modern Arm // cores, whereas ZIP1/2 can be executed on all of them.) const uint8x16x2_t m_u16 = vzipq_u8(vld1q_u8(m + i), vdupq_n_u8(0)); const int16x8_t m_l = vreinterpretq_s16_u8(m_u16.val[0]); const int16x8_t m_h = vreinterpretq_s16_u8(m_u16.val[1]);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.