Quelle obmc_variance_neon.c

Sprache: C

/*
* Copyright (c) 2023, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/

#include <arm_neon.h>

#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "mem_neon.h"
#include "sum_neon.h"

static inline void obmc_variance_8x1_s16_neon(int16x8_t pre_s16,
                                              const int32_t *wsrc,
                                              const int32_t *mask,
                                              int32x4_t *ssev,
                                              int32x4_t *sumv) {
  // For 4xh and 8xh we observe it is faster to avoid the double-widening of
  // pre. Instead we do a single widening step and narrow the mask to 16-bits
  // to allow us to perform a widening multiply. Widening multiply
  // instructions have better throughput on some micro-architectures but for
  // the larger block sizes this benefit is outweighed by the additional
  // instruction needed to first narrow the mask vectors.

  int32x4_t wsrc_s32_lo = vld1q_s32(&wsrc[0]);
  int32x4_t wsrc_s32_hi = vld1q_s32(&wsrc[4]);
  int16x8_t mask_s16 = vuzpq_s16(vreinterpretq_s16_s32(vld1q_s32(&mask[0])),
                                 vreinterpretq_s16_s32(vld1q_s32(&mask[4])))
                           .val[0];

  int32x4_t diff_s32_lo =
      vmlsl_s16(wsrc_s32_lo, vget_low_s16(pre_s16), vget_low_s16(mask_s16));
  int32x4_t diff_s32_hi =
      vmlsl_s16(wsrc_s32_hi, vget_high_s16(pre_s16), vget_high_s16(mask_s16));

  // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away
  // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up.
  // This difference only affects the bit patterns at the rounding breakpoints
  // exactly, so we can add -1 to all negative numbers to move the breakpoint
  // one value across and into the correct rounding region.
  diff_s32_lo = vsraq_n_s32(diff_s32_lo, diff_s32_lo, 31);
  diff_s32_hi = vsraq_n_s32(diff_s32_hi, diff_s32_hi, 31);
  int32x4_t round_s32_lo = vrshrq_n_s32(diff_s32_lo, 12);
  int32x4_t round_s32_hi = vrshrq_n_s32(diff_s32_hi, 12);

  *sumv = vrsraq_n_s32(*sumv, diff_s32_lo, 12);
  *sumv = vrsraq_n_s32(*sumv, diff_s32_hi, 12);
  *ssev = vmlaq_s32(*ssev, round_s32_lo, round_s32_lo);
  *ssev = vmlaq_s32(*ssev, round_s32_hi, round_s32_hi);
}

#if AOM_ARCH_AARCH64

// Use tbl for doing a double-width zero extension from 8->32 bits since we can
// do this in one instruction rather than two (indices out of range (255 here)
// are set to zero by tbl).
DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = {
  0,  255, 255, 255, 1,  255, 255, 255, 2,  255, 255, 255, 3,  255, 255, 255,
  4,  255, 255, 255, 5,  255, 255, 255, 6,  255, 255, 255, 7,  255, 255, 255,
  8,  255, 255, 255, 9,  255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255,
  12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255
};

static inline void obmc_variance_8x1_s32_neon(
    int32x4_t pre_lo, int32x4_t pre_hi, const int32_t *wsrc,
    const int32_t *mask, int32x4_t *ssev, int32x4_t *sumv) {
  int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]);
  int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]);
  int32x4_t mask_lo = vld1q_s32(&mask[0]);
  int32x4_t mask_hi = vld1q_s32(&mask[4]);

  int32x4_t diff_lo = vmlsq_s32(wsrc_lo, pre_lo, mask_lo);
  int32x4_t diff_hi = vmlsq_s32(wsrc_hi, pre_hi, mask_hi);

  // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away from
  // zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. This
  // difference only affects the bit patterns at the rounding breakpoints
  // exactly, so we can add -1 to all negative numbers to move the breakpoint
  // one value across and into the correct rounding region.
  diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31);
  diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31);
  int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12);
  int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12);

  *sumv = vrsraq_n_s32(*sumv, diff_lo, 12);
  *sumv = vrsraq_n_s32(*sumv, diff_hi, 12);
  *ssev = vmlaq_s32(*ssev, round_lo, round_lo);
  *ssev = vmlaq_s32(*ssev, round_hi, round_hi);
}

static inline void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
                                            const int32_t *wsrc,
                                            const int32_t *mask, int width,
                                            int height, unsigned *sse,
                                            int *sum) {
  assert(width % 16 == 0);

  // Use tbl for doing a double-width zero extension from 8->32 bits since we
  // can do this in one instruction rather than two.
  uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]);
  uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]);
  uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]);
  uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]);

  int32x4_t ssev = vdupq_n_s32(0);
  int32x4_t sumv = vdupq_n_s32(0);

  int h = height;
  do {
    int w = width;
    do {
      uint8x16_t pre_u8 = vld1q_u8(pre);

      int32x4_t pre_s32_lo = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx0));
      int32x4_t pre_s32_hi = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx1));
      obmc_variance_8x1_s32_neon(pre_s32_lo, pre_s32_hi, &wsrc[0], &mask[0],
                                 &ssev, &sumv);

      pre_s32_lo = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx2));
      pre_s32_hi = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx3));
      obmc_variance_8x1_s32_neon(pre_s32_lo, pre_s32_hi, &wsrc[8], &mask[8],
                                 &ssev, &sumv);

      wsrc += 16;
      mask += 16;
      pre += 16;
      w -= 16;
    } while (w != 0);

    pre += pre_stride - width;
  } while (--h != 0);

  *sse = horizontal_add_s32x4(ssev);
  *sum = horizontal_add_s32x4(sumv);
}

#else  // !AOM_ARCH_AARCH64

static inline void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
                                            const int32_t *wsrc,
                                            const int32_t *mask, int width,
                                            int height, unsigned *sse,
                                            int *sum) {
  // Non-aarch64 targets do not have a 128-bit tbl instruction, so use the
  // widening version of the core kernel instead.

  assert(width % 16 == 0);

  int32x4_t ssev = vdupq_n_s32(0);
  int32x4_t sumv = vdupq_n_s32(0);

  int h = height;
  do {
    int w = width;
    do {
      uint8x16_t pre_u8 = vld1q_u8(pre);

      int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pre_u8)));
      obmc_variance_8x1_s16_neon(pre_s16, &wsrc[0], &mask[0], &ssev, &sumv);

      pre_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pre_u8)));
      obmc_variance_8x1_s16_neon(pre_s16, &wsrc[8], &mask[8], &ssev, &sumv);

      wsrc += 16;
      mask += 16;
      pre += 16;
      w -= 16;
    } while (w != 0);

    pre += pre_stride - width;
  } while (--h != 0);

  *sse = horizontal_add_s32x4(ssev);
  *sum = horizontal_add_s32x4(sumv);
}

#endif  // AOM_ARCH_AARCH64

static inline void obmc_variance_neon_128xh(const uint8_t *pre, int pre_stride,
                                            const int32_t *wsrc,
                                            const int32_t *mask, int h,
                                            unsigned *sse, int *sum) {
  obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse, sum);
}

static inline void obmc_variance_neon_64xh(const uint8_t *pre, int pre_stride,
                                           const int32_t *wsrc,
                                           const int32_t *mask, int h,
                                           unsigned *sse, int *sum) {
  obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum);
}

static inline void obmc_variance_neon_32xh(const uint8_t *pre, int pre_stride,
                                           const int32_t *wsrc,
                                           const int32_t *mask, int h,
                                           unsigned *sse, int *sum) {
  obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum);
}

static inline void obmc_variance_neon_16xh(const uint8_t *pre, int pre_stride,
                                           const int32_t *wsrc,
                                           const int32_t *mask, int h,
                                           unsigned *sse, int *sum) {
  obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum);
}

static inline void obmc_variance_neon_8xh(const uint8_t *pre, int pre_stride,
                                          const int32_t *wsrc,
                                          const int32_t *mask, int h,
                                          unsigned *sse, int *sum) {
  int32x4_t ssev = vdupq_n_s32(0);
  int32x4_t sumv = vdupq_n_s32(0);

  do {
    uint8x8_t pre_u8 = vld1_u8(pre);
    int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(pre_u8));

    obmc_variance_8x1_s16_neon(pre_s16, wsrc, mask, &ssev, &sumv);

    pre += pre_stride;
    wsrc += 8;
    mask += 8;
  } while (--h != 0);

  *sse = horizontal_add_s32x4(ssev);
  *sum = horizontal_add_s32x4(sumv);
}

static inline void obmc_variance_neon_4xh(const uint8_t *pre, int pre_stride,
                                          const int32_t *wsrc,
                                          const int32_t *mask, int h,
                                          unsigned *sse, int *sum) {
  assert(h % 2 == 0);

  int32x4_t ssev = vdupq_n_s32(0);
  int32x4_t sumv = vdupq_n_s32(0);

  do {
    uint8x8_t pre_u8 = load_unaligned_u8(pre, pre_stride);
    int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(pre_u8));

    obmc_variance_8x1_s16_neon(pre_s16, wsrc, mask, &ssev, &sumv);

    pre += 2 * pre_stride;
    wsrc += 8;
    mask += 8;
    h -= 2;
  } while (h != 0);

  *sse = horizontal_add_s32x4(ssev);
  *sum = horizontal_add_s32x4(sumv);
}

#define OBMC_VARIANCE_WXH_NEON(W, H)                                       \
  unsigned aom_obmc_variance##W##x##H##_neon(                              \
      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
      const int32_t *mask, unsigned *sse) {                                \
    int sum;                                                               \
    obmc_variance_neon_##W##xh(pre, pre_stride, wsrc, mask, H, sse, &sum); \
    return *sse - (unsigned)(((int64_t)sum * sum) / (W * H));              \
  }

OBMC_VARIANCE_WXH_NEON(4, 4)
OBMC_VARIANCE_WXH_NEON(4, 8)
OBMC_VARIANCE_WXH_NEON(8, 4)
OBMC_VARIANCE_WXH_NEON(8, 8)
OBMC_VARIANCE_WXH_NEON(8, 16)
OBMC_VARIANCE_WXH_NEON(16, 8)
OBMC_VARIANCE_WXH_NEON(16, 16)
OBMC_VARIANCE_WXH_NEON(16, 32)
OBMC_VARIANCE_WXH_NEON(32, 16)
OBMC_VARIANCE_WXH_NEON(32, 32)
OBMC_VARIANCE_WXH_NEON(32, 64)
OBMC_VARIANCE_WXH_NEON(64, 32)
OBMC_VARIANCE_WXH_NEON(64, 64)
OBMC_VARIANCE_WXH_NEON(64, 128)
OBMC_VARIANCE_WXH_NEON(128, 64)
OBMC_VARIANCE_WXH_NEON(128, 128)
OBMC_VARIANCE_WXH_NEON(4, 16)
OBMC_VARIANCE_WXH_NEON(16, 4)
OBMC_VARIANCE_WXH_NEON(8, 32)
OBMC_VARIANCE_WXH_NEON(32, 8)
OBMC_VARIANCE_WXH_NEON(16, 64)
OBMC_VARIANCE_WXH_NEON(64, 16)

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.13 Sekunden (vorverarbeitet am 2026-04-27) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.