SSL highbd_intrapred_neon.c

Sprache: C

/*
* Copyright (c) 2022, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/

#include <arm_neon.h>

#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "config/av1_rtcd.h"

#include "aom/aom_integer.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_dsp/intrapred_common.h"

// -----------------------------------------------------------------------------
// DC

static inline void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h,
                                       uint16x4_t dc) {
  for (int i = 0; i < h; ++i) {
    vst1_u16(dst + i * stride, dc);
  }
}

static inline void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h,
                                       uint16x8_t dc) {
  for (int i = 0; i < h; ++i) {
    vst1q_u16(dst + i * stride, dc);
  }
}

static inline void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h,
                                        uint16x8_t dc) {
  for (int i = 0; i < h; ++i) {
    vst1q_u16(dst + i * stride, dc);
    vst1q_u16(dst + i * stride + 8, dc);
  }
}

static inline void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h,
                                        uint16x8_t dc) {
  for (int i = 0; i < h; ++i) {
    vst1q_u16(dst + i * stride, dc);
    vst1q_u16(dst + i * stride + 8, dc);
    vst1q_u16(dst + i * stride + 16, dc);
    vst1q_u16(dst + i * stride + 24, dc);
  }
}

static inline void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h,
                                        uint16x8_t dc) {
  for (int i = 0; i < h; ++i) {
    vst1q_u16(dst + i * stride, dc);
    vst1q_u16(dst + i * stride + 8, dc);
    vst1q_u16(dst + i * stride + 16, dc);
    vst1q_u16(dst + i * stride + 24, dc);
    vst1q_u16(dst + i * stride + 32, dc);
    vst1q_u16(dst + i * stride + 40, dc);
    vst1q_u16(dst + i * stride + 48, dc);
    vst1q_u16(dst + i * stride + 56, dc);
  }
}

static inline uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) {
  // Need to assume input is up to 16 bits wide from dc 64x64 partial sum, so
  // promote first.
  const uint32x4_t b = vpaddlq_u16(a);
#if AOM_ARCH_AARCH64
  const uint32x4_t c = vpaddq_u32(b, b);
  return vpaddq_u32(c, c);
#else
  const uint32x2_t c = vadd_u32(vget_low_u32(b), vget_high_u32(b));
  const uint32x2_t d = vpadd_u32(c, c);
  return vcombine_u32(d, d);
#endif
}

static inline uint16x8_t highbd_dc_load_partial_sum_4(const uint16_t *left) {
  // Nothing to do since sum is already one vector, but saves needing to
  // special case w=4 or h=4 cases. The combine will be zero cost for a sane
  // compiler since vld1 already sets the top half of a vector to zero as part
  // of the operation.
  return vcombine_u16(vld1_u16(left), vdup_n_u16(0));
}

static inline uint16x8_t highbd_dc_load_partial_sum_8(const uint16_t *left) {
  // Nothing to do since sum is already one vector, but saves needing to
  // special case w=8 or h=8 cases.
  return vld1q_u16(left);
}

static inline uint16x8_t highbd_dc_load_partial_sum_16(const uint16_t *left) {
  const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
  const uint16x8_t a1 = vld1q_u16(left + 8);
  return vaddq_u16(a0, a1);  // up to 13 bits
}

static inline uint16x8_t highbd_dc_load_partial_sum_32(const uint16_t *left) {
  const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
  const uint16x8_t a1 = vld1q_u16(left + 8);
  const uint16x8_t a2 = vld1q_u16(left + 16);
  const uint16x8_t a3 = vld1q_u16(left + 24);
  const uint16x8_t b0 = vaddq_u16(a0, a1);  // up to 13 bits
  const uint16x8_t b1 = vaddq_u16(a2, a3);
  return vaddq_u16(b0, b1);  // up to 14 bits
}

static inline uint16x8_t highbd_dc_load_partial_sum_64(const uint16_t *left) {
  const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
  const uint16x8_t a1 = vld1q_u16(left + 8);
  const uint16x8_t a2 = vld1q_u16(left + 16);
  const uint16x8_t a3 = vld1q_u16(left + 24);
  const uint16x8_t a4 = vld1q_u16(left + 32);
  const uint16x8_t a5 = vld1q_u16(left + 40);
  const uint16x8_t a6 = vld1q_u16(left + 48);
  const uint16x8_t a7 = vld1q_u16(left + 56);
  const uint16x8_t b0 = vaddq_u16(a0, a1);  // up to 13 bits
  const uint16x8_t b1 = vaddq_u16(a2, a3);
  const uint16x8_t b2 = vaddq_u16(a4, a5);
  const uint16x8_t b3 = vaddq_u16(a6, a7);
  const uint16x8_t c0 = vaddq_u16(b0, b1);  // up to 14 bits
  const uint16x8_t c1 = vaddq_u16(b2, b3);
  return vaddq_u16(c0, c1);  // up to 15 bits
}

#define HIGHBD_DC_PREDICTOR(w, h, shift)                               \
  void aom_highbd_dc_predictor_##w##x##h##_neon(                       \
      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,          \
      const uint16_t *left, int bd) {                                  \
    (void)bd;                                                          \
    const uint16x8_t a = highbd_dc_load_partial_sum_##w(above);        \
    const uint16x8_t l = highbd_dc_load_partial_sum_##h(left);         \
    const uint32x4_t sum =                                             \
        horizontal_add_and_broadcast_long_u16x8(vaddq_u16(a, l));      \
    const uint16x4_t dc0 = vrshrn_n_u32(sum, shift);                   \
    highbd_dc_store_##w##xh(dst, stride, (h), vdupq_lane_u16(dc0, 0)); \
  }

void aom_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                      const uint16_t *above,
                                      const uint16_t *left, int bd) {
  // In the rectangular cases we simply extend the shorter vector to uint16x8
  // in order to accumulate, however in the 4x4 case there is no shorter vector
  // to extend so it is beneficial to do the whole calculation in uint16x4
  // instead.
  (void)bd;
  const uint16x4_t a = vld1_u16(above);  // up to 12 bits
  const uint16x4_t l = vld1_u16(left);
  uint16x4_t sum = vpadd_u16(a, l);  // up to 13 bits
  sum = vpadd_u16(sum, sum);         // up to 14 bits
  sum = vpadd_u16(sum, sum);
  const uint16x4_t dc = vrshr_n_u16(sum, 3);
  highbd_dc_store_4xh(dst, stride, 4, dc);
}

HIGHBD_DC_PREDICTOR(8, 8, 4)
HIGHBD_DC_PREDICTOR(16, 16, 5)
HIGHBD_DC_PREDICTOR(32, 32, 6)
HIGHBD_DC_PREDICTOR(64, 64, 7)

#undef HIGHBD_DC_PREDICTOR

static inline int divide_using_multiply_shift(int num, int shift1,
                                              int multiplier, int shift2) {
  const int interm = num >> shift1;
  return interm * multiplier >> shift2;
}

#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
#define HIGHBD_DC_SHIFT2 17

static inline int highbd_dc_predictor_rect(int bw, int bh, int sum, int shift1,
                                           uint32_t multiplier) {
  return divide_using_multiply_shift(sum + ((bw + bh) >> 1), shift1, multiplier,
                                     HIGHBD_DC_SHIFT2);
}

#undef HIGHBD_DC_SHIFT2

#define HIGHBD_DC_PREDICTOR_RECT(w, h, q, shift, mult)                  \
  void aom_highbd_dc_predictor_##w##x##h##_neon(                        \
      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,           \
      const uint16_t *left, int bd) {                                   \
    (void)bd;                                                           \
    uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above);       \
    uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left);         \
    uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above);                \
    int sum = horizontal_add_u16x8(sum_vec);                            \
    int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \
    highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0));    \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(4, 16, , 2, HIGHBD_DC_MULTIPLIER_1X4)
HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(8, 32, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
HIGHBD_DC_PREDICTOR_RECT(16, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X4)
HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(16, 64, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
HIGHBD_DC_PREDICTOR_RECT(32, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(64, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
#else
HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

#undef HIGHBD_DC_PREDICTOR_RECT
#undef HIGHBD_DC_MULTIPLIER_1X2
#undef HIGHBD_DC_MULTIPLIER_1X4

// -----------------------------------------------------------------------------
// DC_128

#define HIGHBD_DC_PREDICTOR_128(w, h, q)                        \
  void aom_highbd_dc_128_predictor_##w##x##h##_neon(            \
      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
      const uint16_t *left, int bd) {                           \
    (void)above;                                                \
    (void)bd;                                                   \
    (void)left;                                                 \
    highbd_dc_store_##w##xh(dst, stride, (h),                   \
                            vdup##q##_n_u16(0x80 << (bd - 8))); \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
HIGHBD_DC_PREDICTOR_128(4, 4, )
HIGHBD_DC_PREDICTOR_128(4, 8, )
HIGHBD_DC_PREDICTOR_128(4, 16, )
HIGHBD_DC_PREDICTOR_128(8, 4, q)
HIGHBD_DC_PREDICTOR_128(8, 8, q)
HIGHBD_DC_PREDICTOR_128(8, 16, q)
HIGHBD_DC_PREDICTOR_128(8, 32, q)
HIGHBD_DC_PREDICTOR_128(16, 4, q)
HIGHBD_DC_PREDICTOR_128(16, 8, q)
HIGHBD_DC_PREDICTOR_128(16, 16, q)
HIGHBD_DC_PREDICTOR_128(16, 32, q)
HIGHBD_DC_PREDICTOR_128(16, 64, q)
HIGHBD_DC_PREDICTOR_128(32, 8, q)
HIGHBD_DC_PREDICTOR_128(32, 16, q)
HIGHBD_DC_PREDICTOR_128(32, 32, q)
HIGHBD_DC_PREDICTOR_128(32, 64, q)
HIGHBD_DC_PREDICTOR_128(64, 16, q)
HIGHBD_DC_PREDICTOR_128(64, 32, q)
HIGHBD_DC_PREDICTOR_128(64, 64, q)
#else
HIGHBD_DC_PREDICTOR_128(4, 4, )
HIGHBD_DC_PREDICTOR_128(4, 8, )
HIGHBD_DC_PREDICTOR_128(8, 4, q)
HIGHBD_DC_PREDICTOR_128(8, 8, q)
HIGHBD_DC_PREDICTOR_128(8, 16, q)
HIGHBD_DC_PREDICTOR_128(16, 8, q)
HIGHBD_DC_PREDICTOR_128(16, 16, q)
HIGHBD_DC_PREDICTOR_128(16, 32, q)
HIGHBD_DC_PREDICTOR_128(32, 16, q)
HIGHBD_DC_PREDICTOR_128(32, 32, q)
HIGHBD_DC_PREDICTOR_128(32, 64, q)
HIGHBD_DC_PREDICTOR_128(64, 32, q)
HIGHBD_DC_PREDICTOR_128(64, 64, q)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

#undef HIGHBD_DC_PREDICTOR_128

// -----------------------------------------------------------------------------
// DC_LEFT

static inline uint32x4_t highbd_dc_load_sum_4(const uint16_t *left) {
  const uint16x4_t a = vld1_u16(left);   // up to 12 bits
  const uint16x4_t b = vpadd_u16(a, a);  // up to 13 bits
  return vcombine_u32(vpaddl_u16(b), vdup_n_u32(0));
}

static inline uint32x4_t highbd_dc_load_sum_8(const uint16_t *left) {
  return horizontal_add_and_broadcast_long_u16x8(vld1q_u16(left));
}

static inline uint32x4_t highbd_dc_load_sum_16(const uint16_t *left) {
  return horizontal_add_and_broadcast_long_u16x8(
      highbd_dc_load_partial_sum_16(left));
}

static inline uint32x4_t highbd_dc_load_sum_32(const uint16_t *left) {
  return horizontal_add_and_broadcast_long_u16x8(
      highbd_dc_load_partial_sum_32(left));
}

static inline uint32x4_t highbd_dc_load_sum_64(const uint16_t *left) {
  return horizontal_add_and_broadcast_long_u16x8(
      highbd_dc_load_partial_sum_64(left));
}

#define DC_PREDICTOR_LEFT(w, h, shift, q)                                  \
  void aom_highbd_dc_left_predictor_##w##x##h##_neon(                      \
      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
      const uint16_t *left, int bd) {                                      \
    (void)above;                                                           \
    (void)bd;                                                              \
    const uint32x4_t sum = highbd_dc_load_sum_##h(left);                   \
    const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift));                     \
    highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
DC_PREDICTOR_LEFT(4, 4, 2, )
DC_PREDICTOR_LEFT(4, 8, 3, )
DC_PREDICTOR_LEFT(4, 16, 4, )
DC_PREDICTOR_LEFT(8, 4, 2, q)
DC_PREDICTOR_LEFT(8, 8, 3, q)
DC_PREDICTOR_LEFT(8, 16, 4, q)
DC_PREDICTOR_LEFT(8, 32, 5, q)
DC_PREDICTOR_LEFT(16, 4, 2, q)
DC_PREDICTOR_LEFT(16, 8, 3, q)
DC_PREDICTOR_LEFT(16, 16, 4, q)
DC_PREDICTOR_LEFT(16, 32, 5, q)
DC_PREDICTOR_LEFT(16, 64, 6, q)
DC_PREDICTOR_LEFT(32, 8, 3, q)
DC_PREDICTOR_LEFT(32, 16, 4, q)
DC_PREDICTOR_LEFT(32, 32, 5, q)
DC_PREDICTOR_LEFT(32, 64, 6, q)
DC_PREDICTOR_LEFT(64, 16, 4, q)
DC_PREDICTOR_LEFT(64, 32, 5, q)
DC_PREDICTOR_LEFT(64, 64, 6, q)
#else
DC_PREDICTOR_LEFT(4, 4, 2, )
DC_PREDICTOR_LEFT(4, 8, 3, )
DC_PREDICTOR_LEFT(8, 4, 2, q)
DC_PREDICTOR_LEFT(8, 8, 3, q)
DC_PREDICTOR_LEFT(8, 16, 4, q)
DC_PREDICTOR_LEFT(16, 8, 3, q)
DC_PREDICTOR_LEFT(16, 16, 4, q)
DC_PREDICTOR_LEFT(16, 32, 5, q)
DC_PREDICTOR_LEFT(32, 16, 4, q)
DC_PREDICTOR_LEFT(32, 32, 5, q)
DC_PREDICTOR_LEFT(32, 64, 6, q)
DC_PREDICTOR_LEFT(64, 32, 5, q)
DC_PREDICTOR_LEFT(64, 64, 6, q)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

#undef DC_PREDICTOR_LEFT

// -----------------------------------------------------------------------------
// DC_TOP

#define DC_PREDICTOR_TOP(w, h, shift, q)                                   \
  void aom_highbd_dc_top_predictor_##w##x##h##_neon(                       \
      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
      const uint16_t *left, int bd) {                                      \
    (void)bd;                                                              \
    (void)left;                                                            \
    const uint32x4_t sum = highbd_dc_load_sum_##w(above);                  \
    const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift));                     \
    highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
DC_PREDICTOR_TOP(4, 4, 2, )
DC_PREDICTOR_TOP(4, 8, 2, )
DC_PREDICTOR_TOP(4, 16, 2, )
DC_PREDICTOR_TOP(8, 4, 3, q)
DC_PREDICTOR_TOP(8, 8, 3, q)
DC_PREDICTOR_TOP(8, 16, 3, q)
DC_PREDICTOR_TOP(8, 32, 3, q)
DC_PREDICTOR_TOP(16, 4, 4, q)
DC_PREDICTOR_TOP(16, 8, 4, q)
DC_PREDICTOR_TOP(16, 16, 4, q)
DC_PREDICTOR_TOP(16, 32, 4, q)
DC_PREDICTOR_TOP(16, 64, 4, q)
DC_PREDICTOR_TOP(32, 8, 5, q)
DC_PREDICTOR_TOP(32, 16, 5, q)
DC_PREDICTOR_TOP(32, 32, 5, q)
DC_PREDICTOR_TOP(32, 64, 5, q)
DC_PREDICTOR_TOP(64, 16, 6, q)
DC_PREDICTOR_TOP(64, 32, 6, q)
DC_PREDICTOR_TOP(64, 64, 6, q)
#else
DC_PREDICTOR_TOP(4, 4, 2, )
DC_PREDICTOR_TOP(4, 8, 2, )
DC_PREDICTOR_TOP(8, 4, 3, q)
DC_PREDICTOR_TOP(8, 8, 3, q)
DC_PREDICTOR_TOP(8, 16, 3, q)
DC_PREDICTOR_TOP(16, 8, 4, q)
DC_PREDICTOR_TOP(16, 16, 4, q)
DC_PREDICTOR_TOP(16, 32, 4, q)
DC_PREDICTOR_TOP(32, 16, 5, q)
DC_PREDICTOR_TOP(32, 32, 5, q)
DC_PREDICTOR_TOP(32, 64, 5, q)
DC_PREDICTOR_TOP(64, 32, 6, q)
DC_PREDICTOR_TOP(64, 64, 6, q)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

#undef DC_PREDICTOR_TOP

// -----------------------------------------------------------------------------
// V_PRED

#define HIGHBD_V_NXM(W, H)                                    \
  void aom_highbd_v_predictor_##W##x##H##_neon(               \
      uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
      const uint16_t *left, int bd) {                         \
    (void)left;                                               \
    (void)bd;                                                 \
    vertical##W##xh_neon(dst, stride, above, H);              \
  }

static inline uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
  uint16x8x2_t x;
  // Clang/gcc uses ldp here.
  x.val[0] = vld1q_u16(ptr);
  x.val[1] = vld1q_u16(ptr + 8);
  return x;
}

static inline void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) {
  vst1q_u16(ptr, x.val[0]);
  vst1q_u16(ptr + 8, x.val[1]);
}

static inline void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
                                    const uint16_t *const above, int height) {
  const uint16x4_t row = vld1_u16(above);
  int y = height;
  do {
    vst1_u16(dst, row);
    vst1_u16(dst + stride, row);
    dst += stride << 1;
    y -= 2;
  } while (y != 0);
}

static inline void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
                                    const uint16_t *const above, int height) {
  const uint16x8_t row = vld1q_u16(above);
  int y = height;
  do {
    vst1q_u16(dst, row);
    vst1q_u16(dst + stride, row);
    dst += stride << 1;
    y -= 2;
  } while (y != 0);
}

static inline void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
                                     const uint16_t *const above, int height) {
  const uint16x8x2_t row = load_uint16x8x2(above);
  int y = height;
  do {
    store_uint16x8x2(dst, row);
    store_uint16x8x2(dst + stride, row);
    dst += stride << 1;
    y -= 2;
  } while (y != 0);
}

static inline uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
  uint16x8x4_t x;
  // Clang/gcc uses ldp here.
  x.val[0] = vld1q_u16(ptr);
  x.val[1] = vld1q_u16(ptr + 8);
  x.val[2] = vld1q_u16(ptr + 16);
  x.val[3] = vld1q_u16(ptr + 24);
  return x;
}

static inline void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) {
  vst1q_u16(ptr, x.val[0]);
  vst1q_u16(ptr + 8, x.val[1]);
  vst1q_u16(ptr + 16, x.val[2]);
  vst1q_u16(ptr + 24, x.val[3]);
}

static inline void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
                                     const uint16_t *const above, int height) {
  const uint16x8x4_t row = load_uint16x8x4(above);
  int y = height;
  do {
    store_uint16x8x4(dst, row);
    store_uint16x8x4(dst + stride, row);
    dst += stride << 1;
    y -= 2;
  } while (y != 0);
}

static inline void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride,
                                     const uint16_t *const above, int height) {
  uint16_t *dst32 = dst + 32;
  const uint16x8x4_t row = load_uint16x8x4(above);
  const uint16x8x4_t row32 = load_uint16x8x4(above + 32);
  int y = height;
  do {
    store_uint16x8x4(dst, row);
    store_uint16x8x4(dst32, row32);
    store_uint16x8x4(dst + stride, row);
    store_uint16x8x4(dst32 + stride, row32);
    dst += stride << 1;
    dst32 += stride << 1;
    y -= 2;
  } while (y != 0);
}

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
HIGHBD_V_NXM(4, 4)
HIGHBD_V_NXM(4, 8)
HIGHBD_V_NXM(4, 16)

HIGHBD_V_NXM(8, 4)
HIGHBD_V_NXM(8, 8)
HIGHBD_V_NXM(8, 16)
HIGHBD_V_NXM(8, 32)

HIGHBD_V_NXM(16, 4)
HIGHBD_V_NXM(16, 8)
HIGHBD_V_NXM(16, 16)
HIGHBD_V_NXM(16, 32)
HIGHBD_V_NXM(16, 64)

HIGHBD_V_NXM(32, 8)
HIGHBD_V_NXM(32, 16)
HIGHBD_V_NXM(32, 32)
HIGHBD_V_NXM(32, 64)

HIGHBD_V_NXM(64, 16)
HIGHBD_V_NXM(64, 32)
HIGHBD_V_NXM(64, 64)
#else
HIGHBD_V_NXM(4, 4)
HIGHBD_V_NXM(4, 8)

HIGHBD_V_NXM(8, 4)
HIGHBD_V_NXM(8, 8)
HIGHBD_V_NXM(8, 16)

HIGHBD_V_NXM(16, 8)
HIGHBD_V_NXM(16, 16)
HIGHBD_V_NXM(16, 32)

HIGHBD_V_NXM(32, 16)
HIGHBD_V_NXM(32, 32)
HIGHBD_V_NXM(32, 64)

HIGHBD_V_NXM(64, 32)
HIGHBD_V_NXM(64, 64)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

// -----------------------------------------------------------------------------
// H_PRED

static inline void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride,
                                      uint16x4_t left) {
  vst1_u16(dst + 0 * stride, vdup_lane_u16(left, 0));
  vst1_u16(dst + 1 * stride, vdup_lane_u16(left, 1));
  vst1_u16(dst + 2 * stride, vdup_lane_u16(left, 2));
  vst1_u16(dst + 3 * stride, vdup_lane_u16(left, 3));
}

static inline void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride,
                                      uint16x4_t left) {
  vst1q_u16(dst + 0 * stride, vdupq_lane_u16(left, 0));
  vst1q_u16(dst + 1 * stride, vdupq_lane_u16(left, 1));
  vst1q_u16(dst + 2 * stride, vdupq_lane_u16(left, 2));
  vst1q_u16(dst + 3 * stride, vdupq_lane_u16(left, 3));
}

static inline void highbd_h_store_16x1(uint16_t *dst, uint16x8_t left) {
  vst1q_u16(dst + 0, left);
  vst1q_u16(dst + 8, left);
}

static inline void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride,
                                       uint16x4_t left) {
  highbd_h_store_16x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
  highbd_h_store_16x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
  highbd_h_store_16x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
  highbd_h_store_16x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
}

static inline void highbd_h_store_32x1(uint16_t *dst, uint16x8_t left) {
  vst1q_u16(dst + 0, left);
  vst1q_u16(dst + 8, left);
  vst1q_u16(dst + 16, left);
  vst1q_u16(dst + 24, left);
}

static inline void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride,
                                       uint16x4_t left) {
  highbd_h_store_32x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
  highbd_h_store_32x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
  highbd_h_store_32x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
  highbd_h_store_32x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
}

static inline void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) {
  vst1q_u16(dst + 0, left);
  vst1q_u16(dst + 8, left);
  vst1q_u16(dst + 16, left);
  vst1q_u16(dst + 24, left);
  vst1q_u16(dst + 32, left);
  vst1q_u16(dst + 40, left);
  vst1q_u16(dst + 48, left);
  vst1q_u16(dst + 56, left);
}

static inline void highbd_h_store_64x4(uint16_t *dst, ptrdiff_t stride,
                                       uint16x4_t left) {
  highbd_h_store_64x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
  highbd_h_store_64x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
  highbd_h_store_64x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
  highbd_h_store_64x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
}

void aom_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                     const uint16_t *above,
                                     const uint16_t *left, int bd) {
  (void)above;
  (void)bd;
  highbd_h_store_4x4(dst, stride, vld1_u16(left));
}

void aom_highbd_h_predictor_4x8_neon(uint16_t *dst, ptrdiff_t stride,
                                     const uint16_t *above,
                                     const uint16_t *left, int bd) {
  (void)above;
  (void)bd;
  uint16x8_t l = vld1q_u16(left);
  highbd_h_store_4x4(dst + 0 * stride, stride, vget_low_u16(l));
  highbd_h_store_4x4(dst + 4 * stride, stride, vget_high_u16(l));
}

void aom_highbd_h_predictor_8x4_neon(uint16_t *dst, ptrdiff_t stride,
                                     const uint16_t *above,
                                     const uint16_t *left, int bd) {
  (void)above;
  (void)bd;
  highbd_h_store_8x4(dst, stride, vld1_u16(left));
}

void aom_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                     const uint16_t *above,
                                     const uint16_t *left, int bd) {
  (void)above;
  (void)bd;
  uint16x8_t l = vld1q_u16(left);
  highbd_h_store_8x4(dst + 0 * stride, stride, vget_low_u16(l));
  highbd_h_store_8x4(dst + 4 * stride, stride, vget_high_u16(l));
}

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
void aom_highbd_h_predictor_16x4_neon(uint16_t *dst, ptrdiff_t stride,
                                      const uint16_t *above,
                                      const uint16_t *left, int bd) {
  (void)above;
  (void)bd;
  highbd_h_store_16x4(dst, stride, vld1_u16(left));
}
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

void aom_highbd_h_predictor_16x8_neon(uint16_t *dst, ptrdiff_t stride,
                                      const uint16_t *above,
                                      const uint16_t *left, int bd) {
  (void)above;
  (void)bd;
  uint16x8_t l = vld1q_u16(left);
  highbd_h_store_16x4(dst + 0 * stride, stride, vget_low_u16(l));
  highbd_h_store_16x4(dst + 4 * stride, stride, vget_high_u16(l));
}

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
void aom_highbd_h_predictor_32x8_neon(uint16_t *dst, ptrdiff_t stride,
                                      const uint16_t *above,
                                      const uint16_t *left, int bd) {
  (void)above;
  (void)bd;
  uint16x8_t l = vld1q_u16(left);
  highbd_h_store_32x4(dst + 0 * stride, stride, vget_low_u16(l));
  highbd_h_store_32x4(dst + 4 * stride, stride, vget_high_u16(l));
}
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

// For cases where height >= 16 we use pairs of loads to get LDP instructions.
#define HIGHBD_H_WXH_LARGE(w, h)                                            \
  void aom_highbd_h_predictor_##w##x##h##_neon(                             \
      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
      const uint16_t *left, int bd) {                                       \
    (void)above;                                                            \
    (void)bd;                                                               \
    for (int i = 0; i < (h) / 16; ++i) {                                    \
      uint16x8_t l0 = vld1q_u16(left + 0);                                  \
      uint16x8_t l1 = vld1q_u16(left + 8);                                  \
      highbd_h_store_##w##x4(dst + 0 * stride, stride, vget_low_u16(l0));   \
      highbd_h_store_##w##x4(dst + 4 * stride, stride, vget_high_u16(l0));  \
      highbd_h_store_##w##x4(dst + 8 * stride, stride, vget_low_u16(l1));   \
      highbd_h_store_##w##x4(dst + 12 * stride, stride, vget_high_u16(l1)); \
      left += 16;                                                           \
      dst += 16 * stride;                                                   \
    }                                                                       \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
HIGHBD_H_WXH_LARGE(4, 16)
HIGHBD_H_WXH_LARGE(8, 16)
HIGHBD_H_WXH_LARGE(8, 32)
HIGHBD_H_WXH_LARGE(16, 16)
HIGHBD_H_WXH_LARGE(16, 32)
HIGHBD_H_WXH_LARGE(16, 64)
HIGHBD_H_WXH_LARGE(32, 16)
HIGHBD_H_WXH_LARGE(32, 32)
HIGHBD_H_WXH_LARGE(32, 64)
HIGHBD_H_WXH_LARGE(64, 16)
HIGHBD_H_WXH_LARGE(64, 32)
HIGHBD_H_WXH_LARGE(64, 64)
#else
HIGHBD_H_WXH_LARGE(8, 16)
HIGHBD_H_WXH_LARGE(16, 16)
HIGHBD_H_WXH_LARGE(16, 32)
HIGHBD_H_WXH_LARGE(32, 16)
HIGHBD_H_WXH_LARGE(32, 32)
HIGHBD_H_WXH_LARGE(32, 64)
HIGHBD_H_WXH_LARGE(64, 32)
HIGHBD_H_WXH_LARGE(64, 64)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

#undef HIGHBD_H_WXH_LARGE

// -----------------------------------------------------------------------------
// PAETH

static inline void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
                                              const uint16_t *const top_row,
                                              const uint16_t *const left_column,
                                              int width, int height) {
  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
  uint16x8_t top;
  if (width == 4) {
    top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0));
  } else {  // width == 8
    top = vld1q_u16(top_row);
  }

  for (int y = 0; y < height; ++y) {
    const uint16x8_t left = vdupq_n_u16(left_column[y]);

    const uint16x8_t left_dist = vabdq_u16(top, top_left);
    const uint16x8_t top_dist = vabdq_u16(left, top_left);
    const uint16x8_t top_left_dist =
        vabdq_u16(vaddq_u16(top, left), top_left_x2);

    const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
    const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
    const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);

    // if (left_dist <= top_dist && left_dist <= top_left_dist)
    const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
    //   dest[x] = left_column[y];
    // Fill all the unused spaces with 'top'. They will be overwritten when
    // the positions for top_left are known.
    uint16x8_t result = vbslq_u16(left_mask, left, top);
    // else if (top_dist <= top_left_dist)
    //   dest[x] = top_row[x];
    // Add these values to the mask. They were already set.
    const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
    // else
    //   dest[x] = top_left;
    result = vbslq_u16(left_or_top_mask, result, top_left);

    if (width == 4) {
      vst1_u16(dest, vget_low_u16(result));
    } else {  // width == 8
      vst1q_u16(dest, result);
    }
    dest += stride;
  }
}

#define HIGHBD_PAETH_NXM(W, H)                                  \
  void aom_highbd_paeth_predictor_##W##x##H##_neon(             \
      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
      const uint16_t *left, int bd) {                           \
    (void)bd;                                                   \
    highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
HIGHBD_PAETH_NXM(4, 4)
HIGHBD_PAETH_NXM(4, 8)
HIGHBD_PAETH_NXM(4, 16)
HIGHBD_PAETH_NXM(8, 4)
HIGHBD_PAETH_NXM(8, 8)
HIGHBD_PAETH_NXM(8, 16)
HIGHBD_PAETH_NXM(8, 32)
#else
HIGHBD_PAETH_NXM(4, 4)
HIGHBD_PAETH_NXM(4, 8)
HIGHBD_PAETH_NXM(8, 4)
HIGHBD_PAETH_NXM(8, 8)
HIGHBD_PAETH_NXM(8, 16)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

// Select the closest values and collect them.
static inline uint16x8_t select_paeth(const uint16x8_t top,
                                      const uint16x8_t left,
                                      const uint16x8_t top_left,
                                      const uint16x8_t left_le_top,
                                      const uint16x8_t left_le_top_left,
                                      const uint16x8_t top_le_top_left) {
  // if (left_dist <= top_dist && left_dist <= top_left_dist)
  const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
  //   dest[x] = left_column[y];
  // Fill all the unused spaces with 'top'. They will be overwritten when
  // the positions for top_left are known.
  const uint16x8_t result = vbslq_u16(left_mask, left, top);
  // else if (top_dist <= top_left_dist)
  //   dest[x] = top_row[x];
  // Add these values to the mask. They were already set.
  const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
  // else
  //   dest[x] = top_left;
  return vbslq_u16(left_or_top_mask, result, top_left);
}

#define PAETH_PREDICTOR(num)                                                  \
  do {                                                                        \
    const uint16x8_t left_dist = vabdq_u16(top[num], top_left);               \
    const uint16x8_t top_left_dist =                                          \
        vabdq_u16(vaddq_u16(top[num], left), top_left_x2);                    \
    const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);            \
    const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);  \
    const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);    \
    const uint16x8_t result =                                                 \
        select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \
                     top_le_top_left);                                        \
    vst1q_u16(dest + (num * 8), result);                                      \
  } while (0)

#define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8))

static inline void highbd_paeth16_plus_x_h_neon(
    uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row,
    const uint16_t *const left_column, int width, int height) {
  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
  uint16x8_t top[8];
  top[0] = LOAD_TOP_ROW(0);
  top[1] = LOAD_TOP_ROW(1);
  if (width > 16) {
    top[2] = LOAD_TOP_ROW(2);
    top[3] = LOAD_TOP_ROW(3);
    if (width == 64) {
      top[4] = LOAD_TOP_ROW(4);
      top[5] = LOAD_TOP_ROW(5);
      top[6] = LOAD_TOP_ROW(6);
      top[7] = LOAD_TOP_ROW(7);
    }
  }

  for (int y = 0; y < height; ++y) {
    const uint16x8_t left = vdupq_n_u16(left_column[y]);
    const uint16x8_t top_dist = vabdq_u16(left, top_left);
    PAETH_PREDICTOR(0);
    PAETH_PREDICTOR(1);
    if (width > 16) {
      PAETH_PREDICTOR(2);
      PAETH_PREDICTOR(3);
      if (width == 64) {
        PAETH_PREDICTOR(4);
        PAETH_PREDICTOR(5);
        PAETH_PREDICTOR(6);
        PAETH_PREDICTOR(7);
      }
    }
    dest += stride;
  }
}

#define HIGHBD_PAETH_NXM_WIDE(W, H)                               \
  void aom_highbd_paeth_predictor_##W##x##H##_neon(               \
      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,     \
      const uint16_t *left, int bd) {                             \
    (void)bd;                                                     \
    highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
HIGHBD_PAETH_NXM_WIDE(16, 4)
HIGHBD_PAETH_NXM_WIDE(16, 8)
HIGHBD_PAETH_NXM_WIDE(16, 16)
HIGHBD_PAETH_NXM_WIDE(16, 32)
HIGHBD_PAETH_NXM_WIDE(16, 64)
HIGHBD_PAETH_NXM_WIDE(32, 8)
HIGHBD_PAETH_NXM_WIDE(32, 16)
HIGHBD_PAETH_NXM_WIDE(32, 32)
HIGHBD_PAETH_NXM_WIDE(32, 64)
HIGHBD_PAETH_NXM_WIDE(64, 16)
HIGHBD_PAETH_NXM_WIDE(64, 32)
HIGHBD_PAETH_NXM_WIDE(64, 64)
#else
HIGHBD_PAETH_NXM_WIDE(16, 8)
HIGHBD_PAETH_NXM_WIDE(16, 16)
HIGHBD_PAETH_NXM_WIDE(16, 32)
HIGHBD_PAETH_NXM_WIDE(32, 16)
HIGHBD_PAETH_NXM_WIDE(32, 32)
HIGHBD_PAETH_NXM_WIDE(32, 64)
HIGHBD_PAETH_NXM_WIDE(64, 32)
HIGHBD_PAETH_NXM_WIDE(64, 64)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

// -----------------------------------------------------------------------------
// SMOOTH

// 256 - v = vneg_s8(v)
static inline uint16x4_t negate_s8(const uint16x4_t v) {
  return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
}

static inline void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *const top_row,
                                          const uint16_t *const left_column,
                                          const int height) {
  const uint16_t top_right = top_row[3];
  const uint16_t bottom_left = left_column[height - 1];
  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;

  const uint16x4_t top_v = vld1_u16(top_row);
  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
  const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16);
  const uint16x4_t scaled_weights_x = negate_s8(weights_x_v);
  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);

  for (int y = 0; y < height; ++y) {
    // Each variable in the running summation is named for the last item to be
    // accumulated.
    const uint32x4_t weighted_top =
        vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
    const uint32x4_t weighted_left =
        vmlal_n_u16(weighted_top, weights_x_v, left_column[y]);
    const uint32x4_t weighted_bl =
        vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);

    const uint16x4_t pred =
        vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1);
    vst1_u16(dst, pred);
    dst += stride;
  }
}

// Common code between 8xH and [16|32|64]xH.
static inline void highbd_calculate_pred8(
    uint16_t *dst, const uint32x4_t weighted_corners_low,
    const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals,
    const uint16x4x2_t weights_x, const uint16_t left_y,
    const uint16_t weight_y) {
  // Each variable in the running summation is named for the last item to be
  // accumulated.
  const uint32x4_t weighted_top_low =
      vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
  const uint32x4_t weighted_edges_low =
      vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);

  const uint16x4_t pred_low =
      vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1);
  vst1_u16(dst, pred_low);

  const uint32x4_t weighted_top_high =
      vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
  const uint32x4_t weighted_edges_high =
      vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);

  const uint16x4_t pred_high =
      vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1);
  vst1_u16(dst + 4, pred_high);
}

static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride,
                                   const uint16_t *const top_row,
                                   const uint16_t *const left_column,
                                   const int height) {
  const uint16_t top_right = top_row[7];
  const uint16_t bottom_left = left_column[height - 1];
  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;

  const uint16x4x2_t top_vals = { { vld1_u16(top_row),
                                    vld1_u16(top_row + 4) } };
  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
  const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
                                     vld1_u16(smooth_weights_u16 + 8) } };
  const uint32x4_t weighted_tr_low =
      vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
  const uint32x4_t weighted_tr_high =
      vmull_n_u16(negate_s8(weights_x.val[1]), top_right);

  for (int y = 0; y < height; ++y) {
    const uint32x4_t weighted_bl =
        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
    const uint32x4_t weighted_corners_low =
        vaddq_u32(weighted_bl, weighted_tr_low);
    const uint32x4_t weighted_corners_high =
        vaddq_u32(weighted_bl, weighted_tr_high);
    highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high,
                           top_vals, weights_x, left_column[y], weights_y[y]);
    dst += stride;
  }
}

#define HIGHBD_SMOOTH_NXM(W, H)                                 \
  void aom_highbd_smooth_predictor_##W##x##H##_neon(            \
      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
      const uint16_t *left, int bd) {                           \
    (void)bd;                                                   \
    highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
HIGHBD_SMOOTH_NXM(4, 4)
HIGHBD_SMOOTH_NXM(4, 8)
HIGHBD_SMOOTH_NXM(8, 4)
HIGHBD_SMOOTH_NXM(8, 8)
HIGHBD_SMOOTH_NXM(4, 16)
HIGHBD_SMOOTH_NXM(8, 16)
HIGHBD_SMOOTH_NXM(8, 32)
#else
HIGHBD_SMOOTH_NXM(4, 4)
HIGHBD_SMOOTH_NXM(4, 8)
HIGHBD_SMOOTH_NXM(8, 4)
HIGHBD_SMOOTH_NXM(8, 8)
HIGHBD_SMOOTH_NXM(8, 16)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

#undef HIGHBD_SMOOTH_NXM

// For width 16 and above.
#define HIGHBD_SMOOTH_PREDICTOR(W)                                             \
  static void highbd_smooth_##W##xh_neon(                                      \
      uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,          \
      const uint16_t *const left_column, const int height) {                   \
    const uint16_t top_right = top_row[(W)-1];                                 \
    const uint16_t bottom_left = left_column[height - 1];                      \
    const uint16_t *const weights_y = smooth_weights_u16 + height - 4;         \
                                                                               \
    /* Precompute weighted values that don't vary with |y|. */                 \
    uint32x4_t weighted_tr_low[(W) >> 3];                                      \
    uint32x4_t weighted_tr_high[(W) >> 3];                                     \
    for (int i = 0; i < (W) >> 3; ++i) {                                       \
      const int x = i << 3;                                                    \
      const uint16x4_t weights_x_low =                                         \
          vld1_u16(smooth_weights_u16 + (W)-4 + x);                            \
      weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right);   \
      const uint16x4_t weights_x_high =                                        \
          vld1_u16(smooth_weights_u16 + (W) + x);                              \
      weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \
    }                                                                          \
                                                                               \
    const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                  \
    for (int y = 0; y < height; ++y) {                                         \
      const uint32x4_t weighted_bl =                                           \
          vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                      \
      uint16_t *dst_x = dst;                                                   \
      for (int i = 0; i < (W) >> 3; ++i) {                                     \
        const int x = i << 3;                                                  \
        const uint16x4x2_t top_vals = { { vld1_u16(top_row + x),               \
                                          vld1_u16(top_row + x + 4) } };       \
        const uint32x4_t weighted_corners_low =                                \
            vaddq_u32(weighted_bl, weighted_tr_low[i]);                        \
        const uint32x4_t weighted_corners_high =                               \
            vaddq_u32(weighted_bl, weighted_tr_high[i]);                       \
        /* Accumulate weighted edge values and store. */                       \
        const uint16x4x2_t weights_x = {                                       \
          { vld1_u16(smooth_weights_u16 + (W)-4 + x),                          \
            vld1_u16(smooth_weights_u16 + (W) + x) }                           \
        };                                                                     \
        highbd_calculate_pred8(dst_x, weighted_corners_low,                    \
                               weighted_corners_high, top_vals, weights_x,     \
                               left_column[y], weights_y[y]);                  \
        dst_x += 8;                                                            \
      }                                                                        \
      dst += stride;                                                           \
    }                                                                          \
  }

HIGHBD_SMOOTH_PREDICTOR(16)
HIGHBD_SMOOTH_PREDICTOR(32)
HIGHBD_SMOOTH_PREDICTOR(64)

#undef HIGHBD_SMOOTH_PREDICTOR

#define HIGHBD_SMOOTH_NXM_WIDE(W, H)                            \
  void aom_highbd_smooth_predictor_##W##x##H##_neon(            \
      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
      const uint16_t *left, int bd) {                           \
    (void)bd;                                                   \
    highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
HIGHBD_SMOOTH_NXM_WIDE(16, 4)
HIGHBD_SMOOTH_NXM_WIDE(16, 8)
HIGHBD_SMOOTH_NXM_WIDE(16, 16)
HIGHBD_SMOOTH_NXM_WIDE(16, 32)
HIGHBD_SMOOTH_NXM_WIDE(16, 64)
HIGHBD_SMOOTH_NXM_WIDE(32, 8)
HIGHBD_SMOOTH_NXM_WIDE(32, 16)
HIGHBD_SMOOTH_NXM_WIDE(32, 32)
HIGHBD_SMOOTH_NXM_WIDE(32, 64)
HIGHBD_SMOOTH_NXM_WIDE(64, 16)
HIGHBD_SMOOTH_NXM_WIDE(64, 32)
HIGHBD_SMOOTH_NXM_WIDE(64, 64)
#else
HIGHBD_SMOOTH_NXM_WIDE(16, 8)
HIGHBD_SMOOTH_NXM_WIDE(16, 16)
HIGHBD_SMOOTH_NXM_WIDE(16, 32)
HIGHBD_SMOOTH_NXM_WIDE(32, 16)
HIGHBD_SMOOTH_NXM_WIDE(32, 32)
HIGHBD_SMOOTH_NXM_WIDE(32, 64)
HIGHBD_SMOOTH_NXM_WIDE(64, 32)
HIGHBD_SMOOTH_NXM_WIDE(64, 64)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

#undef HIGHBD_SMOOTH_NXM_WIDE

static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride,
                                     const uint16_t *const top_row,
                                     const uint16_t *const left_column,
                                     const int height) {
  const uint16_t bottom_left = left_column[height - 1];
  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;

  const uint16x4_t top_v = vld1_u16(top_row);
  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);

  for (int y = 0; y < height; ++y) {
    const uint32x4_t weighted_bl =
        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
    const uint32x4_t weighted_top =
        vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
    vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE));

    dst += stride;
  }
}

static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride,
                                     const uint16_t *const top_row,
                                     const uint16_t *const left_column,
                                     const int height) {
  const uint16_t bottom_left = left_column[height - 1];
  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;

  const uint16x4_t top_low = vld1_u16(top_row);
  const uint16x4_t top_high = vld1_u16(top_row + 4);
  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);

  for (int y = 0; y < height; ++y) {
    const uint32x4_t weighted_bl =
        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);

    const uint32x4_t weighted_top_low =
        vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
    vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));

    const uint32x4_t weighted_top_high =
        vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
    vst1_u16(dst + 4,
             vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE));
    dst += stride;
  }
}

#define HIGHBD_SMOOTH_V_NXM(W, H)                                \
  void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \
      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
      const uint16_t *left, int bd) {                            \
    (void)bd;                                                    \
    highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
HIGHBD_SMOOTH_V_NXM(4, 4)
HIGHBD_SMOOTH_V_NXM(4, 8)
HIGHBD_SMOOTH_V_NXM(4, 16)
HIGHBD_SMOOTH_V_NXM(8, 4)
HIGHBD_SMOOTH_V_NXM(8, 8)
HIGHBD_SMOOTH_V_NXM(8, 16)
HIGHBD_SMOOTH_V_NXM(8, 32)
#else
HIGHBD_SMOOTH_V_NXM(4, 4)
HIGHBD_SMOOTH_V_NXM(4, 8)
HIGHBD_SMOOTH_V_NXM(8, 4)
HIGHBD_SMOOTH_V_NXM(8, 8)
HIGHBD_SMOOTH_V_NXM(8, 16)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

#undef HIGHBD_SMOOTH_V_NXM

// For width 16 and above.
#define HIGHBD_SMOOTH_V_PREDICTOR(W)                                         \
  static void highbd_smooth_v_##W##xh_neon(                                  \
      uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row,  \
      const uint16_t *const left_column, const int height) {                 \
    const uint16_t bottom_left = left_column[height - 1];                    \
    const uint16_t *const weights_y = smooth_weights_u16 + height - 4;       \
                                                                             \
    uint16x4x2_t top_vals[(W) >> 3];                                         \
    for (int i = 0; i < (W) >> 3; ++i) {                                     \
      const int x = i << 3;                                                  \
      top_vals[i].val[0] = vld1_u16(top_row + x);                            \
      top_vals[i].val[1] = vld1_u16(top_row + x + 4);                        \
    }                                                                        \
                                                                             \
    const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                \
    for (int y = 0; y < height; ++y) {                                       \
      const uint32x4_t weighted_bl =                                         \
          vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                    \
                                                                             \
      uint16_t *dst_x = dst;                                                 \
      for (int i = 0; i < (W) >> 3; ++i) {                                   \
        const uint32x4_t weighted_top_low =                                  \
            vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);      \
        vst1_u16(dst_x,                                                      \
                 vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));  \
                                                                             \
        const uint32x4_t weighted_top_high =                                 \
            vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]);      \
        vst1_u16(dst_x + 4,                                                  \
                 vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
        dst_x += 8;                                                          \
      }                                                                      \
      dst += stride;                                                         \
    }                                                                        \
  }

HIGHBD_SMOOTH_V_PREDICTOR(16)
HIGHBD_SMOOTH_V_PREDICTOR(32)
HIGHBD_SMOOTH_V_PREDICTOR(64)

#undef HIGHBD_SMOOTH_V_PREDICTOR

#define HIGHBD_SMOOTH_V_NXM_WIDE(W, H)                           \
  void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \
      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
      const uint16_t *left, int bd) {                            \
    (void)bd;                                                    \
    highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
HIGHBD_SMOOTH_V_NXM_WIDE(16, 4)
HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
HIGHBD_SMOOTH_V_NXM_WIDE(16, 64)
HIGHBD_SMOOTH_V_NXM_WIDE(32, 8)
HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
HIGHBD_SMOOTH_V_NXM_WIDE(64, 16)
HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
#else
HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

#undef HIGHBD_SMOOTH_V_NXM_WIDE

static inline void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride,
                                            const uint16_t *const top_row,
                                            const uint16_t *const left_column,
                                            const int height) {
  const uint16_t top_right = top_row[3];

  const uint16x4_t weights_x = vld1_u16(smooth_weights_u16);
  const uint16x4_t scaled_weights_x = negate_s8(weights_x);

  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
  for (int y = 0; y < height; ++y) {
    const uint32x4_t weighted_left =
        vmlal_n_u16(weighted_tr, weights_x, left_column[y]);
    vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE));
    dst += stride;
  }
}

static inline void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride,
                                            const uint16_t *const top_row,
                                            const uint16_t *const left_column,
                                            const int height) {
  const uint16_t top_right = top_row[7];

  const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
                                     vld1_u16(smooth_weights_u16 + 8) } };

  const uint32x4_t weighted_tr_low =
      vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
  const uint32x4_t weighted_tr_high =
      vmull_n_u16(negate_s8(weights_x.val[1]), top_right);

  for (int y = 0; y < height; ++y) {
    const uint16_t left_y = left_column[y];
    const uint32x4_t weighted_left_low =
        vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
    vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));

    const uint32x4_t weighted_left_high =
        vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
    vst1_u16(dst + 4,
             vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE));
    dst += stride;
  }
}

#define HIGHBD_SMOOTH_H_NXM(W, H)                                \
  void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \
      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
      const uint16_t *left, int bd) {                            \
    (void)bd;                                                    \
    highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
HIGHBD_SMOOTH_H_NXM(4, 4)
HIGHBD_SMOOTH_H_NXM(4, 8)
HIGHBD_SMOOTH_H_NXM(4, 16)
HIGHBD_SMOOTH_H_NXM(8, 4)
HIGHBD_SMOOTH_H_NXM(8, 8)
HIGHBD_SMOOTH_H_NXM(8, 16)
HIGHBD_SMOOTH_H_NXM(8, 32)
#else
HIGHBD_SMOOTH_H_NXM(4, 4)
HIGHBD_SMOOTH_H_NXM(4, 8)
HIGHBD_SMOOTH_H_NXM(8, 4)
HIGHBD_SMOOTH_H_NXM(8, 8)
HIGHBD_SMOOTH_H_NXM(8, 16)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

#undef HIGHBD_SMOOTH_H_NXM

// For width 16 and above.
#define HIGHBD_SMOOTH_H_PREDICTOR(W)                                          \
  static void highbd_smooth_h_##W##xh_neon(                                   \
      uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,         \
      const uint16_t *const left_column, const int height) {                  \
    const uint16_t top_right = top_row[(W)-1];                                \
                                                                              \
    uint16x4_t weights_x_low[(W) >> 3];                                       \
    uint16x4_t weights_x_high[(W) >> 3];                                      \
    uint32x4_t weighted_tr_low[(W) >> 3];                                     \
    uint32x4_t weighted_tr_high[(W) >> 3];                                    \
    for (int i = 0; i < (W) >> 3; ++i) {                                      \
      const int x = i << 3;                                                   \
      weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x);            \
      weighted_tr_low[i] =                                                    \
          vmull_n_u16(negate_s8(weights_x_low[i]), top_right);                \
      weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x);             \
      weighted_tr_high[i] =                                                   \
          vmull_n_u16(negate_s8(weights_x_high[i]), top_right);               \
    }                                                                         \
                                                                              \
    for (int y = 0; y < height; ++y) {                                        \
      uint16_t *dst_x = dst;                                                  \
      const uint16_t left_y = left_column[y];                                 \
      for (int i = 0; i < (W) >> 3; ++i) {                                    \
        const uint32x4_t weighted_left_low =                                  \
            vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);        \
        vst1_u16(dst_x,                                                       \
                 vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));  \
                                                                              \
        const uint32x4_t weighted_left_high =                                 \
            vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y);      \
        vst1_u16(dst_x + 4,                                                   \
                 vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
        dst_x += 8;                                                           \
      }                                                                       \
      dst += stride;                                                          \
    }                                                                         \
  }

HIGHBD_SMOOTH_H_PREDICTOR(16)
HIGHBD_SMOOTH_H_PREDICTOR(32)
HIGHBD_SMOOTH_H_PREDICTOR(64)

#undef HIGHBD_SMOOTH_H_PREDICTOR

#define HIGHBD_SMOOTH_H_NXM_WIDE(W, H)                           \
  void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \
      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
      const uint16_t *left, int bd) {                            \
    (void)bd;                                                    \
    highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
HIGHBD_SMOOTH_H_NXM_WIDE(16, 4)
HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
HIGHBD_SMOOTH_H_NXM_WIDE(16, 64)
HIGHBD_SMOOTH_H_NXM_WIDE(32, 8)
HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
HIGHBD_SMOOTH_H_NXM_WIDE(64, 16)
HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
#else
HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

#undef HIGHBD_SMOOTH_H_NXM_WIDE

// -----------------------------------------------------------------------------
// Z1

static const int16_t iota1_s16[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
static const int16_t iota2_s16[] = { 0, 2, 4, 6, 8, 10, 12, 14 };

static AOM_FORCE_INLINE uint16x4_t highbd_dr_z1_apply_shift_x4(uint16x4_t a0,
                                                               uint16x4_t a1,
                                                               int shift) {
  // The C implementation of the z1 predictor uses (32 - shift) and a right
  // shift by 5, however we instead double shift to avoid an unnecessary right
  // shift by 1.
  uint32x4_t res = vmull_n_u16(a1, shift);
  res = vmlal_n_u16(res, a0, 64 - shift);
  return vrshrn_n_u32(res, 6);
}

static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0,
                                                               uint16x8_t a1,
                                                               int shift) {
  return vcombine_u16(
      highbd_dr_z1_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), shift),
      highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift));
}

// clang-format off
static const uint8_t kLoadMaxShuffles[] = {
  14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
  12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
  10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
   8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
   6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15,
   4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15,
   2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15,
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
};
// clang-format on

static inline uint16x8_t zn_load_masked_neon(const uint16_t *ptr,
                                             int shuffle_idx) {
  uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
  uint8x16_t src = vreinterpretq_u8_u16(vld1q_u16(ptr));
#if AOM_ARCH_AARCH64
  return vreinterpretq_u16_u8(vqtbl1q_u8(src, shuffle));
#else
  uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
  uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
  uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
  return vreinterpretq_u16_u8(vcombine_u8(lo, hi));
#endif
}

static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
                                                   ptrdiff_t stride, int bw,
                                                   int bh,
                                                   const uint16_t *above,
                                                   int dx) {
  assert(bw % 4 == 0);
  assert(bh % 4 == 0);
  assert(dx > 0);

  const int max_base_x = (bw + bh) - 1;
  const int above_max = above[max_base_x];

  const int16x8_t iota1x8 = vld1q_s16(iota1_s16);
  const int16x4_t iota1x4 = vget_low_s16(iota1x8);

  int x = dx;
  int r = 0;
  do {
    const int base = x >> 6;
    if (base >= max_base_x) {
      for (int i = r; i < bh; ++i) {
        aom_memset16(dst, above_max, bw);
        dst += stride;
      }
      return;
    }

    // The C implementation of the z1 predictor when not upsampling uses:
    // ((x & 0x3f) >> 1)
    // The right shift is unnecessary here since we instead shift by +1 later,
    // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
    const int shift = x & 0x3e;

    if (bw == 4) {
      const uint16x4_t a0 = vld1_u16(&above[base]);
      const uint16x4_t a1 = vld1_u16(&above[base + 1]);
      const uint16x4_t val = highbd_dr_z1_apply_shift_x4(a0, a1, shift);
      const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota1x4);
      const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
      vst1_u16(dst, res);
    } else {
      int c = 0;
      do {
        uint16x8_t a0;
        uint16x8_t a1;
        if (base + c >= max_base_x) {
          a0 = a1 = vdupq_n_u16(above_max);
        } else {
          if (base + c + 7 >= max_base_x) {
            int shuffle_idx = max_base_x - base - c;
            a0 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
          } else {
            a0 = vld1q_u16(above + base + c);
          }
          if (base + c + 8 >= max_base_x) {
            int shuffle_idx = max_base_x - base - c - 1;
            a1 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
          } else {
            a1 = vld1q_u16(above + base + c + 1);
          }
        }

        vst1q_u16(dst + c, highbd_dr_z1_apply_shift_x8(a0, a1, shift));
        c += 8;
      } while (c < bw);
    }

    dst += stride;
    x += dx;
  } while (++r < bh);
}

static void highbd_dr_prediction_z1_upsample1_neon(uint16_t *dst,
                                                   ptrdiff_t stride, int bw,
                                                   int bh,
                                                   const uint16_t *above,
                                                   int dx) {
  assert(bw % 4 == 0);
  assert(bh % 4 == 0);
  assert(dx > 0);

  const int max_base_x = ((bw + bh) - 1) << 1;
  const int above_max = above[max_base_x];

  const int16x8_t iota2x8 = vld1q_s16(iota2_s16);
  const int16x4_t iota2x4 = vget_low_s16(iota2x8);

  int x = dx;
  int r = 0;
  do {
    const int base = x >> 5;
    if (base >= max_base_x) {
      for (int i = r; i < bh; ++i) {
        aom_memset16(dst, above_max, bw);
        dst += stride;
      }
      return;
    }

    // The C implementation of the z1 predictor when upsampling uses:
    // (((x << 1) & 0x3f) >> 1)
    // The right shift is unnecessary here since we instead shift by +1 later,
    // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
    const int shift = (x << 1) & 0x3e;

    if (bw == 4) {
      const uint16x4x2_t a01 = vld2_u16(&above[base]);
      const uint16x4_t val =
          highbd_dr_z1_apply_shift_x4(a01.val[0], a01.val[1], shift);
      const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota2x4);
      const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
      vst1_u16(dst, res);
    } else {
      int c = 0;
      do {
        const uint16x8x2_t a01 = vld2q_u16(&above[base + 2 * c]);
        const uint16x8_t val =
            highbd_dr_z1_apply_shift_x8(a01.val[0], a01.val[1], shift);
        const uint16x8_t cmp =
            vcgtq_s16(vdupq_n_s16(max_base_x - base - 2 * c), iota2x8);
        const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
        vst1q_u16(dst + c, res);
        c += 8;
      } while (c < bw);
    }

    dst += stride;
    x += dx;
  } while (++r < bh);
}

// Directional prediction, zone 1: 0 < angle < 90
void av1_highbd_dr_prediction_z1_neon(uint16_t *dst, ptrdiff_t stride, int bw,
                                      int bh, const uint16_t *above,
                                      const uint16_t *left, int upsample_above,
                                      int dx, int dy, int bd) {
  (void)left;
  (void)dy;
  (void)bd;
  assert(dy == 1);

  if (upsample_above) {
    highbd_dr_prediction_z1_upsample1_neon(dst, stride, bw, bh, above, dx);
  } else {
    highbd_dr_prediction_z1_upsample0_neon(dst, stride, bw, bh, above, dx);
  }
}

// -----------------------------------------------------------------------------
// Z2

#if AOM_ARCH_AARCH64
// Incrementally shift more elements from `above` into the result, merging with
// existing `left` elements.
// X0, X1, X2, X3
// Y0, X0, X1, X2
// Y0, Y1, X0, X1
// Y0, Y1, Y2, X0
// Y0, Y1, Y2, Y3
// clang-format off
static const uint8_t z2_merge_shuffles_u16x4[5][8] = {
  {  8,  9, 10, 11, 12, 13, 14, 15 },
  {  0,  1,  8,  9, 10, 11, 12, 13 },
  {  0,  1,  2,  3,  8,  9, 10, 11 },
  {  0,  1,  2,  3,  4,  5,  8,  9 },
  {  0,  1,  2,  3,  4,  5,  6,  7 },
};
// clang-format on

// Incrementally shift more elements from `above` into the result, merging with
// existing `left` elements.
// X0, X1, X2, X3, X4, X5, X6, X7
// Y0, X0, X1, X2, X3, X4, X5, X6
// Y0, Y1, X0, X1, X2, X3, X4, X5
// Y0, Y1, Y2, X0, X1, X2, X3, X4
// Y0, Y1, Y2, Y3, X0, X1, X2, X3
// Y0, Y1, Y2, Y3, Y4, X0, X1, X2
// Y0, Y1, Y2, Y3, Y4, Y5, X0, X1
// Y0, Y1, Y2, Y3, Y4, Y5, Y6, X0
// Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7
// clang-format off
static const uint8_t z2_merge_shuffles_u16x8[9][16] = {
  { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
  {  0,  1, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
  {  0,  1,  2,  3, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 },
  {  0,  1,  2,  3,  4,  5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 },
  {  0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23 },
  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 16, 17, 18, 19, 20, 21 },
  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 16, 17, 18, 19 },
  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 17 },
  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
};
// clang-format on

// clang-format off
static const uint16_t z2_y_iter_masks_u16x4[5][4] = {
  {      0U,      0U,      0U,      0U },
  { 0xffffU,      0U,      0U,      0U },
  { 0xffffU, 0xffffU,      0U,      0U },
  { 0xffffU, 0xffffU, 0xffffU,      0U },
  { 0xffffU, 0xffffU, 0xffffU, 0xffffU },
};
// clang-format on

// clang-format off
static const uint16_t z2_y_iter_masks_u16x8[9][8] = {
  {      0U,      0U,      0U,      0U,      0U,      0U,      0U,      0U },
  { 0xffffU,      0U,      0U,      0U,      0U,      0U,      0U,      0U },
  { 0xffffU, 0xffffU,      0U,      0U,      0U,      0U,      0U,      0U },
  { 0xffffU, 0xffffU, 0xffffU,      0U,      0U,      0U,      0U,      0U },
  { 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U,      0U,      0U,      0U },
  { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U,      0U,      0U },
  { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U,      0U },
  { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U },
  { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU },
};
// clang-format on

static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x8(
    const uint16x8_t left_data, const int16x4_t indices, int base, int n) {
  // Need to adjust indices to operate on 0-based indices rather than
  // `base`-based indices and then adjust from uint16x4 indices to uint8x8
  // indices so we can use a tbl instruction (which only operates on bytes).
  uint8x8_t left_indices =
      vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
  left_indices = vtrn1_u8(left_indices, left_indices);
  left_indices = vadd_u8(left_indices, left_indices);
  left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
  const uint16x4_t ret = vreinterpret_u16_u8(
      vqtbl1_u8(vreinterpretq_u8_u16(left_data), left_indices));
  return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
}

static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x16(
    const uint16x8x2_t left_data, const int16x4_t indices, int base, int n) {
  // Need to adjust indices to operate on 0-based indices rather than
  // `base`-based indices and then adjust from uint16x4 indices to uint8x8
  // indices so we can use a tbl instruction (which only operates on bytes).
  uint8x8_t left_indices =
      vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
  left_indices = vtrn1_u8(left_indices, left_indices);
  left_indices = vadd_u8(left_indices, left_indices);
  left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
  uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
                             vreinterpretq_u8_u16(left_data.val[1]) } };
  const uint16x4_t ret = vreinterpret_u16_u8(vqtbl2_u8(data_u8, left_indices));
  return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
}

static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x8(
    const uint16x8_t left_data, const int16x8_t indices, int base, int n) {
  // Need to adjust indices to operate on 0-based indices rather than
  // `base`-based indices and then adjust from uint16x4 indices to uint8x8
  // indices so we can use a tbl instruction (which only operates on bytes).
  uint8x16_t left_indices =
      vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
  left_indices = vtrn1q_u8(left_indices, left_indices);
  left_indices = vaddq_u8(left_indices, left_indices);
  left_indices =
      vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
  const uint16x8_t ret = vreinterpretq_u16_u8(
      vqtbl1q_u8(vreinterpretq_u8_u16(left_data), left_indices));
  return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
}

static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16(
    const uint16x8x2_t left_data, const int16x8_t indices, int base, int n) {
  // Need to adjust indices to operate on 0-based indices rather than
  // `base`-based indices and then adjust from uint16x4 indices to uint8x8
  // indices so we can use a tbl instruction (which only operates on bytes).
  uint8x16_t left_indices =
      vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
  left_indices = vtrn1q_u8(left_indices, left_indices);
  left_indices = vaddq_u8(left_indices, left_indices);
  left_indices =
      vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
  uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
                             vreinterpretq_u8_u16(left_data.val[1]) } };
  const uint16x8_t ret =
      vreinterpretq_u16_u8(vqtbl2q_u8(data_u8, left_indices));
  return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
}
#endif  // AOM_ARCH_AARCH64

static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4(
    const uint16_t *left, const int16x4_t indices, int n) {
  assert(n > 0);
  assert(n <= 4);
  // Load two elements at a time and then uzp them into separate vectors, to
  // reduce the number of memory accesses.
  uint32x2_t ret0_u32 = vdup_n_u32(0);
  uint32x2_t ret1_u32 = vdup_n_u32(0);

  // Use a single vget_lane_u64 to minimize vector to general purpose register
  // transfers and then mask off the bits we actually want.
  const uint64_t indices0123 = vget_lane_u64(vreinterpret_u64_s16(indices), 0);
  const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
  const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
  const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
  const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);

  // At time of writing both Clang and GCC produced better code with these
  // nested if-statements compared to a switch statement with fallthrough.
  load_unaligned_u32_2x1_lane(ret0_u32, left + idx0, 0);
  if (n > 1) {
    load_unaligned_u32_2x1_lane(ret0_u32, left + idx1, 1);
    if (n > 2) {
      load_unaligned_u32_2x1_lane(ret1_u32, left + idx2, 0);
      if (n > 3) {
        load_unaligned_u32_2x1_lane(ret1_u32, left + idx3, 1);
      }
    }
  }
  return vuzp_u16(vreinterpret_u16_u32(ret0_u32),
                  vreinterpret_u16_u32(ret1_u32));
}

static AOM_FORCE_INLINE uint16x8x2_t highbd_dr_prediction_z2_gather_left_x8(
    const uint16_t *left, const int16x8_t indices, int n) {
  assert(n > 0);
  assert(n <= 8);
  // Load two elements at a time and then uzp them into separate vectors, to
  // reduce the number of memory accesses.
  uint32x4_t ret0_u32 = vdupq_n_u32(0);
  uint32x4_t ret1_u32 = vdupq_n_u32(0);

  // Use a pair of vget_lane_u64 to minimize vector to general purpose register
  // transfers and then mask off the bits we actually want.
  const uint64_t indices0123 =
      vgetq_lane_u64(vreinterpretq_u64_s16(indices), 0);
  const uint64_t indices4567 =
      vgetq_lane_u64(vreinterpretq_u64_s16(indices), 1);
  const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
  const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
  const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
  const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);
  const int idx4 = (int16_t)((indices4567 >> 0) & 0xffffU);
  const int idx5 = (int16_t)((indices4567 >> 16) & 0xffffU);
  const int idx6 = (int16_t)((indices4567 >> 32) & 0xffffU);
  const int idx7 = (int16_t)((indices4567 >> 48) & 0xffffU);

  // At time of writing both Clang and GCC produced better code with these
  // nested if-statements compared to a switch statement with fallthrough.
  load_unaligned_u32_4x1_lane(ret0_u32, left + idx0, 0);
  if (n > 1) {
    load_unaligned_u32_4x1_lane(ret0_u32, left + idx1, 1);
    if (n > 2) {
      load_unaligned_u32_4x1_lane(ret0_u32, left + idx2, 2);
      if (n > 3) {
        load_unaligned_u32_4x1_lane(ret0_u32, left + idx3, 3);
        if (n > 4) {
          load_unaligned_u32_4x1_lane(ret1_u32, left + idx4, 0);
          if (n > 5) {
            load_unaligned_u32_4x1_lane(ret1_u32, left + idx5, 1);
            if (n > 6) {
              load_unaligned_u32_4x1_lane(ret1_u32, left + idx6, 2);
              if (n > 7) {
                load_unaligned_u32_4x1_lane(ret1_u32, left + idx7, 3);
              }
            }
          }
        }
      }
    }
  }
  return vuzpq_u16(vreinterpretq_u16_u32(ret0_u32),
                   vreinterpretq_u16_u32(ret1_u32));
}

static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_merge_x4(
    uint16x4_t out_x, uint16x4_t out_y, int base_shift) {
  assert(base_shift >= 0);
  assert(base_shift <= 4);
  // On AArch64 we can permute the data from the `above` and `left` vectors
  // into a single vector in a single load (of the permute vector) + tbl.
#if AOM_ARCH_AARCH64
  const uint8x8x2_t out_yx = { { vreinterpret_u8_u16(out_y),
                                 vreinterpret_u8_u16(out_x) } };
  return vreinterpret_u16_u8(
      vtbl2_u8(out_yx, vld1_u8(z2_merge_shuffles_u16x4[base_shift])));
#else
  uint16x4_t out = out_y;
  for (int c2 = base_shift, x_idx = 0; c2 < 4; ++c2, ++x_idx) {
    out[c2] = out_x[x_idx];
  }
  return out;
#endif
}

static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_merge_x8(
    uint16x8_t out_x, uint16x8_t out_y, int base_shift) {
  assert(base_shift >= 0);
  assert(base_shift <= 8);
  // On AArch64 we can permute the data from the `above` and `left` vectors
  // into a single vector in a single load (of the permute vector) + tbl.
#if AOM_ARCH_AARCH64
  const uint8x16x2_t out_yx = { { vreinterpretq_u8_u16(out_y),
                                  vreinterpretq_u8_u16(out_x) } };
  return vreinterpretq_u16_u8(
      vqtbl2q_u8(out_yx, vld1q_u8(z2_merge_shuffles_u16x8[base_shift])));
#else
  uint16x8_t out = out_y;
  for (int c2 = base_shift, x_idx = 0; c2 < 8; ++c2, ++x_idx) {
    out[c2] = out_x[x_idx];
  }
  return out;
#endif
}

static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_apply_shift_x4(
    uint16x4_t a0, uint16x4_t a1, int16x4_t shift) {
  uint32x4_t res = vmull_u16(a1, vreinterpret_u16_s16(shift));
  res =
      vmlal_u16(res, a0, vsub_u16(vdup_n_u16(32), vreinterpret_u16_s16(shift)));
  return vrshrn_n_u32(res, 5);
}

static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_apply_shift_x8(
    uint16x8_t a0, uint16x8_t a1, int16x8_t shift) {
  return vcombine_u16(
      highbd_dr_prediction_z2_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1),
                                             vget_low_s16(shift)),
      highbd_dr_prediction_z2_apply_shift_x4(
          vget_high_u16(a0), vget_high_u16(a1), vget_high_s16(shift)));
}

static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_step_x4(
    const uint16_t *above, const uint16x4_t above0, const uint16x4_t above1,
    const uint16_t *left, int dx, int dy, int r, int c) {
  const int16x4_t iota = vld1_s16(iota1_s16);

  const int x0 = (c << 6) - (r + 1) * dx;
  const int y0 = (r << 6) - (c + 1) * dy;

  const int16x4_t x0123 = vadd_s16(vdup_n_s16(x0), vshl_n_s16(iota, 6));
  const int16x4_t y0123 = vsub_s16(vdup_n_s16(y0), vmul_n_s16(iota, dy));
  const int16x4_t shift_x0123 =
      vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
  const int16x4_t shift_y0123 =
      vshr_n_s16(vand_s16(y0123, vdup_n_s16(0x3F)), 1);
  const int16x4_t base_y0123 = vshr_n_s16(y0123, 6);

  const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;

  // Based on the value of `base_shift` there are three possible cases to
  // compute the result:
  // 1) base_shift <= 0: We can load and operate entirely on data from the
  //                     `above` input vector.
  // 2) base_shift < vl: We can load from `above[-1]` and shift
  //                     `vl - base_shift` elements across to the end of the
  //                     vector, then compute the remainder from `left`.
  // 3) base_shift >= vl: We can load and operate entirely on data from the
  //                      `left` input vector.

  if (base_shift <= 0) {
    const int base_x = x0 >> 6;
    const uint16x4_t a0 = vld1_u16(above + base_x);
    const uint16x4_t a1 = vld1_u16(above + base_x + 1);
    return highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
  } else if (base_shift < 4) {
    const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4(
        left + 1, base_y0123, base_shift);
    const uint16x4_t out16_y = highbd_dr_prediction_z2_apply_shift_x4(
        l01.val[0], l01.val[1], shift_y0123);

    // No need to reload from above in the loop, just use pre-loaded constants.
    const uint16x4_t out16_x =
        highbd_dr_prediction_z2_apply_shift_x4(above0, above1, shift_x0123);

    return highbd_dr_prediction_z2_merge_x4(out16_x, out16_y, base_shift);
  } else {
    const uint16x4x2_t l01 =
        highbd_dr_prediction_z2_gather_left_x4(left + 1, base_y0123, 4);
    return highbd_dr_prediction_z2_apply_shift_x4(l01.val[0], l01.val[1],
                                                  shift_y0123);
  }
}

static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_step_x8(
    const uint16_t *above, const uint16x8_t above0, const uint16x8_t above1,
    const uint16_t *left, int dx, int dy, int r, int c) {
  const int16x8_t iota = vld1q_s16(iota1_s16);

  const int x0 = (c << 6) - (r + 1) * dx;
  const int y0 = (r << 6) - (c + 1) * dy;

  const int16x8_t x01234567 = vaddq_s16(vdupq_n_s16(x0), vshlq_n_s16(iota, 6));
  const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(y0), vmulq_n_s16(iota, dy));
  const int16x8_t shift_x01234567 =
      vshrq_n_s16(vandq_s16(x01234567, vdupq_n_s16(0x3F)), 1);
  const int16x8_t shift_y01234567 =
      vshrq_n_s16(vandq_s16(y01234567, vdupq_n_s16(0x3F)), 1);
  const int16x8_t base_y01234567 = vshrq_n_s16(y01234567, 6);

  const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;

  // Based on the value of `base_shift` there are three possible cases to
  // compute the result:
  // 1) base_shift <= 0: We can load and operate entirely on data from the
  //                     `above` input vector.
  // 2) base_shift < vl: We can load from `above[-1]` and shift
  //                     `vl - base_shift` elements across to the end of the
  //                     vector, then compute the remainder from `left`.
  // 3) base_shift >= vl: We can load and operate entirely on data from the
  //                      `left` input vector.

  if (base_shift <= 0) {
    const int base_x = x0 >> 6;
    const uint16x8_t a0 = vld1q_u16(above + base_x);
    const uint16x8_t a1 = vld1q_u16(above + base_x + 1);
    return highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
  } else if (base_shift < 8) {
    const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8(
        left + 1, base_y01234567, base_shift);
    const uint16x8_t out16_y = highbd_dr_prediction_z2_apply_shift_x8(
        l01.val[0], l01.val[1], shift_y01234567);

    // No need to reload from above in the loop, just use pre-loaded constants.
    const uint16x8_t out16_x =
        highbd_dr_prediction_z2_apply_shift_x8(above0, above1, shift_x01234567);

    return highbd_dr_prediction_z2_merge_x8(out16_x, out16_y, base_shift);
  } else {
    const uint16x8x2_t l01 =
        highbd_dr_prediction_z2_gather_left_x8(left + 1, base_y01234567, 8);
    return highbd_dr_prediction_z2_apply_shift_x8(l01.val[0], l01.val[1],
                                                  shift_y01234567);
  }
}

// Left array is accessed from -1 through `bh - 1` inclusive.
// Above array is accessed from -1 through `bw - 1` inclusive.
#define HIGHBD_DR_PREDICTOR_Z2_WXH(bw, bh)                                 \
  static void highbd_dr_prediction_z2_##bw##x##bh##_neon(                  \
      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
      const uint16_t *left, int upsample_above, int upsample_left, int dx, \
      int dy, int bd) {                                                    \
    (void)bd;                                                              \
    (void)upsample_above;                                                  \
    (void)upsample_left;                                                   \
    assert(!upsample_above);                                               \
    assert(!upsample_left);                                                \
    assert(bw % 4 == 0);                                                   \
    assert(bh % 4 == 0);                                                   \
    assert(dx > 0);                                                        \
    assert(dy > 0);                                                        \
                                                                           \
    uint16_t left_data[bh + 1];                                            \
    memcpy(left_data, left - 1, (bh + 1) * sizeof(uint16_t));              \
                                                                           \
    uint16x8_t a0, a1;                                                     \
    if (bw == 4) {                                                         \
      a0 = vcombine_u16(vld1_u16(above - 1), vdup_n_u16(0));               \
      a1 = vcombine_u16(vld1_u16(above + 0), vdup_n_u16(0));               \
    } else {                                                               \
      a0 = vld1q_u16(above - 1);                                           \
      a1 = vld1q_u16(above + 0);                                           \
    }                                                                      \
                                                                           \
    int r = 0;                                                             \
    do {                                                                   \
      if (bw == 4) {                                                       \
        vst1_u16(dst, highbd_dr_prediction_z2_step_x4(                     \
                          above, vget_low_u16(a0), vget_low_u16(a1),       \
                          left_data, dx, dy, r, 0));                       \
      } else {                                                             \
        int c = 0;                                                         \
        do {                                                               \
          vst1q_u16(dst + c, highbd_dr_prediction_z2_step_x8(              \
                                 above, a0, a1, left_data, dx, dy, r, c)); \
          c += 8;                                                          \
        } while (c < bw);                                                  \
      }                                                                    \
      dst += stride;                                                       \
    } while (++r < bh);                                                    \
  }

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
HIGHBD_DR_PREDICTOR_Z2_WXH(4, 16)
HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16)
HIGHBD_DR_PREDICTOR_Z2_WXH(8, 32)
HIGHBD_DR_PREDICTOR_Z2_WXH(16, 4)
HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8)
HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16)
HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32)
HIGHBD_DR_PREDICTOR_Z2_WXH(16, 64)
HIGHBD_DR_PREDICTOR_Z2_WXH(32, 8)
HIGHBD_DR_PREDICTOR_Z2_WXH(32, 16)
HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32)
HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64)
HIGHBD_DR_PREDICTOR_Z2_WXH(64, 16)
HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32)
HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64)
#else
HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16)
HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8)
HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16)
HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32)
HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32)
HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64)
HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32)
HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64)
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

#undef HIGHBD_DR_PREDICTOR_Z2_WXH

typedef void (*highbd_dr_prediction_z2_ptr)(uint16_t *dst, ptrdiff_t stride,
                                            const uint16_t *above,
                                            const uint16_t *left,
                                            int upsample_above,
                                            int upsample_left, int dx, int dy,
                                            int bd);

static void highbd_dr_prediction_z2_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left,
                                             int upsample_above,
                                             int upsample_left, int dx, int dy,
                                             int bd) {
  (void)bd;
  assert(dx > 0);
  assert(dy > 0);

  const int frac_bits_x = 6 - upsample_above;
  const int frac_bits_y = 6 - upsample_left;
  const int min_base_x = -(1 << (upsample_above + frac_bits_x));

  // if `upsample_left` then we need -2 through 6 inclusive from `left`.
  // else we only need -1 through 3 inclusive.

#if AOM_ARCH_AARCH64
  uint16x8_t left_data0, left_data1;
  if (upsample_left) {
    left_data0 = vld1q_u16(left - 2);
    left_data1 = vld1q_u16(left - 1);
  } else {
    left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
    left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
  }
#endif

  const int16x4_t iota0123 = vld1_s16(iota1_s16);
  const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);

  for (int r = 0; r < 4; ++r) {
    const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
    const int x0 = (r + 1) * dx;
    const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
    const int base_x0 = (-x0) >> frac_bits_x;
    if (base_shift <= 0) {
      uint16x4_t a0, a1;
      int16x4_t shift_x0123;
      if (upsample_above) {
        const uint16x4x2_t a01 = vld2_u16(above + base_x0);
        a0 = a01.val[0];
        a1 = a01.val[1];
        shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
      } else {
        a0 = vld1_u16(above + base_x0);
        a1 = vld1_u16(above + base_x0 + 1);
        shift_x0123 = vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
      }
      vst1_u16(dst,
               highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
    } else if (base_shift < 4) {
      // Calculate Y component from `left`.
      const int y_iters = base_shift;
      const int16x4_t y0123 =
          vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
      const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
      const int16x4_t shift_y0123 = vshr_n_s16(
          vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
      uint16x4_t l0, l1;
#if AOM_ARCH_AARCH64
      const int left_data_base = upsample_left ? -2 : -1;
      l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
                                                       left_data_base, y_iters);
      l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
                                                       left_data_base, y_iters);
#else
      const uint16x4x2_t l01 =
          highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
      l0 = l01.val[0];
      l1 = l01.val[1];
#endif

      const uint16x4_t out_y =
          highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);

      // Calculate X component from `above`.
      const int16x4_t shift_x0123 = vshr_n_s16(
          vand_s16(vmul_n_s16(x0123, 1 << upsample_above), vdup_n_s16(0x3F)),
          1);
      uint16x4_t a0, a1;
      if (upsample_above) {
        const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
        a0 = a01.val[0];
        a1 = a01.val[1];
      } else {
        a0 = vld1_u16(above - 1);
        a1 = vld1_u16(above + 0);
      }
      const uint16x4_t out_x =
          highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);

      // Combine X and Y vectors.
      const uint16x4_t out =
          highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
      vst1_u16(dst, out);
    } else {
      const int16x4_t y0123 =
          vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
      const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
      const int16x4_t shift_y0123 = vshr_n_s16(
          vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
      uint16x4_t l0, l1;
#if AOM_ARCH_AARCH64
      const int left_data_base = upsample_left ? -2 : -1;
      l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
                                                       left_data_base, 4);
      l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
                                                       left_data_base, 4);
#else
      const uint16x4x2_t l01 =
          highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
      l0 = l01.val[0];
      l1 = l01.val[1];
#endif
      vst1_u16(dst,
               highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
    }
    dst += stride;
  }
}

static void highbd_dr_prediction_z2_4x8_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left,
                                             int upsample_above,
                                             int upsample_left, int dx, int dy,
                                             int bd) {
  (void)bd;
  assert(dx > 0);
  assert(dy > 0);

  const int frac_bits_x = 6 - upsample_above;
  const int frac_bits_y = 6 - upsample_left;
  const int min_base_x = -(1 << (upsample_above + frac_bits_x));

  // if `upsample_left` then we need -2 through 14 inclusive from `left`.
  // else we only need -1 through 6 inclusive.

#if AOM_ARCH_AARCH64
  uint16x8x2_t left_data0, left_data1;
  if (upsample_left) {
    left_data0 = vld1q_u16_x2(left - 2);
    left_data1 = vld1q_u16_x2(left - 1);
  } else {
    left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
    left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
  }
#endif

  const int16x4_t iota0123 = vld1_s16(iota1_s16);
  const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);

  for (int r = 0; r < 8; ++r) {
    const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
    const int x0 = (r + 1) * dx;
    const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
    const int base_x0 = (-x0) >> frac_bits_x;
    if (base_shift <= 0) {
      uint16x4_t a0, a1;
      int16x4_t shift_x0123;
      if (upsample_above) {
        const uint16x4x2_t a01 = vld2_u16(above + base_x0);
        a0 = a01.val[0];
        a1 = a01.val[1];
        shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
      } else {
        a0 = vld1_u16(above + base_x0);
        a1 = vld1_u16(above + base_x0 + 1);
        shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
      }
      vst1_u16(dst,
               highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
    } else if (base_shift < 4) {
      // Calculate Y component from `left`.
      const int y_iters = base_shift;
      const int16x4_t y0123 =
          vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
      const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
      const int16x4_t shift_y0123 = vshr_n_s16(
          vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);

      uint16x4_t l0, l1;
#if AOM_ARCH_AARCH64
      const int left_data_base = upsample_left ? -2 : -1;
      l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
          left_data0, base_y0123, left_data_base, y_iters);
      l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
          left_data1, base_y0123, left_data_base, y_iters);
#else
      const uint16x4x2_t l01 =
          highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
      l0 = l01.val[0];
      l1 = l01.val[1];
#endif

      const uint16x4_t out_y =
          highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);

      // Calculate X component from `above`.
      uint16x4_t a0, a1;
      int16x4_t shift_x0123;
      if (upsample_above) {
        const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
        a0 = a01.val[0];
        a1 = a01.val[1];
        shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
      } else {
        a0 = vld1_u16(above - 1);
        a1 = vld1_u16(above + 0);
        shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
      }
      const uint16x4_t out_x =
          highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);

      // Combine X and Y vectors.
      const uint16x4_t out =
          highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
      vst1_u16(dst, out);
    } else {
      const int16x4_t y0123 =
          vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
      const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
      const int16x4_t shift_y0123 = vshr_n_s16(
          vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);

      uint16x4_t l0, l1;
#if AOM_ARCH_AARCH64
      const int left_data_base = upsample_left ? -2 : -1;
      l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data0, base_y0123,
                                                        left_data_base, 4);
      l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data1, base_y0123,
                                                        left_data_base, 4);
#else
      const uint16x4x2_t l01 =
          highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
      l0 = l01.val[0];
      l1 = l01.val[1];
#endif

      vst1_u16(dst,
               highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
    }
    dst += stride;
  }
}

static void highbd_dr_prediction_z2_8x4_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left,
                                             int upsample_above,
                                             int upsample_left, int dx, int dy,
                                             int bd) {
  (void)bd;
  assert(dx > 0);
  assert(dy > 0);

  const int frac_bits_x = 6 - upsample_above;
  const int frac_bits_y = 6 - upsample_left;
  const int min_base_x = -(1 << (upsample_above + frac_bits_x));

  // if `upsample_left` then we need -2 through 6 inclusive from `left`.
  // else we only need -1 through 3 inclusive.

#if AOM_ARCH_AARCH64
  uint16x8_t left_data0, left_data1;
  if (upsample_left) {
    left_data0 = vld1q_u16(left - 2);
    left_data1 = vld1q_u16(left - 1);
  } else {
    left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
    left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
  }
#endif

  const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
  const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);

  for (int r = 0; r < 4; ++r) {
    const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
    const int x0 = (r + 1) * dx;
    const int16x8_t x01234567 =
        vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
    const int base_x0 = (-x0) >> frac_bits_x;
    if (base_shift <= 0) {
      uint16x8_t a0, a1;
      int16x8_t shift_x01234567;
      if (upsample_above) {
        const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
        a0 = a01.val[0];
        a1 = a01.val[1];
        shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
      } else {
        a0 = vld1q_u16(above + base_x0);
        a1 = vld1q_u16(above + base_x0 + 1);
        shift_x01234567 =
            vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
      }
      vst1q_u16(
          dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
    } else if (base_shift < 8) {
      // Calculate Y component from `left`.
      const int y_iters = base_shift;
      const int16x8_t y01234567 =
          vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
      const int16x8_t base_y01234567 =
          vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
      const int16x8_t shift_y01234567 =
          vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
                                vdupq_n_s16(0x3F)),
                      1);

      uint16x8_t l0, l1;
#if AOM_ARCH_AARCH64
      const int left_data_base = upsample_left ? -2 : -1;
      l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
          left_data0, base_y01234567, left_data_base, y_iters);
      l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
          left_data1, base_y01234567, left_data_base, y_iters);
#else
      const uint16x8x2_t l01 =
          highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
      l0 = l01.val[0];
      l1 = l01.val[1];
#endif

      const uint16x8_t out_y =
          highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);

      // Calculate X component from `above`.
      uint16x8_t a0, a1;
      int16x8_t shift_x01234567;
      if (upsample_above) {
        const uint16x8x2_t a01 =
            vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
        a0 = a01.val[0];
        a1 = a01.val[1];
        shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
      } else {
        a0 = vld1q_u16(above - 1);
        a1 = vld1q_u16(above + 0);
        shift_x01234567 =
            vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
      }
      const uint16x8_t out_x =
          highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);

      // Combine X and Y vectors.
      const uint16x8_t out =
          highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
      vst1q_u16(dst, out);
    } else {
      const int16x8_t y01234567 =
          vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
      const int16x8_t base_y01234567 =
          vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
      const int16x8_t shift_y01234567 =
          vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
                                vdupq_n_s16(0x3F)),
                      1);

      uint16x8_t l0, l1;
#if AOM_ARCH_AARCH64
      const int left_data_base = upsample_left ? -2 : -1;
      l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
          left_data0, base_y01234567, left_data_base, 8);
      l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
          left_data1, base_y01234567, left_data_base, 8);
#else
      const uint16x8x2_t l01 =
          highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
      l0 = l01.val[0];
      l1 = l01.val[1];
#endif

      vst1q_u16(
          dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
    }
    dst += stride;
  }
}

static void highbd_dr_prediction_z2_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left,
                                             int upsample_above,
                                             int upsample_left, int dx, int dy,
                                             int bd) {
  (void)bd;
  assert(dx > 0);
  assert(dy > 0);

  const int frac_bits_x = 6 - upsample_above;
  const int frac_bits_y = 6 - upsample_left;
  const int min_base_x = -(1 << (upsample_above + frac_bits_x));

  // if `upsample_left` then we need -2 through 14 inclusive from `left`.
  // else we only need -1 through 6 inclusive.

#if AOM_ARCH_AARCH64
  uint16x8x2_t left_data0, left_data1;
  if (upsample_left) {
    left_data0 = vld1q_u16_x2(left - 2);
    left_data1 = vld1q_u16_x2(left - 1);
  } else {
    left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
    left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
  }
#endif

  const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
  const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);

  for (int r = 0; r < 8; ++r) {
    const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
    const int x0 = (r + 1) * dx;
    const int16x8_t x01234567 =
        vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
    const int base_x0 = (-x0) >> frac_bits_x;
    if (base_shift <= 0) {
      uint16x8_t a0, a1;
      int16x8_t shift_x01234567;
      if (upsample_above) {
        const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
        a0 = a01.val[0];
        a1 = a01.val[1];
        shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
      } else {
        a0 = vld1q_u16(above + base_x0);
        a1 = vld1q_u16(above + base_x0 + 1);
        shift_x01234567 =
            vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
      }
      vst1q_u16(
          dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
    } else if (base_shift < 8) {
      // Calculate Y component from `left`.
      const int y_iters = base_shift;
      const int16x8_t y01234567 =
          vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
      const int16x8_t base_y01234567 =
          vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
      const int16x8_t shift_y01234567 =
          vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
                                vdupq_n_s16(0x3F)),
                      1);

      uint16x8_t l0, l1;
#if AOM_ARCH_AARCH64
      const int left_data_base = upsample_left ? -2 : -1;
      l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
          left_data0, base_y01234567, left_data_base, y_iters);
      l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
          left_data1, base_y01234567, left_data_base, y_iters);
#else
      const uint16x8x2_t l01 =
          highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
      l0 = l01.val[0];
      l1 = l01.val[1];
#endif

      const uint16x8_t out_y =
          highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);

      // Calculate X component from `above`.
      uint16x8_t a0, a1;
      int16x8_t shift_x01234567;
      if (upsample_above) {
        const uint16x8x2_t a01 =
            vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
        a0 = a01.val[0];
        a1 = a01.val[1];
        shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
      } else {
        a0 = vld1q_u16(above - 1);
        a1 = vld1q_u16(above + 0);
        shift_x01234567 =
            vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
      }
      const uint16x8_t out_x =
          highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);

      // Combine X and Y vectors.
      const uint16x8_t out =
          highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
      vst1q_u16(dst, out);
    } else {
      const int16x8_t y01234567 =
          vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
      const int16x8_t base_y01234567 =
          vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
      const int16x8_t shift_y01234567 =
          vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
                                vdupq_n_s16(0x3F)),
                      1);

      uint16x8_t l0, l1;
#if AOM_ARCH_AARCH64
      const int left_data_base = upsample_left ? -2 : -1;
      l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
          left_data0, base_y01234567, left_data_base, 8);
      l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
          left_data1, base_y01234567, left_data_base, 8);
#else
      const uint16x8x2_t l01 =
          highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
      l0 = l01.val[0];
      l1 = l01.val[1];
#endif

      vst1q_u16(
          dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
    }
    dst += stride;
  }
}

#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = {
  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
  { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon,
    &highbd_dr_prediction_z2_4x8_neon, &highbd_dr_prediction_z2_4x16_neon, NULL,
    NULL },
  { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon,
    &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon,
    &highbd_dr_prediction_z2_8x32_neon, NULL },
  { NULL, NULL, &highbd_dr_prediction_z2_16x4_neon,
    &highbd_dr_prediction_z2_16x8_neon, &highbd_dr_prediction_z2_16x16_neon,
    &highbd_dr_prediction_z2_16x32_neon, &highbd_dr_prediction_z2_16x64_neon },
  { NULL, NULL, NULL, &highbd_dr_prediction_z2_32x8_neon,
    &highbd_dr_prediction_z2_32x16_neon, &highbd_dr_prediction_z2_32x32_neon,
    &highbd_dr_prediction_z2_32x64_neon },
  { NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x16_neon,
    &highbd_dr_prediction_z2_64x32_neon, &highbd_dr_prediction_z2_64x64_neon },
};
#else
static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = {
  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
  { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon,
    &highbd_dr_prediction_z2_4x8_neon, NULL, NULL, NULL },
  { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon,
    &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon, NULL,
    NULL },
  { NULL, NULL, NULL, &highbd_dr_prediction_z2_16x8_neon,
    &highbd_dr_prediction_z2_16x16_neon, &highbd_dr_prediction_z2_16x32_neon,
    NULL },
  { NULL, NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_32x32_neon,
    &highbd_dr_prediction_z2_32x64_neon },
  { NULL, NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x32_neon,
    &highbd_dr_prediction_z2_64x64_neon },
};
#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER

// Directional prediction, zone 2: 90 < angle < 180
void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
                                      int bh, const uint16_t *above,
                                      const uint16_t *left, int upsample_above,
                                      int upsample_left, int dx, int dy,
                                      int bd) {
  highbd_dr_prediction_z2_ptr f =
      dr_predictor_z2_arr_neon[get_msb(bw)][get_msb(bh)];
  assert(f != NULL);
  f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd);
}

// -----------------------------------------------------------------------------
// Z3

// Both the lane to the use and the shift amount must be immediates.
#define HIGHBD_DR_PREDICTOR_Z3_STEP_X4(out, iota, base, in0, in1, s0, s1, \
                                       lane, shift)                       \
  do {                                                                    \
    uint32x4_t val = vmull_lane_u16((in0), (s0), (lane));                 \
    val = vmlal_lane_u16(val, (in1), (s1), (lane));                       \
    const uint16x4_t cmp = vadd_u16((iota), vdup_n_u16(base));            \
    const uint16x4_t res = vrshrn_n_u32(val, (shift));                    \
    *(out) = vbsl_u16(vclt_u16(cmp, vdup_n_u16(max_base_y)), res,         \
                      vdup_n_u16(left_max));                              \
  } while (0)

#define HIGHBD_DR_PREDICTOR_Z3_STEP_X8(out, iota, base, in0, in1, s0, s1, \
                                       lane, shift)                       \
  do {                                                                    \
    uint32x4_t val_lo = vmull_lane_u16(vget_low_u16(in0), (s0), (lane));  \
    val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane));     \
    uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \
    val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane));    \
    *(out) = vcombine_u16(vrshrn_n_u32(val_lo, (shift)),                  \
                          vrshrn_n_u32(val_hi, (shift)));                 \
  } while (0)

static inline uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs,
                                             int max_ofs) {
  uint16x8_t r0;
  uint16x8_t r1;
  if (ofs + 7 >= max_ofs) {
    int shuffle_idx = max_ofs - ofs;
    r0 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
  } else {
    r0 = vld1q_u16(left0 + ofs);
  }
  if (ofs + 8 >= max_ofs) {
    int shuffle_idx = max_ofs - ofs - 1;
    r1 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
  } else {
    r1 = vld1q_u16(left0 + ofs + 1);
  }
  return (uint16x8x2_t){ { r0, r1 } };
}

static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
                                                   ptrdiff_t stride, int bw,
                                                   int bh, const uint16_t *left,
                                                   int dy) {
  assert(bw % 4 == 0);
  assert(bh % 4 == 0);
  assert(dy > 0);

  // Factor out left + 1 to give the compiler a better chance of recognising
  // that the offsets used for the loads from left and left + 1 are otherwise
  // identical.
  const uint16_t *left1 = left + 1;

  const int max_base_y = (bw + bh - 1);
  const int left_max = left[max_base_y];
  const int frac_bits = 6;

  const uint16x8_t iota1x8 = vreinterpretq_u16_s16(vld1q_s16(iota1_s16));
  const uint16x4_t iota1x4 = vget_low_u16(iota1x8);

  // The C implementation of the z3 predictor when not upsampling uses:
  // ((y & 0x3f) >> 1)
  // The right shift is unnecessary here since we instead shift by +1 later,
  // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
  const uint16x4_t shift_mask = vdup_n_u16(0x3e);

  if (bh == 4) {
    int y = dy;
    int c = 0;
    do {
      // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
      // multiply instructions.
      const uint16x4_t shifts1 =
          vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
      const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
      const int base0 = (y + 0 * dy) >> frac_bits;
      const int base1 = (y + 1 * dy) >> frac_bits;
      const int base2 = (y + 2 * dy) >> frac_bits;
      const int base3 = (y + 3 * dy) >> frac_bits;
      uint16x4_t out[4];
      if (base0 >= max_base_y) {
        out[0] = vdup_n_u16(left_max);
      } else {
        const uint16x4_t l00 = vld1_u16(left + base0);
        const uint16x4_t l01 = vld1_u16(left1 + base0);
        HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota1x4, base0, l00, l01,
                                       shifts0, shifts1, 0, 6);
      }
      if (base1 >= max_base_y) {
        out[1] = vdup_n_u16(left_max);
      } else {
        const uint16x4_t l10 = vld1_u16(left + base1);
        const uint16x4_t l11 = vld1_u16(left1 + base1);
        HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota1x4, base1, l10, l11,
                                       shifts0, shifts1, 1, 6);
      }
      if (base2 >= max_base_y) {
        out[2] = vdup_n_u16(left_max);
      } else {
        const uint16x4_t l20 = vld1_u16(left + base2);
        const uint16x4_t l21 = vld1_u16(left1 + base2);
        HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota1x4, base2, l20, l21,
                                       shifts0, shifts1, 2, 6);
      }
      if (base3 >= max_base_y) {
        out[3] = vdup_n_u16(left_max);
      } else {
        const uint16x4_t l30 = vld1_u16(left + base3);
        const uint16x4_t l31 = vld1_u16(left1 + base3);
        HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota1x4, base3, l30, l31,
                                       shifts0, shifts1, 3, 6);
      }
      transpose_array_inplace_u16_4x4(out);
      for (int r2 = 0; r2 < 4; ++r2) {
        vst1_u16(dst + r2 * stride + c, out[r2]);
      }
      y += 4 * dy;
      c += 4;
    } while (c < bw);
  } else {
    int y = dy;
    int c = 0;
    do {
      int r = 0;
      do {
        // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
        // multiply instructions.
        const uint16x4_t shifts1 =
            vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
        const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
        const int base0 = ((y + 0 * dy) >> frac_bits) + r;
        const int base1 = ((y + 1 * dy) >> frac_bits) + r;
        const int base2 = ((y + 2 * dy) >> frac_bits) + r;
        const int base3 = ((y + 3 * dy) >> frac_bits) + r;
        uint16x8_t out[4];
        if (base0 >= max_base_y) {
          out[0] = vdupq_n_u16(left_max);
        } else {
          const uint16x8x2_t l0 = z3_load_left_neon(left, base0, max_base_y);
          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l0.val[0],
                                         l0.val[1], shifts0, shifts1, 0, 6);
        }
        if (base1 >= max_base_y) {
          out[1] = vdupq_n_u16(left_max);
        } else {
          const uint16x8x2_t l1 = z3_load_left_neon(left, base1, max_base_y);
          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l1.val[0],
                                         l1.val[1], shifts0, shifts1, 1, 6);
        }
        if (base2 >= max_base_y) {
          out[2] = vdupq_n_u16(left_max);
        } else {
          const uint16x8x2_t l2 = z3_load_left_neon(left, base2, max_base_y);
          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l2.val[0],
                                         l2.val[1], shifts0, shifts1, 2, 6);
        }
        if (base3 >= max_base_y) {
          out[3] = vdupq_n_u16(left_max);
        } else {
          const uint16x8x2_t l3 = z3_load_left_neon(left, base3, max_base_y);
          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l3.val[0],
                                         l3.val[1], shifts0, shifts1, 3, 6);
        }
        transpose_array_inplace_u16_4x8(out);
        for (int r2 = 0; r2 < 4; ++r2) {
          vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
        }
        for (int r2 = 0; r2 < 4; ++r2) {
          vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
        }
        r += 8;
      } while (r < bh);
      y += 4 * dy;
      c += 4;
    } while (c < bw);
  }
}

static void highbd_dr_prediction_z3_upsample1_neon(uint16_t *dst,
                                                   ptrdiff_t stride, int bw,
                                                   int bh, const uint16_t *left,
                                                   int dy) {
  assert(bw % 4 == 0);
  assert(bh % 4 == 0);
  assert(dy > 0);

  const int max_base_y = (bw + bh - 1) << 1;
  const int left_max = left[max_base_y];
  const int frac_bits = 5;

  const uint16x4_t iota1x4 = vreinterpret_u16_s16(vld1_s16(iota1_s16));
  const uint16x8_t iota2x8 = vreinterpretq_u16_s16(vld1q_s16(iota2_s16));
  const uint16x4_t iota2x4 = vget_low_u16(iota2x8);

  // The C implementation of the z3 predictor when upsampling uses:
  // (((x << 1) & 0x3f) >> 1)
  // The two shifts are unnecessary here since the lowest bit is guaranteed to
  // be zero when the mask is applied, so adjust the mask to 0x1f to avoid
  // needing the shifts at all.
  const uint16x4_t shift_mask = vdup_n_u16(0x1F);

  if (bh == 4) {
    int y = dy;
    int c = 0;
    do {
      // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
      // multiply instructions.
      const uint16x4_t shifts1 =
          vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
      const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
      const int base0 = (y + 0 * dy) >> frac_bits;
      const int base1 = (y + 1 * dy) >> frac_bits;
      const int base2 = (y + 2 * dy) >> frac_bits;
      const int base3 = (y + 3 * dy) >> frac_bits;
      const uint16x4x2_t l0 = vld2_u16(left + base0);
      const uint16x4x2_t l1 = vld2_u16(left + base1);
      const uint16x4x2_t l2 = vld2_u16(left + base2);
      const uint16x4x2_t l3 = vld2_u16(left + base3);
      uint16x4_t out[4];
      HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota2x4, base0, l0.val[0],
                                     l0.val[1], shifts0, shifts1, 0, 5);
      HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota2x4, base1, l1.val[0],
                                     l1.val[1], shifts0, shifts1, 1, 5);
      HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota2x4, base2, l2.val[0],
                                     l2.val[1], shifts0, shifts1, 2, 5);
      HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota2x4, base3, l3.val[0],
                                     l3.val[1], shifts0, shifts1, 3, 5);
      transpose_array_inplace_u16_4x4(out);
      for (int r2 = 0; r2 < 4; ++r2) {
        vst1_u16(dst + r2 * stride + c, out[r2]);
      }
      y += 4 * dy;
      c += 4;
    } while (c < bw);
  } else {
    assert(bh % 8 == 0);

    int y = dy;
    int c = 0;
    do {
      int r = 0;
      do {
        // Fully unroll the 4x8 block to allow us to use immediate lane-indexed
        // multiply instructions.
        const uint16x4_t shifts1 =
            vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
        const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
        const int base0 = ((y + 0 * dy) >> frac_bits) + (r * 2);
        const int base1 = ((y + 1 * dy) >> frac_bits) + (r * 2);
        const int base2 = ((y + 2 * dy) >> frac_bits) + (r * 2);
        const int base3 = ((y + 3 * dy) >> frac_bits) + (r * 2);
        const uint16x8x2_t l0 = vld2q_u16(left + base0);
        const uint16x8x2_t l1 = vld2q_u16(left + base1);
        const uint16x8x2_t l2 = vld2q_u16(left + base2);
        const uint16x8x2_t l3 = vld2q_u16(left + base3);
        uint16x8_t out[4];
        HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota2x8, base0, l0.val[0],
                                       l0.val[1], shifts0, shifts1, 0, 5);
        HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota2x8, base1, l1.val[0],
                                       l1.val[1], shifts0, shifts1, 1, 5);
        HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota2x8, base2, l2.val[0],
                                       l2.val[1], shifts0, shifts1, 2, 5);
        HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota2x8, base3, l3.val[0],
                                       l3.val[1], shifts0, shifts1, 3, 5);
        transpose_array_inplace_u16_4x8(out);
        for (int r2 = 0; r2 < 4; ++r2) {
          vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
        }
        for (int r2 = 0; r2 < 4; ++r2) {
          vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
        }
        r += 8;
      } while (r < bh);
      y += 4 * dy;
      c += 4;
    } while (c < bw);
  }
}

// Directional prediction, zone 3: 180 < angle < 270
void av1_highbd_dr_prediction_z3_neon(uint16_t *dst, ptrdiff_t stride, int bw,
                                      int bh, const uint16_t *above,
                                      const uint16_t *left, int upsample_left,
                                      int dx, int dy, int bd) {
  (void)above;
  (void)dx;
  (void)bd;
  assert(bw % 4 == 0);
  assert(bh % 4 == 0);
  assert(dx == 1);
  assert(dy > 0);

  if (upsample_left) {
    highbd_dr_prediction_z3_upsample1_neon(dst, stride, bw, bh, left, dy);
  } else {
    highbd_dr_prediction_z3_upsample0_neon(dst, stride, bw, bh, left, dy);
  }
}

#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X4
#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X8

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.71 Sekunden (vorverarbeitet am 2026-04-26) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.