Quelle highbd_fwd_txfm_neon.c Sprache: C

/*
* Copyright (c) 2020, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/

#include <arm_neon.h>
#include <assert.h>

#include "aom_dsp/arm/transpose_neon.h"
#include "aom_dsp/txfm_common.h"
#include "aom_ports/mem.h"
#include "av1/common/av1_txfm.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#include "shift_neon.h"
#include "txfm_neon.h"

static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in,
                                                        int32x4_t *out) {
  // This is not quite the same as the other transposes defined in
  // transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is
  // unused by the following row transform.
  for (int j = 0; j < 8; ++j) {
    for (int i = 0; i < 16; ++i) {
      transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i);
    }
  }
}

// A note on butterfly helper naming:
//
// butterfly_[weight_indices]_neon
// e.g. butterfly_0312_neon
//                ^ Weights are applied as indices 0, 3, 2, 1
//                  (see more detail below)
//
// Weight indices are treated as an index into the 4-tuple of the weight
// itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1).
// This is then represented in the helper naming by referring to the lane index
// in the loaded tuple that each multiply is performed with:
//
//         in0   in1
//      /------------
// out0 |  w[0]  w[1]   ==>  out0 = in0 * w[0] + in1 * w[1]
// out1 |  w[2]  w[3]   ==>  out1 = in0 * w[2] + in1 * w[3]
//
// So for indices 0321 from the earlier example, we end up with:
//
//          in0       in1
//      /------------------
// out0 | (lane 0) (lane 3)   ==>  out0 = in0 *  w0 + in1 * (w0-1)
// out1 | (lane 2) (lane 1)   ==>  out1 = in0 * -w0 + in1 * (1-w0)

#define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit)   \
  do {                                                                  \
    int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } };                   \
    int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \
    x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2);        \
    *out = vrshlq_s32(x, v_bit);                                        \
  } while (false)

static AOM_FORCE_INLINE void butterfly_0112_neon(
    const int32_t *cospi, const int widx0, const int32x4_t n0,
    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
    const int32x4_t v_bit) {
  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
  butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
  butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
}

static AOM_FORCE_INLINE void butterfly_2312_neon(
    const int32_t *cospi, const int widx0, const int32x4_t n0,
    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
    const int32x4_t v_bit) {
  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
  butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit);
  butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
}

static AOM_FORCE_INLINE void butterfly_0332_neon(
    const int32_t *cospi, const int widx0, const int32x4_t n0,
    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
    const int32x4_t v_bit) {
  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
  butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit);
  butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit);
}

static AOM_FORCE_INLINE void butterfly_0130_neon(
    const int32_t *cospi, const int widx0, const int32x4_t n0,
    const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
    const int32x4_t v_bit) {
  int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
  butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
  butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit);
}

static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon(
    const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
    int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
  int32x2_t w01 = vld1_s32(cospi + 2 * 32);
  butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit);
  butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit);
}

static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon(
    const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
    int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
  int32x2_t w01 = vld1_s32(cospi + 2 * 32);
  butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit);
  butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit);
}

static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input,
                                                       int32x4_t *output,
                                                       const int size) {
  const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
  int i = 0;
  do {
    const int32x4_t r1 = vmulq_s32(input[i], sqrt2);
    output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
  } while (++i < size);
}

static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon(
    const int32x4_t *input, int32x4_t *output, const int size) {
  const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
  int i = 0;
  do {
    const int32x4_t r0 = vrshrq_n_s32(input[i], 2);
    const int32x4_t r1 = vmulq_s32(r0, sqrt2);
    output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
  } while (++i < size);
}

#define LOAD_BUFFER_4XH(h)                                           \
  static AOM_FORCE_INLINE void load_buffer_4x##h(                    \
      const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
    if (fliplr) {                                                    \
      for (int i = 0; i < (h); ++i) {                                \
        int16x4_t a = vld1_s16(input + i * stride);                  \
        a = vrev64_s16(a);                                           \
        in[i] = vshll_n_s16(a, 2);                                   \
      }                                                              \
    } else {                                                         \
      for (int i = 0; i < (h); ++i) {                                \
        int16x4_t a = vld1_s16(input + i * stride);                  \
        in[i] = vshll_n_s16(a, 2);                                   \
      }                                                              \
    }                                                                \
  }

// AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to
// avoid the expression even though the compiler can prove that the code path
// is never taken if `shift == 0`.
#define shift_left_long_s16(a, shift) \
  ((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift)))

#define LOAD_BUFFER_WXH(w, h, shift)                                 \
  static AOM_FORCE_INLINE void load_buffer_##w##x##h(                \
      const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
    assert(w >= 8);                                                  \
    if (fliplr) {                                                    \
      for (int i = 0; i < (h); ++i) {                                \
        for (int j = 0; j < (w) / 8; ++j) {                          \
          int16x8_t a = vld1q_s16(input + i * stride + j * 8);       \
          a = vrev64q_s16(a);                                        \
          int j2 = (w) / 8 - j - 1;                                  \
          in[i + (h) * (2 * j2 + 0)] =                               \
              shift_left_long_s16(vget_high_s16(a), (shift));        \
          in[i + (h) * (2 * j2 + 1)] =                               \
              shift_left_long_s16(vget_low_s16(a), (shift));         \
        }                                                            \
      }                                                              \
    } else {                                                         \
      for (int i = 0; i < (h); ++i) {                                \
        for (int j = 0; j < (w) / 8; ++j) {                          \
          int16x8_t a = vld1q_s16(input + i * stride + j * 8);       \
          in[i + (h) * (2 * j + 0)] =                                \
              shift_left_long_s16(vget_low_s16(a), (shift));         \
          in[i + (h) * (2 * j + 1)] =                                \
              shift_left_long_s16(vget_high_s16(a), (shift));        \
        }                                                            \
      }                                                              \
    }                                                                \
  }

LOAD_BUFFER_4XH(4)
LOAD_BUFFER_4XH(8)
LOAD_BUFFER_4XH(16)
LOAD_BUFFER_4XH(32)
LOAD_BUFFER_WXH(8, 8, 2)
LOAD_BUFFER_WXH(16, 16, 2)
LOAD_BUFFER_WXH(32, 64, 0)
LOAD_BUFFER_WXH(64, 32, 2)
LOAD_BUFFER_WXH(64, 64, 0)

#if !CONFIG_REALTIME_ONLY
LOAD_BUFFER_WXH(16, 64, 0)
LOAD_BUFFER_WXH(64, 16, 2)
#endif  // !CONFIG_REALTIME_ONLY

#define STORE_BUFFER_WXH(w, h)                                \
  static AOM_FORCE_INLINE void store_buffer_##w##x##h(        \
      const int32x4_t *in, int32_t *out, int stride) {        \
    for (int i = 0; i < (w); ++i) {                           \
      for (int j = 0; j < (h) / 4; ++j) {                     \
        vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \
      }                                                       \
    }                                                         \
  }

STORE_BUFFER_WXH(4, 4)
STORE_BUFFER_WXH(8, 4)
STORE_BUFFER_WXH(8, 8)
STORE_BUFFER_WXH(16, 4)
STORE_BUFFER_WXH(16, 16)
STORE_BUFFER_WXH(32, 4)
STORE_BUFFER_WXH(32, 32)
STORE_BUFFER_WXH(64, 32)

#if !CONFIG_REALTIME_ONLY
STORE_BUFFER_WXH(16, 32)
STORE_BUFFER_WXH(64, 16)
#endif  // !CONFIG_REALTIME_ONLY

static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in,
                                                  int32x4_t *out, int bit) {
  const int32_t *const cospi = cospi_arr_s32(bit);
  const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]);
  const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]);

  const int32x4_t a0 = vaddq_s32(in[0], in[3]);
  const int32x4_t a1 = vsubq_s32(in[0], in[3]);
  const int32x4_t a2 = vaddq_s32(in[1], in[2]);
  const int32x4_t a3 = vsubq_s32(in[1], in[2]);

  const int32x4_t b0 = vmulq_s32(a0, cospi32);
  const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1);
  const int32x4_t b2 = vmulq_s32(a2, cospi32);
  const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1);

  const int32x4_t c0 = vaddq_s32(b0, b2);
  const int32x4_t c1 = vsubq_s32(b0, b2);
  const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0);
  const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0);

  const int32x4_t v_bit = vdupq_n_s32(-bit);
  const int32x4_t d0 = vrshlq_s32(c0, v_bit);
  const int32x4_t d1 = vrshlq_s32(c1, v_bit);
  const int32x4_t d2 = vrshlq_s32(c2, v_bit);
  const int32x4_t d3 = vrshlq_s32(c3, v_bit);

  out[0] = d0;
  out[1] = d2;
  out[2] = d1;
  out[3] = d3;
}

static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in,
                                                   int32x4_t *out, int bit) {
  const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1);

  const int32x4_t a0 = vaddq_s32(in[0], in[1]);
  const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0);
  const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1);
  const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0);

  const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1);
  const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0);
  const int32x4_t b2 = vsubq_s32(a0, in[3]);

  const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1);
  const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1);
  const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0);

  const int32x4_t d0 = vaddq_s32(c0, a3);
  const int32x4_t d1 = vsubq_s32(c1, a3);
  const int32x4_t d2 = vsubq_s32(c1, c0);

  const int32x4_t e0 = vaddq_s32(d2, a3);

  const int32x4_t v_bit = vdupq_n_s32(-bit);
  out[0] = vrshlq_s32(d0, v_bit);
  out[1] = vrshlq_s32(c2, v_bit);
  out[2] = vrshlq_s32(d1, v_bit);
  out[3] = vrshlq_s32(e0, v_bit);
}

static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in,
                                                       int32x4_t *out,
                                                       int bit) {
  (void)bit;
  int32x4_t fact = vdupq_n_s32(NewSqrt2);

  for (int i = 0; i < 4; i++) {
    const int32x4_t a_low = vmulq_s32(in[i], fact);
    out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits);
  }
}

void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff,
                             int input_stride, TX_TYPE tx_type, int bd) {
  (void)bd;

  int ud_flip, lr_flip;
  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
  ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4);

  // Workspace for column/row-wise transforms.
  int32x4_t buf[4];

  switch (tx_type) {
    case DCT_DCT:
      load_buffer_4x4(input, buf, input_stride, 0);
      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case ADST_DCT:
      load_buffer_4x4(input, buf, input_stride, 0);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case DCT_ADST:
      load_buffer_4x4(input, buf, input_stride, 0);
      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case ADST_ADST:
      load_buffer_4x4(input, buf, input_stride, 0);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case FLIPADST_DCT:
      load_buffer_4x4(input, buf, input_stride, 0);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case DCT_FLIPADST:
      load_buffer_4x4(input, buf, input_stride, 1);
      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case FLIPADST_FLIPADST:
      load_buffer_4x4(input, buf, input_stride, 1);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case ADST_FLIPADST:
      load_buffer_4x4(input, buf, input_stride, 1);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case FLIPADST_ADST:
      load_buffer_4x4(input, buf, input_stride, 0);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case IDTX:
      load_buffer_4x4(input, buf, input_stride, 0);
      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case V_DCT:
      load_buffer_4x4(input, buf, input_stride, 0);
      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case H_DCT:
      load_buffer_4x4(input, buf, input_stride, 0);
      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case V_ADST:
      load_buffer_4x4(input, buf, input_stride, 0);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case H_ADST:
      load_buffer_4x4(input, buf, input_stride, 0);
      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case V_FLIPADST:
      load_buffer_4x4(input, buf, input_stride, 0);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    case H_FLIPADST:
      load_buffer_4x4(input, buf, input_stride, 1);
      highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      transpose_arrays_s32_4x4(buf, buf);
      highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
      store_buffer_4x4(buf, coeff, /*stride=*/4);
      break;
    default: assert(0);
  }
}

// Butterfly pre-processing:
// e.g. n=4:
//   out[0] = in[0] + in[3]
//   out[1] = in[1] + in[2]
//   out[2] = in[1] - in[2]
//   out[3] = in[0] - in[3]

static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input,
                                               int32x4_t *output, int n) {
  for (int i = 0; i < n / 2; ++i) {
    output[i] = vaddq_s32(input[i], input[n - i - 1]);
  }
  for (int i = 0; i < n / 2; ++i) {
    output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
  }
}

// Butterfly post-processing:
// e.g. n=8:
//   out[0] = in0[0] + in1[3];
//   out[1] = in0[1] + in1[2];
//   out[2] = in0[1] - in1[2];
//   out[3] = in0[0] - in1[3];
//   out[4] = in0[7] - in1[4];
//   out[5] = in0[6] - in1[5];
//   out[6] = in0[6] + in1[5];
//   out[7] = in0[7] + in1[4];

static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0,
                                                const int32x4_t *in1,
                                                int32x4_t *output, int n) {
  for (int i = 0; i < n / 4; ++i) {
    output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]);
  }
  for (int i = 0; i < n / 4; ++i) {
    output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
  }
  for (int i = 0; i < n / 4; ++i) {
    output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
  }
  for (int i = 0; i < n / 4; ++i) {
    output[(3 * n) / 4 + i] =
        vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
  }
}

static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in,
                                                  int32x4_t *out, int bit) {
  const int32_t *const cospi = cospi_arr_s32(bit);
  const int32x4_t v_bit = vdupq_n_s32(-bit);

  // stage 1
  int32x4_t a[8];
  butterfly_dct_pre(in, a, 8);

  // stage 2
  int32x4_t b[8];
  butterfly_dct_pre(a, b, 4);
  butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit);

  // stage 3
  int32x4_t c[8];
  butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit);
  butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit);
  butterfly_dct_post(a + 4, b + 4, c + 4, 4);

  // stage 4-5
  butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit);
  butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit);

  out[0] = c[0];
  out[2] = c[2];
  out[4] = c[1];
  out[6] = c[3];
}

static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in,
                                                   int32x4_t *out, int bit) {
  const int32_t *const cospi = cospi_arr_s32(bit);
  const int32x4_t v_bit = vdupq_n_s32(-bit);

  int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
  int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;

  // stage 0-1
  u0 = in[0];
  u1 = in[7];
  u2 = in[3];
  u3 = in[4];
  u4 = in[1];
  u5 = in[6];
  u6 = in[2];
  u7 = in[5];

  // stage 2
  v0 = u0;
  v1 = u1;
  butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit);
  v4 = u4;
  v5 = u5;
  butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit);

  // stage 3
  u0 = vaddq_s32(v0, v2);
  u1 = vsubq_s32(v3, v1);
  u2 = vsubq_s32(v0, v2);
  u3 = vaddq_s32(v1, v3);
  u4 = vsubq_s32(v6, v4);
  u5 = vaddq_s32(v5, v7);
  u6 = vaddq_s32(v4, v6);
  u7 = vsubq_s32(v5, v7);

  // stage 4
  v0 = u0;
  v1 = u1;
  v2 = u2;
  v3 = u3;

  butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit);
  butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit);

  // stage 5
  u0 = vaddq_s32(v0, v4);
  u1 = vaddq_s32(v1, v5);
  u2 = vaddq_s32(v2, v6);
  u3 = vsubq_s32(v7, v3);
  u4 = vsubq_s32(v0, v4);
  u5 = vsubq_s32(v1, v5);
  u6 = vsubq_s32(v2, v6);
  u7 = vaddq_s32(v3, v7);

  // stage 6
  butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit);
  butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit);
  butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit);
  butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit);

  // stage 7
  out[0] = v1;
  out[1] = v6;
  out[2] = v3;
  out[3] = v4;
  out[4] = v5;
  out[5] = v2;
  out[6] = v7;
  out[7] = v0;
}

static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in,
                                                       int32x4_t *out,
                                                       int bit) {
  (void)bit;
  out[0] = vshlq_n_s32(in[0], 1);
  out[1] = vshlq_n_s32(in[1], 1);
  out[2] = vshlq_n_s32(in[2], 1);
  out[3] = vshlq_n_s32(in[3], 1);
  out[4] = vshlq_n_s32(in[4], 1);
  out[5] = vshlq_n_s32(in[5], 1);
  out[6] = vshlq_n_s32(in[6], 1);
  out[7] = vshlq_n_s32(in[7], 1);
}

static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in,
                                                  int32x4_t *out, int bit,
                                                  int howmany) {
  const int stride = 8;
  int i = 0;
  do {
    highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit);
  } while (++i < howmany);
}

static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in,
                                                   int32x4_t *out, int bit,
                                                   int howmany) {
  const int stride = 8;
  int i = 0;
  do {
    highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit);
  } while (++i < howmany);
}

static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in,
                                                       int32x4_t *out, int bit,
                                                       int howmany) {
  (void)bit;
  const int stride = 8;
  int i = 0;
  do {
    highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit);
  } while (++i < howmany);
}

void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride,
                             TX_TYPE tx_type, int bd) {
  (void)bd;

  int ud_flip, lr_flip;
  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
  ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);

  // Workspaces for column/row-wise transforms.
  int32x4_t buf0[16], buf1[16];

  switch (tx_type) {
    case DCT_DCT:
      load_buffer_8x8(input, buf0, stride, 0);
      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case ADST_DCT:
      load_buffer_8x8(input, buf0, stride, 0);
      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case DCT_ADST:
      load_buffer_8x8(input, buf0, stride, 0);
      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case ADST_ADST:
      load_buffer_8x8(input, buf0, stride, 0);
      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case FLIPADST_DCT:
      load_buffer_8x8(input, buf0, stride, 0);
      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case DCT_FLIPADST:
      load_buffer_8x8(input, buf0, stride, 1);
      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case FLIPADST_FLIPADST:
      load_buffer_8x8(input, buf0, stride, 1);
      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case ADST_FLIPADST:
      load_buffer_8x8(input, buf0, stride, 1);
      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case FLIPADST_ADST:
      load_buffer_8x8(input, buf0, stride, 0);
      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case IDTX:
      load_buffer_8x8(input, buf0, stride, 0);
      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case V_DCT:
      load_buffer_8x8(input, buf0, stride, 0);
      highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case H_DCT:
      load_buffer_8x8(input, buf0, stride, 0);
      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case V_ADST:
      load_buffer_8x8(input, buf0, stride, 0);
      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case H_ADST:
      load_buffer_8x8(input, buf0, stride, 0);
      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case V_FLIPADST:
      load_buffer_8x8(input, buf0, stride, 0);
      highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    case H_FLIPADST:
      load_buffer_8x8(input, buf0, stride, 1);
      highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
      shift_right_1_round_s32_x4(buf0, buf0, 16);
      transpose_arrays_s32_8x8(buf0, buf1);
      highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
      store_buffer_8x8(buf1, coeff, /*stride=*/8);
      break;
    default: assert(0);
  }
}

static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out,
                                  int bit) {
  const int32_t *const cospi = cospi_arr_s32(bit);
  const int32x4_t v_bit = vdupq_n_s32(-bit);

  int32x4_t u[16], v[16];

  // stage 1
  butterfly_dct_pre(in, u, 16);

  // stage 2
  butterfly_dct_pre(u, v, 8);
  v[8] = u[8];
  v[9] = u[9];
  butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit);
  butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit);
  v[14] = u[14];
  v[15] = u[15];

  // stage 3
  butterfly_dct_pre(v, u, 4);
  u[4] = v[4];
  butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit);
  u[7] = v[7];
  butterfly_dct_post(v + 8, v + 8, u + 8, 8);

  // stage 4
  butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit);
  butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit);
  butterfly_dct_post(u + 4, u + 4, v + 4, 4);
  v[8] = u[8];
  butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit);
  butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit);
  v[11] = u[11];
  v[12] = u[12];
  v[15] = u[15];

  // stage 5
  u[0] = v[0];
  u[1] = v[1];
  u[2] = v[2];
  u[3] = v[3];
  butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit);
  butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit);
  butterfly_dct_post(v + 8, v + 8, u + 8, 4);
  butterfly_dct_post(v + 12, v + 12, u + 12, 4);

  // stage 6
  v[0] = u[0];
  v[1] = u[1];
  v[2] = u[2];
  v[3] = u[3];
  v[4] = u[4];
  v[5] = u[5];
  v[6] = u[6];
  v[7] = u[7];
  butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit);
  butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit);
  butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit);
  butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit);

  out[0] = v[0];
  out[1] = v[8];
  out[2] = v[4];
  out[3] = v[12];
  out[4] = v[2];
  out[5] = v[10];
  out[6] = v[6];
  out[7] = v[14];
  out[8] = v[1];
  out[9] = v[9];
  out[10] = v[5];
  out[11] = v[13];
  out[12] = v[3];
  out[13] = v[11];
  out[14] = v[7];
  out[15] = v[15];
}

static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out,
                                   int bit) {
  const int32_t *const cospi = cospi_arr_s32(bit);
  const int32x4_t v_bit = vdupq_n_s32(-bit);

  int32x4_t u[16], v[16];

  // stage 0-1
  u[0] = in[0];
  u[1] = in[15];
  u[2] = in[7];
  u[3] = in[8];
  u[4] = in[3];
  u[5] = in[12];
  u[6] = in[4];
  u[7] = in[11];
  u[8] = in[1];
  u[9] = in[14];
  u[10] = in[6];
  u[11] = in[9];
  u[12] = in[2];
  u[13] = in[13];
  u[14] = in[5];
  u[15] = in[10];

  // stage 2
  v[0] = u[0];
  v[1] = u[1];
  butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit);
  v[4] = u[4];
  v[5] = u[5];
  butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit);
  v[8] = u[8];
  v[9] = u[9];
  butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit);
  v[12] = u[12];
  v[13] = u[13];
  butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit);

  // stage 3
  u[0] = vaddq_s32(v[0], v[2]);
  u[1] = vsubq_s32(v[3], v[1]);
  u[2] = vsubq_s32(v[0], v[2]);
  u[3] = vaddq_s32(v[1], v[3]);
  u[4] = vsubq_s32(v[6], v[4]);
  u[5] = vaddq_s32(v[5], v[7]);
  u[6] = vaddq_s32(v[4], v[6]);
  u[7] = vsubq_s32(v[5], v[7]);
  u[8] = vsubq_s32(v[10], v[8]);
  u[9] = vaddq_s32(v[9], v[11]);
  u[10] = vaddq_s32(v[8], v[10]);
  u[11] = vsubq_s32(v[9], v[11]);
  u[12] = vaddq_s32(v[12], v[14]);
  u[13] = vsubq_s32(v[15], v[13]);
  u[14] = vsubq_s32(v[12], v[14]);
  u[15] = vaddq_s32(v[13], v[15]);

  // stage 4
  v[0] = u[0];
  v[1] = u[1];
  v[2] = u[2];
  v[3] = u[3];
  butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit);
  butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit);

  v[8] = u[8];
  v[9] = u[9];
  v[10] = u[10];
  v[11] = u[11];

  butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit);
  butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit);

  // stage 5
  u[0] = vaddq_s32(v[0], v[4]);
  u[1] = vaddq_s32(v[1], v[5]);
  u[2] = vaddq_s32(v[2], v[6]);
  u[3] = vsubq_s32(v[7], v[3]);
  u[4] = vsubq_s32(v[0], v[4]);
  u[5] = vsubq_s32(v[1], v[5]);
  u[6] = vsubq_s32(v[2], v[6]);
  u[7] = vaddq_s32(v[3], v[7]);
  u[8] = vaddq_s32(v[8], v[12]);
  u[9] = vaddq_s32(v[9], v[13]);
  u[10] = vsubq_s32(v[14], v[10]);
  u[11] = vaddq_s32(v[11], v[15]);
  u[12] = vsubq_s32(v[8], v[12]);
  u[13] = vsubq_s32(v[9], v[13]);
  u[14] = vaddq_s32(v[10], v[14]);
  u[15] = vsubq_s32(v[11], v[15]);

  // stage 6
  v[0] = u[0];
  v[1] = u[1];
  v[2] = u[2];
  v[3] = u[3];
  v[4] = u[4];
  v[5] = u[5];
  v[6] = u[6];
  v[7] = u[7];

  butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit);
  butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit);
  butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit);
  butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit);

  // stage 7
  u[0] = vaddq_s32(v[0], v[8]);
  u[1] = vaddq_s32(v[1], v[9]);
  u[2] = vaddq_s32(v[2], v[10]);
  u[3] = vaddq_s32(v[3], v[11]);
  u[4] = vaddq_s32(v[4], v[12]);
  u[5] = vaddq_s32(v[5], v[13]);
  u[6] = vaddq_s32(v[6], v[14]);
  u[7] = vsubq_s32(v[15], v[7]);
  u[8] = vsubq_s32(v[0], v[8]);
  u[9] = vsubq_s32(v[1], v[9]);
  u[10] = vsubq_s32(v[2], v[10]);
  u[11] = vsubq_s32(v[3], v[11]);
  u[12] = vsubq_s32(v[4], v[12]);
  u[13] = vsubq_s32(v[5], v[13]);
  u[14] = vsubq_s32(v[6], v[14]);
  u[15] = vaddq_s32(v[7], v[15]);

  // stage 8
  butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit);
  butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit);
  butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit);
  butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit);
  butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit);
  butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit);
  butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit);
  butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit);

  // stage 9
  out[0] = v[1];
  out[1] = v[14];
  out[2] = v[3];
  out[3] = v[12];
  out[4] = v[5];
  out[5] = v[10];
  out[6] = v[7];
  out[7] = v[8];
  out[8] = v[9];
  out[9] = v[6];
  out[10] = v[11];
  out[11] = v[4];
  out[12] = v[13];
  out[13] = v[2];
  out[14] = v[15];
  out[15] = v[0];
}

static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out,
                                       int bit) {
  (void)bit;
  const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
  const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));

  for (int i = 0; i < 16; i++) {
    int32x4_t a = vmulq_s32(in[i], fact);
    a = vaddq_s32(a, offset);
    out[i] = vshrq_n_s32(a, NewSqrt2Bits);
  }
}

static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
                                  const int howmany) {
  const int stride = 16;
  int i = 0;
  do {
    highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit);
  } while (++i < howmany);
}

static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
                                   int howmany) {
  const int stride = 16;
  int i = 0;
  do {
    highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit);
  } while (++i < howmany);
}

static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out,
                                       int bit, int howmany) {
  const int stride = 16;
  int i = 0;
  do {
    highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit);
  } while (++i < howmany);
}

void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride,
                               TX_TYPE tx_type, int bd) {
  (void)bd;
  int ud_flip, lr_flip;
  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
  ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);

  // Workspaces for column/row-wise transforms.
  int32x4_t buf0[64], buf1[64];

  switch (tx_type) {
    case DCT_DCT:
      load_buffer_16x16(input, buf0, stride, 0);
      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case ADST_DCT:
      load_buffer_16x16(input, buf0, stride, 0);
      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case DCT_ADST:
      load_buffer_16x16(input, buf0, stride, 0);
      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case ADST_ADST:
      load_buffer_16x16(input, buf0, stride, 0);
      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case FLIPADST_DCT:
      load_buffer_16x16(input, buf0, stride, 0);
      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case DCT_FLIPADST:
      load_buffer_16x16(input, buf0, stride, 1);
      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case FLIPADST_FLIPADST:
      load_buffer_16x16(input, buf0, stride, 1);
      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case ADST_FLIPADST:
      load_buffer_16x16(input, buf0, stride, 1);
      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case FLIPADST_ADST:
      load_buffer_16x16(input, buf0, stride, 0);
      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case IDTX:
      load_buffer_16x16(input, buf0, stride, 0);
      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case V_DCT:
      load_buffer_16x16(input, buf0, stride, 0);
      highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case H_DCT:
      load_buffer_16x16(input, buf0, stride, 0);
      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case V_ADST:
      load_buffer_16x16(input, buf0, stride, 0);
      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case H_ADST:
      load_buffer_16x16(input, buf0, stride, 0);
      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case V_FLIPADST:
      load_buffer_16x16(input, buf0, stride, 0);
      highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    case H_FLIPADST:
      load_buffer_16x16(input, buf0, stride, 1);
      highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
      shift_right_2_round_s32_x4(buf0, buf0, 64);
      transpose_arrays_s32_16x16(buf0, buf1);
      highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
      store_buffer_16x16(buf1, coeff, /*stride=*/16);
      break;
    default: assert(0);
  }
}

typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out,
                                          int stride, int bit, int lr_flip);
typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in,
                                               int32x4_t *out, int stride,
                                               int bit, int lr_flip,
                                               int howmany, int hm_stride);

typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out,
                                          int bit, int stride);
typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in,
                                               int32_t *out, int bit,
                                               int howmany, int hm_stride,
                                               int stride);

// Construct component kernels that include the load_buffer and store_buffer
// stages to avoid the need to spill loaded data to the stack between these and
// the txfm kernel calls.
// The TRANSFORM_*_ONE cases are only ever called in situations where the
// howmany parameter would be one, so no need for the loop at all in these
// cases.

#define TRANSFORM_COL_ONE(name, n)                                    \
  static void highbd_##name##_col_neon(const int16_t *input,          \
                                       int32x4_t *output, int stride, \
                                       int cos_bit, int lr_flip) {    \
    int32x4_t buf0[n];                                                \
    load_buffer_4x##n(input, buf0, stride, lr_flip);                  \
    highbd_##name##_x4_neon(buf0, output, cos_bit);                   \
  }

#define TRANSFORM_COL_MANY(name, n)                                     \
  static void highbd_##name##_col_many_neon(                            \
      const int16_t *input, int32x4_t *output, int stride, int cos_bit, \
      int lr_flip, int howmany, int hm_stride) {                        \
    int i = 0;                                                          \
    do {                                                                \
      int32x4_t buf0[n];                                                \
      load_buffer_4x##n(input + 4 * i, buf0, stride, lr_flip);          \
      highbd_##name##_x4_neon(buf0, output + i * hm_stride, cos_bit);   \
    } while (++i < howmany);                                            \
  }

#define TRANSFORM_ROW_ONE(name, n)                                        \
  static void highbd_##name##_row_neon(                                   \
      const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
    int32x4_t buf0[n];                                                    \
    highbd_##name##_x4_neon(input, buf0, cos_bit);                        \
    store_buffer_##n##x4(buf0, output, stride);                           \
  }

#define TRANSFORM_ROW_RECT_ONE(name, n)                                   \
  static void highbd_##name##_row_rect_neon(                              \
      const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
    int32x4_t buf0[n];                                                    \
    highbd_##name##_x4_neon(input, buf0, cos_bit);                        \
    round_rect_array_s32_neon(buf0, buf0, (n));                           \
    store_buffer_##n##x4(buf0, output, stride);                           \
  }

#define TRANSFORM_ROW_MANY(name, n)                                      \
  static void highbd_##name##_row_many_neon(                             \
      const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
      int hm_stride, int stride) {                                       \
    int i = 0;                                                           \
    do {                                                                 \
      int32x4_t buf0[n];                                                 \
      highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit);     \
      store_buffer_##n##x4(buf0, output + 4 * i, stride);                \
    } while (++i < howmany);                                             \
  }

#define TRANSFORM_ROW_RECT_MANY(name, n)                                 \
  static void highbd_##name##_row_rect_many_neon(                        \
      const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
      int hm_stride, int stride) {                                       \
    int i = 0;                                                           \
    do {                                                                 \
      int32x4_t buf0[n];                                                 \
      highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit);     \
      round_rect_array_s32_neon(buf0, buf0, (n));                        \
      store_buffer_##n##x4(buf0, output + 4 * i, stride);                \
    } while (++i < howmany);                                             \
  }

TRANSFORM_COL_ONE(fdct8, 8)
TRANSFORM_COL_ONE(fadst8, 8)
TRANSFORM_COL_ONE(fidentity8, 8)

TRANSFORM_COL_MANY(fdct4, 4)
TRANSFORM_COL_MANY(fdct8, 8)
TRANSFORM_COL_MANY(fdct16, 16)
TRANSFORM_COL_MANY(fadst4, 4)
TRANSFORM_COL_MANY(fadst8, 8)
TRANSFORM_COL_MANY(fadst16, 16)
TRANSFORM_COL_MANY(fidentity4, 4)
TRANSFORM_COL_MANY(fidentity8, 8)
TRANSFORM_COL_MANY(fidentity16, 16)

TRANSFORM_ROW_ONE(fdct16, 16)
TRANSFORM_ROW_ONE(fadst16, 16)
TRANSFORM_ROW_ONE(fidentity16, 16)

TRANSFORM_ROW_RECT_ONE(fdct8, 8)
TRANSFORM_ROW_RECT_ONE(fadst8, 8)
TRANSFORM_ROW_RECT_ONE(fidentity8, 8)

#if !CONFIG_REALTIME_ONLY
TRANSFORM_ROW_MANY(fdct4, 4)
TRANSFORM_ROW_MANY(fdct8, 8)
TRANSFORM_ROW_MANY(fadst4, 4)
TRANSFORM_ROW_MANY(fadst8, 8)
TRANSFORM_ROW_MANY(fidentity4, 4)
TRANSFORM_ROW_MANY(fidentity8, 8)
#endif

TRANSFORM_ROW_RECT_MANY(fdct4, 4)
TRANSFORM_ROW_RECT_MANY(fdct8, 8)
TRANSFORM_ROW_RECT_MANY(fdct16, 16)
TRANSFORM_ROW_RECT_MANY(fadst4, 4)
TRANSFORM_ROW_RECT_MANY(fadst8, 8)
TRANSFORM_ROW_RECT_MANY(fadst16, 16)
TRANSFORM_ROW_RECT_MANY(fidentity4, 4)
TRANSFORM_ROW_RECT_MANY(fidentity8, 8)
TRANSFORM_ROW_RECT_MANY(fidentity16, 16)

static const fwd_transform_1d_col_many_neon
    col_highbd_txfm8_xn_arr[TX_TYPES] = {
      highbd_fdct8_col_many_neon,       // DCT_DCT
      highbd_fadst8_col_many_neon,      // ADST_DCT
      highbd_fdct8_col_many_neon,       // DCT_ADST
      highbd_fadst8_col_many_neon,      // ADST_ADST
      highbd_fadst8_col_many_neon,      // FLIPADST_DCT
      highbd_fdct8_col_many_neon,       // DCT_FLIPADST
      highbd_fadst8_col_many_neon,      // FLIPADST_FLIPADST
      highbd_fadst8_col_many_neon,      // ADST_FLIPADST
      highbd_fadst8_col_many_neon,      // FLIPADST_ADST
      highbd_fidentity8_col_many_neon,  // IDTX
      highbd_fdct8_col_many_neon,       // V_DCT
      highbd_fidentity8_col_many_neon,  // H_DCT
      highbd_fadst8_col_many_neon,      // V_ADST
      highbd_fidentity8_col_many_neon,  // H_ADST
      highbd_fadst8_col_many_neon,      // V_FLIPADST
      highbd_fidentity8_col_many_neon   // H_FLIPADST
    };

static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = {
  highbd_fdct8_col_neon,       // DCT_DCT
  highbd_fadst8_col_neon,      // ADST_DCT
  highbd_fdct8_col_neon,       // DCT_ADST
  highbd_fadst8_col_neon,      // ADST_ADST
  highbd_fadst8_col_neon,      // FLIPADST_DCT
  highbd_fdct8_col_neon,       // DCT_FLIPADST
  highbd_fadst8_col_neon,      // FLIPADST_FLIPADST
  highbd_fadst8_col_neon,      // ADST_FLIPADST
  highbd_fadst8_col_neon,      // FLIPADST_ADST
  highbd_fidentity8_col_neon,  // IDTX
  highbd_fdct8_col_neon,       // V_DCT
  highbd_fidentity8_col_neon,  // H_DCT
  highbd_fadst8_col_neon,      // V_ADST
  highbd_fidentity8_col_neon,  // H_ADST
  highbd_fadst8_col_neon,      // V_FLIPADST
  highbd_fidentity8_col_neon   // H_FLIPADST
};

static const fwd_transform_1d_col_many_neon
    col_highbd_txfm16_xn_arr[TX_TYPES] = {
      highbd_fdct16_col_many_neon,       // DCT_DCT
      highbd_fadst16_col_many_neon,      // ADST_DCT
      highbd_fdct16_col_many_neon,       // DCT_ADST
      highbd_fadst16_col_many_neon,      // ADST_ADST
      highbd_fadst16_col_many_neon,      // FLIPADST_DCT
      highbd_fdct16_col_many_neon,       // DCT_FLIPADST
      highbd_fadst16_col_many_neon,      // FLIPADST_FLIPADST
      highbd_fadst16_col_many_neon,      // ADST_FLIPADST
      highbd_fadst16_col_many_neon,      // FLIPADST_ADST
      highbd_fidentity16_col_many_neon,  // IDTX
      highbd_fdct16_col_many_neon,       // V_DCT
      highbd_fidentity16_col_many_neon,  // H_DCT
      highbd_fadst16_col_many_neon,      // V_ADST
      highbd_fidentity16_col_many_neon,  // H_ADST
      highbd_fadst16_col_many_neon,      // V_FLIPADST
      highbd_fidentity16_col_many_neon   // H_FLIPADST
    };

static const fwd_transform_1d_col_many_neon
    col_highbd_txfm4_xn_arr[TX_TYPES] = {
      highbd_fdct4_col_many_neon,       // DCT_DCT
      highbd_fadst4_col_many_neon,      // ADST_DCT
      highbd_fdct4_col_many_neon,       // DCT_ADST
      highbd_fadst4_col_many_neon,      // ADST_ADST
      highbd_fadst4_col_many_neon,      // FLIPADST_DCT
      highbd_fdct4_col_many_neon,       // DCT_FLIPADST
      highbd_fadst4_col_many_neon,      // FLIPADST_FLIPADST
      highbd_fadst4_col_many_neon,      // ADST_FLIPADST
      highbd_fadst4_col_many_neon,      // FLIPADST_ADST
      highbd_fidentity4_col_many_neon,  // IDTX
      highbd_fdct4_col_many_neon,       // V_DCT
      highbd_fidentity4_col_many_neon,  // H_DCT
      highbd_fadst4_col_many_neon,      // V_ADST
      highbd_fidentity4_col_many_neon,  // H_ADST
      highbd_fadst4_col_many_neon,      // V_FLIPADST
      highbd_fidentity4_col_many_neon   // H_FLIPADST
    };

static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = {
  highbd_fdct16_row_neon,       // DCT_DCT
  highbd_fdct16_row_neon,       // ADST_DCT
  highbd_fadst16_row_neon,      // DCT_ADST
  highbd_fadst16_row_neon,      // ADST_ADST
  highbd_fdct16_row_neon,       // FLIPADST_DCT
  highbd_fadst16_row_neon,      // DCT_FLIPADST
  highbd_fadst16_row_neon,      // FLIPADST_FLIPADST
  highbd_fadst16_row_neon,      // ADST_FLIPADST
  highbd_fadst16_row_neon,      // FLIPADST_ADST
  highbd_fidentity16_row_neon,  // IDTX
  highbd_fidentity16_row_neon,  // V_DCT
  highbd_fdct16_row_neon,       // H_DCT
  highbd_fidentity16_row_neon,  // V_ADST
  highbd_fadst16_row_neon,      // H_ADST
  highbd_fidentity16_row_neon,  // V_FLIPADST
  highbd_fadst16_row_neon       // H_FLIPADST
};

static const fwd_transform_1d_row_many_neon
    row_rect_highbd_txfm16_xn_arr[TX_TYPES] = {
      highbd_fdct16_row_rect_many_neon,       // DCT_DCT
      highbd_fdct16_row_rect_many_neon,       // ADST_DCT
      highbd_fadst16_row_rect_many_neon,      // DCT_ADST
      highbd_fadst16_row_rect_many_neon,      // ADST_ADST
      highbd_fdct16_row_rect_many_neon,       // FLIPADST_DCT
      highbd_fadst16_row_rect_many_neon,      // DCT_FLIPADST
      highbd_fadst16_row_rect_many_neon,      // FLIPADST_FLIPADST
      highbd_fadst16_row_rect_many_neon,      // ADST_FLIPADST
      highbd_fadst16_row_rect_many_neon,      // FLIPADST_ADST
      highbd_fidentity16_row_rect_many_neon,  // IDTX
      highbd_fidentity16_row_rect_many_neon,  // V_DCT
      highbd_fdct16_row_rect_many_neon,       // H_DCT
      highbd_fidentity16_row_rect_many_neon,  // V_ADST
      highbd_fadst16_row_rect_many_neon,      // H_ADST
      highbd_fidentity16_row_rect_many_neon,  // V_FLIPADST
      highbd_fadst16_row_rect_many_neon       // H_FLIPADST
    };

#if !CONFIG_REALTIME_ONLY
static const fwd_transform_1d_row_many_neon
    row_highbd_txfm8_xn_arr[TX_TYPES] = {
      highbd_fdct8_row_many_neon,       // DCT_DCT
      highbd_fdct8_row_many_neon,       // ADST_DCT
      highbd_fadst8_row_many_neon,      // DCT_ADST
      highbd_fadst8_row_many_neon,      // ADST_ADST
      highbd_fdct8_row_many_neon,       // FLIPADST_DCT
      highbd_fadst8_row_many_neon,      // DCT_FLIPADST
      highbd_fadst8_row_many_neon,      // FLIPADST_FLIPADST
      highbd_fadst8_row_many_neon,      // ADST_FLIPADST
      highbd_fadst8_row_many_neon,      // FLIPADST_ADST
      highbd_fidentity8_row_many_neon,  // IDTX
      highbd_fidentity8_row_many_neon,  // V_DCT
      highbd_fdct8_row_many_neon,       // H_DCT
      highbd_fidentity8_row_many_neon,  // V_ADST
      highbd_fadst8_row_many_neon,      // H_ADST
      highbd_fidentity8_row_many_neon,  // V_FLIPADST
      highbd_fadst8_row_many_neon       // H_FLIPADST
    };
#endif

static const fwd_transform_1d_row_many_neon
    row_rect_highbd_txfm8_xn_arr[TX_TYPES] = {
      highbd_fdct8_row_rect_many_neon,       // DCT_DCT
      highbd_fdct8_row_rect_many_neon,       // ADST_DCT
      highbd_fadst8_row_rect_many_neon,      // DCT_ADST
      highbd_fadst8_row_rect_many_neon,      // ADST_ADST
      highbd_fdct8_row_rect_many_neon,       // FLIPADST_DCT
      highbd_fadst8_row_rect_many_neon,      // DCT_FLIPADST
      highbd_fadst8_row_rect_many_neon,      // FLIPADST_FLIPADST
      highbd_fadst8_row_rect_many_neon,      // ADST_FLIPADST
      highbd_fadst8_row_rect_many_neon,      // FLIPADST_ADST
      highbd_fidentity8_row_rect_many_neon,  // IDTX
      highbd_fidentity8_row_rect_many_neon,  // V_DCT
      highbd_fdct8_row_rect_many_neon,       // H_DCT
      highbd_fidentity8_row_rect_many_neon,  // V_ADST
      highbd_fadst8_row_rect_many_neon,      // H_ADST
      highbd_fidentity8_row_rect_many_neon,  // V_FLIPADST
      highbd_fadst8_row_rect_many_neon       // H_FLIPADST
    };

static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = {
  highbd_fdct8_row_rect_neon,       // DCT_DCT
  highbd_fdct8_row_rect_neon,       // ADST_DCT
  highbd_fadst8_row_rect_neon,      // DCT_ADST
  highbd_fadst8_row_rect_neon,      // ADST_ADST
  highbd_fdct8_row_rect_neon,       // FLIPADST_DCT
  highbd_fadst8_row_rect_neon,      // DCT_FLIPADST
  highbd_fadst8_row_rect_neon,      // FLIPADST_FLIPADST
  highbd_fadst8_row_rect_neon,      // ADST_FLIPADST
  highbd_fadst8_row_rect_neon,      // FLIPADST_ADST
  highbd_fidentity8_row_rect_neon,  // IDTX
  highbd_fidentity8_row_rect_neon,  // V_DCT
  highbd_fdct8_row_rect_neon,       // H_DCT
  highbd_fidentity8_row_rect_neon,  // V_ADST
  highbd_fadst8_row_rect_neon,      // H_ADST
  highbd_fidentity8_row_rect_neon,  // V_FLIPADST
  highbd_fadst8_row_rect_neon       // H_FLIPADST
};

#if !CONFIG_REALTIME_ONLY
static const fwd_transform_1d_row_many_neon
    row_highbd_txfm4_xn_arr[TX_TYPES] = {
      highbd_fdct4_row_many_neon,       // DCT_DCT
      highbd_fdct4_row_many_neon,       // ADST_DCT
      highbd_fadst4_row_many_neon,      // DCT_ADST
      highbd_fadst4_row_many_neon,      // ADST_ADST
      highbd_fdct4_row_many_neon,       // FLIPADST_DCT
      highbd_fadst4_row_many_neon,      // DCT_FLIPADST
      highbd_fadst4_row_many_neon,      // FLIPADST_FLIPADST
      highbd_fadst4_row_many_neon,      // ADST_FLIPADST
      highbd_fadst4_row_many_neon,      // FLIPADST_ADST
      highbd_fidentity4_row_many_neon,  // IDTX
      highbd_fidentity4_row_many_neon,  // V_DCT
      highbd_fdct4_row_many_neon,       // H_DCT
      highbd_fidentity4_row_many_neon,  // V_ADST
      highbd_fadst4_row_many_neon,      // H_ADST
      highbd_fidentity4_row_many_neon,  // V_FLIPADST
      highbd_fadst4_row_many_neon       // H_FLIPADST
    };
#endif

static const fwd_transform_1d_row_many_neon
    row_rect_highbd_txfm4_xn_arr[TX_TYPES] = {
      highbd_fdct4_row_rect_many_neon,       // DCT_DCT
      highbd_fdct4_row_rect_many_neon,       // ADST_DCT
      highbd_fadst4_row_rect_many_neon,      // DCT_ADST
      highbd_fadst4_row_rect_many_neon,      // ADST_ADST
      highbd_fdct4_row_rect_many_neon,       // FLIPADST_DCT
      highbd_fadst4_row_rect_many_neon,      // DCT_FLIPADST
      highbd_fadst4_row_rect_many_neon,      // FLIPADST_FLIPADST
      highbd_fadst4_row_rect_many_neon,      // ADST_FLIPADST
      highbd_fadst4_row_rect_many_neon,      // FLIPADST_ADST
      highbd_fidentity4_row_rect_many_neon,  // IDTX
      highbd_fidentity4_row_rect_many_neon,  // V_DCT
      highbd_fdct4_row_rect_many_neon,       // H_DCT
      highbd_fidentity4_row_rect_many_neon,  // V_ADST
      highbd_fadst4_row_rect_many_neon,      // H_ADST
      highbd_fidentity4_row_rect_many_neon,  // V_FLIPADST
      highbd_fadst4_row_rect_many_neon       // H_FLIPADST
    };

static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output,
                                  int cos_bit) {
  const int32_t *const cospi = cospi_arr_s32(cos_bit);
  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);

  // Workspaces for intermediate transform steps.
  int32x4_t buf0[32];
  int32x4_t buf1[32];

  // stage 1
  butterfly_dct_pre(input, buf1, 32);

  // stage 2
  butterfly_dct_pre(buf1, buf0, 16);
  buf0[16] = buf1[16];
  buf0[17] = buf1[17];
  buf0[18] = buf1[18];
  buf0[19] = buf1[19];
  butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20],
                      v_cos_bit);
  butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21],
                      v_cos_bit);
  butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22],
                      v_cos_bit);
  butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23],
                      v_cos_bit);
  buf0[28] = buf1[28];
  buf0[29] = buf1[29];
  buf0[30] = buf1[30];
  buf0[31] = buf1[31];

  // stage 3
  butterfly_dct_pre(buf0, buf1, 8);
  buf1[8] = buf0[8];
  buf1[9] = buf0[9];
  butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10],
                      v_cos_bit);
  butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11],
                      v_cos_bit);
  buf1[14] = buf0[14];
  buf1[15] = buf0[15];
  butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16);

  // stage 4
  butterfly_dct_pre(buf1, buf0, 4);
  buf0[4] = buf1[4];
  butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5],
                      v_cos_bit);
  buf0[7] = buf1[7];
  butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8);
  buf0[16] = buf1[16];
  buf0[17] = buf1[17];
  butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18],
                      v_cos_bit);
  butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19],
                      v_cos_bit);
  butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27],
                      v_cos_bit);
  butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26],
                      v_cos_bit);
  buf0[22] = buf1[22];
  buf0[23] = buf1[23];
  buf0[24] = buf1[24];
  buf0[25] = buf1[25];
  buf0[30] = buf1[30];
  buf0[31] = buf1[31];

  // stage 5
  butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1],
                      v_cos_bit);
  butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3],
                      v_cos_bit);
  butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4);
  buf1[8] = buf0[8];
  butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9],
                      v_cos_bit);
  butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13],
                      v_cos_bit);
  buf1[11] = buf0[11];
  buf1[12] = buf0[12];
  buf1[15] = buf0[15];
  butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8);
  butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8);

  // stage 6
  buf0[0] = buf1[0];
  buf0[1] = buf1[1];
  buf0[2] = buf1[2];
  buf0[3] = buf1[3];

  butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7],
                      v_cos_bit);
  butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17],
                      v_cos_bit);
  butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29],
                      v_cos_bit);
  butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4);
  butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4);
  buf0[16] = buf1[16];
  buf0[19] = buf1[19];
  buf0[20] = buf1[20];

  butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6],
                      v_cos_bit);
  butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21],
                      v_cos_bit);
  butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22],
                      v_cos_bit);

  buf0[23] = buf1[23];
  buf0[24] = buf1[24];
  buf0[27] = buf1[27];
  buf0[28] = buf1[28];
  buf0[31] = buf1[31];

  // stage 7
  buf1[0] = buf0[0];
  buf1[1] = buf0[1];
  buf1[2] = buf0[2];
  buf1[3] = buf0[3];
  buf1[4] = buf0[4];
  buf1[5] = buf0[5];
  buf1[6] = buf0[6];
  buf1[7] = buf0[7];
  butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15],
                      v_cos_bit);
  butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14],
                      v_cos_bit);
  butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13],
                      v_cos_bit);
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.22 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.