/*
* Copyright (c) 2020, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include <assert.h>
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_dsp/txfm_common.h"
#include "aom_ports/mem.h"
#include "av1/common/av1_txfm.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#include "shift_neon.h"
#include "txfm_neon.h"
static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in,
int32x4_t *out) {
// This is not quite the same as the other transposes defined in
// transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is
// unused by the following row transform.
for (int j = 0; j < 8; ++j) {
for (int i = 0; i < 16; ++i) {
transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i);
}
}
}
// A note on butterfly helper naming:
//
// butterfly_[weight_indices]_neon
// e.g. butterfly_0312_neon
// ^ Weights are applied as indices 0, 3, 2, 1
// (see more detail below)
//
// Weight indices are treated as an index into the 4-tuple of the weight
// itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1).
// This is then represented in the helper naming by referring to the lane index
// in the loaded tuple that each multiply is performed with:
//
// in0 in1
// /------------
// out0 | w[0] w[1] ==> out0 = in0 * w[0] + in1 * w[1]
// out1 | w[2] w[3] ==> out1 = in0 * w[2] + in1 * w[3]
//
// So for indices 0321 from the earlier example, we end up with:
//
// in0 in1
// /------------------
// out0 | (lane 0) (lane 3) ==> out0 = in0 * w0 + in1 * (w0-1)
// out1 | (lane 2) (lane 1) ==> out1 = in0 * -w0 + in1 * (1-w0)
#define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit) \
do { \
int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } }; \
int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \
x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2); \
*out = vrshlq_s32(x, v_bit); \
} while (false )
static AOM_FORCE_INLINE void butterfly_0112_neon(
const int32_t *cospi, const int widx0, const int32x4_t n0,
const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
const int32x4_t v_bit) {
int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
}
static AOM_FORCE_INLINE void butterfly_2312_neon(
const int32_t *cospi, const int widx0, const int32x4_t n0,
const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
const int32x4_t v_bit) {
int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit);
butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
}
static AOM_FORCE_INLINE void butterfly_0332_neon(
const int32_t *cospi, const int widx0, const int32x4_t n0,
const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
const int32x4_t v_bit) {
int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit);
butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit);
}
static AOM_FORCE_INLINE void butterfly_0130_neon(
const int32_t *cospi, const int widx0, const int32x4_t n0,
const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
const int32x4_t v_bit) {
int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit);
}
static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon(
const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
int32x2_t w01 = vld1_s32(cospi + 2 * 32);
butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit);
butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit);
}
static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon(
const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
int32x2_t w01 = vld1_s32(cospi + 2 * 32);
butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit);
butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit);
}
static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input,
int32x4_t *output,
const int size) {
const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
int i = 0;
do {
const int32x4_t r1 = vmulq_s32(input[i], sqrt2);
output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
} while (++i < size);
}
static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon(
const int32x4_t *input, int32x4_t *output, const int size) {
const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
int i = 0;
do {
const int32x4_t r0 = vrshrq_n_s32(input[i], 2);
const int32x4_t r1 = vmulq_s32(r0, sqrt2);
output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
} while (++i < size);
}
#define LOAD_BUFFER_4XH(h) \
static AOM_FORCE_INLINE void load_buffer_4x## h( \
const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
if (fliplr) { \
for (int i = 0; i < (h); ++i) { \
int16x4_t a = vld1_s16(input + i * stride); \
a = vrev64_s16(a); \
in[i] = vshll_n_s16(a, 2); \
} \
} else { \
for (int i = 0; i < (h); ++i) { \
int16x4_t a = vld1_s16(input + i * stride); \
in[i] = vshll_n_s16(a, 2); \
} \
} \
}
// AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to
// avoid the expression even though the compiler can prove that the code path
// is never taken if `shift == 0`.
#define shift_left_long_s16(a, shift) \
((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift)))
#define LOAD_BUFFER_WXH(w, h, shift) \
static AOM_FORCE_INLINE void load_buffer_## w## x## h( \
const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
assert(w >= 8); \
if (fliplr) { \
for (int i = 0; i < (h); ++i) { \
for (int j = 0; j < (w) / 8; ++j) { \
int16x8_t a = vld1q_s16(input + i * stride + j * 8); \
a = vrev64q_s16(a); \
int j2 = (w) / 8 - j - 1; \
in[i + (h) * (2 * j2 + 0)] = \
shift_left_long_s16(vget_high_s16(a), (shift)); \
in[i + (h) * (2 * j2 + 1)] = \
shift_left_long_s16(vget_low_s16(a), (shift)); \
} \
} \
} else { \
for (int i = 0; i < (h); ++i) { \
for (int j = 0; j < (w) / 8; ++j) { \
int16x8_t a = vld1q_s16(input + i * stride + j * 8); \
in[i + (h) * (2 * j + 0)] = \
shift_left_long_s16(vget_low_s16(a), (shift)); \
in[i + (h) * (2 * j + 1)] = \
shift_left_long_s16(vget_high_s16(a), (shift)); \
} \
} \
} \
}
LOAD_BUFFER_4XH(4)
LOAD_BUFFER_4XH(8)
LOAD_BUFFER_4XH(16)
LOAD_BUFFER_4XH(32)
LOAD_BUFFER_WXH(8, 8, 2)
LOAD_BUFFER_WXH(16, 16, 2)
LOAD_BUFFER_WXH(32, 64, 0)
LOAD_BUFFER_WXH(64, 32, 2)
LOAD_BUFFER_WXH(64, 64, 0)
#if !CONFIG_REALTIME_ONLY
LOAD_BUFFER_WXH(16, 64, 0)
LOAD_BUFFER_WXH(64, 16, 2)
#endif // !CONFIG_REALTIME_ONLY
#define STORE_BUFFER_WXH(w, h) \
static AOM_FORCE_INLINE void store_buffer_## w## x## h( \
const int32x4_t *in, int32_t *out, int stride) { \
for (int i = 0; i < (w); ++i) { \
for (int j = 0; j < (h) / 4; ++j) { \
vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \
} \
} \
}
STORE_BUFFER_WXH(4, 4)
STORE_BUFFER_WXH(8, 4)
STORE_BUFFER_WXH(8, 8)
STORE_BUFFER_WXH(16, 4)
STORE_BUFFER_WXH(16, 16)
STORE_BUFFER_WXH(32, 4)
STORE_BUFFER_WXH(32, 32)
STORE_BUFFER_WXH(64, 32)
#if !CONFIG_REALTIME_ONLY
STORE_BUFFER_WXH(16, 32)
STORE_BUFFER_WXH(64, 16)
#endif // !CONFIG_REALTIME_ONLY
static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in,
int32x4_t *out, int bit) {
const int32_t *const cospi = cospi_arr_s32(bit);
const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]);
const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]);
const int32x4_t a0 = vaddq_s32(in[0], in[3]);
const int32x4_t a1 = vsubq_s32(in[0], in[3]);
const int32x4_t a2 = vaddq_s32(in[1], in[2]);
const int32x4_t a3 = vsubq_s32(in[1], in[2]);
const int32x4_t b0 = vmulq_s32(a0, cospi32);
const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1);
const int32x4_t b2 = vmulq_s32(a2, cospi32);
const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1);
const int32x4_t c0 = vaddq_s32(b0, b2);
const int32x4_t c1 = vsubq_s32(b0, b2);
const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0);
const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0);
const int32x4_t v_bit = vdupq_n_s32(-bit);
const int32x4_t d0 = vrshlq_s32(c0, v_bit);
const int32x4_t d1 = vrshlq_s32(c1, v_bit);
const int32x4_t d2 = vrshlq_s32(c2, v_bit);
const int32x4_t d3 = vrshlq_s32(c3, v_bit);
out[0] = d0;
out[1] = d2;
out[2] = d1;
out[3] = d3;
}
static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in,
int32x4_t *out, int bit) {
const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1);
const int32x4_t a0 = vaddq_s32(in[0], in[1]);
const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0);
const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1);
const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0);
const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1);
const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0);
const int32x4_t b2 = vsubq_s32(a0, in[3]);
const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1);
const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1);
const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0);
const int32x4_t d0 = vaddq_s32(c0, a3);
const int32x4_t d1 = vsubq_s32(c1, a3);
const int32x4_t d2 = vsubq_s32(c1, c0);
const int32x4_t e0 = vaddq_s32(d2, a3);
const int32x4_t v_bit = vdupq_n_s32(-bit);
out[0] = vrshlq_s32(d0, v_bit);
out[1] = vrshlq_s32(c2, v_bit);
out[2] = vrshlq_s32(d1, v_bit);
out[3] = vrshlq_s32(e0, v_bit);
}
static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in,
int32x4_t *out,
int bit) {
(void )bit;
int32x4_t fact = vdupq_n_s32(NewSqrt2);
for (int i = 0; i < 4; i++) {
const int32x4_t a_low = vmulq_s32(in[i], fact);
out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits);
}
}
void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff,
int input_stride, TX_TYPE tx_type, int bd) {
(void )bd;
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4);
// Workspace for column/row-wise transforms.
int32x4_t buf[4];
switch (tx_type) {
case DCT_DCT:
load_buffer_4x4(input, buf, input_stride, 0);
highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case ADST_DCT:
load_buffer_4x4(input, buf, input_stride, 0);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case DCT_ADST:
load_buffer_4x4(input, buf, input_stride, 0);
highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case ADST_ADST:
load_buffer_4x4(input, buf, input_stride, 0);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case FLIPADST_DCT:
load_buffer_4x4(input, buf, input_stride, 0);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case DCT_FLIPADST:
load_buffer_4x4(input, buf, input_stride, 1);
highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case FLIPADST_FLIPADST:
load_buffer_4x4(input, buf, input_stride, 1);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case ADST_FLIPADST:
load_buffer_4x4(input, buf, input_stride, 1);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case FLIPADST_ADST:
load_buffer_4x4(input, buf, input_stride, 0);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case IDTX:
load_buffer_4x4(input, buf, input_stride, 0);
highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case V_DCT:
load_buffer_4x4(input, buf, input_stride, 0);
highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case H_DCT:
load_buffer_4x4(input, buf, input_stride, 0);
highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case V_ADST:
load_buffer_4x4(input, buf, input_stride, 0);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case H_ADST:
load_buffer_4x4(input, buf, input_stride, 0);
highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case V_FLIPADST:
load_buffer_4x4(input, buf, input_stride, 0);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
case H_FLIPADST:
load_buffer_4x4(input, buf, input_stride, 1);
highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
transpose_arrays_s32_4x4(buf, buf);
highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
store_buffer_4x4(buf, coeff, /*stride=*/4);
break ;
default : assert(0);
}
}
// Butterfly pre-processing:
// e.g. n=4:
// out[0] = in[0] + in[3]
// out[1] = in[1] + in[2]
// out[2] = in[1] - in[2]
// out[3] = in[0] - in[3]
static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input,
int32x4_t *output, int n) {
for (int i = 0; i < n / 2; ++i) {
output[i] = vaddq_s32(input[i], input[n - i - 1]);
}
for (int i = 0; i < n / 2; ++i) {
output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
}
}
// Butterfly post-processing:
// e.g. n=8:
// out[0] = in0[0] + in1[3];
// out[1] = in0[1] + in1[2];
// out[2] = in0[1] - in1[2];
// out[3] = in0[0] - in1[3];
// out[4] = in0[7] - in1[4];
// out[5] = in0[6] - in1[5];
// out[6] = in0[6] + in1[5];
// out[7] = in0[7] + in1[4];
static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0,
const int32x4_t *in1,
int32x4_t *output, int n) {
for (int i = 0; i < n / 4; ++i) {
output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]);
}
for (int i = 0; i < n / 4; ++i) {
output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
}
for (int i = 0; i < n / 4; ++i) {
output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
}
for (int i = 0; i < n / 4; ++i) {
output[(3 * n) / 4 + i] =
vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
}
}
static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in,
int32x4_t *out, int bit) {
const int32_t *const cospi = cospi_arr_s32(bit);
const int32x4_t v_bit = vdupq_n_s32(-bit);
// stage 1
int32x4_t a[8];
butterfly_dct_pre(in, a, 8);
// stage 2
int32x4_t b[8];
butterfly_dct_pre(a, b, 4);
butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit);
// stage 3
int32x4_t c[8];
butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit);
butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit);
butterfly_dct_post(a + 4, b + 4, c + 4, 4);
// stage 4-5
butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit);
butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit);
out[0] = c[0];
out[2] = c[2];
out[4] = c[1];
out[6] = c[3];
}
static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in,
int32x4_t *out, int bit) {
const int32_t *const cospi = cospi_arr_s32(bit);
const int32x4_t v_bit = vdupq_n_s32(-bit);
int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
// stage 0-1
u0 = in[0];
u1 = in[7];
u2 = in[3];
u3 = in[4];
u4 = in[1];
u5 = in[6];
u6 = in[2];
u7 = in[5];
// stage 2
v0 = u0;
v1 = u1;
butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit);
v4 = u4;
v5 = u5;
butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit);
// stage 3
u0 = vaddq_s32(v0, v2);
u1 = vsubq_s32(v3, v1);
u2 = vsubq_s32(v0, v2);
u3 = vaddq_s32(v1, v3);
u4 = vsubq_s32(v6, v4);
u5 = vaddq_s32(v5, v7);
u6 = vaddq_s32(v4, v6);
u7 = vsubq_s32(v5, v7);
// stage 4
v0 = u0;
v1 = u1;
v2 = u2;
v3 = u3;
butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit);
butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit);
// stage 5
u0 = vaddq_s32(v0, v4);
u1 = vaddq_s32(v1, v5);
u2 = vaddq_s32(v2, v6);
u3 = vsubq_s32(v7, v3);
u4 = vsubq_s32(v0, v4);
u5 = vsubq_s32(v1, v5);
u6 = vsubq_s32(v2, v6);
u7 = vaddq_s32(v3, v7);
// stage 6
butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit);
butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit);
butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit);
butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit);
// stage 7
out[0] = v1;
out[1] = v6;
out[2] = v3;
out[3] = v4;
out[4] = v5;
out[5] = v2;
out[6] = v7;
out[7] = v0;
}
static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in,
int32x4_t *out,
int bit) {
(void )bit;
out[0] = vshlq_n_s32(in[0], 1);
out[1] = vshlq_n_s32(in[1], 1);
out[2] = vshlq_n_s32(in[2], 1);
out[3] = vshlq_n_s32(in[3], 1);
out[4] = vshlq_n_s32(in[4], 1);
out[5] = vshlq_n_s32(in[5], 1);
out[6] = vshlq_n_s32(in[6], 1);
out[7] = vshlq_n_s32(in[7], 1);
}
static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in,
int32x4_t *out, int bit,
int howmany) {
const int stride = 8;
int i = 0;
do {
highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit);
} while (++i < howmany);
}
static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in,
int32x4_t *out, int bit,
int howmany) {
const int stride = 8;
int i = 0;
do {
highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit);
} while (++i < howmany);
}
static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in,
int32x4_t *out, int bit,
int howmany) {
(void )bit;
const int stride = 8;
int i = 0;
do {
highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit);
} while (++i < howmany);
}
void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
(void )bd;
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
// Workspaces for column/row-wise transforms.
int32x4_t buf0[16], buf1[16];
switch (tx_type) {
case DCT_DCT:
load_buffer_8x8(input, buf0, stride, 0);
highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case ADST_DCT:
load_buffer_8x8(input, buf0, stride, 0);
highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case DCT_ADST:
load_buffer_8x8(input, buf0, stride, 0);
highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case ADST_ADST:
load_buffer_8x8(input, buf0, stride, 0);
highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case FLIPADST_DCT:
load_buffer_8x8(input, buf0, stride, 0);
highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case DCT_FLIPADST:
load_buffer_8x8(input, buf0, stride, 1);
highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case FLIPADST_FLIPADST:
load_buffer_8x8(input, buf0, stride, 1);
highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case ADST_FLIPADST:
load_buffer_8x8(input, buf0, stride, 1);
highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case FLIPADST_ADST:
load_buffer_8x8(input, buf0, stride, 0);
highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case IDTX:
load_buffer_8x8(input, buf0, stride, 0);
highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case V_DCT:
load_buffer_8x8(input, buf0, stride, 0);
highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case H_DCT:
load_buffer_8x8(input, buf0, stride, 0);
highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case V_ADST:
load_buffer_8x8(input, buf0, stride, 0);
highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case H_ADST:
load_buffer_8x8(input, buf0, stride, 0);
highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case V_FLIPADST:
load_buffer_8x8(input, buf0, stride, 0);
highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
case H_FLIPADST:
load_buffer_8x8(input, buf0, stride, 1);
highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
shift_right_1_round_s32_x4(buf0, buf0, 16);
transpose_arrays_s32_8x8(buf0, buf1);
highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
store_buffer_8x8(buf1, coeff, /*stride=*/8);
break ;
default : assert(0);
}
}
static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out,
int bit) {
const int32_t *const cospi = cospi_arr_s32(bit);
const int32x4_t v_bit = vdupq_n_s32(-bit);
int32x4_t u[16], v[16];
// stage 1
butterfly_dct_pre(in, u, 16);
// stage 2
butterfly_dct_pre(u, v, 8);
v[8] = u[8];
v[9] = u[9];
butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit);
butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit);
v[14] = u[14];
v[15] = u[15];
// stage 3
butterfly_dct_pre(v, u, 4);
u[4] = v[4];
butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit);
u[7] = v[7];
butterfly_dct_post(v + 8, v + 8, u + 8, 8);
// stage 4
butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit);
butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit);
butterfly_dct_post(u + 4, u + 4, v + 4, 4);
v[8] = u[8];
butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit);
butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit);
v[11] = u[11];
v[12] = u[12];
v[15] = u[15];
// stage 5
u[0] = v[0];
u[1] = v[1];
u[2] = v[2];
u[3] = v[3];
butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit);
butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit);
butterfly_dct_post(v + 8, v + 8, u + 8, 4);
butterfly_dct_post(v + 12, v + 12, u + 12, 4);
// stage 6
v[0] = u[0];
v[1] = u[1];
v[2] = u[2];
v[3] = u[3];
v[4] = u[4];
v[5] = u[5];
v[6] = u[6];
v[7] = u[7];
butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit);
butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit);
butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit);
butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit);
out[0] = v[0];
out[1] = v[8];
out[2] = v[4];
out[3] = v[12];
out[4] = v[2];
out[5] = v[10];
out[6] = v[6];
out[7] = v[14];
out[8] = v[1];
out[9] = v[9];
out[10] = v[5];
out[11] = v[13];
out[12] = v[3];
out[13] = v[11];
out[14] = v[7];
out[15] = v[15];
}
static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out,
int bit) {
const int32_t *const cospi = cospi_arr_s32(bit);
const int32x4_t v_bit = vdupq_n_s32(-bit);
int32x4_t u[16], v[16];
// stage 0-1
u[0] = in[0];
u[1] = in[15];
u[2] = in[7];
u[3] = in[8];
u[4] = in[3];
u[5] = in[12];
u[6] = in[4];
u[7] = in[11];
u[8] = in[1];
u[9] = in[14];
u[10] = in[6];
u[11] = in[9];
u[12] = in[2];
u[13] = in[13];
u[14] = in[5];
u[15] = in[10];
// stage 2
v[0] = u[0];
v[1] = u[1];
butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit);
v[4] = u[4];
v[5] = u[5];
butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit);
v[8] = u[8];
v[9] = u[9];
butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit);
v[12] = u[12];
v[13] = u[13];
butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit);
// stage 3
u[0] = vaddq_s32(v[0], v[2]);
u[1] = vsubq_s32(v[3], v[1]);
u[2] = vsubq_s32(v[0], v[2]);
u[3] = vaddq_s32(v[1], v[3]);
u[4] = vsubq_s32(v[6], v[4]);
u[5] = vaddq_s32(v[5], v[7]);
u[6] = vaddq_s32(v[4], v[6]);
u[7] = vsubq_s32(v[5], v[7]);
u[8] = vsubq_s32(v[10], v[8]);
u[9] = vaddq_s32(v[9], v[11]);
u[10] = vaddq_s32(v[8], v[10]);
u[11] = vsubq_s32(v[9], v[11]);
u[12] = vaddq_s32(v[12], v[14]);
u[13] = vsubq_s32(v[15], v[13]);
u[14] = vsubq_s32(v[12], v[14]);
u[15] = vaddq_s32(v[13], v[15]);
// stage 4
v[0] = u[0];
v[1] = u[1];
v[2] = u[2];
v[3] = u[3];
butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit);
butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit);
v[8] = u[8];
v[9] = u[9];
v[10] = u[10];
v[11] = u[11];
butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit);
butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit);
// stage 5
u[0] = vaddq_s32(v[0], v[4]);
u[1] = vaddq_s32(v[1], v[5]);
u[2] = vaddq_s32(v[2], v[6]);
u[3] = vsubq_s32(v[7], v[3]);
u[4] = vsubq_s32(v[0], v[4]);
u[5] = vsubq_s32(v[1], v[5]);
u[6] = vsubq_s32(v[2], v[6]);
u[7] = vaddq_s32(v[3], v[7]);
u[8] = vaddq_s32(v[8], v[12]);
u[9] = vaddq_s32(v[9], v[13]);
u[10] = vsubq_s32(v[14], v[10]);
u[11] = vaddq_s32(v[11], v[15]);
u[12] = vsubq_s32(v[8], v[12]);
u[13] = vsubq_s32(v[9], v[13]);
u[14] = vaddq_s32(v[10], v[14]);
u[15] = vsubq_s32(v[11], v[15]);
// stage 6
v[0] = u[0];
v[1] = u[1];
v[2] = u[2];
v[3] = u[3];
v[4] = u[4];
v[5] = u[5];
v[6] = u[6];
v[7] = u[7];
butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit);
butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit);
butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit);
butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit);
// stage 7
u[0] = vaddq_s32(v[0], v[8]);
u[1] = vaddq_s32(v[1], v[9]);
u[2] = vaddq_s32(v[2], v[10]);
u[3] = vaddq_s32(v[3], v[11]);
u[4] = vaddq_s32(v[4], v[12]);
u[5] = vaddq_s32(v[5], v[13]);
u[6] = vaddq_s32(v[6], v[14]);
u[7] = vsubq_s32(v[15], v[7]);
u[8] = vsubq_s32(v[0], v[8]);
u[9] = vsubq_s32(v[1], v[9]);
u[10] = vsubq_s32(v[2], v[10]);
u[11] = vsubq_s32(v[3], v[11]);
u[12] = vsubq_s32(v[4], v[12]);
u[13] = vsubq_s32(v[5], v[13]);
u[14] = vsubq_s32(v[6], v[14]);
u[15] = vaddq_s32(v[7], v[15]);
// stage 8
butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit);
butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit);
butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit);
butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit);
butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit);
butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit);
butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit);
butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit);
// stage 9
out[0] = v[1];
out[1] = v[14];
out[2] = v[3];
out[3] = v[12];
out[4] = v[5];
out[5] = v[10];
out[6] = v[7];
out[7] = v[8];
out[8] = v[9];
out[9] = v[6];
out[10] = v[11];
out[11] = v[4];
out[12] = v[13];
out[13] = v[2];
out[14] = v[15];
out[15] = v[0];
}
static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out,
int bit) {
(void )bit;
const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
for (int i = 0; i < 16; i++) {
int32x4_t a = vmulq_s32(in[i], fact);
a = vaddq_s32(a, offset);
out[i] = vshrq_n_s32(a, NewSqrt2Bits);
}
}
static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
const int howmany) {
const int stride = 16;
int i = 0;
do {
highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit);
} while (++i < howmany);
}
static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
int howmany) {
const int stride = 16;
int i = 0;
do {
highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit);
} while (++i < howmany);
}
static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out,
int bit, int howmany) {
const int stride = 16;
int i = 0;
do {
highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit);
} while (++i < howmany);
}
void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride,
TX_TYPE tx_type, int bd) {
(void )bd;
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
// Workspaces for column/row-wise transforms.
int32x4_t buf0[64], buf1[64];
switch (tx_type) {
case DCT_DCT:
load_buffer_16x16(input, buf0, stride, 0);
highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case ADST_DCT:
load_buffer_16x16(input, buf0, stride, 0);
highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case DCT_ADST:
load_buffer_16x16(input, buf0, stride, 0);
highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case ADST_ADST:
load_buffer_16x16(input, buf0, stride, 0);
highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case FLIPADST_DCT:
load_buffer_16x16(input, buf0, stride, 0);
highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case DCT_FLIPADST:
load_buffer_16x16(input, buf0, stride, 1);
highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case FLIPADST_FLIPADST:
load_buffer_16x16(input, buf0, stride, 1);
highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case ADST_FLIPADST:
load_buffer_16x16(input, buf0, stride, 1);
highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case FLIPADST_ADST:
load_buffer_16x16(input, buf0, stride, 0);
highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case IDTX:
load_buffer_16x16(input, buf0, stride, 0);
highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case V_DCT:
load_buffer_16x16(input, buf0, stride, 0);
highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case H_DCT:
load_buffer_16x16(input, buf0, stride, 0);
highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case V_ADST:
load_buffer_16x16(input, buf0, stride, 0);
highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case H_ADST:
load_buffer_16x16(input, buf0, stride, 0);
highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case V_FLIPADST:
load_buffer_16x16(input, buf0, stride, 0);
highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
case H_FLIPADST:
load_buffer_16x16(input, buf0, stride, 1);
highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
shift_right_2_round_s32_x4(buf0, buf0, 64);
transpose_arrays_s32_16x16(buf0, buf1);
highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
store_buffer_16x16(buf1, coeff, /*stride=*/16);
break ;
default : assert(0);
}
}
typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out,
int stride, int bit, int lr_flip);
typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in,
int32x4_t *out, int stride,
int bit, int lr_flip,
int howmany, int hm_stride);
typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out,
int bit, int stride);
typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in,
int32_t *out, int bit,
int howmany, int hm_stride,
int stride);
// Construct component kernels that include the load_buffer and store_buffer
// stages to avoid the need to spill loaded data to the stack between these and
// the txfm kernel calls.
// The TRANSFORM_*_ONE cases are only ever called in situations where the
// howmany parameter would be one, so no need for the loop at all in these
// cases.
#define TRANSFORM_COL_ONE(name, n) \
static void highbd_## name## _col_neon(const int16_t *input, \
int32x4_t *output, int stride, \
int cos_bit, int lr_flip) { \
int32x4_t buf0[n]; \
load_buffer_4x## n(input, buf0, stride, lr_flip); \
highbd_## name## _x4_neon(buf0, output, cos_bit); \
}
#define TRANSFORM_COL_MANY(name, n) \
static void highbd_## name## _col_many_neon( \
const int16_t *input, int32x4_t *output, int stride, int cos_bit, \
int lr_flip, int howmany, int hm_stride) { \
int i = 0; \
do { \
int32x4_t buf0[n]; \
load_buffer_4x## n(input + 4 * i, buf0, stride, lr_flip); \
highbd_## name## _x4_neon(buf0, output + i * hm_stride, cos_bit); \
} while (++i < howmany); \
}
#define TRANSFORM_ROW_ONE(name, n) \
static void highbd_## name## _row_neon( \
const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
int32x4_t buf0[n]; \
highbd_## name## _x4_neon(input, buf0, cos_bit); \
store_buffer_## n## x4(buf0, output, stride); \
}
#define TRANSFORM_ROW_RECT_ONE(name, n) \
static void highbd_## name## _row_rect_neon( \
const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
int32x4_t buf0[n]; \
highbd_## name## _x4_neon(input, buf0, cos_bit); \
round_rect_array_s32_neon(buf0, buf0, (n)); \
store_buffer_## n## x4(buf0, output, stride); \
}
#define TRANSFORM_ROW_MANY(name, n) \
static void highbd_## name## _row_many_neon( \
const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
int hm_stride, int stride) { \
int i = 0; \
do { \
int32x4_t buf0[n]; \
highbd_## name## _x4_neon(input + hm_stride * i, buf0, cos_bit); \
store_buffer_## n## x4(buf0, output + 4 * i, stride); \
} while (++i < howmany); \
}
#define TRANSFORM_ROW_RECT_MANY(name, n) \
static void highbd_## name## _row_rect_many_neon( \
const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
int hm_stride, int stride) { \
int i = 0; \
do { \
int32x4_t buf0[n]; \
highbd_## name## _x4_neon(input + hm_stride * i, buf0, cos_bit); \
round_rect_array_s32_neon(buf0, buf0, (n)); \
store_buffer_## n## x4(buf0, output + 4 * i, stride); \
} while (++i < howmany); \
}
TRANSFORM_COL_ONE(fdct8, 8)
TRANSFORM_COL_ONE(fadst8, 8)
TRANSFORM_COL_ONE(fidentity8, 8)
TRANSFORM_COL_MANY(fdct4, 4)
TRANSFORM_COL_MANY(fdct8, 8)
TRANSFORM_COL_MANY(fdct16, 16)
TRANSFORM_COL_MANY(fadst4, 4)
TRANSFORM_COL_MANY(fadst8, 8)
TRANSFORM_COL_MANY(fadst16, 16)
TRANSFORM_COL_MANY(fidentity4, 4)
TRANSFORM_COL_MANY(fidentity8, 8)
TRANSFORM_COL_MANY(fidentity16, 16)
TRANSFORM_ROW_ONE(fdct16, 16)
TRANSFORM_ROW_ONE(fadst16, 16)
TRANSFORM_ROW_ONE(fidentity16, 16)
TRANSFORM_ROW_RECT_ONE(fdct8, 8)
TRANSFORM_ROW_RECT_ONE(fadst8, 8)
TRANSFORM_ROW_RECT_ONE(fidentity8, 8)
#if !CONFIG_REALTIME_ONLY
TRANSFORM_ROW_MANY(fdct4, 4)
TRANSFORM_ROW_MANY(fdct8, 8)
TRANSFORM_ROW_MANY(fadst4, 4)
TRANSFORM_ROW_MANY(fadst8, 8)
TRANSFORM_ROW_MANY(fidentity4, 4)
TRANSFORM_ROW_MANY(fidentity8, 8)
#endif
TRANSFORM_ROW_RECT_MANY(fdct4, 4)
TRANSFORM_ROW_RECT_MANY(fdct8, 8)
TRANSFORM_ROW_RECT_MANY(fdct16, 16)
TRANSFORM_ROW_RECT_MANY(fadst4, 4)
TRANSFORM_ROW_RECT_MANY(fadst8, 8)
TRANSFORM_ROW_RECT_MANY(fadst16, 16)
TRANSFORM_ROW_RECT_MANY(fidentity4, 4)
TRANSFORM_ROW_RECT_MANY(fidentity8, 8)
TRANSFORM_ROW_RECT_MANY(fidentity16, 16)
static const fwd_transform_1d_col_many_neon
col_highbd_txfm8_xn_arr[TX_TYPES] = {
highbd_fdct8_col_many_neon, // DCT_DCT
highbd_fadst8_col_many_neon, // ADST_DCT
highbd_fdct8_col_many_neon, // DCT_ADST
highbd_fadst8_col_many_neon, // ADST_ADST
highbd_fadst8_col_many_neon, // FLIPADST_DCT
highbd_fdct8_col_many_neon, // DCT_FLIPADST
highbd_fadst8_col_many_neon, // FLIPADST_FLIPADST
highbd_fadst8_col_many_neon, // ADST_FLIPADST
highbd_fadst8_col_many_neon, // FLIPADST_ADST
highbd_fidentity8_col_many_neon, // IDTX
highbd_fdct8_col_many_neon, // V_DCT
highbd_fidentity8_col_many_neon, // H_DCT
highbd_fadst8_col_many_neon, // V_ADST
highbd_fidentity8_col_many_neon, // H_ADST
highbd_fadst8_col_many_neon, // V_FLIPADST
highbd_fidentity8_col_many_neon // H_FLIPADST
};
static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = {
highbd_fdct8_col_neon, // DCT_DCT
highbd_fadst8_col_neon, // ADST_DCT
highbd_fdct8_col_neon, // DCT_ADST
highbd_fadst8_col_neon, // ADST_ADST
highbd_fadst8_col_neon, // FLIPADST_DCT
highbd_fdct8_col_neon, // DCT_FLIPADST
highbd_fadst8_col_neon, // FLIPADST_FLIPADST
highbd_fadst8_col_neon, // ADST_FLIPADST
highbd_fadst8_col_neon, // FLIPADST_ADST
highbd_fidentity8_col_neon, // IDTX
highbd_fdct8_col_neon, // V_DCT
highbd_fidentity8_col_neon, // H_DCT
highbd_fadst8_col_neon, // V_ADST
highbd_fidentity8_col_neon, // H_ADST
highbd_fadst8_col_neon, // V_FLIPADST
highbd_fidentity8_col_neon // H_FLIPADST
};
static const fwd_transform_1d_col_many_neon
col_highbd_txfm16_xn_arr[TX_TYPES] = {
highbd_fdct16_col_many_neon, // DCT_DCT
highbd_fadst16_col_many_neon, // ADST_DCT
highbd_fdct16_col_many_neon, // DCT_ADST
highbd_fadst16_col_many_neon, // ADST_ADST
highbd_fadst16_col_many_neon, // FLIPADST_DCT
highbd_fdct16_col_many_neon, // DCT_FLIPADST
highbd_fadst16_col_many_neon, // FLIPADST_FLIPADST
highbd_fadst16_col_many_neon, // ADST_FLIPADST
highbd_fadst16_col_many_neon, // FLIPADST_ADST
highbd_fidentity16_col_many_neon, // IDTX
highbd_fdct16_col_many_neon, // V_DCT
highbd_fidentity16_col_many_neon, // H_DCT
highbd_fadst16_col_many_neon, // V_ADST
highbd_fidentity16_col_many_neon, // H_ADST
highbd_fadst16_col_many_neon, // V_FLIPADST
highbd_fidentity16_col_many_neon // H_FLIPADST
};
static const fwd_transform_1d_col_many_neon
col_highbd_txfm4_xn_arr[TX_TYPES] = {
highbd_fdct4_col_many_neon, // DCT_DCT
highbd_fadst4_col_many_neon, // ADST_DCT
highbd_fdct4_col_many_neon, // DCT_ADST
highbd_fadst4_col_many_neon, // ADST_ADST
highbd_fadst4_col_many_neon, // FLIPADST_DCT
highbd_fdct4_col_many_neon, // DCT_FLIPADST
highbd_fadst4_col_many_neon, // FLIPADST_FLIPADST
highbd_fadst4_col_many_neon, // ADST_FLIPADST
highbd_fadst4_col_many_neon, // FLIPADST_ADST
highbd_fidentity4_col_many_neon, // IDTX
highbd_fdct4_col_many_neon, // V_DCT
highbd_fidentity4_col_many_neon, // H_DCT
highbd_fadst4_col_many_neon, // V_ADST
highbd_fidentity4_col_many_neon, // H_ADST
highbd_fadst4_col_many_neon, // V_FLIPADST
highbd_fidentity4_col_many_neon // H_FLIPADST
};
static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = {
highbd_fdct16_row_neon, // DCT_DCT
highbd_fdct16_row_neon, // ADST_DCT
highbd_fadst16_row_neon, // DCT_ADST
highbd_fadst16_row_neon, // ADST_ADST
highbd_fdct16_row_neon, // FLIPADST_DCT
highbd_fadst16_row_neon, // DCT_FLIPADST
highbd_fadst16_row_neon, // FLIPADST_FLIPADST
highbd_fadst16_row_neon, // ADST_FLIPADST
highbd_fadst16_row_neon, // FLIPADST_ADST
highbd_fidentity16_row_neon, // IDTX
highbd_fidentity16_row_neon, // V_DCT
highbd_fdct16_row_neon, // H_DCT
highbd_fidentity16_row_neon, // V_ADST
highbd_fadst16_row_neon, // H_ADST
highbd_fidentity16_row_neon, // V_FLIPADST
highbd_fadst16_row_neon // H_FLIPADST
};
static const fwd_transform_1d_row_many_neon
row_rect_highbd_txfm16_xn_arr[TX_TYPES] = {
highbd_fdct16_row_rect_many_neon, // DCT_DCT
highbd_fdct16_row_rect_many_neon, // ADST_DCT
highbd_fadst16_row_rect_many_neon, // DCT_ADST
highbd_fadst16_row_rect_many_neon, // ADST_ADST
highbd_fdct16_row_rect_many_neon, // FLIPADST_DCT
highbd_fadst16_row_rect_many_neon, // DCT_FLIPADST
highbd_fadst16_row_rect_many_neon, // FLIPADST_FLIPADST
highbd_fadst16_row_rect_many_neon, // ADST_FLIPADST
highbd_fadst16_row_rect_many_neon, // FLIPADST_ADST
highbd_fidentity16_row_rect_many_neon, // IDTX
highbd_fidentity16_row_rect_many_neon, // V_DCT
highbd_fdct16_row_rect_many_neon, // H_DCT
highbd_fidentity16_row_rect_many_neon, // V_ADST
highbd_fadst16_row_rect_many_neon, // H_ADST
highbd_fidentity16_row_rect_many_neon, // V_FLIPADST
highbd_fadst16_row_rect_many_neon // H_FLIPADST
};
#if !CONFIG_REALTIME_ONLY
static const fwd_transform_1d_row_many_neon
row_highbd_txfm8_xn_arr[TX_TYPES] = {
highbd_fdct8_row_many_neon, // DCT_DCT
highbd_fdct8_row_many_neon, // ADST_DCT
highbd_fadst8_row_many_neon, // DCT_ADST
highbd_fadst8_row_many_neon, // ADST_ADST
highbd_fdct8_row_many_neon, // FLIPADST_DCT
highbd_fadst8_row_many_neon, // DCT_FLIPADST
highbd_fadst8_row_many_neon, // FLIPADST_FLIPADST
highbd_fadst8_row_many_neon, // ADST_FLIPADST
highbd_fadst8_row_many_neon, // FLIPADST_ADST
highbd_fidentity8_row_many_neon, // IDTX
highbd_fidentity8_row_many_neon, // V_DCT
highbd_fdct8_row_many_neon, // H_DCT
highbd_fidentity8_row_many_neon, // V_ADST
highbd_fadst8_row_many_neon, // H_ADST
highbd_fidentity8_row_many_neon, // V_FLIPADST
highbd_fadst8_row_many_neon // H_FLIPADST
};
#endif
static const fwd_transform_1d_row_many_neon
row_rect_highbd_txfm8_xn_arr[TX_TYPES] = {
highbd_fdct8_row_rect_many_neon, // DCT_DCT
highbd_fdct8_row_rect_many_neon, // ADST_DCT
highbd_fadst8_row_rect_many_neon, // DCT_ADST
highbd_fadst8_row_rect_many_neon, // ADST_ADST
highbd_fdct8_row_rect_many_neon, // FLIPADST_DCT
highbd_fadst8_row_rect_many_neon, // DCT_FLIPADST
highbd_fadst8_row_rect_many_neon, // FLIPADST_FLIPADST
highbd_fadst8_row_rect_many_neon, // ADST_FLIPADST
highbd_fadst8_row_rect_many_neon, // FLIPADST_ADST
highbd_fidentity8_row_rect_many_neon, // IDTX
highbd_fidentity8_row_rect_many_neon, // V_DCT
highbd_fdct8_row_rect_many_neon, // H_DCT
highbd_fidentity8_row_rect_many_neon, // V_ADST
highbd_fadst8_row_rect_many_neon, // H_ADST
highbd_fidentity8_row_rect_many_neon, // V_FLIPADST
highbd_fadst8_row_rect_many_neon // H_FLIPADST
};
static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = {
highbd_fdct8_row_rect_neon, // DCT_DCT
highbd_fdct8_row_rect_neon, // ADST_DCT
highbd_fadst8_row_rect_neon, // DCT_ADST
highbd_fadst8_row_rect_neon, // ADST_ADST
highbd_fdct8_row_rect_neon, // FLIPADST_DCT
highbd_fadst8_row_rect_neon, // DCT_FLIPADST
highbd_fadst8_row_rect_neon, // FLIPADST_FLIPADST
highbd_fadst8_row_rect_neon, // ADST_FLIPADST
highbd_fadst8_row_rect_neon, // FLIPADST_ADST
highbd_fidentity8_row_rect_neon, // IDTX
highbd_fidentity8_row_rect_neon, // V_DCT
highbd_fdct8_row_rect_neon, // H_DCT
highbd_fidentity8_row_rect_neon, // V_ADST
highbd_fadst8_row_rect_neon, // H_ADST
highbd_fidentity8_row_rect_neon, // V_FLIPADST
highbd_fadst8_row_rect_neon // H_FLIPADST
};
#if !CONFIG_REALTIME_ONLY
static const fwd_transform_1d_row_many_neon
row_highbd_txfm4_xn_arr[TX_TYPES] = {
highbd_fdct4_row_many_neon, // DCT_DCT
highbd_fdct4_row_many_neon, // ADST_DCT
highbd_fadst4_row_many_neon, // DCT_ADST
highbd_fadst4_row_many_neon, // ADST_ADST
highbd_fdct4_row_many_neon, // FLIPADST_DCT
highbd_fadst4_row_many_neon, // DCT_FLIPADST
highbd_fadst4_row_many_neon, // FLIPADST_FLIPADST
highbd_fadst4_row_many_neon, // ADST_FLIPADST
highbd_fadst4_row_many_neon, // FLIPADST_ADST
highbd_fidentity4_row_many_neon, // IDTX
highbd_fidentity4_row_many_neon, // V_DCT
highbd_fdct4_row_many_neon, // H_DCT
highbd_fidentity4_row_many_neon, // V_ADST
highbd_fadst4_row_many_neon, // H_ADST
highbd_fidentity4_row_many_neon, // V_FLIPADST
highbd_fadst4_row_many_neon // H_FLIPADST
};
#endif
static const fwd_transform_1d_row_many_neon
row_rect_highbd_txfm4_xn_arr[TX_TYPES] = {
highbd_fdct4_row_rect_many_neon, // DCT_DCT
highbd_fdct4_row_rect_many_neon, // ADST_DCT
highbd_fadst4_row_rect_many_neon, // DCT_ADST
highbd_fadst4_row_rect_many_neon, // ADST_ADST
highbd_fdct4_row_rect_many_neon, // FLIPADST_DCT
highbd_fadst4_row_rect_many_neon, // DCT_FLIPADST
highbd_fadst4_row_rect_many_neon, // FLIPADST_FLIPADST
highbd_fadst4_row_rect_many_neon, // ADST_FLIPADST
highbd_fadst4_row_rect_many_neon, // FLIPADST_ADST
highbd_fidentity4_row_rect_many_neon, // IDTX
highbd_fidentity4_row_rect_many_neon, // V_DCT
highbd_fdct4_row_rect_many_neon, // H_DCT
highbd_fidentity4_row_rect_many_neon, // V_ADST
highbd_fadst4_row_rect_many_neon, // H_ADST
highbd_fidentity4_row_rect_many_neon, // V_FLIPADST
highbd_fadst4_row_rect_many_neon // H_FLIPADST
};
static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output,
int cos_bit) {
const int32_t *const cospi = cospi_arr_s32(cos_bit);
const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
// Workspaces for intermediate transform steps.
int32x4_t buf0[32];
int32x4_t buf1[32];
// stage 1
butterfly_dct_pre(input, buf1, 32);
// stage 2
butterfly_dct_pre(buf1, buf0, 16);
buf0[16] = buf1[16];
buf0[17] = buf1[17];
buf0[18] = buf1[18];
buf0[19] = buf1[19];
butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20],
v_cos_bit);
butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21],
v_cos_bit);
butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22],
v_cos_bit);
butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23],
v_cos_bit);
buf0[28] = buf1[28];
buf0[29] = buf1[29];
buf0[30] = buf1[30];
buf0[31] = buf1[31];
// stage 3
butterfly_dct_pre(buf0, buf1, 8);
buf1[8] = buf0[8];
buf1[9] = buf0[9];
butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10],
v_cos_bit);
butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11],
v_cos_bit);
buf1[14] = buf0[14];
buf1[15] = buf0[15];
butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16);
// stage 4
butterfly_dct_pre(buf1, buf0, 4);
buf0[4] = buf1[4];
butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5],
v_cos_bit);
buf0[7] = buf1[7];
butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8);
buf0[16] = buf1[16];
buf0[17] = buf1[17];
butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18],
v_cos_bit);
butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19],
v_cos_bit);
butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27],
v_cos_bit);
butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26],
v_cos_bit);
buf0[22] = buf1[22];
buf0[23] = buf1[23];
buf0[24] = buf1[24];
buf0[25] = buf1[25];
buf0[30] = buf1[30];
buf0[31] = buf1[31];
// stage 5
butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1],
v_cos_bit);
butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3],
v_cos_bit);
butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4);
buf1[8] = buf0[8];
butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9],
v_cos_bit);
butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13],
v_cos_bit);
buf1[11] = buf0[11];
buf1[12] = buf0[12];
buf1[15] = buf0[15];
butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8);
butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8);
// stage 6
buf0[0] = buf1[0];
buf0[1] = buf1[1];
buf0[2] = buf1[2];
buf0[3] = buf1[3];
butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7],
v_cos_bit);
butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17],
v_cos_bit);
butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29],
v_cos_bit);
butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4);
butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4);
buf0[16] = buf1[16];
buf0[19] = buf1[19];
buf0[20] = buf1[20];
butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6],
v_cos_bit);
butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21],
v_cos_bit);
butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22],
v_cos_bit);
buf0[23] = buf1[23];
buf0[24] = buf1[24];
buf0[27] = buf1[27];
buf0[28] = buf1[28];
buf0[31] = buf1[31];
// stage 7
buf1[0] = buf0[0];
buf1[1] = buf0[1];
buf1[2] = buf0[2];
buf1[3] = buf0[3];
buf1[4] = buf0[4];
buf1[5] = buf0[5];
buf1[6] = buf0[6];
buf1[7] = buf0[7];
butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15],
v_cos_bit);
butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14],
v_cos_bit);
butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13],
v_cos_bit);
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=87 H=90 G=88
¤ Dauer der Verarbeitung: 0.20 Sekunden
¤
*© Formatika GbR, Deutschland