Ziele Untersuchung
mit Columbo Integrität von
Datenbanken Interaktion und
Portierbarkeit Ergonomie der
Schnittstellen

Angebot Produkte Projekt Beratung

Mittel Analytik Modellierung Sprachen Algebra Logik Hardware Denken Kreativität

Zusammenhänge Gesellschaft Wirtschaft Branche Firma


products/Sources/formale Sprachen/C/Firefox/third_party/aom/av1/common/arm/ (Browser von der Mozilla Stiftung Version 136.0.1^©) Datei vom 10.2.2025 mit Größe 54 kB

Quelle selfguided_neon.c

Sprache: C

/*
* Copyright (c) 2018, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/

#include <arm_neon.h>
#include <assert.h>

#include "config/aom_config.h"
#include "config/av1_rtcd.h"

#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_mem/aom_mem.h"
#include "aom_ports/mem.h"
#include "av1/common/av1_common_int.h"
#include "av1/common/common.h"
#include "av1/common/resize.h"
#include "av1/common/restoration.h"

// Constants used for right shift in final_filter calculation.
#define NB_EVEN 5
#define NB_ODD 4

static inline void calc_ab_fast_internal_common(
    uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
    uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5,
    int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec,
    uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec,
    uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2,
    const int buf_stride) {
  uint32x4_t q0, q1, q2, q3;
  uint32x4_t p0, p1, p2, p3;
  uint16x4_t d0, d1, d2, d3;

  s0 = vmulq_u32(s0, const_n_val);
  s1 = vmulq_u32(s1, const_n_val);
  s2 = vmulq_u32(s2, const_n_val);
  s3 = vmulq_u32(s3, const_n_val);

  q0 = vmulq_u32(s4, s4);
  q1 = vmulq_u32(s5, s5);
  q2 = vmulq_u32(s6, s6);
  q3 = vmulq_u32(s7, s7);

  p0 = vcleq_u32(q0, s0);
  p1 = vcleq_u32(q1, s1);
  p2 = vcleq_u32(q2, s2);
  p3 = vcleq_u32(q3, s3);

  q0 = vsubq_u32(s0, q0);
  q1 = vsubq_u32(s1, q1);
  q2 = vsubq_u32(s2, q2);
  q3 = vsubq_u32(s3, q3);

  p0 = vandq_u32(p0, q0);
  p1 = vandq_u32(p1, q1);
  p2 = vandq_u32(p2, q2);
  p3 = vandq_u32(p3, q3);

  p0 = vmulq_u32(p0, s_vec);
  p1 = vmulq_u32(p1, s_vec);
  p2 = vmulq_u32(p2, s_vec);
  p3 = vmulq_u32(p3, s_vec);

  p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
  p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
  p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
  p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);

  p0 = vminq_u32(p0, const_val);
  p1 = vminq_u32(p1, const_val);
  p2 = vminq_u32(p2, const_val);
  p3 = vminq_u32(p3, const_val);

  {
    store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);

    for (int x = 0; x < 4; x++) {
      for (int y = 0; y < 4; y++) {
        dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
      }
    }
    load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3);
  }
  p0 = vsubl_u16(sgrproj_sgr, d0);
  p1 = vsubl_u16(sgrproj_sgr, d1);
  p2 = vsubl_u16(sgrproj_sgr, d2);
  p3 = vsubl_u16(sgrproj_sgr, d3);

  s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec);
  s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec);
  s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec);
  s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec);

  s4 = vmulq_u32(s4, p0);
  s5 = vmulq_u32(s5, p1);
  s6 = vmulq_u32(s6, p2);
  s7 = vmulq_u32(s7, p3);

  p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
  p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
  p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
  p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);

  store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0),
                vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
                vreinterpretq_s32_u32(p3));
}
static inline void calc_ab_internal_common(
    uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
    uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0,
    uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4,
    uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7,
    uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val,
    uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1,
    uint16_t *dst_A16, int32_t *dst2, const int buf_stride) {
  uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
  uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7;
  uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7;

  s0 = vmulq_u32(s0, const_n_val);
  s1 = vmulq_u32(s1, const_n_val);
  s2 = vmulq_u32(s2, const_n_val);
  s3 = vmulq_u32(s3, const_n_val);
  s4 = vmulq_u32(s4, const_n_val);
  s5 = vmulq_u32(s5, const_n_val);
  s6 = vmulq_u32(s6, const_n_val);
  s7 = vmulq_u32(s7, const_n_val);

  d0 = vget_low_u16(s16_4);
  d1 = vget_low_u16(s16_5);
  d2 = vget_low_u16(s16_6);
  d3 = vget_low_u16(s16_7);
  d4 = vget_high_u16(s16_4);
  d5 = vget_high_u16(s16_5);
  d6 = vget_high_u16(s16_6);
  d7 = vget_high_u16(s16_7);

  q0 = vmull_u16(d0, d0);
  q1 = vmull_u16(d1, d1);
  q2 = vmull_u16(d2, d2);
  q3 = vmull_u16(d3, d3);
  q4 = vmull_u16(d4, d4);
  q5 = vmull_u16(d5, d5);
  q6 = vmull_u16(d6, d6);
  q7 = vmull_u16(d7, d7);

  p0 = vcleq_u32(q0, s0);
  p1 = vcleq_u32(q1, s1);
  p2 = vcleq_u32(q2, s2);
  p3 = vcleq_u32(q3, s3);
  p4 = vcleq_u32(q4, s4);
  p5 = vcleq_u32(q5, s5);
  p6 = vcleq_u32(q6, s6);
  p7 = vcleq_u32(q7, s7);

  q0 = vsubq_u32(s0, q0);
  q1 = vsubq_u32(s1, q1);
  q2 = vsubq_u32(s2, q2);
  q3 = vsubq_u32(s3, q3);
  q4 = vsubq_u32(s4, q4);
  q5 = vsubq_u32(s5, q5);
  q6 = vsubq_u32(s6, q6);
  q7 = vsubq_u32(s7, q7);

  p0 = vandq_u32(p0, q0);
  p1 = vandq_u32(p1, q1);
  p2 = vandq_u32(p2, q2);
  p3 = vandq_u32(p3, q3);
  p4 = vandq_u32(p4, q4);
  p5 = vandq_u32(p5, q5);
  p6 = vandq_u32(p6, q6);
  p7 = vandq_u32(p7, q7);

  p0 = vmulq_u32(p0, s_vec);
  p1 = vmulq_u32(p1, s_vec);
  p2 = vmulq_u32(p2, s_vec);
  p3 = vmulq_u32(p3, s_vec);
  p4 = vmulq_u32(p4, s_vec);
  p5 = vmulq_u32(p5, s_vec);
  p6 = vmulq_u32(p6, s_vec);
  p7 = vmulq_u32(p7, s_vec);

  p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
  p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
  p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
  p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
  p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS);
  p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS);
  p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS);
  p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS);

  p0 = vminq_u32(p0, const_val);
  p1 = vminq_u32(p1, const_val);
  p2 = vminq_u32(p2, const_val);
  p3 = vminq_u32(p3, const_val);
  p4 = vminq_u32(p4, const_val);
  p5 = vminq_u32(p5, const_val);
  p6 = vminq_u32(p6, const_val);
  p7 = vminq_u32(p7, const_val);

  {
    store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
    store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7);

    for (int x = 0; x < 4; x++) {
      for (int y = 0; y < 8; y++) {
        dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
      }
    }
    load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7);
  }

  s16_4 = vsubq_u16(sgrproj_sgr, s16_4);
  s16_5 = vsubq_u16(sgrproj_sgr, s16_5);
  s16_6 = vsubq_u16(sgrproj_sgr, s16_6);
  s16_7 = vsubq_u16(sgrproj_sgr, s16_7);

  s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec);
  s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec);
  s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec);
  s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec);
  s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec);
  s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec);
  s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec);
  s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec);

  s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4)));
  s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5)));
  s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6)));
  s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7)));
  s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4)));
  s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5)));
  s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6)));
  s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7)));

  p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS);
  p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS);
  p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS);
  p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS);
  p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
  p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
  p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
  p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);

  store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0),
                vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
                vreinterpretq_s32_u32(p3));
  store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4),
                vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6),
                vreinterpretq_s32_u32(p7));
}

static inline void boxsum2_square_sum_calc(
    int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5,
    int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10,
    int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) {
  int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
  int32x4_t r12, r34, r67, r89, r1011;
  int32x4_t r345, r6789, r789;

  d1 = vmull_s16(t1, t1);
  d2 = vmull_s16(t2, t2);
  d3 = vmull_s16(t3, t3);
  d4 = vmull_s16(t4, t4);
  d5 = vmull_s16(t5, t5);
  d6 = vmull_s16(t6, t6);
  d7 = vmull_s16(t7, t7);
  d8 = vmull_s16(t8, t8);
  d9 = vmull_s16(t9, t9);
  d10 = vmull_s16(t10, t10);
  d11 = vmull_s16(t11, t11);

  r12 = vaddq_s32(d1, d2);
  r34 = vaddq_s32(d3, d4);
  r67 = vaddq_s32(d6, d7);
  r89 = vaddq_s32(d8, d9);
  r1011 = vaddq_s32(d10, d11);
  r345 = vaddq_s32(r34, d5);
  r6789 = vaddq_s32(r67, r89);
  r789 = vsubq_s32(r6789, d6);
  *r0 = vaddq_s32(r12, r345);
  *r1 = vaddq_s32(r67, r345);
  *r2 = vaddq_s32(d5, r6789);
  *r3 = vaddq_s32(r789, r1011);
}

static inline void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
                           int32_t *dst32, int32_t *dst2, const int dst_stride,
                           const int width, const int height) {
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
  assert(height > 2 * SGRPROJ_BORDER_VERT);

  int16_t *dst1_16_ptr, *src_ptr;
  int32_t *dst2_ptr;
  int h, w, count = 0;
  const int dst_stride_2 = (dst_stride << 1);
  const int dst_stride_8 = (dst_stride << 3);

  dst1_16_ptr = dst16;
  dst2_ptr = dst2;
  src_ptr = src;
  w = width;
  {
    int16x8_t t1, t2, t3, t4, t5, t6, t7;
    int16x8_t t8, t9, t10, t11, t12;

    int16x8_t q12345, q56789, q34567, q7891011;
    int16x8_t q12, q34, q67, q89, q1011;
    int16x8_t q345, q6789, q789;

    int32x4_t r12345, r56789, r34567, r7891011;

    do {
      h = height;
      dst1_16_ptr = dst16 + (count << 3);
      dst2_ptr = dst2 + (count << 3);
      src_ptr = src + (count << 3);

      dst1_16_ptr += dst_stride_2;
      dst2_ptr += dst_stride_2;
      do {
        load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
        src_ptr += 4 * src_stride;
        load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8);
        src_ptr += 4 * src_stride;
        load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12);

        q12 = vaddq_s16(t1, t2);
        q34 = vaddq_s16(t3, t4);
        q67 = vaddq_s16(t6, t7);
        q89 = vaddq_s16(t8, t9);
        q1011 = vaddq_s16(t10, t11);
        q345 = vaddq_s16(q34, t5);
        q6789 = vaddq_s16(q67, q89);
        q789 = vaddq_s16(q89, t7);
        q12345 = vaddq_s16(q12, q345);
        q34567 = vaddq_s16(q67, q345);
        q56789 = vaddq_s16(t5, q6789);
        q7891011 = vaddq_s16(q789, q1011);

        store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789,
                      q7891011);
        dst1_16_ptr += dst_stride_8;

        boxsum2_square_sum_calc(
            vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3),
            vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6),
            vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9),
            vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789,
            &r7891011);

        store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011);

        boxsum2_square_sum_calc(
            vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3),
            vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6),
            vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9),
            vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789,
            &r7891011);

        store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789,
                      r7891011);
        dst2_ptr += (dst_stride_8);
        h -= 8;
      } while (h > 0);
      w -= 8;
      count++;
    } while (w > 0);

    // memset needed for row pixels as 2nd stage of boxsum filter uses
    // first 2 rows of dst16, dst2 buffer which is not filled in first stage.
    for (int x = 0; x < 2; x++) {
      memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16));
      memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
    }

    // memset needed for extra columns as 2nd stage of boxsum filter uses
    // last 2 columns of dst16, dst2 buffer which is not filled in first stage.
    for (int x = 2; x < height + 2; x++) {
      int dst_offset = x * dst_stride + width + 2;
      memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16));
      memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
    }
  }

  {
    int16x4_t s1, s2, s3, s4, s5, s6, s7, s8;
    int32x4_t d1, d2, d3, d4, d5, d6, d7, d8;
    int32x4_t q12345, q34567, q23456, q45678;
    int32x4_t q23, q45, q67;
    int32x4_t q2345, q4567;

    int32x4_t r12345, r34567, r23456, r45678;
    int32x4_t r23, r45, r67;
    int32x4_t r2345, r4567;

    int32_t *src2_ptr, *dst1_32_ptr;
    int16_t *src1_ptr;
    count = 0;
    h = height;
    do {
      dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2);
      dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
      src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2);
      src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
      w = width;

      dst1_32_ptr += 2;
      dst2_ptr += 2;
      load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4);
      transpose_elems_inplace_s16_4x4(&s1, &s2, &s3, &s4);
      load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4);
      transpose_elems_inplace_s32_4x4(&d1, &d2, &d3, &d4);
      do {
        src1_ptr += 4;
        src2_ptr += 4;
        load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8);
        transpose_elems_inplace_s16_4x4(&s5, &s6, &s7, &s8);
        load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8);
        transpose_elems_inplace_s32_4x4(&d5, &d6, &d7, &d8);
        q23 = vaddl_s16(s2, s3);
        q45 = vaddl_s16(s4, s5);
        q67 = vaddl_s16(s6, s7);
        q2345 = vaddq_s32(q23, q45);
        q4567 = vaddq_s32(q45, q67);
        q12345 = vaddq_s32(vmovl_s16(s1), q2345);
        q23456 = vaddq_s32(q2345, vmovl_s16(s6));
        q34567 = vaddq_s32(q4567, vmovl_s16(s3));
        q45678 = vaddq_s32(q4567, vmovl_s16(s8));

        transpose_elems_inplace_s32_4x4(&q12345, &q23456, &q34567, &q45678);
        store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567,
                      q45678);
        dst1_32_ptr += 4;
        s1 = s5;
        s2 = s6;
        s3 = s7;
        s4 = s8;

        r23 = vaddq_s32(d2, d3);
        r45 = vaddq_s32(d4, d5);
        r67 = vaddq_s32(d6, d7);
        r2345 = vaddq_s32(r23, r45);
        r4567 = vaddq_s32(r45, r67);
        r12345 = vaddq_s32(d1, r2345);
        r23456 = vaddq_s32(r2345, d6);
        r34567 = vaddq_s32(r4567, d3);
        r45678 = vaddq_s32(r4567, d8);

        transpose_elems_inplace_s32_4x4(&r12345, &r23456, &r34567, &r45678);
        store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678);
        dst2_ptr += 4;
        d1 = d5;
        d2 = d6;
        d3 = d7;
        d4 = d8;
        w -= 4;
      } while (w > 0);
      h -= 8;
      count++;
    } while (h > 0);
  }
}

static inline void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
                                        uint16_t *B16, int32_t *B,
                                        const int buf_stride, const int width,
                                        const int height, const int r,
                                        const int s, const int ht_inc) {
  int32_t *src1, *dst2, count = 0;
  uint16_t *dst_A16, *src2;
  const uint32_t n = (2 * r + 1) * (2 * r + 1);
  const uint32x4_t const_n_val = vdupq_n_u32(n);
  const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
  const uint32x4_t const_val = vdupq_n_u32(255);

  uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7;

  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;

  const uint32x4_t s_vec = vdupq_n_u32(s);
  int w, h = height;

  do {
    dst_A16 = A16 + (count << 2) * buf_stride;
    src1 = A + (count << 2) * buf_stride;
    src2 = B16 + (count << 2) * buf_stride;
    dst2 = B + (count << 2) * buf_stride;
    w = width;
    do {
      load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3);
      load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7);
      load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);

      s16_4 = s16_0;
      s16_5 = s16_1;
      s16_6 = s16_2;
      s16_7 = s16_3;

      calc_ab_internal_common(
          s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
          s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
          one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);

      w -= 8;
      dst2 += 8;
      src1 += 8;
      src2 += 8;
      dst_A16 += 8;
    } while (w > 0);
    count++;
    h -= (ht_inc * 4);
  } while (h > 0);
}

#if CONFIG_AV1_HIGHBITDEPTH
static inline void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
                                        uint16_t *B16, int32_t *B,
                                        const int buf_stride, const int width,
                                        const int height, const int bit_depth,
                                        const int r, const int s,
                                        const int ht_inc) {
  int32_t *src1, *dst2, count = 0;
  uint16_t *dst_A16, *src2;
  const uint32_t n = (2 * r + 1) * (2 * r + 1);
  const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8));
  const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
  const uint32x4_t const_n_val = vdupq_n_u32(n);
  const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
  const uint32x4_t const_val = vdupq_n_u32(255);

  int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
  uint16x8_t s16_0, s16_1, s16_2, s16_3;
  uint16x8_t s16_4, s16_5, s16_6, s16_7;
  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;

  const uint32x4_t s_vec = vdupq_n_u32(s);
  int w, h = height;

  do {
    src1 = A + (count << 2) * buf_stride;
    src2 = B16 + (count << 2) * buf_stride;
    dst2 = B + (count << 2) * buf_stride;
    dst_A16 = A16 + (count << 2) * buf_stride;
    w = width;
    do {
      load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
      load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7);
      load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);

      s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
      s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
      s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
      s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
      s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec);
      s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec);
      s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec);
      s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec);

      s16_4 = vrshlq_u16(s16_0, bd_min_2_vec);
      s16_5 = vrshlq_u16(s16_1, bd_min_2_vec);
      s16_6 = vrshlq_u16(s16_2, bd_min_2_vec);
      s16_7 = vrshlq_u16(s16_3, bd_min_2_vec);

      calc_ab_internal_common(
          s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
          s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
          one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);

      w -= 8;
      dst2 += 8;
      src1 += 8;
      src2 += 8;
      dst_A16 += 8;
    } while (w > 0);
    count++;
    h -= (ht_inc * 4);
  } while (h > 0);
}
#endif  // CONFIG_AV1_HIGHBITDEPTH

static inline void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
                                             int32_t *B, const int buf_stride,
                                             const int width, const int height,
                                             const int r, const int s,
                                             const int ht_inc) {
  int32_t *src1, *src2, count = 0;
  uint16_t *dst_A16;
  const uint32_t n = (2 * r + 1) * (2 * r + 1);
  const uint32x4_t const_n_val = vdupq_n_u32(n);
  const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
  const uint32x4_t const_val = vdupq_n_u32(255);

  int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;

  const uint32x4_t s_vec = vdupq_n_u32(s);
  int w, h = height;

  do {
    src1 = A + (count << 2) * buf_stride;
    src2 = B + (count << 2) * buf_stride;
    dst_A16 = A16 + (count << 2) * buf_stride;
    w = width;
    do {
      load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
      load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);

      s0 = vreinterpretq_u32_s32(sr0);
      s1 = vreinterpretq_u32_s32(sr1);
      s2 = vreinterpretq_u32_s32(sr2);
      s3 = vreinterpretq_u32_s32(sr3);
      s4 = vreinterpretq_u32_s32(sr4);
      s5 = vreinterpretq_u32_s32(sr5);
      s6 = vreinterpretq_u32_s32(sr6);
      s7 = vreinterpretq_u32_s32(sr7);

      calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
                                   sr6, sr7, const_n_val, s_vec, const_val,
                                   one_by_n_minus_1_vec, sgrproj_sgr, src1,
                                   dst_A16, src2, buf_stride);

      w -= 4;
      src1 += 4;
      src2 += 4;
      dst_A16 += 4;
    } while (w > 0);
    count++;
    h -= (ht_inc * 4);
  } while (h > 0);
}

#if CONFIG_AV1_HIGHBITDEPTH
static inline void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
                                             int32_t *B, const int buf_stride,
                                             const int width, const int height,
                                             const int bit_depth, const int r,
                                             const int s, const int ht_inc) {
  int32_t *src1, *src2, count = 0;
  uint16_t *dst_A16;
  const uint32_t n = (2 * r + 1) * (2 * r + 1);
  const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8));
  const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
  const uint32x4_t const_n_val = vdupq_n_u32(n);
  const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
  const uint32x4_t const_val = vdupq_n_u32(255);

  int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;

  const uint32x4_t s_vec = vdupq_n_u32(s);
  int w, h = height;

  do {
    src1 = A + (count << 2) * buf_stride;
    src2 = B + (count << 2) * buf_stride;
    dst_A16 = A16 + (count << 2) * buf_stride;
    w = width;
    do {
      load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
      load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);

      s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
      s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
      s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
      s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
      s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec);
      s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec);
      s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec);
      s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec);

      calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
                                   sr6, sr7, const_n_val, s_vec, const_val,
                                   one_by_n_minus_1_vec, sgrproj_sgr, src1,
                                   dst_A16, src2, buf_stride);

      w -= 4;
      src1 += 4;
      src2 += 4;
      dst_A16 += 4;
    } while (w > 0);
    count++;
    h -= (ht_inc * 4);
  } while (h > 0);
}
#endif  // CONFIG_AV1_HIGHBITDEPTH

static inline void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
                           int32_t *dst2, const int dst_stride, const int width,
                           const int height) {
  assert(width > 2 * SGRPROJ_BORDER_HORZ);
  assert(height > 2 * SGRPROJ_BORDER_VERT);

  int16_t *src_ptr;
  int32_t *dst2_ptr;
  uint16_t *dst1_ptr;
  int h, w, count = 0;

  w = width;
  {
    int16x8_t s1, s2, s3, s4, s5, s6, s7, s8;
    int16x8_t q23, q34, q56, q234, q345, q456, q567;
    int32x4_t r23, r56, r345, r456, r567, r78, r678;
    int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high;
    int32x4_t r2, r3, r5, r6, r7, r8;
    int16x8_t q678, q78;

    do {
      dst1_ptr = dst1 + (count << 3);
      dst2_ptr = dst2 + (count << 3);
      src_ptr = src + (count << 3);
      h = height;

      load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
      src_ptr += 4 * src_stride;

      q23 = vaddq_s16(s2, s3);
      q234 = vaddq_s16(q23, s4);
      q34 = vaddq_s16(s3, s4);
      dst1_ptr += (dst_stride << 1);

      r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2));
      r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3));
      r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4));
      r23 = vaddq_s32(r2, r3);
      r234_low = vaddq_s32(r23, r4_low);
      r34_low = vaddq_s32(r3, r4_low);

      r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2));
      r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3));
      r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4));
      r23 = vaddq_s32(r2, r3);
      r234_high = vaddq_s32(r23, r4_high);
      r34_high = vaddq_s32(r3, r4_high);

      dst2_ptr += (dst_stride << 1);

      do {
        load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
        src_ptr += 4 * src_stride;

        q345 = vaddq_s16(s5, q34);
        q56 = vaddq_s16(s5, s6);
        q456 = vaddq_s16(s4, q56);
        q567 = vaddq_s16(s7, q56);
        q78 = vaddq_s16(s7, s8);
        q678 = vaddq_s16(s6, q78);

        store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
        dst1_ptr += (dst_stride << 2);

        s4 = s8;
        q34 = q78;
        q234 = q678;

        r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5));
        r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6));
        r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7));
        r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8));

        r345 = vaddq_s32(r5, r34_low);
        r56 = vaddq_s32(r5, r6);
        r456 = vaddq_s32(r4_low, r56);
        r567 = vaddq_s32(r7, r56);
        r78 = vaddq_s32(r7, r8);
        r678 = vaddq_s32(r6, r78);
        store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567);

        r4_low = r8;
        r34_low = r78;
        r234_low = r678;

        r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5));
        r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6));
        r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7));
        r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8));

        r345 = vaddq_s32(r5, r34_high);
        r56 = vaddq_s32(r5, r6);
        r456 = vaddq_s32(r4_high, r56);
        r567 = vaddq_s32(r7, r56);
        r78 = vaddq_s32(r7, r8);
        r678 = vaddq_s32(r6, r78);
        store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567);
        dst2_ptr += (dst_stride << 2);

        r4_high = r8;
        r34_high = r78;
        r234_high = r678;

        h -= 4;
      } while (h > 0);
      w -= 8;
      count++;
    } while (w > 0);

    // memset needed for row pixels as 2nd stage of boxsum filter uses
    // first 2 rows of dst1, dst2 buffer which is not filled in first stage.
    for (int x = 0; x < 2; x++) {
      memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1));
      memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
    }

    // memset needed for extra columns as 2nd stage of boxsum filter uses
    // last 2 columns of dst1, dst2 buffer which is not filled in first stage.
    for (int x = 2; x < height + 2; x++) {
      int dst_offset = x * dst_stride + width + 2;
      memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1));
      memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
    }
  }

  {
    int16x4_t d1, d2, d3, d4, d5, d6, d7, d8;
    int16x4_t q23, q34, q56, q234, q345, q456, q567;
    int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678;
    int32x4_t r1, r2, r3, r4, r5, r6, r7, r8;
    int16x4_t q678, q78;

    int32_t *src2_ptr;
    uint16_t *src1_ptr;
    count = 0;
    h = height;
    w = width;
    do {
      dst1_ptr = dst1 + (count << 2) * dst_stride;
      dst2_ptr = dst2 + (count << 2) * dst_stride;
      src1_ptr = dst1 + (count << 2) * dst_stride;
      src2_ptr = dst2 + (count << 2) * dst_stride;
      w = width;

      load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4);
      transpose_elems_inplace_s16_4x4(&d1, &d2, &d3, &d4);
      load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4);
      transpose_elems_inplace_s32_4x4(&r1, &r2, &r3, &r4);
      src1_ptr += 4;
      src2_ptr += 4;

      q23 = vadd_s16(d2, d3);
      q234 = vadd_s16(q23, d4);
      q34 = vadd_s16(d3, d4);
      dst1_ptr += 2;
      r23 = vaddq_s32(r2, r3);
      r234 = vaddq_s32(r23, r4);
      r34 = vaddq_s32(r3, r4);
      dst2_ptr += 2;

      do {
        load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8);
        transpose_elems_inplace_s16_4x4(&d5, &d6, &d7, &d8);
        load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8);
        transpose_elems_inplace_s32_4x4(&r5, &r6, &r7, &r8);
        src1_ptr += 4;
        src2_ptr += 4;

        q345 = vadd_s16(d5, q34);
        q56 = vadd_s16(d5, d6);
        q456 = vadd_s16(d4, q56);
        q567 = vadd_s16(d7, q56);
        q78 = vadd_s16(d7, d8);
        q678 = vadd_s16(d6, q78);
        transpose_elems_inplace_s16_4x4(&q234, &q345, &q456, &q567);
        store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
        dst1_ptr += 4;

        d4 = d8;
        q34 = q78;
        q234 = q678;

        r345 = vaddq_s32(r5, r34);
        r56 = vaddq_s32(r5, r6);
        r456 = vaddq_s32(r4, r56);
        r567 = vaddq_s32(r7, r56);
        r78 = vaddq_s32(r7, r8);
        r678 = vaddq_s32(r6, r78);
        transpose_elems_inplace_s32_4x4(&r234, &r345, &r456, &r567);
        store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567);
        dst2_ptr += 4;

        r4 = r8;
        r34 = r78;
        r234 = r678;
        w -= 4;
      } while (w > 0);
      h -= 4;
      count++;
    } while (h > 0);
  }
}

static inline int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) {
  int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
  int32x4_t fours, threes, res;

  xtl = vld1q_s32(buf - buf_stride - 1);
  xt = vld1q_s32(buf - buf_stride);
  xtr = vld1q_s32(buf - buf_stride + 1);
  xl = vld1q_s32(buf - 1);
  x = vld1q_s32(buf);
  xr = vld1q_s32(buf + 1);
  xbl = vld1q_s32(buf + buf_stride - 1);
  xb = vld1q_s32(buf + buf_stride);
  xbr = vld1q_s32(buf + buf_stride + 1);

  fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x))));
  threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
  res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes);
  return res;
}

static inline void cross_sum_inp_u16(uint16_t *buf, int buf_stride,
                                     int32x4_t *a0, int32x4_t *a1) {
  uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
  uint16x8_t r0, r1;

  xtl = vld1q_u16(buf - buf_stride - 1);
  xt = vld1q_u16(buf - buf_stride);
  xtr = vld1q_u16(buf - buf_stride + 1);
  xl = vld1q_u16(buf - 1);
  x = vld1q_u16(buf);
  xr = vld1q_u16(buf + 1);
  xbl = vld1q_u16(buf + buf_stride - 1);
  xb = vld1q_u16(buf + buf_stride);
  xbr = vld1q_u16(buf + buf_stride + 1);

  xb = vaddq_u16(xb, x);
  xt = vaddq_u16(xt, xr);
  xl = vaddq_u16(xl, xb);
  xl = vaddq_u16(xl, xt);

  r0 = vshlq_n_u16(xl, 2);

  xbl = vaddq_u16(xbl, xbr);
  xtl = vaddq_u16(xtl, xtr);
  xtl = vaddq_u16(xtl, xbl);

  r1 = vshlq_n_u16(xtl, 2);
  r1 = vsubq_u16(r1, xtl);

  *a0 = vreinterpretq_s32_u32(
      vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1))));
  *a1 = vreinterpretq_s32_u32(
      vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1))));
}

static inline int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) {
  int32x4_t xtr, xt, xtl, xbr, xb, xbl;
  int32x4_t fives, sixes, fives_plus_sixes;

  xtl = vld1q_s32(buf - buf_stride - 1);
  xt = vld1q_s32(buf - buf_stride);
  xtr = vld1q_s32(buf - buf_stride + 1);
  xbl = vld1q_s32(buf + buf_stride - 1);
  xb = vld1q_s32(buf + buf_stride);
  xbr = vld1q_s32(buf + buf_stride + 1);

  fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
  sixes = vaddq_s32(xt, xb);
  fives_plus_sixes = vaddq_s32(fives, sixes);

  return vaddq_s32(
      vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
}

static inline void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride,
                                                 int32x4_t *a0, int32x4_t *a1) {
  uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0;

  xtl = vld1q_u16(buf - buf_stride - 1);
  xt = vld1q_u16(buf - buf_stride);
  xtr = vld1q_u16(buf - buf_stride + 1);
  xbl = vld1q_u16(buf + buf_stride - 1);
  xb = vld1q_u16(buf + buf_stride);
  xbr = vld1q_u16(buf + buf_stride + 1);

  xbr = vaddq_u16(xbr, xbl);
  xtr = vaddq_u16(xtr, xtl);
  xbr = vaddq_u16(xbr, xtr);
  xtl = vshlq_n_u16(xbr, 2);
  xbr = vaddq_u16(xtl, xbr);

  xb = vaddq_u16(xb, xt);
  xb0 = vshlq_n_u16(xb, 1);
  xb = vshlq_n_u16(xb, 2);
  xb = vaddq_u16(xb, xb0);

  *a0 = vreinterpretq_s32_u32(
      vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb))));
  *a1 = vreinterpretq_s32_u32(
      vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb))));
}

static inline int32x4_t cross_sum_fast_odd_row(int32_t *buf) {
  int32x4_t xl, x, xr;
  int32x4_t fives, sixes, fives_plus_sixes;

  xl = vld1q_s32(buf - 1);
  x = vld1q_s32(buf);
  xr = vld1q_s32(buf + 1);
  fives = vaddq_s32(xl, xr);
  sixes = x;
  fives_plus_sixes = vaddq_s32(fives, sixes);

  return vaddq_s32(
      vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
}

static inline void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
                                                int32x4_t *a1) {
  uint16x8_t xl, x, xr;
  uint16x8_t x0;

  xl = vld1q_u16(buf - 1);
  x = vld1q_u16(buf);
  xr = vld1q_u16(buf + 1);
  xl = vaddq_u16(xl, xr);
  x0 = vshlq_n_u16(xl, 2);
  xl = vaddq_u16(xl, x0);

  x0 = vshlq_n_u16(x, 1);
  x = vshlq_n_u16(x, 2);
  x = vaddq_u16(x, x0);

  *a0 = vreinterpretq_s32_u32(
      vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x))));
  *a1 = vreinterpretq_s32_u32(
      vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x))));
}

static void final_filter_fast_internal(uint16_t *A, int32_t *B,
                                       const int buf_stride, int16_t *src,
                                       const int src_stride, int32_t *dst,
                                       const int dst_stride, const int width,
                                       const int height) {
  int16x8_t s0;
  int32_t *B_tmp, *dst_ptr;
  uint16_t *A_tmp;
  int16_t *src_ptr;
  int32x4_t a_res0, a_res1, b_res0, b_res1;
  int w, h, count = 0;
  assert(SGRPROJ_SGR_BITS == 8);
  assert(SGRPROJ_RST_BITS == 4);

  A_tmp = A;
  B_tmp = B;
  src_ptr = src;
  dst_ptr = dst;
  h = height;
  do {
    A_tmp = (A + count * buf_stride);
    B_tmp = (B + count * buf_stride);
    src_ptr = (src + count * src_stride);
    dst_ptr = (dst + count * dst_stride);
    w = width;
    if (!(count & 1)) {
      do {
        s0 = vld1q_s16(src_ptr);
        cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1);
        a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
        a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);

        b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride);
        b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride);
        a_res0 = vaddq_s32(a_res0, b_res0);
        a_res1 = vaddq_s32(a_res1, b_res1);

        a_res0 =
            vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
        a_res1 =
            vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);

        vst1q_s32(dst_ptr, a_res0);
        vst1q_s32(dst_ptr + 4, a_res1);

        A_tmp += 8;
        B_tmp += 8;
        src_ptr += 8;
        dst_ptr += 8;
        w -= 8;
      } while (w > 0);
    } else {
      do {
        s0 = vld1q_s16(src_ptr);
        cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1);
        a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
        a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);

        b_res0 = cross_sum_fast_odd_row(B_tmp);
        b_res1 = cross_sum_fast_odd_row(B_tmp + 4);
        a_res0 = vaddq_s32(a_res0, b_res0);
        a_res1 = vaddq_s32(a_res1, b_res1);

        a_res0 =
            vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
        a_res1 =
            vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);

        vst1q_s32(dst_ptr, a_res0);
        vst1q_s32(dst_ptr + 4, a_res1);

        A_tmp += 8;
        B_tmp += 8;
        src_ptr += 8;
        dst_ptr += 8;
        w -= 8;
      } while (w > 0);
    }
    count++;
    h -= 1;
  } while (h > 0);
}

static void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
                                  int16_t *src, const int src_stride,
                                  int32_t *dst, const int dst_stride,
                                  const int width, const int height) {
  int16x8_t s0;
  int32_t *B_tmp, *dst_ptr;
  uint16_t *A_tmp;
  int16_t *src_ptr;
  int32x4_t a_res0, a_res1, b_res0, b_res1;
  int w, h, count = 0;

  assert(SGRPROJ_SGR_BITS == 8);
  assert(SGRPROJ_RST_BITS == 4);
  h = height;

  do {
    A_tmp = (A + count * buf_stride);
    B_tmp = (B + count * buf_stride);
    src_ptr = (src + count * src_stride);
    dst_ptr = (dst + count * dst_stride);
    w = width;
    do {
      s0 = vld1q_s16(src_ptr);
      cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1);
      a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
      a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);

      b_res0 = cross_sum_inp_s32(B_tmp, buf_stride);
      b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride);
      a_res0 = vaddq_s32(a_res0, b_res0);
      a_res1 = vaddq_s32(a_res1, b_res1);

      a_res0 =
          vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
      a_res1 =
          vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
      vst1q_s32(dst_ptr, a_res0);
      vst1q_s32(dst_ptr + 4, a_res1);

      A_tmp += 8;
      B_tmp += 8;
      src_ptr += 8;
      dst_ptr += 8;
      w -= 8;
    } while (w > 0);
    count++;
    h -= 1;
  } while (h > 0);
}

static inline int restoration_fast_internal(uint16_t *dgd16, int width,
                                            int height, int dgd_stride,
                                            int32_t *dst, int dst_stride,
                                            int bit_depth, int sgr_params_idx,
                                            int radius_idx) {
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
  const int r = params->r[radius_idx];
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
  const int buf_stride = ((width_ext + 3) & ~3) + 16;

  const size_t buf_size = 3 * sizeof(int32_t) * RESTORATION_PROC_UNIT_PELS;
  int32_t *buf = aom_memalign(8, buf_size);
  if (!buf) return -1;

  int32_t *square_sum_buf = buf;
  int32_t *sum_buf = square_sum_buf + RESTORATION_PROC_UNIT_PELS;
  uint16_t *tmp16_buf = (uint16_t *)(sum_buf + RESTORATION_PROC_UNIT_PELS);
  assert((char *)(sum_buf + RESTORATION_PROC_UNIT_PELS) <=
             (char *)buf + buf_size &&
         "Allocated buffer is too small. Resize the buffer.");

  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
         "Need SGRPROJ_BORDER_* >= r+1");

  assert(radius_idx == 0);
  assert(r == 2);

  // input(dgd16) is 16bit.
  // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is
  // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit
  // buffer(square_sum_buf).
  boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
                      SGRPROJ_BORDER_HORZ),
          dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride,
          width_ext, height_ext);

  square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
  sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
  tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;

  // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
  // [1, 256] for all bit depths. b output is kept in 32bit buffer.

#if CONFIG_AV1_HIGHBITDEPTH
  if (bit_depth > 8) {
    calc_ab_fast_internal_hbd(
        (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
        (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2,
        bit_depth, r, params->s[radius_idx], 2);
  } else {
    calc_ab_fast_internal_lbd(
        (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
        (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
        params->s[radius_idx], 2);
  }
#else
  (void)bit_depth;
  calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1),
                            (tmp16_buf - buf_stride - 1),
                            (sum_buf - buf_stride - 1), buf_stride * 2,
                            width + 2, height + 2, r, params->s[radius_idx], 2);
#endif
  final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
                             dgd_stride, dst, dst_stride, width, height);
  aom_free(buf);
  return 0;
}

static inline int restoration_internal(uint16_t *dgd16, int width, int height,
                                       int dgd_stride, int32_t *dst,
                                       int dst_stride, int bit_depth,
                                       int sgr_params_idx, int radius_idx) {
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
  const int r = params->r[radius_idx];
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
  const int buf_stride = ((width_ext + 3) & ~3) + 16;

  const size_t buf_size = 3 * sizeof(int32_t) * RESTORATION_PROC_UNIT_PELS;
  int32_t *buf = aom_memalign(8, buf_size);
  if (!buf) return -1;

  int32_t *square_sum_buf = buf;
  int32_t *B = square_sum_buf + RESTORATION_PROC_UNIT_PELS;
  uint16_t *A16 = (uint16_t *)(B + RESTORATION_PROC_UNIT_PELS);
  uint16_t *sum_buf = A16 + RESTORATION_PROC_UNIT_PELS;

  assert((char *)(sum_buf + RESTORATION_PROC_UNIT_PELS) <=
             (char *)buf + buf_size &&
         "Allocated buffer is too small. Resize the buffer.");

  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
         "Need SGRPROJ_BORDER_* >= r+1");

  assert(radius_idx == 1);
  assert(r == 1);

  // input(dgd16) is 16bit.
  // sum of pixels output will be in 16bit(sum_buf).
  // sum of squares output is kept in 32bit buffer(square_sum_buf).
  boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
                      SGRPROJ_BORDER_HORZ),
          dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext,
          height_ext);

  square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
  A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
  sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;

#if CONFIG_AV1_HIGHBITDEPTH
  // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
  // [1, 256] for all bit depths. b output is kept in 32bit buffer.
  if (bit_depth > 8) {
    calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
                         (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
                         (B - buf_stride - 1), buf_stride, width + 2,
                         height + 2, bit_depth, r, params->s[radius_idx], 1);
  } else {
    calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
                         (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
                         (B - buf_stride - 1), buf_stride, width + 2,
                         height + 2, r, params->s[radius_idx], 1);
  }
#else
  (void)bit_depth;
  calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
                       (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
                       (B - buf_stride - 1), buf_stride, width + 2, height + 2,
                       r, params->s[radius_idx], 1);
#endif
  final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
                        dst_stride, width, height);
  aom_free(buf);
  return 0;
}

static inline void src_convert_u8_to_u16(const uint8_t *src,
                                         const int src_stride, uint16_t *dst,
                                         const int dst_stride, const int width,
                                         const int height) {
  const uint8_t *src_ptr;
  uint16_t *dst_ptr;
  int h, w, count = 0;

  uint8x8_t t1, t2, t3, t4;
  uint16x8_t s1, s2, s3, s4;
  h = height;
  do {
    src_ptr = src + (count << 2) * src_stride;
    dst_ptr = dst + (count << 2) * dst_stride;
    w = width;
    if (w >= 7) {
      do {
        load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
        s1 = vmovl_u8(t1);
        s2 = vmovl_u8(t2);
        s3 = vmovl_u8(t3);
        s4 = vmovl_u8(t4);
        store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);

        src_ptr += 8;
        dst_ptr += 8;
        w -= 8;
      } while (w > 7);
    }

    for (int y = 0; y < w; y++) {
      dst_ptr[y] = src_ptr[y];
      dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
      dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
      dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
    }
    count++;
    h -= 4;
  } while (h > 3);

  src_ptr = src + (count << 2) * src_stride;
  dst_ptr = dst + (count << 2) * dst_stride;
  for (int x = 0; x < h; x++) {
    for (int y = 0; y < width; y++) {
      dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
    }
  }

  // memset uninitialized rows of src buffer as they are needed for the
  // boxsum filter calculation.
  for (int x = height; x < height + 5; x++)
    memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
}

#if CONFIG_AV1_HIGHBITDEPTH
static inline void src_convert_hbd_copy(const uint16_t *src, int src_stride,
                                        uint16_t *dst, const int dst_stride,
                                        int width, int height) {
  const uint16_t *src_ptr;
  uint16_t *dst_ptr;
  int h, w, count = 0;
  uint16x8_t s1, s2, s3, s4;

  h = height;
  do {
    src_ptr = src + (count << 2) * src_stride;
    dst_ptr = dst + (count << 2) * dst_stride;
    w = width;
    do {
      load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
      store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
      src_ptr += 8;
      dst_ptr += 8;
      w -= 8;
    } while (w > 7);

    for (int y = 0; y < w; y++) {
      dst_ptr[y] = src_ptr[y];
      dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
      dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
      dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
    }
    count++;
    h -= 4;
  } while (h > 3);

  src_ptr = src + (count << 2) * src_stride;
  dst_ptr = dst + (count << 2) * dst_stride;

  for (int x = 0; x < h; x++) {
    memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
           sizeof(uint16_t) * width);
  }
  // memset uninitialized rows of src buffer as they are needed for the
  // boxsum filter calculation.
  for (int x = height; x < height + 5; x++)
    memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
}
#endif  // CONFIG_AV1_HIGHBITDEPTH

int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
                                    int stride, int32_t *flt0, int32_t *flt1,
                                    int flt_stride, int sgr_params_idx,
                                    int bit_depth, int highbd) {
  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
  assert(!(params->r[0] == 0 && params->r[1] == 0));

  uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
  const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
  uint16_t *dgd16 =
      dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
  const int dgd_stride = stride;

#if CONFIG_AV1_HIGHBITDEPTH
  if (highbd) {
    const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
    src_convert_hbd_copy(
        dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
        dgd_stride,
        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
        dgd16_stride, width_ext, height_ext);
  } else {
    src_convert_u8_to_u16(
        dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
        dgd_stride,
        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
        dgd16_stride, width_ext, height_ext);
  }
#else
  (void)highbd;
  src_convert_u8_to_u16(
      dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
      dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
      dgd16_stride, width_ext, height_ext);
#endif

  if (params->r[0] > 0) {
    int ret =
        restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
                                  flt_stride, bit_depth, sgr_params_idx, 0);
    if (ret != 0) return ret;
  }
  if (params->r[1] > 0) {
    int ret = restoration_internal(dgd16, width, height, dgd16_stride, flt1,
                                   flt_stride, bit_depth, sgr_params_idx, 1);
    if (ret != 0) return ret;
  }
  return 0;
}

int av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
                                          int height, int stride, int eps,
                                          const int *xqd, uint8_t *dst8,
                                          int dst_stride, int32_t *tmpbuf,
                                          int bit_depth, int highbd) {
  int32_t *flt0 = tmpbuf;
  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
  assert(width * height <= RESTORATION_UNITPELS_MAX);
  uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
  const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
  uint16_t *dgd16 =
      dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
  const int dgd_stride = stride;
  const sgr_params_type *const params = &av1_sgr_params[eps];
  int xq[2];

  assert(!(params->r[0] == 0 && params->r[1] == 0));

#if CONFIG_AV1_HIGHBITDEPTH
  if (highbd) {
    const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
    src_convert_hbd_copy(
        dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
        dgd_stride,
        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
        dgd16_stride, width_ext, height_ext);
  } else {
    src_convert_u8_to_u16(
        dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
        dgd_stride,
        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
        dgd16_stride, width_ext, height_ext);
  }
#else
  (void)highbd;
  src_convert_u8_to_u16(
      dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
      dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
      dgd16_stride, width_ext, height_ext);
#endif
  if (params->r[0] > 0) {
    int ret = restoration_fast_internal(dgd16, width, height, dgd16_stride,
                                        flt0, width, bit_depth, eps, 0);
    if (ret != 0) return ret;
  }
  if (params->r[1] > 0) {
    int ret = restoration_internal(dgd16, width, height, dgd16_stride, flt1,
                                   width, bit_depth, eps, 1);
    if (ret != 0) return ret;
  }

  av1_decode_xq(xqd, xq, params);

  {
    int16_t *src_ptr;
    uint8_t *dst_ptr;
#if CONFIG_AV1_HIGHBITDEPTH
    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8);
    uint16_t *dst16_ptr;
#endif
    int16x4_t d0, d4;
    int16x8_t r0, s0;
    uint16x8_t r4;
    int32x4_t u0, u4, v0, v4, f00, f10;
    uint8x8_t t0;
    int count = 0, w = width, h = height, rc = 0;

    const int32x4_t xq0_vec = vdupq_n_s32(xq[0]);
    const int32x4_t xq1_vec = vdupq_n_s32(xq[1]);
    const int16x8_t zero = vdupq_n_s16(0);
    const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1);
    src_ptr = (int16_t *)dgd16;
    do {
      w = width;
      count = 0;
      dst_ptr = dst8 + rc * dst_stride;
#if CONFIG_AV1_HIGHBITDEPTH
      dst16_ptr = dst16 + rc * dst_stride;
#endif
      do {
        s0 = vld1q_s16(src_ptr + count);

        u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS);
        u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS);

        v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS);
        v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS);

        if (params->r[0] > 0) {
          f00 = vld1q_s32(flt0 + count);
          f10 = vld1q_s32(flt0 + count + 4);

          f00 = vsubq_s32(f00, u0);
          f10 = vsubq_s32(f10, u4);

          v0 = vmlaq_s32(v0, xq0_vec, f00);
          v4 = vmlaq_s32(v4, xq0_vec, f10);
        }

        if (params->r[1] > 0) {
          f00 = vld1q_s32(flt1 + count);
          f10 = vld1q_s32(flt1 + count + 4);

          f00 = vsubq_s32(f00, u0);
          f10 = vsubq_s32(f10, u4);

          v0 = vmlaq_s32(v0, xq1_vec, f00);
          v4 = vmlaq_s32(v4, xq1_vec, f10);
        }

        d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
        d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);

        r0 = vcombine_s16(d0, d4);

        r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));

#if CONFIG_AV1_HIGHBITDEPTH
        if (highbd) {
          r4 = vminq_u16(r4, max);
          vst1q_u16(dst16_ptr, r4);
          dst16_ptr += 8;
        } else {
          t0 = vqmovn_u16(r4);
          vst1_u8(dst_ptr, t0);
          dst_ptr += 8;
        }
#else
        (void)max;
        t0 = vqmovn_u16(r4);
        vst1_u8(dst_ptr, t0);
        dst_ptr += 8;
#endif
        w -= 8;
        count += 8;
      } while (w > 0);

      src_ptr += dgd16_stride;
      flt1 += width;
      flt0 += width;
      rc++;
      h--;
    } while (h > 0);
  }
  return 0;
}

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.20 Sekunden (vorverarbeitet am 2026-04-28) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.