Quelle row_sve.cc

Sprache: C

/*
*  Copyright 2024 The LibYuv Project Authors. All rights reserved.
*
*  Use of this source code is governed by a BSD-style license
*  that can be found in the LICENSE file in the root of the source
*  tree. An additional intellectual property rights grant can be found
*  in the file PATENTS. All contributing project authors may
*  be found in the AUTHORS file in the root of the source tree.
*/

#include "libyuv/row_sve.h"
#include "libyuv/row.h"

#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

#if !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)

#define RGBTOARGB8_SVE_2X                                 \
  /* Inputs: B: z16.h,  G: z17.h,  R: z18.h,  A: z19.b */ \
  "uqshrnb     z16.b, z16.h, #6     \n" /* B0 */          \
  "uqshrnb     z17.b, z17.h, #6     \n" /* G0 */          \
  "uqshrnb     z18.b, z18.h, #6     \n" /* R0 */          \
  "uqshrnt     z16.b, z20.h, #6     \n" /* B1 */          \
  "uqshrnt     z17.b, z21.h, #6     \n" /* G1 */          \
  "uqshrnt     z18.b, z22.h, #6     \n" /* R1 */

#define RGBTOARGB8_SVE_TOP_2X                        \
  /* Inputs: B: z16.h,  G: z17.h,  R: z18.h */       \
  "uqshl     z16.h, p0/m, z16.h, #2     \n" /* B0 */ \
  "uqshl     z17.h, p0/m, z17.h, #2     \n" /* G0 */ \
  "uqshl     z18.h, p0/m, z18.h, #2     \n" /* R0 */ \
  "uqshl     z20.h, p0/m, z20.h, #2     \n" /* B1 */ \
  "uqshl     z21.h, p0/m, z21.h, #2     \n" /* G1 */ \
  "uqshl     z22.h, p0/m, z22.h, #2     \n" /* R1 */

void I444ToARGBRow_SVE2(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  uint64_t vl;
  asm volatile(
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n"  //
      YUVTORGB_SVE_SETUP
      "dup      z19.b, #255                             \n"  // Alpha
      "subs     %w[width], %w[width], %w[vl]            \n"
      "b.lt     2f                                      \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue    p1.h                                    \n"
      "1:                                               \n"  //
      READYUV444_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
      "subs     %w[width], %w[width], %w[vl]            \n"
      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
      "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
      "b.ge     1b                                      \n"

      "2:                                               \n"
      "adds     %w[width], %w[width], %w[vl]            \n"
      "b.eq     99f                                     \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt  p1.h, wzr, %w[width]                    \n"  //
      READYUV444_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"

      "99:                                              \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width),                               // %[width]
        [vl] "=&r"(vl)                                     // %[vl]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc", "memory", YUVTORGB_SVE_REGS);
}

void I400ToARGBRow_SVE2(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  I400ToARGBRow_SVE_SC(src_y, dst_argb, yuvconstants, width);
}

void I422ToARGBRow_SVE2(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  I422ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}

void I422ToRGB24Row_SVE2(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  I422ToRGB24Row_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}

void I422ToRGB565Row_SVE2(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width) {
  I422ToRGB565Row_SVE_SC(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
}

void I422ToARGB1555Row_SVE2(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb1555,
                            const struct YuvConstants* yuvconstants,
                            int width) {
  I422ToARGB1555Row_SVE_SC(src_y, src_u, src_v, dst_argb1555, yuvconstants,
                           width);
}

void I422ToARGB4444Row_SVE2(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb4444,
                            const struct YuvConstants* yuvconstants,
                            int width) {
  I422ToARGB4444Row_SVE_SC(src_y, src_u, src_v, dst_argb4444, yuvconstants,
                           width);
}

void I422ToRGBARow_SVE2(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  I422ToRGBARow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}

void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
                             const uint8_t* src_a,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  I422AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
                            width);
}

void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
                             const uint8_t* src_a,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  I444AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
                            width);
}

void NV12ToARGBRow_SVE2(const uint8_t* src_y,
                        const uint8_t* src_uv,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  NV12ToARGBRow_SVE_SC(src_y, src_uv, dst_argb, yuvconstants, width);
}

void NV21ToARGBRow_SVE2(const uint8_t* src_y,
                        const uint8_t* src_vu,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  NV21ToARGBRow_SVE_SC(src_y, src_vu, dst_argb, yuvconstants, width);
}

void NV12ToRGB24Row_SVE2(const uint8_t* src_y,
                         const uint8_t* src_uv,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  NV12ToRGB24Row_SVE_SC(src_y, src_uv, dst_rgb24, yuvconstants, width);
}

void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
                         const uint8_t* src_vu,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
}

// Dot-product constants are stored as four-tuples with the two innermost
// elements flipped to account for the interleaving nature of the widening
// addition instructions.

static const int16_t kARGBToUVCoefficients[] = {
    // UB, -UR, -UG, 0, -VB, VR, -VG, 0
    56, -19, -37, 0, -9, 56, -47, 0,
};

static const int16_t kRGBAToUVCoefficients[] = {
    // 0, -UG, UB, -UR, 0, -VG, -VB, VR
    0, -37, 56, -19, 0, -47, -9, 56,
};

static const int16_t kBGRAToUVCoefficients[] = {
    // 0, -UG, -UR, UB, 0, -VG, VR, -VB
    0, -37, -19, 56, 0, -47, 56, -9,
};

static const int16_t kABGRToUVCoefficients[] = {
    // -UR, UB, -UG, 0, VR, -VB, -VG, 0
    -19, 56, -37, 0, 56, -9, -47, 0,
};

static const int16_t kARGBToUVJCoefficients[] = {
    // UB, -UR, -UG, 0, -VB, VR, -VG, 0
    63, -21, -42, 0, -10, 63, -53, 0,
};

static const int16_t kABGRToUVJCoefficients[] = {
    // -UR, UB, -UG, 0, VR, -VB, -VG, 0
    -21, 63, -42, 0, 63, -10, -53, 0,
};

static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
                                   int src_stride_argb,
                                   uint8_t* dst_u,
                                   uint8_t* dst_v,
                                   int width,
                                   const int16_t* uvconstants) {
  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
  uint64_t vl;
  asm volatile(
      "ptrue    p0.b                                \n"
      "ld1rd    {z24.d}, p0/z, [%[uvconstants]]     \n"
      "ld1rd    {z25.d}, p0/z, [%[uvconstants], #8] \n"
      "mov      z26.b, #0x80                        \n"

      "cntb     %[vl]                               \n"
      "subs     %w[width], %w[width], %w[vl]        \n"
      "b.lt     2f                                  \n"

      // Process 4x vectors from each input row per iteration.
      // Cannot use predication here due to unrolling.
      "1:                                           \n"  // e.g.
      "ld1b     {z0.b}, p0/z, [%[src0], #0, mul vl] \n"  // bgrabgra
      "ld1b     {z4.b}, p0/z, [%[src1], #0, mul vl] \n"  // bgrabgra
      "ld1b     {z1.b}, p0/z, [%[src0], #1, mul vl] \n"  // bgrabgra
      "ld1b     {z5.b}, p0/z, [%[src1], #1, mul vl] \n"  // bgrabgra
      "ld1b     {z2.b}, p0/z, [%[src0], #2, mul vl] \n"  // bgrabgra
      "ld1b     {z6.b}, p0/z, [%[src1], #2, mul vl] \n"  // bgrabgra
      "ld1b     {z3.b}, p0/z, [%[src0], #3, mul vl] \n"  // bgrabgra
      "ld1b     {z7.b}, p0/z, [%[src1], #3, mul vl] \n"  // bgrabgra
      "incb     %[src0], all, mul #4                \n"
      "incb     %[src1], all, mul #4                \n"

      "uaddlb   z16.h, z0.b, z4.b                   \n"  // brbrbrbr
      "uaddlt   z17.h, z0.b, z4.b                   \n"  // gagagaga
      "uaddlb   z18.h, z1.b, z5.b                   \n"  // brbrbrbr
      "uaddlt   z19.h, z1.b, z5.b                   \n"  // gagagaga
      "uaddlb   z20.h, z2.b, z6.b                   \n"  // brbrbrbr
      "uaddlt   z21.h, z2.b, z6.b                   \n"  // gagagaga
      "uaddlb   z22.h, z3.b, z7.b                   \n"  // brbrbrbr
      "uaddlt   z23.h, z3.b, z7.b                   \n"  // gagagaga

      "trn1     z0.s, z16.s, z17.s                  \n"  // brgabgra
      "trn2     z1.s, z16.s, z17.s                  \n"  // brgabgra
      "trn1     z2.s, z18.s, z19.s                  \n"  // brgabgra
      "trn2     z3.s, z18.s, z19.s                  \n"  // brgabgra
      "trn1     z4.s, z20.s, z21.s                  \n"  // brgabgra
      "trn2     z5.s, z20.s, z21.s                  \n"  // brgabgra
      "trn1     z6.s, z22.s, z23.s                  \n"  // brgabgra
      "trn2     z7.s, z22.s, z23.s                  \n"  // brgabgra

      "subs     %w[width], %w[width], %w[vl]        \n"  // 4*VL per loop

      "urhadd   z0.h, p0/m, z0.h, z1.h              \n"  // brgabrga
      "urhadd   z2.h, p0/m, z2.h, z3.h              \n"  // brgabrga
      "urhadd   z4.h, p0/m, z4.h, z5.h              \n"  // brgabrga
      "urhadd   z6.h, p0/m, z6.h, z7.h              \n"  // brgabrga

      "movi     v16.8h, #0                          \n"
      "movi     v17.8h, #0                          \n"
      "movi     v18.8h, #0                          \n"
      "movi     v19.8h, #0                          \n"

      "movi     v20.8h, #0                          \n"
      "movi     v21.8h, #0                          \n"
      "movi     v22.8h, #0                          \n"
      "movi     v23.8h, #0                          \n"

      "sdot     z16.d, z0.h, z24.h                  \n"  // UUxxxxxx
      "sdot     z17.d, z2.h, z24.h                  \n"  // UUxxxxxx
      "sdot     z18.d, z4.h, z24.h                  \n"  // UUxxxxxx
      "sdot     z19.d, z6.h, z24.h                  \n"  // UUxxxxxx

      "sdot     z20.d, z0.h, z25.h                  \n"  // VVxxxxxx
      "sdot     z21.d, z2.h, z25.h                  \n"  // VVxxxxxx
      "sdot     z22.d, z4.h, z25.h                  \n"  // VVxxxxxx
      "sdot     z23.d, z6.h, z25.h                  \n"  // VVxxxxxx

      "uzp1     z16.s, z16.s, z17.s                 \n"  // UUxx
      "uzp1     z18.s, z18.s, z19.s                 \n"  // UUxx
      "uzp1     z20.s, z20.s, z21.s                 \n"  // VVxx
      "uzp1     z22.s, z22.s, z23.s                 \n"  // VVxx

      "uzp1     z16.h, z16.h, z18.h                 \n"  // UU
      "uzp1     z20.h, z20.h, z22.h                 \n"  // VV

      "addhnb   z16.b, z16.h, z26.h                 \n"  // U
      "addhnb   z20.b, z20.h, z26.h                 \n"  // V

      "st1b     {z16.h}, p0, [%[dst_u]]             \n"  // U
      "st1b     {z20.h}, p0, [%[dst_v]]             \n"  // V
      "inch     %[dst_u]                            \n"
      "inch     %[dst_v]                            \n"

      "b.ge     1b                                  \n"

      "2:                                           \n"
      "adds     %w[width], %w[width], %w[vl]        \n"  // VL per loop
      "b.le     99f                                 \n"

      // Process remaining pixels from each input row.
      // Use predication to do one vector from each input array, so may loop up
      // to three iterations.
      "cntw     %x[vl]                              \n"

      "3:                                           \n"
      "whilelt  p1.s, wzr, %w[width]                \n"
      "ld1d     {z0.d}, p1/z, [%[src0]]             \n"  // bgrabgra
      "ld1d     {z4.d}, p1/z, [%[src1]]             \n"  // bgrabgra
      "incb     %[src0]                             \n"
      "incb     %[src1]                             \n"

      "uaddlb   z16.h, z0.b, z4.b                   \n"  // brbrbrbr
      "uaddlt   z17.h, z0.b, z4.b                   \n"  // gagagaga

      "trn1     z0.s, z16.s, z17.s                  \n"  // brgabgra
      "trn2     z1.s, z16.s, z17.s                  \n"  // brgabgra

      "urhadd   z0.h, p0/m, z0.h, z1.h              \n"  // brgabrga

      "subs     %w[width], %w[width], %w[vl]        \n"  // VL per loop

      "movi     v16.8h, #0                          \n"
      "movi     v20.8h, #0                          \n"

      "sdot     z16.d, z0.h, z24.h                  \n"
      "sdot     z20.d, z0.h, z25.h                  \n"

      "addhnb   z16.b, z16.h, z26.h                 \n"  // U
      "addhnb   z20.b, z20.h, z26.h                 \n"  // V

      "st1b     {z16.d}, p0, [%[dst_u]]             \n"  // U
      "st1b     {z20.d}, p0, [%[dst_v]]             \n"  // V
      "incd     %[dst_u]                            \n"
      "incd     %[dst_v]                            \n"
      "b.gt     3b                                  \n"

      "99:                                          \n"
      : [src0] "+r"(src_argb),    // %[src0]
        [src1] "+r"(src_argb_1),  // %[src1]
        [dst_u] "+r"(dst_u),      // %[dst_u]
        [dst_v] "+r"(dst_v),      // %[dst_v]
        [width] "+r"(width),      // %[width]
        [vl] "=&r"(vl)            // %[vl]
      : [uvconstants] "r"(uvconstants)
      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16",
        "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26",
        "p0");
}

void ARGBToUVRow_SVE2(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width,
                         kARGBToUVCoefficients);
}

void ARGBToUVJRow_SVE2(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
  ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width,
                         kARGBToUVJCoefficients);
}

void ABGRToUVJRow_SVE2(const uint8_t* src_abgr,
                       int src_stride_abgr,
                       uint8_t* dst_uj,
                       uint8_t* dst_vj,
                       int width) {
  ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_uj, dst_vj, width,
                         kABGRToUVJCoefficients);
}

void BGRAToUVRow_SVE2(const uint8_t* src_bgra,
                      int src_stride_bgra,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  ARGBToUVMatrixRow_SVE2(src_bgra, src_stride_bgra, dst_u, dst_v, width,
                         kBGRAToUVCoefficients);
}

void ABGRToUVRow_SVE2(const uint8_t* src_abgr,
                      int src_stride_abgr,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_u, dst_v, width,
                         kABGRToUVCoefficients);
}

void RGBAToUVRow_SVE2(const uint8_t* src_rgba,
                      int src_stride_rgba,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  ARGBToUVMatrixRow_SVE2(src_rgba, src_stride_rgba, dst_u, dst_v, width,
                         kRGBAToUVCoefficients);
}

#define ARGBTORGB565_SVE                    \
  /* Inputs:                                \
   * z0: rrrrrxxxbbbbbxxx                   \
   * z1: xxxxxxxxggggggxx                   \
   * z3: 0000000000000011 (3, 0, 3, 0, ...) \
   * z4: 0000011111100000                   \
   */
  "lsr     z0.b, p0/m, z0.b, z3.b       \n" \
  "lsl     z1.h, z1.h, #3               \n" \
  "bsl     z1.d, z1.d, z0.d, z4.d       \n"

void ARGBToRGB565Row_SVE2(const uint8_t* src_argb,
                          uint8_t* dst_rgb,
                          int width) {
  unsigned bsl_mask = 0x7e0;
  uint64_t vl;
  width *= 2;
  asm volatile(
      "mov     z3.h, #3                     \n"
      "dup     z4.h, %w[bsl_mask]           \n"

      "cntb    %[vl]                        \n"
      "subs    %w[width], %w[width], %w[vl] \n"
      "b.lt    2f                           \n"

      "ptrue   p0.b                         \n"
      "1:                                   \n"
      "ld2b    {z0.b, z1.b}, p0/z, [%[src]] \n"  // BR, GA
      "incb    %[src], all, mul #2          \n"
      "subs    %w[width], %w[width], %w[vl] \n"  //
      ARGBTORGB565_SVE
      "st1b    {z1.b}, p0, [%[dst]]         \n"
      "incb    %[dst]                       \n"
      "b.ge    1b                           \n"

      "2:                                   \n"
      "adds    %w[width], %w[width], %w[vl] \n"
      "b.eq    99f                          \n"

      "whilelt p0.b, wzr, %w[width]         \n"
      "ld2b    {z0.b, z1.b}, p0/z, [%[src]] \n"  // BR, GA
      ARGBTORGB565_SVE
      "st1b    {z1.b}, p0, [%[dst]]         \n"

      "99:                                  \n"
      : [src] "+r"(src_argb),     // %[src]
        [dst] "+r"(dst_rgb),      // %[dst]
        [width] "+r"(width),      // %[width]
        [vl] "=&r"(vl)            // %[vl]
      : [bsl_mask] "r"(bsl_mask)  // %[bsl_mask]
      : "cc", "memory", "z0", "z1", "z3", "z4", "p0");
}

void ARGBToRGB565DitherRow_SVE2(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
                                uint32_t dither4,
                                int width) {
  unsigned bsl_mask = 0x7e0;
  uint64_t vl;
  width *= 2;
  asm volatile(
      "mov     z3.h, #3                     \n"
      "dup     z4.h, %w[bsl_mask]           \n"
      "dup     z2.s, %w[dither4]            \n"
      "zip1    z2.b, z2.b, z2.b             \n"

      "cntb    %[vl]                        \n"
      "subs    %w[width], %w[width], %w[vl] \n"
      "b.lt    2f                           \n"

      "ptrue   p0.b                         \n"
      "1:                                   \n"
      "ld2b    {z0.b, z1.b}, p0/z, [%[src]] \n"  // BR, GA
      "incb    %[src], all, mul #2          \n"
      "uqadd   z0.b, z0.b, z2.b             \n"
      "uqadd   z1.b, z1.b, z2.b             \n"
      "subs    %w[width], %w[width], %w[vl] \n"  //
      ARGBTORGB565_SVE
      "st1b    {z1.b}, p0, [%[dst]]         \n"
      "incb    %[dst]                       \n"
      "b.ge    1b                           \n"

      "2:                                   \n"
      "adds    %w[width], %w[width], %w[vl] \n"
      "b.eq    99f                          \n"

      "whilelt p0.b, wzr, %w[width]         \n"
      "ld2b    {z0.b, z1.b}, p0/z, [%[src]] \n"  // BR, GA
      "uqadd   z0.b, z0.b, z2.b             \n"
      "uqadd   z1.b, z1.b, z2.b             \n"  //
      ARGBTORGB565_SVE
      "st1b    {z1.b}, p0, [%[dst]]         \n"

      "99:                                  \n"
      : [src] "+r"(src_argb),      // %[src]
        [dst] "+r"(dst_rgb),       // %[dst]
        [width] "+r"(width),       // %[width]
        [vl] "=&r"(vl)             // %[vl]
      : [bsl_mask] "r"(bsl_mask),  // %[bsl_mask]
        [dither4] "r"(dither4)     // %[dither4]
      : "cc", "memory", "z0", "z1", "z3", "z4", "p0");
}

#define ARGB1555TOARGB                                        \
  /* Input: z1/z3.h = arrrrrgggggbbbbb */                     \
  "lsl     z0.h, z1.h, #3          \n" /* rrrgggggbbbbb000 */ \
  "lsl     z2.h, z3.h, #3          \n" /* rrrgggggbbbbb000 */ \
  "asr     z1.h, z1.h, #7          \n" /* aaaaaaaarrrrrggg */ \
  "asr     z3.h, z3.h, #7          \n" /* aaaaaaaarrrrrggg */ \
  "lsl     z0.b, p0/m, z0.b, z4.b  \n" /* ggggg000bbbbb000 */ \
  "lsl     z2.b, p0/m, z2.b, z4.b  \n" /* ggggg000bbbbb000 */ \
  "sri     z1.b, z1.b, #5          \n" /* aaaaaaaarrrrrrrr */ \
  "sri     z3.b, z3.b, #5          \n" /* aaaaaaaarrrrrrrr */ \
  "sri     z0.b, z0.b, #5          \n" /* ggggggggbbbbbbbb */ \
  "sri     z2.b, z2.b, #5          \n" /* ggggggggbbbbbbbb */

void ARGB1555ToARGBRow_SVE2(const uint8_t* src_argb1555,
                            uint8_t* dst_argb,
                            int width) {
  uint64_t vl;
  asm volatile(
      "mov     z4.h, #0x0300                           \n"
      "ptrue   p0.b                                    \n"

      "cnth    %x[vl]                                  \n"
      "subs    %w[width], %w[width], %w[vl], lsl #1    \n"
      "b.lt    2f                                      \n"

      "1:                                              \n"
      "ld1h    {z1.h}, p0/z, [%[src]]                  \n"
      "ld1h    {z3.h}, p0/z, [%[src], #1, mul vl]      \n"
      "incb    %[src], all, mul #2                     \n"  //
      ARGB1555TOARGB
      "subs    %w[width], %w[width], %w[vl], lsl #1    \n"
      "st2h    {z0.h, z1.h}, p0, [%[dst]]              \n"
      "st2h    {z2.h, z3.h}, p0, [%[dst], #2, mul vl]  \n"
      "incb    %[dst], all, mul #4                     \n"
      "b.ge    1b                                      \n"

      "2:                                              \n"
      "adds    %w[width], %w[width], %w[vl], lsl #1    \n"
      "b.eq    99f                                     \n"

      "whilelt p1.h, wzr, %w[width]                    \n"
      "whilelt p2.h, %w[vl], %w[width]                 \n"
      "ld1h    {z1.h}, p1/z, [%[src]]                  \n"
      "ld1h    {z3.h}, p2/z, [%[src], #1, mul vl]      \n"  //
      ARGB1555TOARGB
      "st2h    {z0.h, z1.h}, p1, [%[dst]]              \n"
      "st2h    {z2.h, z3.h}, p2, [%[dst], #2, mul vl]  \n"

      "99:                                             \n"
      : [src] "+r"(src_argb1555),  // %[src]
        [dst] "+r"(dst_argb),      // %[dst]
        [width] "+r"(width),       // %[width]
        [vl] "=&r"(vl)             // %[vl]
      :
      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0", "p1", "p2");
}

// clang-format off
#define AYUVTOUV_SVE(zU0, zV0, zU1, zV1)                   /* e.g. */          \
  "ld2h     {z0.h, z1.h}, p0/z, [%[src0]]              \n" /* VUVU.. YAYA.. */ \
  "ld2h     {z1.h, z2.h}, p1/z, [%[src0], #2, mul vl]  \n" /* VUVU.. YAYA.. */ \
  "ld2h     {z2.h, z3.h}, p0/z, [%[src1]]              \n" /* VUVU.. YAYA.. */ \
  "ld2h     {z3.h, z4.h}, p1/z, [%[src1], #2, mul vl]  \n" /* VUVU.. YAYA.. */ \
  "incb     %[src0], all, mul #4                       \n"                     \
  "incb     %[src1], all, mul #4                       \n"                     \
  "uaddlb   z4.h, z0.b, z2.b                           \n" /* V */             \
  "uaddlt   z5.h, z0.b, z2.b                           \n" /* U */             \
  "uaddlb   z6.h, z1.b, z3.b                           \n" /* V */             \
  "uaddlt   z7.h, z1.b, z3.b                           \n" /* U */             \
  "addp   " #zU0 ".h, p0/m, " #zU0 ".h, " #zV0 ".h     \n" /* UV */            \
  "addp   " #zU1 ".h, p1/m, " #zU1 ".h, " #zV1 ".h     \n" /* UV */            \
  "subs     %w[width], %w[width], %w[vl]               \n"                     \
  "urshr  " #zU0 ".h, p0/m, " #zU0 ".h, #2             \n" /* U0V0 */          \
  "urshr  " #zU1 ".h, p1/m, " #zU1 ".h, #2             \n" /* U0V0 */          \
  "st1b     {" #zU0 ".h}, p0, [%[dst]]                 \n"                     \
  "st1b     {" #zU1 ".h}, p1, [%[dst], #1, mul vl]     \n"                     \
  "incb     %[dst]                                     \n"
// clang-format on

// Filter 2 rows of AYUV UV's (444) into UV (420).
// AYUV is VUYA in memory.  UV for NV12 is UV order in memory.
void AYUVToUVRow_SVE2(const uint8_t* src_ayuv,
                      int src_stride_ayuv,
                      uint8_t* dst_uv,
                      int width) {
  // Output a row of UV values, filtering 2x2 rows of AYUV.
  const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
  int vl;
  asm volatile (
      "cntb    %x[vl]                            \n"
      "subs    %w[width], %w[width], %w[vl]      \n"
      "b.lt    2f                                \n"

      "ptrue   p0.h                              \n"
      "ptrue   p1.h                              \n"
      "1:                                        \n"
      AYUVTOUV_SVE(z5, z4, z7, z6)
      "b.ge    1b                                \n"

      "2:                                        \n"
      "adds    %w[width], %w[width], %w[vl]      \n"
      "b.eq    99f                               \n"

      "cnth    %x[vl]                            \n"
      "whilelt p0.h, wzr, %w[width]              \n" // first row
      "whilelt p1.h, %w[vl], %w[width]           \n" // second row
      AYUVTOUV_SVE(z5, z4, z7, z6)

      "99:                                       \n"
      : [src0]"+r"(src_ayuv),   // %[src0]
        [src1]"+r"(src_ayuv1),  // %[src1]
        [dst]"+r"(dst_uv),      // %[dst]
        [width]"+r"(width),     // %[width]
        [vl]"=&r"(vl)           // %[vl]
      :
      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0",
        "p1");
}

// Filter 2 rows of AYUV UV's (444) into VU (420).
void AYUVToVURow_SVE2(const uint8_t* src_ayuv,
                      int src_stride_ayuv,
                      uint8_t* dst_vu,
                      int width) {
  // Output a row of VU values, filtering 2x2 rows of AYUV.
  const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
  int vl;
  asm volatile (
      "cntb    %x[vl]                            \n"
      "cmp     %w[width], %w[vl]                 \n"
      "subs    %w[width], %w[width], %w[vl]      \n"
      "b.lt    2f                                \n"

      "ptrue   p0.h                              \n"
      "ptrue   p1.h                              \n"
      "1:                                        \n"
      AYUVTOUV_SVE(z4, z5, z6, z7)
      "b.ge    1b                                \n"

      "2:                                        \n"
      "adds    %w[width], %w[width], %w[vl]      \n"
      "b.eq    99f                               \n"

      "cnth    %x[vl]                            \n"
      "whilelt p0.h, wzr, %w[width]              \n" // first row
      "whilelt p1.h, %w[vl], %w[width]           \n" // second row
      AYUVTOUV_SVE(z4, z5, z6, z7)

      "99:                                       \n"
      : [src0]"+r"(src_ayuv),   // %[src0]
        [src1]"+r"(src_ayuv1),  // %[src1]
        [dst]"+r"(dst_vu),      // %[dst]
        [width]"+r"(width),     // %[width]
        [vl]"=&r"(vl)           // %[vl]
      :
      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0",
        "p1");
}

void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  YUY2ToARGBRow_SVE_SC(src_yuy2, dst_argb, yuvconstants, width);
}

void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  UYVYToARGBRow_SVE_SC(src_uyvy, dst_argb, yuvconstants, width);
}

static inline void RAWToWXYZRow_SVE2(const uint8_t* src_raw,
                                     uint8_t* dst_wxyz,
                                     int width,
                                     uint32_t idx_start,
                                     uint32_t idx_step,
                                     uint32_t alpha) {
  uint32_t vl;
  asm("cntw %x0" : "=r"(vl));
  uint32_t vl_mul3 = vl * 3;
  uint32_t rem_mul3;
  asm volatile(
      "index   z31.s, %w[idx_start], %w[idx_step]        \n"
      "dup     z30.s, %w[alpha]                          \n"
      "subs    %w[width], %w[width], %w[vl], lsl #1      \n"
      "b.lt    2f                                        \n"

      // Run bulk of computation with the same predicates to avoid predicate
      // generation overhead. We set up p1 to only load 3/4 of a vector.
      "ptrue   p0.s                                      \n"
      "whilelt p1.b, wzr, %w[vl_mul3]                    \n"
      "1:                                                \n"
      "ld1b    {z0.b}, p1/z, [%[src]]                    \n"
      "add     %[src], %[src], %x[vl_mul3]               \n"
      "ld1b    {z1.b}, p1/z, [%[src]]                    \n"
      "add     %[src], %[src], %x[vl_mul3]               \n"
      "tbl     z0.b, {z0.b}, z31.b                       \n"
      "tbl     z1.b, {z1.b}, z31.b                       \n"
      "subs    %w[width], %w[width], %w[vl], lsl #1      \n"
      "orr     z0.d, z0.d, z30.d                         \n"
      "orr     z1.d, z1.d, z30.d                         \n"
      "st1w    {z0.s}, p0, [%[dst]]                      \n"
      "st1w    {z1.s}, p0, [%[dst], #1, mul vl]          \n"
      "incb    %[dst], all, mul #2                       \n"
      "b.ge    1b                                        \n"

      "2:                                                \n"
      "adds     %w[width], %w[width], %w[vl], lsl #1     \n"
      "b.eq     99f                                      \n"

      // Calculate a pair of predicates for the final iteration to deal with
      // the tail.
      "3:                                                \n"
      "add     %w[rem_mul3], %w[width], %w[width], lsl #1 \n"
      "whilelt p0.s, wzr, %w[width]                      \n"
      "whilelt p1.b, wzr, %w[rem_mul3]                    \n"
      "ld1b    {z0.b}, p1/z, [%[src]]                    \n"
      "add     %[src], %[src], %x[vl_mul3]               \n"
      "tbl     z0.b, {z0.b}, z31.b                       \n"
      "subs    %w[width], %w[width], %w[vl]              \n"
      "orr     z0.d, z0.d, z30.d                         \n"
      "st1w    {z0.s}, p0, [%[dst]]                      \n"
      "incb    %[dst]                                    \n"
      "b.gt    3b                                        \n"

      "99:                                               \n"
      : [src] "+r"(src_raw),         // %[src]
        [dst] "+r"(dst_wxyz),        // %[dst]
        [width] "+r"(width),         // %[width]
        [vl_mul3] "+r"(vl_mul3),     // %[vl_mul3]
        [rem_mul3] "=&r"(rem_mul3)   // %[rem_mul3]
      : [idx_start] "r"(idx_start),  // %[idx_start]
        [idx_step] "r"(idx_step),    // %[idx_step]
        [alpha] "r"(alpha),          // %[alpha]
        [vl] "r"(vl)                 // %[vl]
      : "cc", "memory", "z0", "z1", "z30", "z31", "p0", "p1");
}

void RAWToARGBRow_SVE2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  RAWToWXYZRow_SVE2(src_raw, dst_argb, width, 0xff000102U, 0x00030303U,
                    0xff000000U);
}

void RAWToRGBARow_SVE2(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
  RAWToWXYZRow_SVE2(src_raw, dst_rgba, width, 0x000102ffU, 0x03030300U,
                    0x000000ffU);
}

void RGB24ToARGBRow_SVE2(const uint8_t* src_rgb24,
                         uint8_t* dst_argb,
                         int width) {
  RAWToWXYZRow_SVE2(src_rgb24, dst_argb, width, 0xff020100U, 0x00030303U,
                    0xff000000U);
}

static const uint8_t kRAWToRGB24Indices[] = {
    2,   1,   0,   5,   4,   3,   8,   7,   6,   11,  10,  9,   14,  13,  12,
    17,  16,  15,  20,  19,  18,  23,  22,  21,  26,  25,  24,  29,  28,  27,
    32,  31,  30,  35,  34,  33,  38,  37,  36,  41,  40,  39,  44,  43,  42,
    47,  46,  45,  50,  49,  48,  53,  52,  51,  56,  55,  54,  59,  58,  57,
    62,  61,  60,  65,  64,  63,  68,  67,  66,  71,  70,  69,  74,  73,  72,
    77,  76,  75,  80,  79,  78,  83,  82,  81,  86,  85,  84,  89,  88,  87,
    92,  91,  90,  95,  94,  93,  98,  97,  96,  101, 100, 99,  104, 103, 102,
    107, 106, 105, 110, 109, 108, 113, 112, 111, 116, 115, 114, 119, 118, 117,
    122, 121, 120, 125, 124, 123, 128, 127, 126, 131, 130, 129, 134, 133, 132,
    137, 136, 135, 140, 139, 138, 143, 142, 141, 146, 145, 144, 149, 148, 147,
    152, 151, 150, 155, 154, 153, 158, 157, 156, 161, 160, 159, 164, 163, 162,
    167, 166, 165, 170, 169, 168, 173, 172, 171, 176, 175, 174, 179, 178, 177,
    182, 181, 180, 185, 184, 183, 188, 187, 186, 191, 190, 189, 194, 193, 192,
    197, 196, 195, 200, 199, 198, 203, 202, 201, 206, 205, 204, 209, 208, 207,
    212, 211, 210, 215, 214, 213, 218, 217, 216, 221, 220, 219, 224, 223, 222,
    227, 226, 225, 230, 229, 228, 233, 232, 231, 236, 235, 234, 239, 238, 237,
    242, 241, 240, 245, 244, 243, 248, 247, 246, 251, 250, 249, 254, 253, 252};

void RAWToRGB24Row_SVE2(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
  // width is in elements, convert to bytes.
  width *= 3;
  // we use the mul3 predicate pattern throughout to use the largest multiple
  // of three number of lanes, for instance with a vector length of 16 bytes
  // only the first 15 bytes will be used for load/store instructions.
  uint32_t vl;
  asm volatile(
      "cntb    %x[vl], mul3                              \n"
      "ptrue   p0.b, mul3                                \n"
      "ld1b    {z31.b}, p0/z, [%[kIndices]]              \n"
      "subs    %w[width], %w[width], %w[vl]              \n"
      "b.lt    2f                                        \n"

      // Run bulk of computation with the same predicate to avoid predicate
      // generation overhead.
      "1:                                                \n"
      "ld1b    {z0.b}, p0/z, [%[src]]                    \n"
      "add     %[src], %[src], %x[vl]                    \n"
      "tbl     z0.b, {z0.b}, z31.b                       \n"
      "subs    %w[width], %w[width], %w[vl]              \n"
      "st1b    {z0.b}, p0, [%[dst]]                      \n"
      "add     %[dst], %[dst], %x[vl]                    \n"
      "b.ge    1b                                        \n"

      "2:                                                \n"
      "adds    %w[width], %w[width], %w[vl]              \n"
      "b.eq    99f                                       \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt p0.b, wzr, %w[width]                      \n"
      "ld1b    {z0.b}, p0/z, [%[src]]                    \n"
      "tbl     z0.b, {z0.b}, z31.b                       \n"
      "st1b    {z0.b}, p0, [%[dst]]                      \n"

      "99:                                               \n"
      : [src] "+r"(src_raw),                // %[src]
        [dst] "+r"(dst_rgb24),              // %[dst]
        [width] "+r"(width),                // %[width]
        [vl] "=&r"(vl)                      // %[vl]
      : [kIndices] "r"(kRAWToRGB24Indices)  // %[kIndices]
      : "cc", "memory", "z0", "z31", "p0");
}

static inline void ARGBToXYZRow_SVE2(const uint8_t* src_argb,
                                     uint8_t* dst_xyz,
                                     int width,
                                     const uint8_t* indices) {
  uint32_t vl;
  asm("cntw %x0" : "=r"(vl));
  uint32_t vl_mul3 = vl * 3;
  uint32_t rem_mul3;
  asm volatile(
      "whilelt p1.b, wzr, %w[vl_mul3]                     \n"
      "ld1b    {z31.b}, p1/z, [%[indices]]                \n"
      "subs    %w[width], %w[width], %w[vl], lsl #1       \n"
      "b.lt    2f                                         \n"

      // Run bulk of computation with the same predicates to avoid predicate
      // generation overhead. We set up p1 to only store 3/4 of a vector.
      "ptrue   p0.s                                       \n"
      "1:                                                 \n"
      "ld1w    {z0.s}, p0/z, [%[src]]                     \n"
      "ld1w    {z1.s}, p0/z, [%[src], #1, mul vl]         \n"
      "incb    %[src], all, mul #2                        \n"
      "tbl     z0.b, {z0.b}, z31.b                        \n"
      "tbl     z1.b, {z1.b}, z31.b                        \n"
      "subs    %w[width], %w[width], %w[vl], lsl #1       \n"
      "st1b    {z0.b}, p1, [%[dst]]                       \n"
      "add     %[dst], %[dst], %x[vl_mul3]                \n"
      "st1b    {z1.b}, p1, [%[dst]]                       \n"
      "add     %[dst], %[dst], %x[vl_mul3]                \n"
      "b.ge    1b                                         \n"

      "2:                                                 \n"
      "adds    %w[width], %w[width], %w[vl], lsl #1       \n"
      "b.eq    99f                                        \n"

      // Calculate predicates for the final iteration to deal with the tail.
      "add     %w[rem_mul3], %w[width], %w[width], lsl #1 \n"
      "whilelt p0.s, wzr, %w[width]                       \n"
      "whilelt p1.b, wzr, %w[rem_mul3]                    \n"
      "whilelt p2.s, %w[vl], %w[width]                    \n"
      "whilelt p3.b, %w[vl_mul3], %w[rem_mul3]            \n"
      "ld1w    {z0.s}, p0/z, [%[src]]                     \n"
      "ld1w    {z1.s}, p2/z, [%[src], #1, mul vl]         \n"
      "tbl     z0.b, {z0.b}, z31.b                        \n"
      "tbl     z1.b, {z1.b}, z31.b                        \n"
      "st1b    {z0.b}, p1, [%[dst]]                       \n"
      "add     %[dst], %[dst], %x[vl_mul3]                \n"
      "st1b    {z1.b}, p3, [%[dst]]                       \n"

      "99:                                                \n"
      : [src] "+r"(src_argb),       // %[src]
        [dst] "+r"(dst_xyz),        // %[dst]
        [width] "+r"(width),        // %[width]
        [rem_mul3] "=&r"(rem_mul3)  // %[rem_mul3]
      : [indices] "r"(indices),     // %[indices]
        [vl_mul3] "r"(vl_mul3),     // %[vl_mul3]
        [vl] "r"(vl)                // %[vl]
      : "cc", "memory", "z0", "z1", "z31", "p0", "p1", "p2", "p3");
}

static const uint8_t kARGBToRGB24RowIndices[] = {
    0,   1,   2,   4,   5,   6,   8,   9,   10,  12,  13,  14,  16,  17,  18,
    20,  21,  22,  24,  25,  26,  28,  29,  30,  32,  33,  34,  36,  37,  38,
    40,  41,  42,  44,  45,  46,  48,  49,  50,  52,  53,  54,  56,  57,  58,
    60,  61,  62,  64,  65,  66,  68,  69,  70,  72,  73,  74,  76,  77,  78,
    80,  81,  82,  84,  85,  86,  88,  89,  90,  92,  93,  94,  96,  97,  98,
    100, 101, 102, 104, 105, 106, 108, 109, 110, 112, 113, 114, 116, 117, 118,
    120, 121, 122, 124, 125, 126, 128, 129, 130, 132, 133, 134, 136, 137, 138,
    140, 141, 142, 144, 145, 146, 148, 149, 150, 152, 153, 154, 156, 157, 158,
    160, 161, 162, 164, 165, 166, 168, 169, 170, 172, 173, 174, 176, 177, 178,
    180, 181, 182, 184, 185, 186, 188, 189, 190, 192, 193, 194, 196, 197, 198,
    200, 201, 202, 204, 205, 206, 208, 209, 210, 212, 213, 214, 216, 217, 218,
    220, 221, 222, 224, 225, 226, 228, 229, 230, 232, 233, 234, 236, 237, 238,
    240, 241, 242, 244, 245, 246, 248, 249, 250, 252, 253, 254,
};

static const uint8_t kARGBToRAWRowIndices[] = {
    2,   1,   0,   6,   5,   4,   10,  9,   8,   14,  13,  12,  18,  17,  16,
    22,  21,  20,  26,  25,  24,  30,  29,  28,  34,  33,  32,  38,  37,  36,
    42,  41,  40,  46,  45,  44,  50,  49,  48,  54,  53,  52,  58,  57,  56,
    62,  61,  60,  66,  65,  64,  70,  69,  68,  74,  73,  72,  78,  77,  76,
    82,  81,  80,  86,  85,  84,  90,  89,  88,  94,  93,  92,  98,  97,  96,
    102, 101, 100, 106, 105, 104, 110, 109, 108, 114, 113, 112, 118, 117, 116,
    122, 121, 120, 126, 125, 124, 130, 129, 128, 134, 133, 132, 138, 137, 136,
    142, 141, 140, 146, 145, 144, 150, 149, 148, 154, 153, 152, 158, 157, 156,
    162, 161, 160, 166, 165, 164, 170, 169, 168, 174, 173, 172, 178, 177, 176,
    182, 181, 180, 186, 185, 184, 190, 189, 188, 194, 193, 192, 198, 197, 196,
    202, 201, 200, 206, 205, 204, 210, 209, 208, 214, 213, 212, 218, 217, 216,
    222, 221, 220, 226, 225, 224, 230, 229, 228, 234, 233, 232, 238, 237, 236,
    242, 241, 240, 246, 245, 244, 250, 249, 248, 254, 253, 252,
};

void ARGBToRGB24Row_SVE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
  ARGBToXYZRow_SVE2(src_argb, dst_rgb, width, kARGBToRGB24RowIndices);
}

void ARGBToRAWRow_SVE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
  ARGBToXYZRow_SVE2(src_argb, dst_rgb, width, kARGBToRAWRowIndices);
}

void DivideRow_16_SVE2(const uint16_t* src_y,
                       uint16_t* dst_y,
                       int scale,
                       int width) {
  uint64_t vl;
  asm volatile(
      "cnth   %x[vl]                                     \n"
      "dup    z0.h, %w[scale]                            \n"
      "subs   %w[width], %w[width], %w[vl], lsl #1       \n"
      "b.le    2f                                        \n"

      // Run bulk of computation with the same predicates to avoid predicate
      // generation overhead.
      "ptrue   p0.h                                      \n"
      "1:                                                \n"
      "ld1h   {z1.h}, p0/z, [%[src]]                     \n"
      "ld1h   {z2.h}, p0/z, [%[src], #1, mul vl]         \n"
      "incb   %[src], all, mul #2                        \n"
      "umulh  z1.h, z1.h, z0.h                           \n"
      "umulh  z2.h, z2.h, z0.h                           \n"
      "subs   %w[width], %w[width], %w[vl], lsl #1       \n"
      "st1h   {z1.h}, p0, [%[dst]]                       \n"
      "st1h   {z2.h}, p0, [%[dst], #1, mul vl]           \n"
      "incb   %[dst], all, mul #2                        \n"
      "b.gt    1b                                        \n"

      "2:                                                \n"
      "adds     %w[width], %w[width], %w[vl], lsl #1     \n"
      "b.eq     99f                                      \n"

      // Calculate a pair of predicates for the final iteration to deal with
      // the tail.
      "whilelt p0.h, wzr, %w[width]                      \n"
      "whilelt p1.h, %w[vl], %w[width]                   \n"
      "ld1h   {z1.h}, p0/z, [%[src]]                     \n"
      "ld1h   {z2.h}, p1/z, [%[src], #1, mul vl]         \n"
      "umulh  z1.h, z1.h, z0.h                           \n"
      "umulh  z2.h, z2.h, z0.h                           \n"
      "st1h   {z1.h}, p0, [%[dst]]                       \n"
      "st1h   {z2.h}, p1, [%[dst], #1, mul vl]           \n"

      "99:                                               \n"
      : [src] "+r"(src_y),    // %[src]
        [dst] "+r"(dst_y),    // %[dst]
        [width] "+r"(width),  // %[width]
        [vl] "=&r"(vl)        // %[vl]
      : [scale] "r"(scale)    // %[scale]
      : "cc", "memory", "z0", "z1", "z2", "p0", "p1");
}

#define HALFFLOAT_SVE                                    \
  "scvtf       z0.s, p0/m, z0.s                      \n" \
  "scvtf       z1.s, p0/m, z1.s                      \n" \
  "scvtf       z2.s, p0/m, z2.s                      \n" \
  "scvtf       z3.s, p0/m, z3.s                      \n" \
  "fmul        z0.s, z0.s, z4.s                      \n" \
  "fmul        z1.s, z1.s, z4.s                      \n" \
  "fmul        z2.s, z2.s, z4.s                      \n" \
  "fmul        z3.s, z3.s, z4.s                      \n" \
  "uqshrnb     z0.h, z0.s, #13                       \n" \
  "uqshrnb     z1.h, z1.s, #13                       \n" \
  "uqshrnb     z2.h, z2.s, #13                       \n" \
  "uqshrnb     z3.h, z3.s, #13                       \n"

void HalfFloatRow_SVE2(const uint16_t* src,
                       uint16_t* dst,
                       float scale,
                       int width) {
  uint64_t vl;
  asm("cntw %x0" : "=r"(vl));
  asm volatile(
      "mov         z4.s, %s[scale]                       \n"
      "subs        %w[width], %w[width], %w[vl], lsl #2  \n"
      "b.lt        2f                                    \n"

      // Run bulk of computation with all-true predicates to avoid predicate
      // generation overhead.
      "ptrue       p0.s                                  \n"
      "1:                                                \n"
      "ld1h        {z0.s}, p0/z, [%[src]]                \n"
      "ld1h        {z1.s}, p0/z, [%[src], #1, mul vl]    \n"
      "ld1h        {z2.s}, p0/z, [%[src], #2, mul vl]    \n"
      "ld1h        {z3.s}, p0/z, [%[src], #3, mul vl]    \n"
      "incb        %[src], all, mul #2                   \n"  //
      HALFFLOAT_SVE
      "subs        %w[width], %w[width], %w[vl], lsl #2  \n"
      "st1h        {z0.s}, p0, [%[dst]]                  \n"
      "st1h        {z1.s}, p0, [%[dst], #1, mul vl]      \n"
      "st1h        {z2.s}, p0, [%[dst], #2, mul vl]      \n"
      "st1h        {z3.s}, p0, [%[dst], #3, mul vl]      \n"
      "incb        %[dst], all, mul #2                   \n"
      "b.ge        1b                                    \n"

      "2:                                                \n"
      "adds     %w[width], %w[width], %w[vl], lsl #2     \n"
      "b.eq     99f                                      \n"

      // Calculate predicates for the final iteration to deal with the tail.
      "whilelt     p0.s, wzr, %w[width]                  \n"
      "whilelt     p1.s, %w[vl], %w[width]               \n"
      "whilelt     p2.s, %w[vl2], %w[width]              \n"
      "whilelt     p3.s, %w[vl3], %w[width]              \n"
      "ld1h        {z0.s}, p0/z, [%[src]]                \n"
      "ld1h        {z1.s}, p1/z, [%[src], #1, mul vl]    \n"
      "ld1h        {z2.s}, p2/z, [%[src], #2, mul vl]    \n"
      "ld1h        {z3.s}, p3/z, [%[src], #3, mul vl]    \n"  //
      HALFFLOAT_SVE
      "st1h        {z0.s}, p0, [%[dst]]                  \n"
      "st1h        {z1.s}, p1, [%[dst], #1, mul vl]      \n"
      "st1h        {z2.s}, p2, [%[dst], #2, mul vl]      \n"
      "st1h        {z3.s}, p3, [%[dst], #3, mul vl]      \n"

      "99:                                               \n"
      : [src] "+r"(src),                        // %[src]
        [dst] "+r"(dst),                        // %[dst]
        [width] "+r"(width)                     // %[width]
      : [vl] "r"(vl),                           // %[vl]
        [vl2] "r"(vl * 2),                      // %[vl2]
        [vl3] "r"(vl * 3),                      // %[vl3]
        [scale] "w"(scale * 1.9259299444e-34f)  // %[scale]
      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0", "p1", "p2", "p3");
}

void HalfFloat1Row_SVE2(const uint16_t* src,
                        uint16_t* dst,
                        float scale,
                        int width) {
  uint64_t vl;
  asm volatile(
      "cnth        %x[vl]                                \n"
      "subs        %w[width], %w[width], %w[vl], lsl #1  \n"
      "b.lt        2f                                    \n"

      // Run bulk of computation with all-true predicates to avoid predicate
      // generation overhead.
      "ptrue       p0.h                                  \n"
      "1:                                                \n"
      "ld1h        {z0.h}, p0/z, [%[src]]                \n"
      "ld1h        {z1.h}, p0/z, [%[src], #1, mul vl]    \n"
      "incb        %[src], all, mul #2                   \n"
      "ucvtf       z0.h, p0/m, z0.h                      \n"
      "ucvtf       z1.h, p0/m, z1.h                      \n"
      "subs        %w[width], %w[width], %w[vl], lsl #1  \n"
      "st1h        {z0.h}, p0, [%[dst]]                  \n"
      "st1h        {z1.h}, p0, [%[dst], #1, mul vl]      \n"
      "incb        %[dst], all, mul #2                   \n"
      "b.ge        1b                                    \n"

      "2:                                                \n"
      "adds     %w[width], %w[width], %w[vl], lsl #1     \n"
      "b.eq     99f                                      \n"

      // Calculate predicates for the final iteration to deal with the tail.
      "whilelt     p0.h, wzr, %w[width]                  \n"
      "whilelt     p1.h, %w[vl], %w[width]               \n"
      "ld1h        {z0.h}, p0/z, [%[src]]                \n"
      "ld1h        {z1.h}, p1/z, [%[src], #1, mul vl]    \n"
      "ucvtf       z0.h, p0/m, z0.h                      \n"
      "ucvtf       z1.h, p0/m, z1.h                      \n"
      "st1h        {z0.h}, p0, [%[dst]]                  \n"
      "st1h        {z1.h}, p1, [%[dst], #1, mul vl]      \n"

      "99:                                               \n"
      : [src] "+r"(src),      // %[src]
        [dst] "+r"(dst),      // %[dst]
        [width] "+r"(width),  // %[width]
        [vl] "=&r"(vl)        // %[vl]
      :
      : "cc", "memory", "z0", "z1", "p0", "p1");
}

void I210ToARGBRow_SVE2(const uint16_t* src_y,
                        const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  I210ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}

void I210AlphaToARGBRow_SVE2(const uint16_t* src_y,
                             const uint16_t* src_u,
                             const uint16_t* src_v,
                             const uint16_t* src_a,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  I210AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
                            width);
}

void I210ToAR30Row_SVE2(const uint16_t* src_y,
                        const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint8_t* dst_ar30,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  I210ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
}

void P210ToARGBRow_SVE2(const uint16_t* src_y,
                        const uint16_t* src_uv,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  P210ToARGBRow_SVE_SC(src_y, src_uv, dst_argb, yuvconstants, width);
}

void P210ToAR30Row_SVE2(const uint16_t* src_y,
                        const uint16_t* src_uv,
                        uint8_t* dst_ar30,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  P210ToAR30Row_SVE_SC(src_y, src_uv, dst_ar30, yuvconstants, width);
}

void I410ToARGBRow_SVE2(const uint16_t* src_y,
                        const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  I410ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}

void I410AlphaToARGBRow_SVE2(const uint16_t* src_y,
                             const uint16_t* src_u,
                             const uint16_t* src_v,
                             const uint16_t* src_a,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  I410AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
                            width);
}

void I410ToAR30Row_SVE2(const uint16_t* src_y,
                        const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint8_t* dst_ar30,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  I410ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
}

void P410ToARGBRow_SVE2(const uint16_t* src_y,
                        const uint16_t* src_uv,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  P410ToARGBRow_SVE_SC(src_y, src_uv, dst_argb, yuvconstants, width);
}

void P410ToAR30Row_SVE2(const uint16_t* src_y,
                        const uint16_t* src_uv,
                        uint8_t* dst_ar30,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  P410ToAR30Row_SVE_SC(src_y, src_uv, dst_ar30, yuvconstants, width);
}

void I212ToAR30Row_SVE2(const uint16_t* src_y,
                        const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint8_t* dst_ar30,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  I212ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
}

void I212ToARGBRow_SVE2(const uint16_t* src_y,
                        const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  I212ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}

#endif  // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)

#ifdef __cplusplus
}  // extern "C"
}  // namespace libyuv
#endif

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.30 Sekunden (vorverarbeitet am 2026-04-26) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.