Quellcodebibliothek Statistik Leitseite products/Sources/formale Sprachen/C/Firefox/third_party/aom/third_party/libyuv/source/   (Browser von der Mozilla Stiftung Version 136.0.1©)  Datei vom 10.2.2025 mit Größe 155 kB image not shown  

Quelle  row_neon64.cc   Sprache: C

 
/*
 *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


#include "libyuv/row.h"

#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

// Read 8 Y, 4 U and 4 V from 422
#define READYUV422                               \
  "ld1 {v0.8b}, [%0], #8 \n" \
  "ld1 {v1.s}[0], [%1], #4 \n" \
  "ld1 {v1.s}[1], [%2], #4 \n"

// Read 8 Y, 8 U and 8 V from 444
#define READYUV444                               \
  "ld1 {v0.8b}, [%0], #8 \n" \
  "ld1 {v1.d}[0], [%1], #8 \n" \
  "ld1 {v1.d}[1], [%2], #8 \n" \
  "uaddlp v1.8h, v1.16b \n" \
  "rshrn v1.8b, v1.8h, #1 \n"

// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400                               \
  "ld1 {v0.8b}, [%0], #8 \n" \
  "movi v1.8b , #128 \n"

// Read 8 Y and 4 UV from NV12
#define READNV12                                 \
  "ld1 {v0.8b}, [%0], #8 \n" \
  "ld1 {v2.8b}, [%1], #8 \n" \
  "uzp1 v1.8b, v2.8b, v2.8b \n" \
  "uzp2 v3.8b, v2.8b, v2.8b \n" \
  "ins v1.s[1], v3.s[0] \n"

// Read 8 Y and 4 VU from NV21
#define READNV21                                 \
  "ld1 {v0.8b}, [%0], #8 \n" \
  "ld1 {v2.8b}, [%1], #8 \n" \
  "uzp1 v3.8b, v2.8b, v2.8b \n" \
  "uzp2 v1.8b, v2.8b, v2.8b \n" \
  "ins v1.s[1], v3.s[0] \n"

// Read 8 YUY2
#define READYUY2                                 \
  "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
  "uzp2 v3.8b, v1.8b, v1.8b \n" \
  "uzp1 v1.8b, v1.8b, v1.8b \n" \
  "ins v1.s[1], v3.s[0] \n"

// Read 8 UYVY
#define READUYVY                                 \
  "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
  "orr v0.8b, v3.8b, v3.8b \n" \
  "uzp1 v1.8b, v2.8b, v2.8b \n" \
  "uzp2 v3.8b, v2.8b, v2.8b \n" \
  "ins v1.s[1], v3.s[0] \n"

#define YUVTORGB_SETUP                                      \
  "ld3r {v24.8h, v25.8h, v26.8h}, [%[kUVBiasBGR]] \n" \
  "ld1r {v31.4s}, [%[kYToRgb]] \n" \
  "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
  "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"

// clang-format off

#define YUVTORGB(vR, vG, vB)                                        \
  "uxtl v0.8h, v0.8b \n" /* Extract Y    */ \
  "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
  "ushll2 v3.4s, v0.8h, #0 \n" /* Y */            \
  "ushll v0.4s, v0.4h, #0 \n"                    \
  "mul v3.4s, v3.4s, v31.4s \n"                    \
  "mul v0.4s, v0.4s, v31.4s \n"                    \
  "sqshrun v0.4h, v0.4s, #16 \n"                    \
  "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */            \
  "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
  "mov v2.d[0], v1.d[1] \n" /* Extract V */    \
  "uxtl v2.8h, v2.8b \n"                    \
  "uxtl v1.8h, v1.8b \n" /* Extract U */    \
  "mul v3.8h, v27.8h, v1.8h \n"                    \
  "mul v5.8h, v29.8h, v1.8h \n"                    \
  "mul v6.8h, v30.8h, v2.8h \n"                    \
  "mul v7.8h, v28.8h, v2.8h \n"                    \
  "sqadd v6.8h, v6.8h, v5.8h \n"                    \
  "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */            \
  "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */            \
  "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */            \
  "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */            \
  "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */            \
  "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */            \
  "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */            \
  "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */            \
  "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */

// clang-format on

void I444ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n" /* A */
      "1: \n"
    READYUV444
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 448] \n"
      "prfm pldl1keep, [%2, 448] \n"
      "subs %w4, %w4, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_argb),  // %3
      "+r"(width)      // %4
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void I422ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n" /* A */

      "1: \n"
    READYUV422
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "subs %w4, %w4, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_argb),  // %3
      "+r"(width)      // %4
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
                             const uint8_t* src_a,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  asm volatile (
    YUVTORGB_SETUP
      "1: \n"
    READYUV422
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "ld1 {v23.8b}, [%3], #8 \n"
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "prfm pldl1keep, [%3, 448] \n"
      "subs %w5, %w5, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(src_a),     // %3
      "+r"(dst_argb),  // %4
      "+r"(width)      // %5
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void I422ToRGBARow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_rgba,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v20.8b, #255 \n" /* A */
      "1: \n"
    READYUV422
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v23, v22, v21)
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "subs %w4, %w4, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_rgba),  // %3
      "+r"(width)      // %4
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void I422ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile (
    YUVTORGB_SETUP
      "1: \n"
    READYUV422
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "subs %w4, %w4, #8 \n"
      "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_rgb24), // %3
      "+r"(width)      // %4
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

#define ARGBTORGB565                                                        \
  "shll v0.8h, v22.8b, #8 \n" /* R                    */ \
  "shll v21.8h, v21.8b, #8 \n" /* G                    */ \
  "shll v20.8h, v20.8b, #8 \n" /* B                    */ \
  "sri v0.8h, v21.8h, #5 \n" /* RG                   */ \
  "sri v0.8h, v20.8h, #11 \n" /* RGB                  */

// clang-format off

void I422ToRGB565Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width) {
  asm volatile(
    YUVTORGB_SETUP
      "1: \n"
    READYUV422
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w4, %w4, #8 \n"
    ARGBTORGB565
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "st1 {v0.8h}, [%3], #16 \n"  // store 8 pixels RGB565.
      "b.gt 1b \n"
      : "+r"(src_y),       // %0
        "+r"(src_u),       // %1
        "+r"(src_v),       // %2
        "+r"(dst_rgb565),  // %3
        "+r"(width)        // %4
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
        "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30");
}

#define ARGBTOARGB1555                                                      \
  "shll v0.8h, v23.8b, #8 \n" /* A                    */ \
  "shll v22.8h, v22.8b, #8 \n" /* R                    */ \
  "shll v21.8h, v21.8b, #8 \n" /* G                    */ \
  "shll v20.8h, v20.8b, #8 \n" /* B                    */ \
  "sri v0.8h, v22.8h, #1 \n" /* AR                   */ \
  "sri v0.8h, v21.8h, #6 \n" /* ARG                  */ \
  "sri v0.8h, v20.8h, #11 \n" /* ARGB                 */

void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb1555,
                            const struct YuvConstants* yuvconstants,
                            int width) {
  asm volatile(
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n"
      "1: \n"
    READYUV422
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w4, %w4, #8 \n"
    ARGBTOARGB1555
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "st1 {v0.8h}, [%3], #16 \n"  // store 8 pixels RGB565.
      "b.gt 1b \n"
      : "+r"(src_y),         // %0
        "+r"(src_u),         // %1
        "+r"(src_v),         // %2
        "+r"(dst_argb1555),  // %3
        "+r"(width)          // %4
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
        "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30");
}
// clang-format on

#define ARGBTOARGB4444                                                       \
  /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
  "ushr v20.8b, v20.8b, #4 \n" /* B                    */  \
  "bic v21.8b, v21.8b, v4.8b \n" /* G                    */  \
  "ushr v22.8b, v22.8b, #4 \n" /* R                    */  \
  "bic v23.8b, v23.8b, v4.8b \n" /* A                    */  \
  "orr v0.8b, v20.8b, v21.8b \n" /* BG                   */  \
  "orr v1.8b, v22.8b, v23.8b \n" /* RA                   */  \
  "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA                 */

void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb4444,
                            const struct YuvConstants* yuvconstants,
                            int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v4.16b, #0x0f \n"  // bits to clear with vbic.
      "1: \n"
    READYUV422
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w4, %w4, #8 \n"
      "movi v23.8b, #255 \n"
    ARGBTOARGB4444
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "st1 {v0.8h}, [%3], #16 \n"  // store 8 pixels ARGB4444.
      "b.gt 1b \n"
    : "+r"(src_y),    // %0
      "+r"(src_u),    // %1
      "+r"(src_v),    // %2
      "+r"(dst_argb4444),  // %3
      "+r"(width)     // %4
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void I400ToARGBRow_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n"
      "1: \n"
    READYUV400
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  asm volatile(
      "movi v23.8b, #255 \n"
      "1: \n"
      "ld1 {v20.8b}, [%0], #8 \n"
      "prfm pldl1keep, [%0, 448] \n"
      "orr v21.8b, v20.8b, v20.8b \n"
      "orr v22.8b, v20.8b, v20.8b \n"
      "subs %w2, %w2, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
      "b.gt 1b \n"
      : "+r"(src_y),     // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v20""v21""v22""v23");
}

void NV12ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_uv,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n"
      "1: \n"
    READNV12
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 256] \n"
      "subs %w3, %w3, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_uv),    // %1
      "+r"(dst_argb),  // %2
      "+r"(width)      // %3
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void NV21ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_vu,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n"
      "1: \n"
    READNV21
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 256] \n"
      "subs %w3, %w3, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_vu),    // %1
      "+r"(dst_argb),  // %2
      "+r"(width)      // %3
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_uv,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile (
    YUVTORGB_SETUP
      "1: \n"
    READNV12
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 256] \n"
      "subs %w3, %w3, #8 \n"
      "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_uv),    // %1
      "+r"(dst_rgb24),  // %2
      "+r"(width)      // %3
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_vu,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile (
    YUVTORGB_SETUP
      "1: \n"
    READNV21
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 256] \n"
      "subs %w3, %w3, #8 \n"
      "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_vu),    // %1
      "+r"(dst_rgb24),  // %2
      "+r"(width)      // %3
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_uv,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width) {
  asm volatile(
      YUVTORGB_SETUP "1: \n" READNV12
                     "prfm pldl1keep, [%0, 448] \n" YUVTORGB(
                         v22, v21, v20) ARGBTORGB565
      "prfm pldl1keep, [%1, 256] \n"
      "subs %w3, %w3, #8 \n"
      "st1 {v0.8h}, [%2], 16 \n"  // store 8 pixels
      "b.gt 1b \n"
      : "+r"(src_y),       // %0
        "+r"(src_uv),      // %1
        "+r"(dst_rgb565),  // %2
        "+r"(width)        // %3
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
        "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30");
}

void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n"
      "1: \n"
    READYUY2
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "subs %w2, %w2, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_yuy2),  // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n"
      "1: \n"
    READUYVY
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
      "b.gt 1b \n"
    : "+r"(src_uyvy),  // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
void SplitUVRow_NEON(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
  asm volatile(
      "1: \n"
      "ld2 {v0.16b,v1.16b}, [%0], #32 \n"  // load 16 pairs of UV
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w3, %w3, #16 \n"  // 16 processed per loop
      "st1 {v0.16b}, [%1], #16 \n"  // store U
      "st1 {v1.16b}, [%2], #16 \n"  // store V
      "b.gt 1b \n"
      : "+r"(src_uv),               // %0
        "+r"(dst_u),                // %1
        "+r"(dst_v),                // %2
        "+r"(width)                 // %3  // Output registers
      :                             // Input registers
      : "cc""memory""v0""v1"  // Clobber List
  );
}

// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width) {
  asm volatile(
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load U
      "ld1 {v1.16b}, [%1], #16 \n"  // load V
      "prfm pldl1keep, [%0, 448] \n"
      "prfm pldl1keep, [%1, 448] \n"
      "subs %w3, %w3, #16 \n"  // 16 processed per loop
      "st2 {v0.16b,v1.16b}, [%2], #32 \n"  // store 16 pairs of UV
      "b.gt 1b \n"
      : "+r"(src_u),                // %0
        "+r"(src_v),                // %1
        "+r"(dst_uv),               // %2
        "+r"(width)                 // %3  // Output registers
      :                             // Input registers
      : "cc""memory""v0""v1"  // Clobber List
  );
}

// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
void SplitRGBRow_NEON(const uint8_t* src_rgb,
                      uint8_t* dst_r,
                      uint8_t* dst_g,
                      uint8_t* dst_b,
                      int width) {
  asm volatile(
      "1: \n"
      "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w4, %w4, #16 \n"  // 16 processed per loop
      "st1 {v0.16b}, [%1], #16 \n"  // store R
      "st1 {v1.16b}, [%2], #16 \n"  // store G
      "st1 {v2.16b}, [%3], #16 \n"  // store B
      "b.gt 1b \n"
      : "+r"(src_rgb),                    // %0
        "+r"(dst_r),                      // %1
        "+r"(dst_g),                      // %2
        "+r"(dst_b),                      // %3
        "+r"(width)                       // %4
      :                                   // Input registers
      : "cc""memory""v0""v1""v2"  // Clobber List
  );
}

// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
void MergeRGBRow_NEON(const uint8_t* src_r,
                      const uint8_t* src_g,
                      const uint8_t* src_b,
                      uint8_t* dst_rgb,
                      int width) {
  asm volatile(
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load R
      "ld1 {v1.16b}, [%1], #16 \n"  // load G
      "ld1 {v2.16b}, [%2], #16 \n"  // load B
      "prfm pldl1keep, [%0, 448] \n"
      "prfm pldl1keep, [%1, 448] \n"
      "prfm pldl1keep, [%2, 448] \n"
      "subs %w4, %w4, #16 \n"  // 16 processed per loop
      "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
      "prfm pldl1keep, [%0, 448] \n"
      "b.gt 1b \n"
      : "+r"(src_r),                      // %0
        "+r"(src_g),                      // %1
        "+r"(src_b),                      // %2
        "+r"(dst_rgb),                    // %3
        "+r"(width)                       // %4
      :                                   // Input registers
      : "cc""memory""v0""v1""v2"  // Clobber List
  );
}

// Copy multiple of 32.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "1: \n"
      "ldp q0, q1, [%0], #32 \n"
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #32 \n"  // 32 processed per loop
      "stp q0, q1, [%1], #32 \n"
      "b.gt 1b \n"
      : "+r"(src),                  // %0
        "+r"(dst),                  // %1
        "+r"(width)                 // %2  // Output registers
      :                             // Input registers
      : "cc""memory""v0""v1"  // Clobber List
  );
}

// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
  asm volatile(
      "dup v0.16b, %w2 \n"  // duplicate 16 bytes
      "1: \n"
      "subs %w1, %w1, #16 \n"  // 16 bytes per loop
      "st1 {v0.16b}, [%0], #16 \n"  // store
      "b.gt 1b \n"
      : "+r"(dst),   // %0
        "+r"(width)  // %1
      : "r"(v8)      // %2
      : "cc""memory""v0");
}

void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
  asm volatile(
      "dup v0.4s, %w2 \n"  // duplicate 4 ints
      "1: \n"
      "subs %w1, %w1, #4 \n"  // 4 ints per loop
      "st1 {v0.16b}, [%0], #16 \n"  // store
      "b.gt 1b \n"
      : "+r"(dst),   // %0
        "+r"(width)  // %1
      : "r"(v32)     // %2
      : "cc""memory""v0");
}

// Shuffle table for reversing the bytes.
static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};

void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      // Start at end of source row.
      "ld1 {v3.16b}, [%3] \n"  // shuffler
      "add %0, %0, %w2, sxtw \n"
      "sub %0, %0, #32 \n"
      "1: \n"
      "ldr q2, [%0, 16] \n"
      "ldr q1, [%0], -32 \n"  // src -= 32
      "subs %w2, %w2, #32 \n"  // 32 pixels per loop.
      "tbl v0.16b, {v2.16b}, v3.16b \n"
      "tbl v1.16b, {v1.16b}, v3.16b \n"
      "st1 {v0.16b, v1.16b}, [%1], #32 \n"  // store 32 pixels
      "b.gt 1b \n"
      : "+r"(src),            // %0
        "+r"(dst),            // %1
        "+r"(width)           // %2
      : "r"(&kShuffleMirror)  // %3
      : "cc""memory""v0""v1""v2""v3");
}

// Shuffle table for reversing the UV.
static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};

void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
  asm volatile(
      // Start at end of source row.
      "ld1 {v4.16b}, [%3] \n"  // shuffler
      "add %0, %0, %w2, sxtw #1 \n"
      "sub %0, %0, #32 \n"
      "1: \n"
      "ldr q1, [%0, 16] \n"
      "ldr q0, [%0], -32 \n"  // src -= 32
      "subs %w2, %w2, #16 \n"  // 16 pixels per loop.
      "tbl v2.16b, {v1.16b}, v4.16b \n"
      "tbl v3.16b, {v0.16b}, v4.16b \n"
      "st1 {v2.16b, v3.16b}, [%1], #32 \n"  // dst += 32
      "b.gt 1b \n"
      : "+r"(src_uv),           // %0
        "+r"(dst_uv),           // %1
        "+r"(width)             // %2
      : "r"(&kShuffleMirrorUV)  // %3
      : "cc""memory""v0""v1""v2""v3""v4");
}

void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
  asm volatile(
      // Start at end of source row.
      "ld1 {v4.16b}, [%4] \n"  // shuffler
      "add %0, %0, %w3, sxtw #1 \n"
      "sub %0, %0, #32 \n"
      "1: \n"
      "ldr q1, [%0, 16] \n"
      "ldr q0, [%0], -32 \n"  // src -= 32
      "subs %w3, %w3, #16 \n"  // 16 pixels per loop.
      "tbl v2.16b, {v1.16b}, v4.16b \n"
      "tbl v3.16b, {v0.16b}, v4.16b \n"
      "uzp1 v0.16b, v2.16b, v3.16b \n"  // U
      "uzp2 v1.16b, v2.16b, v3.16b \n"  // V
      "st1 {v0.16b}, [%1], #16 \n"  // dst += 16
      "st1 {v1.16b}, [%2], #16 \n"
      "b.gt 1b \n"
      : "+r"(src_uv),           // %0
        "+r"(dst_u),            // %1
        "+r"(dst_v),            // %2
        "+r"(width)             // %3
      : "r"(&kShuffleMirrorUV)  // %4
      : "cc""memory""v0""v1""v2""v3""v4");
}

// Shuffle table for reversing the ARGB.
static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
                                         4u,  5u,  6u,  7u,  0u, 1u, 2u,  3u};

void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  asm volatile(
      // Start at end of source row.
      "ld1 {v4.16b}, [%3] \n"  // shuffler
      "add %0, %0, %w2, sxtw #2 \n"
      "sub %0, %0, #32 \n"
      "1: \n"
      "ldr q1, [%0, 16] \n"
      "ldr q0, [%0], -32 \n"  // src -= 32
      "subs %w2, %w2, #8 \n"  // 8 pixels per loop.
      "tbl v2.16b, {v1.16b}, v4.16b \n"
      "tbl v3.16b, {v0.16b}, v4.16b \n"
      "st1 {v2.16b, v3.16b}, [%1], #32 \n"  // dst += 32
      "b.gt 1b \n"
      : "+r"(src_argb),           // %0
        "+r"(dst_argb),           // %1
        "+r"(width)               // %2
      : "r"(&kShuffleMirrorARGB)  // %3
      : "cc""memory""v0""v1""v2""v3""v4");
}

void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
                         uint8_t* dst_rgb24,
                         int width) {
  asm volatile(
      "ld1 {v3.16b}, [%4] \n"  // shuffler
      "add %0, %0, %w2, sxtw #1 \n"  // Start at end of row.
      "add %0, %0, %w2, sxtw \n"
      "sub %0, %0, #48 \n"

      "1: \n"
      "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n"  // src -= 48
      "subs %w2, %w2, #16 \n"  // 16 pixels per loop.
      "tbl v0.16b, {v0.16b}, v3.16b \n"
      "tbl v1.16b, {v1.16b}, v3.16b \n"
      "tbl v2.16b, {v2.16b}, v3.16b \n"
      "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n"  // dst += 48
      "b.gt 1b \n"
      : "+r"(src_rgb24),      // %0
        "+r"(dst_rgb24),      // %1
        "+r"(width)           // %2
      : "r"((ptrdiff_t)-48),  // %3
        "r"(&kShuffleMirror)  // %4
      : "cc""memory""v0""v1""v2""v3");
}

void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                         uint8_t* dst_argb,
                         int width) {
  asm volatile(
      "movi v4.8b, #255 \n"  // Alpha
      "1: \n"
      "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of
                                                       // RGB24.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
      "b.gt 1b \n"
      : "+r"(src_rgb24),  // %0
        "+r"(dst_argb),   // %1
        "+r"(width)       // %2
      :
      : "cc""memory""v1""v2""v3""v4"  // Clobber List
  );
}

void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  asm volatile(
      "movi v5.8b, #255 \n"  // Alpha
      "1: \n"
      "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      "orr v3.8b, v1.8b, v1.8b \n"  // move g
      "orr v4.8b, v0.8b, v0.8b \n"  // move r
      "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
      "b.gt 1b \n"
      : "+r"(src_raw),   // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v0""v1""v2""v3""v4""v5"  // Clobber List
  );
}

void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
  asm volatile(
      "movi v0.8b, #255 \n"  // Alpha
      "1: \n"
      "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n"  // read r g b
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      "orr v2.8b, v4.8b, v4.8b \n"  // move g
      "orr v1.8b, v5.8b, v5.8b \n"  // move r
      "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store a b g r
      "b.gt 1b \n"
      : "+r"(src_raw),   // %0
        "+r"(dst_rgba),  // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v0""v1""v2""v3""v4""v5"  // Clobber List
  );
}

void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
  asm volatile(
      "1: \n"
      "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"   // 8 processed per loop.
      "orr v3.8b, v1.8b, v1.8b \n"   // move g
      "orr v4.8b, v0.8b, v0.8b \n"   // move r
      "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
      "b.gt 1b \n"
      : "+r"(src_raw),    // %0
        "+r"(dst_rgb24),  // %1
        "+r"(width)       // %2
      :
      : "cc""memory""v0""v1""v2""v3""v4"  // Clobber List
  );
}

#define RGB565TOARGB                                                        \
  "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG           */ \
  "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6   */ \
  "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2   */ \
  "orr v1.8b, v4.8b, v6.8b \n" /* G                    */ \
  "xtn v2.8b, v0.8h \n" /* B xxxBBBBB           */ \
  "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR           */ \
  "xtn2 v2.16b,v0.8h \n" /* R in upper part      */ \
  "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
  "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
  "orr v0.16b, v0.16b, v2.16b \n" /* R,B                  */ \
  "dup v2.2D, v0.D[1] \n" /* R                    */

void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                          uint8_t* dst_argb,
                          int width) {
  asm volatile(
      "movi v3.8b, #255 \n"  // Alpha
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load 8 RGB565 pixels.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      RGB565TOARGB
      "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
      "b.gt 1b \n"
      : "+r"(src_rgb565),  // %0
        "+r"(dst_argb),    // %1
        "+r"(width)        // %2
      :
      : "cc""memory""v0""v1""v2""v3""v4""v6"  // Clobber List
  );
}

#define ARGB1555TOARGB                                                      \
  "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR           */ \
  "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5   */ \
  "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA    */ \
                                                                            \
  "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA           */ \
  "xtn2 v3.16b, v2.8h \n"                            \
                                                                            \
  "xtn v2.8b, v0.8h \n" /* B xxxBBBBB           */ \
  "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG           */ \
                                                                            \
  "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
  "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
  "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
                                                                            \
  "orr v0.16b, v0.16b, v2.16b \n" /* B,G                  */ \
  "orr v2.16b, v1.16b, v3.16b \n" /* R,A                  */ \
  "dup v1.2D, v0.D[1] \n"                            \
  "dup v3.2D, v2.D[1] \n"

// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
#define RGB555TOARGB                                                        \
  "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR           */ \
  "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5   */ \
  "xtn v3.8b, v2.8h \n" /* RRRRR000             */ \
                                                                            \
  "xtn v2.8b, v0.8h \n" /* B xxxBBBBB           */ \
  "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG           */ \
                                                                            \
  "ushr v1.16b, v3.16b, #5 \n" /* R   00000RRR lower 3 */ \
  "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
  "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
                                                                            \
  "orr v0.16b, v0.16b, v2.16b \n" /* B,G                  */ \
  "orr v2.16b, v1.16b, v3.16b \n" /* R                    */ \
  "dup v1.2D, v0.D[1] \n" /* G */

void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                            uint8_t* dst_argb,
                            int width) {
  asm volatile(
      "movi v3.8b, #255 \n"  // Alpha
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load 8 ARGB1555 pixels.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      ARGB1555TOARGB
      "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
      "b.gt 1b \n"
      : "+r"(src_argb1555),  // %0
        "+r"(dst_argb),      // %1
        "+r"(width)          // %2
      :
      : "cc""memory""v0""v1""v2""v3"  // Clobber List
  );
}

// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
// clobbers v3
#define ARGB4444TOARGB                                                      \
  "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR             */ \
  "xtn2 v1.16b, v0.8h \n" /* v1(h) GB             */ \
  "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000         */ \
  "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG         */ \
  "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB         */ \
  "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000         */ \
  "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB         */ \
  "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG         */ \
  "dup v0.2D, v2.D[1] \n"                            \
  "dup v1.2D, v3.D[1] \n"

void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                            uint8_t* dst_argb,
                            int width) {
  asm volatile(
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load 8 ARGB4444 pixels.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      ARGB4444TOARGB
      "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
      "b.gt 1b \n"
      : "+r"(src_argb4444),  // %0
        "+r"(dst_argb),      // %1
        "+r"(width)          // %2
      :
      : "cc""memory""v0""v1""v2""v3""v4"  // Clobber List
  );
}

void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_rgb24,
                         int width) {
  asm volatile(
      "1: \n"
      "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"   // 8 processed per loop.
      "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
                                                       // RGB24
      "b.gt 1b \n"
      : "+r"(src_argb),   // %0
        "+r"(dst_rgb24),  // %1
        "+r"(width)       // %2
      :
      : "cc""memory""v1""v2""v3""v4"  // Clobber List
  );
}

void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
  asm volatile(
      "1: \n"
      "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"   // 8 processed per loop.
      "orr v4.8b, v2.8b, v2.8b \n"   // mov g
      "orr v5.8b, v1.8b, v1.8b \n"   // mov b
      "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
      "b.gt 1b \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_raw),   // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v1""v2""v3""v4""v5"  // Clobber List
  );
}

void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  asm volatile(
      "1: \n"
      "ld2 {v0.16b,v1.16b}, [%0], #32 \n"  // load 16 pixels of YUY2.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #16 \n"  // 16 processed per loop.
      "st1 {v0.16b}, [%1], #16 \n"  // store 16 pixels of Y.
      "b.gt 1b \n"
      : "+r"(src_yuy2),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v0""v1"  // Clobber List
  );
}

void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  asm volatile(
      "1: \n"
      "ld2 {v0.16b,v1.16b}, [%0], #32 \n"  // load 16 pixels of UYVY.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #16 \n"  // 16 processed per loop.
      "st1 {v1.16b}, [%1], #16 \n"  // store 16 pixels of Y.
      "b.gt 1b \n"
      : "+r"(src_uyvy),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v0""v1"  // Clobber List
  );
}

void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
  asm volatile(
      "1: \n"
      "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w3, %w3, #16 \n"  // 16 pixels = 8 UVs.
      "st1 {v1.8b}, [%1], #8 \n"  // store 8 U.
      "st1 {v3.8b}, [%2], #8 \n"  // store 8 V.
      "b.gt 1b \n"
      : "+r"(src_yuy2),  // %0
        "+r"(dst_u),     // %1
        "+r"(dst_v),     // %2
        "+r"(width)      // %3
      :
      : "cc""memory""v0""v1""v2""v3"  // Clobber List
  );
}

void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
  asm volatile(
      "1: \n"
      "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w3, %w3, #16 \n"  // 16 pixels = 8 UVs.
      "st1 {v0.8b}, [%1], #8 \n"  // store 8 U.
      "st1 {v2.8b}, [%2], #8 \n"  // store 8 V.
      "b.gt 1b \n"
      : "+r"(src_uyvy),  // %0
        "+r"(dst_u),     // %1
        "+r"(dst_v),     // %2
        "+r"(width)      // %3
      :
      : "cc""memory""v0""v1""v2""v3"  // Clobber List
  );
}

void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
                      int stride_yuy2,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
  asm volatile(
      "1: \n"
      "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w4, %w4, #16 \n"  // 16 pixels = 8 UVs.
      "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
      "urhadd v1.8b, v1.8b, v5.8b \n"  // average rows of U
      "urhadd v3.8b, v3.8b, v7.8b \n"  // average rows of V
      "st1 {v1.8b}, [%2], #8 \n"  // store 8 U.
      "st1 {v3.8b}, [%3], #8 \n"  // store 8 V.
      "b.gt 1b \n"
      : "+r"(src_yuy2),   // %0
        "+r"(src_yuy2b),  // %1
        "+r"(dst_u),      // %2
        "+r"(dst_v),      // %3
        "+r"(width)       // %4
      :
      : "cc""memory""v0""v1""v2""v3""v4""v5""v6",
        "v7"  // Clobber List
  );
}

void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
                      int stride_uyvy,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
  asm volatile(
      "1: \n"
      "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w4, %w4, #16 \n"  // 16 pixels = 8 UVs.
      "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
      "urhadd v0.8b, v0.8b, v4.8b \n"  // average rows of U
      "urhadd v2.8b, v2.8b, v6.8b \n"  // average rows of V
      "st1 {v0.8b}, [%2], #8 \n"  // store 8 U.
      "st1 {v2.8b}, [%3], #8 \n"  // store 8 V.
      "b.gt 1b \n"
      : "+r"(src_uyvy),   // %0
        "+r"(src_uyvyb),  // %1
        "+r"(dst_u),      // %2
        "+r"(dst_v),      // %3
        "+r"(width)       // %4
      :
      : "cc""memory""v0""v1""v2""v3""v4""v5""v6",
        "v7"  // Clobber List
  );
}

// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                         uint8_t* dst_argb,
                         const uint8_t* shuffler,
                         int width) {
  asm volatile(
      "ld1 {v2.16b}, [%3] \n"  // shuffler
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load 4 pixels.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #4 \n"  // 4 processed per loop
      "tbl v1.16b, {v0.16b}, v2.16b \n"  // look up 4 pixels
      "st1 {v1.16b}, [%1], #16 \n"  // store 4.
      "b.gt 1b \n"
      : "+r"(src_argb),                   // %0
        "+r"(dst_argb),                   // %1
        "+r"(width)                       // %2
      : "r"(shuffler)                     // %3
      : "cc""memory""v0""v1""v2"  // Clobber List
  );
}

void I422ToYUY2Row_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_yuy2,
                        int width) {
  asm volatile(
      "1: \n"
      "ld2 {v0.8b, v1.8b}, [%0], #16 \n"  // load 16 Ys
      "prfm pldl1keep, [%0, 448] \n"
      "orr v2.8b, v1.8b, v1.8b \n"
      "ld1 {v1.8b}, [%1], #8 \n"         // load 8 Us
      "ld1 {v3.8b}, [%2], #8 \n"         // load 8 Vs
      "subs %w4, %w4, #16 \n"         // 16 pixels
      "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
      "b.gt 1b \n"
      : "+r"(src_y),     // %0
        "+r"(src_u),     // %1
        "+r"(src_v),     // %2
        "+r"(dst_yuy2),  // %3
        "+r"(width)      // %4
      :
      : "cc""memory""v0""v1""v2""v3");
}

void I422ToUYVYRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_uyvy,
                        int width) {
  asm volatile(
      "1: \n"
      "ld2 {v1.8b,v2.8b}, [%0], #16 \n"  // load 16 Ys
      "prfm pldl1keep, [%0, 448] \n"
      "orr v3.8b, v2.8b, v2.8b \n"
      "ld1 {v0.8b}, [%1], #8 \n"         // load 8 Us
      "ld1 {v2.8b}, [%2], #8 \n"         // load 8 Vs
      "subs %w4, %w4, #16 \n"         // 16 pixels
      "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
      "b.gt 1b \n"
      : "+r"(src_y),     // %0
        "+r"(src_u),     // %1
        "+r"(src_v),     // %2
        "+r"(dst_uyvy),  // %3
        "+r"(width)      // %4
      :
      : "cc""memory""v0""v1""v2""v3");
}

void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_rgb565,
                          int width) {
  asm volatile(
      "1: \n"
      "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
                                                                 // pixels
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      ARGBTORGB565
      "st1 {v0.16b}, [%1], #16 \n"  // store 8 pixels RGB565.
      "b.gt 1b \n"
      : "+r"(src_argb),    // %0
        "+r"(dst_rgb565),  // %1
        "+r"(width)        // %2
      :
      : "cc""memory""v0""v20""v21""v22""v23");
}

void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
                                const uint32_t dither4,
                                int width) {
  asm volatile(
      "dup v1.4s, %w2 \n"  // dither4
      "1: \n"
      "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8
                                                                 // pixels
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w3, %w3, #8 \n"  // 8 processed per loop.
      "uqadd v20.8b, v20.8b, v1.8b \n"
      "uqadd v21.8b, v21.8b, v1.8b \n"
      "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
      "st1 {v0.16b}, [%0], #16 \n"  // store 8 pixels RGB565.
      "b.gt 1b \n"
      : "+r"(dst_rgb)   // %0
      : "r"(src_argb),  // %1
        "r"(dither4),   // %2
        "r"(width)      // %3
      : "cc""memory""v0""v1""v20""v21""v22""v23");
}

void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb1555,
                            int width) {
  asm volatile(
      "1: \n"
      "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
                                                                 // pixels
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      ARGBTOARGB1555
      "st1 {v0.16b}, [%1], #16 \n"  // store 8 pixels
      "b.gt 1b \n"
      : "+r"(src_argb),      // %0
        "+r"(dst_argb1555),  // %1
        "+r"(width)          // %2
      :
      : "cc""memory""v0""v20""v21""v22""v23");
}

void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb4444,
                            int width) {
  asm volatile(
      "movi v4.16b, #0x0f \n"  // bits to clear with
                                                      // vbic.
      "1: \n"
      "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
                                                                 // pixels
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      ARGBTOARGB4444
      "st1 {v0.16b}, [%1], #16 \n"  // store 8 pixels
      "b.gt 1b \n"
      : "+r"(src_argb),      // %0
        "+r"(dst_argb4444),  // %1
        "+r"(width)          // %2
      :
      : "cc""memory""v0""v1""v4""v20""v21""v22""v23");
}

void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  asm volatile(
      "movi v4.8b, #25 \n"  // B * 0.1016 coefficient
      "movi v5.8b, #129 \n"  // G * 0.5078 coefficient
      "movi v6.8b, #66 \n"  // R * 0.2578 coefficient
      "movi v7.8b, #16 \n"  // Add 16 constant
      "1: \n"
      "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      "umull v3.8h, v0.8b, v4.8b \n"  // B
      "umlal v3.8h, v1.8b, v5.8b \n"  // G
      "umlal v3.8h, v2.8b, v6.8b \n"  // R
      "uqrshrn v0.8b, v3.8h, #8 \n"  // 16 bit to 8 bit Y
      "uqadd v0.8b, v0.8b, v7.8b \n"
      "st1 {v0.8b}, [%1], #8 \n"  // store 8 pixels Y.
      "b.gt 1b \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7");
}

void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
                              uint8_t* dst_a,
                              int width) {
  asm volatile(
      "1: \n"
      "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #16 \n"  // 16 processed per loop
      "st1 {v3.16b}, [%1], #16 \n"  // store 16 A's.
      "b.gt 1b \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_a),     // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v0""v1""v2""v3"  // Clobber List
  );
}

void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  asm volatile(
      "movi v4.8b, #29 \n"  // B * 0.1140 coefficient
      "movi v5.8b, #150 \n"  // G * 0.5870 coefficient
      "movi v6.8b, #77 \n"  // R * 0.2990 coefficient
      "1: \n"
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5
C=79 H=83 G=80

¤ Dauer der Verarbeitung: 0.23 Sekunden  (vorverarbeitet)  ¤

*© Formatika GbR, Deutschland






Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.