Quellcodebibliothek Statistik Leitseite products/Sources/formale Sprachen/C/Firefox/dom/tests/mochitest/keyhandling/   (Browser von der Mozilla Stiftung Version 136.0.1©)  Datei vom 10.2.2025 mit Größe 934 B image not shown  

Quelle  row_neon64.cc   Sprache: unbekannt

 
/*
 *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


#include "libyuv/row.h"

#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

// Read 8 Y, 4 U and 4 V from 422
#define READYUV422                               \
  "ld1 {v0.8b}, [%0], #8 \n" \
  "ld1 {v1.s}[0], [%1], #4 \n" \
  "ld1 {v1.s}[1], [%2], #4 \n"

// Read 8 Y, 8 U and 8 V from 444
#define READYUV444                               \
  "ld1 {v0.8b}, [%0], #8 \n" \
  "ld1 {v1.d}[0], [%1], #8 \n" \
  "ld1 {v1.d}[1], [%2], #8 \n" \
  "uaddlp v1.8h, v1.16b \n" \
  "rshrn v1.8b, v1.8h, #1 \n"

// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400                               \
  "ld1 {v0.8b}, [%0], #8 \n" \
  "movi v1.8b , #128 \n"

// Read 8 Y and 4 UV from NV12
#define READNV12                                 \
  "ld1 {v0.8b}, [%0], #8 \n" \
  "ld1 {v2.8b}, [%1], #8 \n" \
  "uzp1 v1.8b, v2.8b, v2.8b \n" \
  "uzp2 v3.8b, v2.8b, v2.8b \n" \
  "ins v1.s[1], v3.s[0] \n"

// Read 8 Y and 4 VU from NV21
#define READNV21                                 \
  "ld1 {v0.8b}, [%0], #8 \n" \
  "ld1 {v2.8b}, [%1], #8 \n" \
  "uzp1 v3.8b, v2.8b, v2.8b \n" \
  "uzp2 v1.8b, v2.8b, v2.8b \n" \
  "ins v1.s[1], v3.s[0] \n"

// Read 8 YUY2
#define READYUY2                                 \
  "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
  "uzp2 v3.8b, v1.8b, v1.8b \n" \
  "uzp1 v1.8b, v1.8b, v1.8b \n" \
  "ins v1.s[1], v3.s[0] \n"

// Read 8 UYVY
#define READUYVY                                 \
  "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
  "orr v0.8b, v3.8b, v3.8b \n" \
  "uzp1 v1.8b, v2.8b, v2.8b \n" \
  "uzp2 v3.8b, v2.8b, v2.8b \n" \
  "ins v1.s[1], v3.s[0] \n"

#define YUVTORGB_SETUP                                      \
  "ld3r {v24.8h, v25.8h, v26.8h}, [%[kUVBiasBGR]] \n" \
  "ld1r {v31.4s}, [%[kYToRgb]] \n" \
  "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
  "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"

// clang-format off

#define YUVTORGB(vR, vG, vB)                                        \
  "uxtl v0.8h, v0.8b \n" /* Extract Y    */ \
  "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
  "ushll2 v3.4s, v0.8h, #0 \n" /* Y */            \
  "ushll v0.4s, v0.4h, #0 \n"                    \
  "mul v3.4s, v3.4s, v31.4s \n"                    \
  "mul v0.4s, v0.4s, v31.4s \n"                    \
  "sqshrun v0.4h, v0.4s, #16 \n"                    \
  "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */            \
  "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
  "mov v2.d[0], v1.d[1] \n" /* Extract V */    \
  "uxtl v2.8h, v2.8b \n"                    \
  "uxtl v1.8h, v1.8b \n" /* Extract U */    \
  "mul v3.8h, v27.8h, v1.8h \n"                    \
  "mul v5.8h, v29.8h, v1.8h \n"                    \
  "mul v6.8h, v30.8h, v2.8h \n"                    \
  "mul v7.8h, v28.8h, v2.8h \n"                    \
  "sqadd v6.8h, v6.8h, v5.8h \n"                    \
  "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */            \
  "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */            \
  "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */            \
  "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */            \
  "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */            \
  "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */            \
  "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */            \
  "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */            \
  "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */

// clang-format on

void I444ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n" /* A */
      "1: \n"
    READYUV444
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 448] \n"
      "prfm pldl1keep, [%2, 448] \n"
      "subs %w4, %w4, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_argb),  // %3
      "+r"(width)      // %4
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void I422ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n" /* A */

      "1: \n"
    READYUV422
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "subs %w4, %w4, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_argb),  // %3
      "+r"(width)      // %4
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
                             const uint8_t* src_a,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  asm volatile (
    YUVTORGB_SETUP
      "1: \n"
    READYUV422
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "ld1 {v23.8b}, [%3], #8 \n"
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "prfm pldl1keep, [%3, 448] \n"
      "subs %w5, %w5, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(src_a),     // %3
      "+r"(dst_argb),  // %4
      "+r"(width)      // %5
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void I422ToRGBARow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_rgba,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v20.8b, #255 \n" /* A */
      "1: \n"
    READYUV422
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v23, v22, v21)
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "subs %w4, %w4, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_rgba),  // %3
      "+r"(width)      // %4
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void I422ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile (
    YUVTORGB_SETUP
      "1: \n"
    READYUV422
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "subs %w4, %w4, #8 \n"
      "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_rgb24), // %3
      "+r"(width)      // %4
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

#define ARGBTORGB565                                                        \
  "shll v0.8h, v22.8b, #8 \n" /* R                    */ \
  "shll v21.8h, v21.8b, #8 \n" /* G                    */ \
  "shll v20.8h, v20.8b, #8 \n" /* B                    */ \
  "sri v0.8h, v21.8h, #5 \n" /* RG                   */ \
  "sri v0.8h, v20.8h, #11 \n" /* RGB                  */

// clang-format off

void I422ToRGB565Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width) {
  asm volatile(
    YUVTORGB_SETUP
      "1: \n"
    READYUV422
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w4, %w4, #8 \n"
    ARGBTORGB565
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "st1 {v0.8h}, [%3], #16 \n"  // store 8 pixels RGB565.
      "b.gt 1b \n"
      : "+r"(src_y),       // %0
        "+r"(src_u),       // %1
        "+r"(src_v),       // %2
        "+r"(dst_rgb565),  // %3
        "+r"(width)        // %4
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
        "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30");
}

#define ARGBTOARGB1555                                                      \
  "shll v0.8h, v23.8b, #8 \n" /* A                    */ \
  "shll v22.8h, v22.8b, #8 \n" /* R                    */ \
  "shll v21.8h, v21.8b, #8 \n" /* G                    */ \
  "shll v20.8h, v20.8b, #8 \n" /* B                    */ \
  "sri v0.8h, v22.8h, #1 \n" /* AR                   */ \
  "sri v0.8h, v21.8h, #6 \n" /* ARG                  */ \
  "sri v0.8h, v20.8h, #11 \n" /* ARGB                 */

void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb1555,
                            const struct YuvConstants* yuvconstants,
                            int width) {
  asm volatile(
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n"
      "1: \n"
    READYUV422
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w4, %w4, #8 \n"
    ARGBTOARGB1555
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "st1 {v0.8h}, [%3], #16 \n"  // store 8 pixels RGB565.
      "b.gt 1b \n"
      : "+r"(src_y),         // %0
        "+r"(src_u),         // %1
        "+r"(src_v),         // %2
        "+r"(dst_argb1555),  // %3
        "+r"(width)          // %4
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
        "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30");
}
// clang-format on

#define ARGBTOARGB4444                                                       \
  /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
  "ushr v20.8b, v20.8b, #4 \n" /* B                    */  \
  "bic v21.8b, v21.8b, v4.8b \n" /* G                    */  \
  "ushr v22.8b, v22.8b, #4 \n" /* R                    */  \
  "bic v23.8b, v23.8b, v4.8b \n" /* A                    */  \
  "orr v0.8b, v20.8b, v21.8b \n" /* BG                   */  \
  "orr v1.8b, v22.8b, v23.8b \n" /* RA                   */  \
  "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA                 */

void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb4444,
                            const struct YuvConstants* yuvconstants,
                            int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v4.16b, #0x0f \n"  // bits to clear with vbic.
      "1: \n"
    READYUV422
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w4, %w4, #8 \n"
      "movi v23.8b, #255 \n"
    ARGBTOARGB4444
      "prfm pldl1keep, [%1, 128] \n"
      "prfm pldl1keep, [%2, 128] \n"
      "st1 {v0.8h}, [%3], #16 \n"  // store 8 pixels ARGB4444.
      "b.gt 1b \n"
    : "+r"(src_y),    // %0
      "+r"(src_u),    // %1
      "+r"(src_v),    // %2
      "+r"(dst_argb4444),  // %3
      "+r"(width)     // %4
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void I400ToARGBRow_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n"
      "1: \n"
    READYUV400
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  asm volatile(
      "movi v23.8b, #255 \n"
      "1: \n"
      "ld1 {v20.8b}, [%0], #8 \n"
      "prfm pldl1keep, [%0, 448] \n"
      "orr v21.8b, v20.8b, v20.8b \n"
      "orr v22.8b, v20.8b, v20.8b \n"
      "subs %w2, %w2, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
      "b.gt 1b \n"
      : "+r"(src_y),     // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v20""v21""v22""v23");
}

void NV12ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_uv,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n"
      "1: \n"
    READNV12
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 256] \n"
      "subs %w3, %w3, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_uv),    // %1
      "+r"(dst_argb),  // %2
      "+r"(width)      // %3
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void NV21ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_vu,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n"
      "1: \n"
    READNV21
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 256] \n"
      "subs %w3, %w3, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_vu),    // %1
      "+r"(dst_argb),  // %2
      "+r"(width)      // %3
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_uv,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile (
    YUVTORGB_SETUP
      "1: \n"
    READNV12
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 256] \n"
      "subs %w3, %w3, #8 \n"
      "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_uv),    // %1
      "+r"(dst_rgb24),  // %2
      "+r"(width)      // %3
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_vu,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile (
    YUVTORGB_SETUP
      "1: \n"
    READNV21
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%1, 256] \n"
      "subs %w3, %w3, #8 \n"
      "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
      "b.gt 1b \n"
    : "+r"(src_y),     // %0
      "+r"(src_vu),    // %1
      "+r"(dst_rgb24),  // %2
      "+r"(width)      // %3
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_uv,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width) {
  asm volatile(
      YUVTORGB_SETUP "1: \n" READNV12
                     "prfm pldl1keep, [%0, 448] \n" YUVTORGB(
                         v22, v21, v20) ARGBTORGB565
      "prfm pldl1keep, [%1, 256] \n"
      "subs %w3, %w3, #8 \n"
      "st1 {v0.8h}, [%2], 16 \n"  // store 8 pixels
      "b.gt 1b \n"
      : "+r"(src_y),       // %0
        "+r"(src_uv),      // %1
        "+r"(dst_rgb565),  // %2
        "+r"(width)        // %3
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
        "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30");
}

void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n"
      "1: \n"
    READYUY2
      "prfm pldl1keep, [%0, 448] \n"
    YUVTORGB(v22, v21, v20)
      "subs %w2, %w2, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
      "b.gt 1b \n"
    : "+r"(src_yuy2),  // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
      "movi v23.8b, #255 \n"
      "1: \n"
    READUYVY
    YUVTORGB(v22, v21, v20)
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
      "b.gt 1b \n"
    : "+r"(src_uyvy),  // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7""v20",
      "v21""v22""v23""v24""v25""v26""v27""v28""v29""v30"
  );
}

// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
void SplitUVRow_NEON(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
  asm volatile(
      "1: \n"
      "ld2 {v0.16b,v1.16b}, [%0], #32 \n"  // load 16 pairs of UV
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w3, %w3, #16 \n"  // 16 processed per loop
      "st1 {v0.16b}, [%1], #16 \n"  // store U
      "st1 {v1.16b}, [%2], #16 \n"  // store V
      "b.gt 1b \n"
      : "+r"(src_uv),               // %0
        "+r"(dst_u),                // %1
        "+r"(dst_v),                // %2
        "+r"(width)                 // %3  // Output registers
      :                             // Input registers
      : "cc""memory""v0""v1"  // Clobber List
  );
}

// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width) {
  asm volatile(
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load U
      "ld1 {v1.16b}, [%1], #16 \n"  // load V
      "prfm pldl1keep, [%0, 448] \n"
      "prfm pldl1keep, [%1, 448] \n"
      "subs %w3, %w3, #16 \n"  // 16 processed per loop
      "st2 {v0.16b,v1.16b}, [%2], #32 \n"  // store 16 pairs of UV
      "b.gt 1b \n"
      : "+r"(src_u),                // %0
        "+r"(src_v),                // %1
        "+r"(dst_uv),               // %2
        "+r"(width)                 // %3  // Output registers
      :                             // Input registers
      : "cc""memory""v0""v1"  // Clobber List
  );
}

// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
void SplitRGBRow_NEON(const uint8_t* src_rgb,
                      uint8_t* dst_r,
                      uint8_t* dst_g,
                      uint8_t* dst_b,
                      int width) {
  asm volatile(
      "1: \n"
      "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w4, %w4, #16 \n"  // 16 processed per loop
      "st1 {v0.16b}, [%1], #16 \n"  // store R
      "st1 {v1.16b}, [%2], #16 \n"  // store G
      "st1 {v2.16b}, [%3], #16 \n"  // store B
      "b.gt 1b \n"
      : "+r"(src_rgb),                    // %0
        "+r"(dst_r),                      // %1
        "+r"(dst_g),                      // %2
        "+r"(dst_b),                      // %3
        "+r"(width)                       // %4
      :                                   // Input registers
      : "cc""memory""v0""v1""v2"  // Clobber List
  );
}

// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
void MergeRGBRow_NEON(const uint8_t* src_r,
                      const uint8_t* src_g,
                      const uint8_t* src_b,
                      uint8_t* dst_rgb,
                      int width) {
  asm volatile(
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load R
      "ld1 {v1.16b}, [%1], #16 \n"  // load G
      "ld1 {v2.16b}, [%2], #16 \n"  // load B
      "prfm pldl1keep, [%0, 448] \n"
      "prfm pldl1keep, [%1, 448] \n"
      "prfm pldl1keep, [%2, 448] \n"
      "subs %w4, %w4, #16 \n"  // 16 processed per loop
      "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
      "prfm pldl1keep, [%0, 448] \n"
      "b.gt 1b \n"
      : "+r"(src_r),                      // %0
        "+r"(src_g),                      // %1
        "+r"(src_b),                      // %2
        "+r"(dst_rgb),                    // %3
        "+r"(width)                       // %4
      :                                   // Input registers
      : "cc""memory""v0""v1""v2"  // Clobber List
  );
}

// Copy multiple of 32.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "1: \n"
      "ldp q0, q1, [%0], #32 \n"
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #32 \n"  // 32 processed per loop
      "stp q0, q1, [%1], #32 \n"
      "b.gt 1b \n"
      : "+r"(src),                  // %0
        "+r"(dst),                  // %1
        "+r"(width)                 // %2  // Output registers
      :                             // Input registers
      : "cc""memory""v0""v1"  // Clobber List
  );
}

// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
  asm volatile(
      "dup v0.16b, %w2 \n"  // duplicate 16 bytes
      "1: \n"
      "subs %w1, %w1, #16 \n"  // 16 bytes per loop
      "st1 {v0.16b}, [%0], #16 \n"  // store
      "b.gt 1b \n"
      : "+r"(dst),   // %0
        "+r"(width)  // %1
      : "r"(v8)      // %2
      : "cc""memory""v0");
}

void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
  asm volatile(
      "dup v0.4s, %w2 \n"  // duplicate 4 ints
      "1: \n"
      "subs %w1, %w1, #4 \n"  // 4 ints per loop
      "st1 {v0.16b}, [%0], #16 \n"  // store
      "b.gt 1b \n"
      : "+r"(dst),   // %0
        "+r"(width)  // %1
      : "r"(v32)     // %2
      : "cc""memory""v0");
}

// Shuffle table for reversing the bytes.
static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};

void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      // Start at end of source row.
      "ld1 {v3.16b}, [%3] \n"  // shuffler
      "add %0, %0, %w2, sxtw \n"
      "sub %0, %0, #32 \n"
      "1: \n"
      "ldr q2, [%0, 16] \n"
      "ldr q1, [%0], -32 \n"  // src -= 32
      "subs %w2, %w2, #32 \n"  // 32 pixels per loop.
      "tbl v0.16b, {v2.16b}, v3.16b \n"
      "tbl v1.16b, {v1.16b}, v3.16b \n"
      "st1 {v0.16b, v1.16b}, [%1], #32 \n"  // store 32 pixels
      "b.gt 1b \n"
      : "+r"(src),            // %0
        "+r"(dst),            // %1
        "+r"(width)           // %2
      : "r"(&kShuffleMirror)  // %3
      : "cc""memory""v0""v1""v2""v3");
}

// Shuffle table for reversing the UV.
static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};

void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
  asm volatile(
      // Start at end of source row.
      "ld1 {v4.16b}, [%3] \n"  // shuffler
      "add %0, %0, %w2, sxtw #1 \n"
      "sub %0, %0, #32 \n"
      "1: \n"
      "ldr q1, [%0, 16] \n"
      "ldr q0, [%0], -32 \n"  // src -= 32
      "subs %w2, %w2, #16 \n"  // 16 pixels per loop.
      "tbl v2.16b, {v1.16b}, v4.16b \n"
      "tbl v3.16b, {v0.16b}, v4.16b \n"
      "st1 {v2.16b, v3.16b}, [%1], #32 \n"  // dst += 32
      "b.gt 1b \n"
      : "+r"(src_uv),           // %0
        "+r"(dst_uv),           // %1
        "+r"(width)             // %2
      : "r"(&kShuffleMirrorUV)  // %3
      : "cc""memory""v0""v1""v2""v3""v4");
}

void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
  asm volatile(
      // Start at end of source row.
      "ld1 {v4.16b}, [%4] \n"  // shuffler
      "add %0, %0, %w3, sxtw #1 \n"
      "sub %0, %0, #32 \n"
      "1: \n"
      "ldr q1, [%0, 16] \n"
      "ldr q0, [%0], -32 \n"  // src -= 32
      "subs %w3, %w3, #16 \n"  // 16 pixels per loop.
      "tbl v2.16b, {v1.16b}, v4.16b \n"
      "tbl v3.16b, {v0.16b}, v4.16b \n"
      "uzp1 v0.16b, v2.16b, v3.16b \n"  // U
      "uzp2 v1.16b, v2.16b, v3.16b \n"  // V
      "st1 {v0.16b}, [%1], #16 \n"  // dst += 16
      "st1 {v1.16b}, [%2], #16 \n"
      "b.gt 1b \n"
      : "+r"(src_uv),           // %0
        "+r"(dst_u),            // %1
        "+r"(dst_v),            // %2
        "+r"(width)             // %3
      : "r"(&kShuffleMirrorUV)  // %4
      : "cc""memory""v0""v1""v2""v3""v4");
}

// Shuffle table for reversing the ARGB.
static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
                                         4u,  5u,  6u,  7u,  0u, 1u, 2u,  3u};

void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  asm volatile(
      // Start at end of source row.
      "ld1 {v4.16b}, [%3] \n"  // shuffler
      "add %0, %0, %w2, sxtw #2 \n"
      "sub %0, %0, #32 \n"
      "1: \n"
      "ldr q1, [%0, 16] \n"
      "ldr q0, [%0], -32 \n"  // src -= 32
      "subs %w2, %w2, #8 \n"  // 8 pixels per loop.
      "tbl v2.16b, {v1.16b}, v4.16b \n"
      "tbl v3.16b, {v0.16b}, v4.16b \n"
      "st1 {v2.16b, v3.16b}, [%1], #32 \n"  // dst += 32
      "b.gt 1b \n"
      : "+r"(src_argb),           // %0
        "+r"(dst_argb),           // %1
        "+r"(width)               // %2
      : "r"(&kShuffleMirrorARGB)  // %3
      : "cc""memory""v0""v1""v2""v3""v4");
}

void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
                         uint8_t* dst_rgb24,
                         int width) {
  asm volatile(
      "ld1 {v3.16b}, [%4] \n"  // shuffler
      "add %0, %0, %w2, sxtw #1 \n"  // Start at end of row.
      "add %0, %0, %w2, sxtw \n"
      "sub %0, %0, #48 \n"

      "1: \n"
      "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n"  // src -= 48
      "subs %w2, %w2, #16 \n"  // 16 pixels per loop.
      "tbl v0.16b, {v0.16b}, v3.16b \n"
      "tbl v1.16b, {v1.16b}, v3.16b \n"
      "tbl v2.16b, {v2.16b}, v3.16b \n"
      "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n"  // dst += 48
      "b.gt 1b \n"
      : "+r"(src_rgb24),      // %0
        "+r"(dst_rgb24),      // %1
        "+r"(width)           // %2
      : "r"((ptrdiff_t)-48),  // %3
        "r"(&kShuffleMirror)  // %4
      : "cc""memory""v0""v1""v2""v3");
}

void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                         uint8_t* dst_argb,
                         int width) {
  asm volatile(
      "movi v4.8b, #255 \n"  // Alpha
      "1: \n"
      "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of
                                                       // RGB24.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
      "b.gt 1b \n"
      : "+r"(src_rgb24),  // %0
        "+r"(dst_argb),   // %1
        "+r"(width)       // %2
      :
      : "cc""memory""v1""v2""v3""v4"  // Clobber List
  );
}

void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  asm volatile(
      "movi v5.8b, #255 \n"  // Alpha
      "1: \n"
      "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      "orr v3.8b, v1.8b, v1.8b \n"  // move g
      "orr v4.8b, v0.8b, v0.8b \n"  // move r
      "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
      "b.gt 1b \n"
      : "+r"(src_raw),   // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v0""v1""v2""v3""v4""v5"  // Clobber List
  );
}

void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
  asm volatile(
      "movi v0.8b, #255 \n"  // Alpha
      "1: \n"
      "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n"  // read r g b
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      "orr v2.8b, v4.8b, v4.8b \n"  // move g
      "orr v1.8b, v5.8b, v5.8b \n"  // move r
      "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store a b g r
      "b.gt 1b \n"
      : "+r"(src_raw),   // %0
        "+r"(dst_rgba),  // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v0""v1""v2""v3""v4""v5"  // Clobber List
  );
}

void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
  asm volatile(
      "1: \n"
      "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"   // 8 processed per loop.
      "orr v3.8b, v1.8b, v1.8b \n"   // move g
      "orr v4.8b, v0.8b, v0.8b \n"   // move r
      "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
      "b.gt 1b \n"
      : "+r"(src_raw),    // %0
        "+r"(dst_rgb24),  // %1
        "+r"(width)       // %2
      :
      : "cc""memory""v0""v1""v2""v3""v4"  // Clobber List
  );
}

#define RGB565TOARGB                                                        \
  "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG           */ \
  "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6   */ \
  "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2   */ \
  "orr v1.8b, v4.8b, v6.8b \n" /* G                    */ \
  "xtn v2.8b, v0.8h \n" /* B xxxBBBBB           */ \
  "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR           */ \
  "xtn2 v2.16b,v0.8h \n" /* R in upper part      */ \
  "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
  "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
  "orr v0.16b, v0.16b, v2.16b \n" /* R,B                  */ \
  "dup v2.2D, v0.D[1] \n" /* R                    */

void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                          uint8_t* dst_argb,
                          int width) {
  asm volatile(
      "movi v3.8b, #255 \n"  // Alpha
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load 8 RGB565 pixels.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      RGB565TOARGB
      "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
      "b.gt 1b \n"
      : "+r"(src_rgb565),  // %0
        "+r"(dst_argb),    // %1
        "+r"(width)        // %2
      :
      : "cc""memory""v0""v1""v2""v3""v4""v6"  // Clobber List
  );
}

#define ARGB1555TOARGB                                                      \
  "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR           */ \
  "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5   */ \
  "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA    */ \
                                                                            \
  "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA           */ \
  "xtn2 v3.16b, v2.8h \n"                            \
                                                                            \
  "xtn v2.8b, v0.8h \n" /* B xxxBBBBB           */ \
  "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG           */ \
                                                                            \
  "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
  "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
  "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
                                                                            \
  "orr v0.16b, v0.16b, v2.16b \n" /* B,G                  */ \
  "orr v2.16b, v1.16b, v3.16b \n" /* R,A                  */ \
  "dup v1.2D, v0.D[1] \n"                            \
  "dup v3.2D, v2.D[1] \n"

// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
#define RGB555TOARGB                                                        \
  "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR           */ \
  "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5   */ \
  "xtn v3.8b, v2.8h \n" /* RRRRR000             */ \
                                                                            \
  "xtn v2.8b, v0.8h \n" /* B xxxBBBBB           */ \
  "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG           */ \
                                                                            \
  "ushr v1.16b, v3.16b, #5 \n" /* R   00000RRR lower 3 */ \
  "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
  "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
                                                                            \
  "orr v0.16b, v0.16b, v2.16b \n" /* B,G                  */ \
  "orr v2.16b, v1.16b, v3.16b \n" /* R                    */ \
  "dup v1.2D, v0.D[1] \n" /* G */

void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                            uint8_t* dst_argb,
                            int width) {
  asm volatile(
      "movi v3.8b, #255 \n"  // Alpha
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load 8 ARGB1555 pixels.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      ARGB1555TOARGB
      "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
      "b.gt 1b \n"
      : "+r"(src_argb1555),  // %0
        "+r"(dst_argb),      // %1
        "+r"(width)          // %2
      :
      : "cc""memory""v0""v1""v2""v3"  // Clobber List
  );
}

// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
// clobbers v3
#define ARGB4444TOARGB                                                      \
  "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR             */ \
  "xtn2 v1.16b, v0.8h \n" /* v1(h) GB             */ \
  "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000         */ \
  "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG         */ \
  "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB         */ \
  "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000         */ \
  "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB         */ \
  "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG         */ \
  "dup v0.2D, v2.D[1] \n"                            \
  "dup v1.2D, v3.D[1] \n"

void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                            uint8_t* dst_argb,
                            int width) {
  asm volatile(
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load 8 ARGB4444 pixels.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      ARGB4444TOARGB
      "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
      "b.gt 1b \n"
      : "+r"(src_argb4444),  // %0
        "+r"(dst_argb),      // %1
        "+r"(width)          // %2
      :
      : "cc""memory""v0""v1""v2""v3""v4"  // Clobber List
  );
}

void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_rgb24,
                         int width) {
  asm volatile(
      "1: \n"
      "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"   // 8 processed per loop.
      "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
                                                       // RGB24
      "b.gt 1b \n"
      : "+r"(src_argb),   // %0
        "+r"(dst_rgb24),  // %1
        "+r"(width)       // %2
      :
      : "cc""memory""v1""v2""v3""v4"  // Clobber List
  );
}

void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
  asm volatile(
      "1: \n"
      "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"   // 8 processed per loop.
      "orr v4.8b, v2.8b, v2.8b \n"   // mov g
      "orr v5.8b, v1.8b, v1.8b \n"   // mov b
      "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
      "b.gt 1b \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_raw),   // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v1""v2""v3""v4""v5"  // Clobber List
  );
}

void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  asm volatile(
      "1: \n"
      "ld2 {v0.16b,v1.16b}, [%0], #32 \n"  // load 16 pixels of YUY2.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #16 \n"  // 16 processed per loop.
      "st1 {v0.16b}, [%1], #16 \n"  // store 16 pixels of Y.
      "b.gt 1b \n"
      : "+r"(src_yuy2),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v0""v1"  // Clobber List
  );
}

void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  asm volatile(
      "1: \n"
      "ld2 {v0.16b,v1.16b}, [%0], #32 \n"  // load 16 pixels of UYVY.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #16 \n"  // 16 processed per loop.
      "st1 {v1.16b}, [%1], #16 \n"  // store 16 pixels of Y.
      "b.gt 1b \n"
      : "+r"(src_uyvy),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v0""v1"  // Clobber List
  );
}

void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
  asm volatile(
      "1: \n"
      "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w3, %w3, #16 \n"  // 16 pixels = 8 UVs.
      "st1 {v1.8b}, [%1], #8 \n"  // store 8 U.
      "st1 {v3.8b}, [%2], #8 \n"  // store 8 V.
      "b.gt 1b \n"
      : "+r"(src_yuy2),  // %0
        "+r"(dst_u),     // %1
        "+r"(dst_v),     // %2
        "+r"(width)      // %3
      :
      : "cc""memory""v0""v1""v2""v3"  // Clobber List
  );
}

void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
  asm volatile(
      "1: \n"
      "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w3, %w3, #16 \n"  // 16 pixels = 8 UVs.
      "st1 {v0.8b}, [%1], #8 \n"  // store 8 U.
      "st1 {v2.8b}, [%2], #8 \n"  // store 8 V.
      "b.gt 1b \n"
      : "+r"(src_uyvy),  // %0
        "+r"(dst_u),     // %1
        "+r"(dst_v),     // %2
        "+r"(width)      // %3
      :
      : "cc""memory""v0""v1""v2""v3"  // Clobber List
  );
}

void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
                      int stride_yuy2,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
  asm volatile(
      "1: \n"
      "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w4, %w4, #16 \n"  // 16 pixels = 8 UVs.
      "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
      "urhadd v1.8b, v1.8b, v5.8b \n"  // average rows of U
      "urhadd v3.8b, v3.8b, v7.8b \n"  // average rows of V
      "st1 {v1.8b}, [%2], #8 \n"  // store 8 U.
      "st1 {v3.8b}, [%3], #8 \n"  // store 8 V.
      "b.gt 1b \n"
      : "+r"(src_yuy2),   // %0
        "+r"(src_yuy2b),  // %1
        "+r"(dst_u),      // %2
        "+r"(dst_v),      // %3
        "+r"(width)       // %4
      :
      : "cc""memory""v0""v1""v2""v3""v4""v5""v6",
        "v7"  // Clobber List
  );
}

void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
                      int stride_uyvy,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
  asm volatile(
      "1: \n"
      "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w4, %w4, #16 \n"  // 16 pixels = 8 UVs.
      "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
      "urhadd v0.8b, v0.8b, v4.8b \n"  // average rows of U
      "urhadd v2.8b, v2.8b, v6.8b \n"  // average rows of V
      "st1 {v0.8b}, [%2], #8 \n"  // store 8 U.
      "st1 {v2.8b}, [%3], #8 \n"  // store 8 V.
      "b.gt 1b \n"
      : "+r"(src_uyvy),   // %0
        "+r"(src_uyvyb),  // %1
        "+r"(dst_u),      // %2
        "+r"(dst_v),      // %3
        "+r"(width)       // %4
      :
      : "cc""memory""v0""v1""v2""v3""v4""v5""v6",
        "v7"  // Clobber List
  );
}

// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                         uint8_t* dst_argb,
                         const uint8_t* shuffler,
                         int width) {
  asm volatile(
      "ld1 {v2.16b}, [%3] \n"  // shuffler
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load 4 pixels.
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #4 \n"  // 4 processed per loop
      "tbl v1.16b, {v0.16b}, v2.16b \n"  // look up 4 pixels
      "st1 {v1.16b}, [%1], #16 \n"  // store 4.
      "b.gt 1b \n"
      : "+r"(src_argb),                   // %0
        "+r"(dst_argb),                   // %1
        "+r"(width)                       // %2
      : "r"(shuffler)                     // %3
      : "cc""memory""v0""v1""v2"  // Clobber List
  );
}

void I422ToYUY2Row_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_yuy2,
                        int width) {
  asm volatile(
      "1: \n"
      "ld2 {v0.8b, v1.8b}, [%0], #16 \n"  // load 16 Ys
      "prfm pldl1keep, [%0, 448] \n"
      "orr v2.8b, v1.8b, v1.8b \n"
      "ld1 {v1.8b}, [%1], #8 \n"         // load 8 Us
      "ld1 {v3.8b}, [%2], #8 \n"         // load 8 Vs
      "subs %w4, %w4, #16 \n"         // 16 pixels
      "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
      "b.gt 1b \n"
      : "+r"(src_y),     // %0
        "+r"(src_u),     // %1
        "+r"(src_v),     // %2
        "+r"(dst_yuy2),  // %3
        "+r"(width)      // %4
      :
      : "cc""memory""v0""v1""v2""v3");
}

void I422ToUYVYRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_uyvy,
                        int width) {
  asm volatile(
      "1: \n"
      "ld2 {v1.8b,v2.8b}, [%0], #16 \n"  // load 16 Ys
      "prfm pldl1keep, [%0, 448] \n"
      "orr v3.8b, v2.8b, v2.8b \n"
      "ld1 {v0.8b}, [%1], #8 \n"         // load 8 Us
      "ld1 {v2.8b}, [%2], #8 \n"         // load 8 Vs
      "subs %w4, %w4, #16 \n"         // 16 pixels
      "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
      "b.gt 1b \n"
      : "+r"(src_y),     // %0
        "+r"(src_u),     // %1
        "+r"(src_v),     // %2
        "+r"(dst_uyvy),  // %3
        "+r"(width)      // %4
      :
      : "cc""memory""v0""v1""v2""v3");
}

void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_rgb565,
                          int width) {
  asm volatile(
      "1: \n"
      "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
                                                                 // pixels
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      ARGBTORGB565
      "st1 {v0.16b}, [%1], #16 \n"  // store 8 pixels RGB565.
      "b.gt 1b \n"
      : "+r"(src_argb),    // %0
        "+r"(dst_rgb565),  // %1
        "+r"(width)        // %2
      :
      : "cc""memory""v0""v20""v21""v22""v23");
}

void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
                                const uint32_t dither4,
                                int width) {
  asm volatile(
      "dup v1.4s, %w2 \n"  // dither4
      "1: \n"
      "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8
                                                                 // pixels
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w3, %w3, #8 \n"  // 8 processed per loop.
      "uqadd v20.8b, v20.8b, v1.8b \n"
      "uqadd v21.8b, v21.8b, v1.8b \n"
      "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
      "st1 {v0.16b}, [%0], #16 \n"  // store 8 pixels RGB565.
      "b.gt 1b \n"
      : "+r"(dst_rgb)   // %0
      : "r"(src_argb),  // %1
        "r"(dither4),   // %2
        "r"(width)      // %3
      : "cc""memory""v0""v1""v20""v21""v22""v23");
}

void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb1555,
                            int width) {
  asm volatile(
      "1: \n"
      "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
                                                                 // pixels
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      ARGBTOARGB1555
      "st1 {v0.16b}, [%1], #16 \n"  // store 8 pixels
      "b.gt 1b \n"
      : "+r"(src_argb),      // %0
        "+r"(dst_argb1555),  // %1
        "+r"(width)          // %2
      :
      : "cc""memory""v0""v20""v21""v22""v23");
}

void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb4444,
                            int width) {
  asm volatile(
      "movi v4.16b, #0x0f \n"  // bits to clear with
                                                      // vbic.
      "1: \n"
      "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
                                                                 // pixels
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      ARGBTOARGB4444
      "st1 {v0.16b}, [%1], #16 \n"  // store 8 pixels
      "b.gt 1b \n"
      : "+r"(src_argb),      // %0
        "+r"(dst_argb4444),  // %1
        "+r"(width)          // %2
      :
      : "cc""memory""v0""v1""v4""v20""v21""v22""v23");
}

void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  asm volatile(
      "movi v4.8b, #25 \n"  // B * 0.1016 coefficient
      "movi v5.8b, #129 \n"  // G * 0.5078 coefficient
      "movi v6.8b, #66 \n"  // R * 0.2578 coefficient
      "movi v7.8b, #16 \n"  // Add 16 constant
      "1: \n"
      "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #8 \n"  // 8 processed per loop.
      "umull v3.8h, v0.8b, v4.8b \n"  // B
      "umlal v3.8h, v1.8b, v5.8b \n"  // G
      "umlal v3.8h, v2.8b, v6.8b \n"  // R
      "uqrshrn v0.8b, v3.8h, #8 \n"  // 16 bit to 8 bit Y
      "uqadd v0.8b, v0.8b, v7.8b \n"
      "st1 {v0.8b}, [%1], #8 \n"  // store 8 pixels Y.
      "b.gt 1b \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v0""v1""v2""v3""v4""v5""v6""v7");
}

void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
                              uint8_t* dst_a,
                              int width) {
  asm volatile(
      "1: \n"
      "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
      "prfm pldl1keep, [%0, 448] \n"
      "subs %w2, %w2, #16 \n"  // 16 processed per loop
      "st1 {v3.16b}, [%1], #16 \n"  // store 16 A's.
      "b.gt 1b \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_a),     // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v0""v1""v2""v3"  // Clobber List
  );
}

void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  asm volatile(
      "movi v4.8b, #29 \n"  // B * 0.1140 coefficient
      "movi v5.8b, #150 \n"  // G * 0.5870 coefficient
      "movi v6.8b, #77 \n"  // R * 0.2990 coefficient
      "1: \n"
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5
C=79 H=83 G=80

[ zur Elbe Produktseite wechseln0.29Quellennavigators  Analyse erneut starten  ]