Anforderungen  |   Konzepte  |   Entwurf  |   Entwicklung  |   Qualitätssicherung  |   Lebenszyklus  |   Steuerung
 
 
 
 


Quelle  row_neon64.cc   Sprache: C

 
/*
 *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


#include "libyuv/row.h"

#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

// Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer
// STn over ZIP1+ST1
// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions.

// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

// v0.8h: Y
// v1.16b: 8U, 8V

// Read 8 Y, 4 U and 4 V from 422
#define READYUV422                               \
  "ldr d0, [%[src_y]], #8 \n" \
  "ldr s1, [%[src_u]], #4 \n" \
  "ldr s2, [%[src_v]], #4 \n" \
  "zip1 v0.16b, v0.16b, v0.16b \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "zip1 v1.8b, v1.8b, v1.8b \n" \
  "zip1 v2.8b, v2.8b, v2.8b \n" \
  "prfm pldl1keep, [%[src_u], 128] \n" \
  "prfm pldl1keep, [%[src_v], 128] \n"

// Read 8 Y, 4 U and 4 V from 210
#define READYUV210                               \
  "ldr q2, [%[src_y]], #16 \n" \
  "ldr d1, [%[src_u]], #8 \n" \
  "ldr d3, [%[src_v]], #8 \n" \
  "shl v0.8h, v2.8h, #6 \n" \
  "usra v0.8h, v2.8h, #4 \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "zip1 v2.8h, v3.8h, v3.8h \n" \
  "zip1 v3.8h, v1.8h, v1.8h \n" \
  "uqshrn v1.8b, v3.8h, #2 \n" \
  "uqshrn2 v1.16b, v2.8h, #2 \n" \
  "prfm pldl1keep, [%[src_u], 128] \n" \
  "prfm pldl1keep, [%[src_v], 128] \n"

// Read 8 Y, 4 U and 4 V interleaved from 210
#define READYUVP210                              \
  "ldr q0, [%[src_y]], #16 \n" \
  "ldr q1, [%[src_uv]], #16 \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "tbl v1.16b, {v1.16b}, v2.16b \n"

// Read 8 Y, 4 U and 4 V from 212
#define READYUV212                               \
  "ldr q2, [%[src_y]], #16 \n" \
  "ldr d1, [%[src_u]], #8 \n" \
  "ldr d3, [%[src_v]], #8 \n" \
  "shl v0.8h, v2.8h, #4 \n" \
  "usra v0.8h, v2.8h, #8 \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "zip1 v2.8h, v3.8h, v3.8h \n" \
  "zip1 v3.8h, v1.8h, v1.8h \n" \
  "uqshrn v1.8b, v3.8h, #4 \n" \
  "uqshrn2 v1.16b, v2.8h, #4 \n" \
  "prfm pldl1keep, [%[src_u], 128] \n" \
  "prfm pldl1keep, [%[src_v], 128] \n"

// Read 8 Y, 8 U and 8 V from 410
#define READYUV410                               \
  "ldr q1, [%[src_y]], #16 \n" \
  "ldr q2, [%[src_u]], #16 \n" \
  "ldr q3, [%[src_v]], #16 \n" \
  "shl v0.8h, v1.8h, #6 \n" \
  "usra v0.8h, v1.8h, #4 \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "uqshrn v1.8b, v2.8h, #2 \n" \
  "uqshrn2 v1.16b, v3.8h, #2 \n" \
  "prfm pldl1keep, [%[src_u], 128] \n" \
  "prfm pldl1keep, [%[src_v], 128] \n"

// Read 8 Y, 8 U and 8 V interleaved from 410
#define READYUVP410                                \
  "ldr q0, [%[src_y]], #16 \n" \
  "ldp q4, q5, [%[src_uv]], #32 \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "tbl v1.16b, {v4.16b, v5.16b}, v2.16b \n"

// Read 8 Y, 8 U and 8 V from 444
#define READYUV444                               \
  "ldr d0, [%[src_y]], #8 \n" \
  "ldr d1, [%[src_u]], #8 \n" \
  "ldr d2, [%[src_v]], #8 \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "prfm pldl1keep, [%[src_u], 448] \n" \
  "zip1 v0.16b, v0.16b, v0.16b \n" \
  "prfm pldl1keep, [%[src_v], 448] \n"

// Read 8 Y
#define READYUV400                               \
  "ldr d0, [%[src_y]], #8 \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "zip1 v0.16b, v0.16b, v0.16b \n"

static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6,
                                 1, 1, 3, 3, 5, 5, 7, 7};
static const uvec8 kNV12InterleavedTable = {0, 0, 4, 4, 8,  8,  12, 12,
                                            2, 2, 6, 6, 10, 10, 14, 14};
static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7,
                                 0, 0, 2, 2, 4, 4, 6, 6};
static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9,  9,  13, 13,
                                            3, 3, 7, 7, 11, 11, 15, 15};

// Read 8 Y and 4 UV from NV12 or NV21
#define READNV12                                 \
  "ldr d0, [%[src_y]], #8 \n" \
  "ldr d1, [%[src_uv]], #8 \n" \
  "zip1 v0.16b, v0.16b, v0.16b \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "tbl v1.16b, {v1.16b}, v2.16b \n" \
  "prfm pldl1keep, [%[src_uv], 448] \n"

// Read 8 YUY2
#define READYUY2                                 \
  "ld1 {v3.16b}, [%[src_yuy2]], #16 \n" \
  "trn1 v0.16b, v3.16b, v3.16b \n" \
  "prfm pldl1keep, [%[src_yuy2], 448] \n" \
  "tbl v1.16b, {v3.16b}, v2.16b \n"

// Read 8 UYVY
#define READUYVY                                 \
  "ld1 {v3.16b}, [%[src_uyvy]], #16 \n" \
  "trn2 v0.16b, v3.16b, v3.16b \n" \
  "prfm pldl1keep, [%[src_uyvy], 448] \n" \
  "tbl v1.16b, {v3.16b}, v2.16b \n"

// UB VR UG VG
// YG BB BG BR
#define YUVTORGB_SETUP                                                \
  "ld4r {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \
  "ld4r {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n"

// v16.8h: B
// v17.8h: G
// v18.8h: R

// Convert from YUV (NV12 or NV21) to 2.14 fixed point RGB.
// Similar to I4XXTORGB but U/V components are in the low/high halves of v1.
#define NVTORGB                                           \
  "umull2 v3.4s, v0.8h, v24.8h \n"          \
  "umull v6.8h, v1.8b, v30.8b \n"          \
  "umull v0.4s, v0.4h, v24.4h \n"          \
  "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \
  "uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */  \
  "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \
  "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \
  "add v17.8h, v0.8h, v26.8h \n" /* G */  \
  "add v16.8h, v0.8h, v4.8h \n" /* B */  \
  "add v18.8h, v0.8h, v5.8h \n" /* R */  \
  "uqsub v17.8h, v17.8h, v6.8h \n" /* G */  \
  "uqsub v16.8h, v16.8h, v25.8h \n" /* B */  \
  "uqsub v18.8h, v18.8h, v27.8h \n" /* R */

// Convert from YUV (I444 or I420) to 2.14 fixed point RGB.
// Similar to NVTORGB but U/V components are in v1/v2.
#define I4XXTORGB                                         \
  "umull2 v3.4s, v0.8h, v24.8h \n"          \
  "umull v6.8h, v1.8b, v30.8b \n"          \
  "umull v0.4s, v0.4h, v24.4h \n"          \
  "umlal v6.8h, v2.8b, v31.8b \n" /* DG */ \
  "uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */  \
  "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \
  "umull v5.8h, v2.8b, v29.8b \n" /* DR */ \
  "add v17.8h, v0.8h, v26.8h \n" /* G */  \
  "add v16.8h, v0.8h, v4.8h \n" /* B */  \
  "add v18.8h, v0.8h, v5.8h \n" /* R */  \
  "uqsub v17.8h, v17.8h, v6.8h \n" /* G */  \
  "uqsub v16.8h, v16.8h, v25.8h \n" /* B */  \
  "uqsub v18.8h, v18.8h, v27.8h \n" /* R */

// Convert from YUV I400 to 2.14 fixed point RGB
#define I400TORGB                                        \
  "umull2 v3.4s, v0.8h, v24.8h \n"         \
  "umull v0.4s, v0.4h, v24.4h \n"         \
  "uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */ \
  "add v17.8h, v0.8h, v26.8h \n" /* G */ \
  "add v16.8h, v0.8h, v4.8h \n" /* B */ \
  "add v18.8h, v0.8h, v5.8h \n" /* R */ \
  "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \
  "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \
  "uqsub v18.8h, v18.8h, v27.8h \n" /* R */

// Convert from 2.14 fixed point RGB To 8 bit RGB
#define RGBTORGB8                                \
  "uqshrn v17.8b, v17.8h, #6 \n" \
  "uqshrn v16.8b, v16.8h, #6 \n" \
  "uqshrn v18.8b, v18.8h, #6 \n"

// Convert from 2.14 fixed point RGB to 8 bit RGB, placing the results in the
// top half of each lane.
#define RGBTORGB8_TOP                            \
  "uqshl v17.8h, v17.8h, #2 \n" \
  "uqshl v16.8h, v16.8h, #2 \n" \
  "uqshl v18.8h, v18.8h, #2 \n"

// Store 2.14 fixed point RGB as AR30 elements
#define STOREAR30                                                         \
  /* Inputs:                                                              \
   *   v16.8h: xxbbbbbbbbbbxxxx                                           \
   *   v17.8h: xxggggggggggxxxx                                           \
   *   v18.8h: xxrrrrrrrrrrxxxx                                           \
   *   v22.8h: 0011111111110000 (umin limit)                              \
   *   v23.8h: 1100000000000000 (alpha)                                   \
   */

  "uqshl v0.8h, v16.8h, #2 \n" /* bbbbbbbbbbxxxxxx */ \
  "uqshl v1.8h, v17.8h, #2 \n" /* ggggggggggxxxxxx */ \
  "umin v6.8h, v18.8h, v22.8h \n" /* 00rrrrrrrrrrxxxx */ \
  "shl v4.8h, v1.8h, #4 \n" /* ggggggxxxxxx0000 */ \
  "orr v5.16b, v6.16b, v23.16b \n" /* 11rrrrrrrrrrxxxx */ \
  "sri v4.8h, v0.8h, #6 \n" /* ggggggbbbbbbbbbb */ \
  "sri v5.8h, v1.8h, #12 \n" /* 11rrrrrrrrrrgggg */ \
  "st2 {v4.8h, v5.8h}, [%[dst_ar30]], #32 \n"

#define YUVTORGB_REGS                                                         \
  "v0""v1""v2""v3""v4""v5""v6""v7""v16""v17""v18""v24", \
      "v25""v26""v27""v28""v29""v30""v31"

void I444ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "movi v19.8b, #255 \n" /* A */
      "1: \n" READYUV444 I4XXTORGB
          RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

void I444ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile(
      YUVTORGB_SETUP
      "1: \n" READYUV444 I4XXTORGB
          RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS);
}

void I210ToAR30Row_NEON(const uint16_t* src_y,
                        const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint8_t* dst_ar30,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  uint16_t limit = 0x3ff0;
  uint16_t alpha = 0xc000;
  asm volatile(
      YUVTORGB_SETUP
      "dup v22.8h, %w[limit] \n"
      "dup v23.8h, %w[alpha] \n"
      "1: \n" READYUV210 NVTORGB
      "subs %w[width], %w[width], #8 \n" STOREAR30
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),             // %[src_y]
        [src_u] "+r"(src_u),             // %[src_u]
        [src_v] "+r"(src_v),             // %[src_v]
        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
        [width] "+r"(width)              // %[width]
      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
        [limit] "r"(limit),              // %[limit]
        [alpha] "r"(alpha)               // %[alpha]
      : "cc""memory", YUVTORGB_REGS, "v22""v23");
}

void I410ToAR30Row_NEON(const uint16_t* src_y,
                        const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint8_t* dst_ar30,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  uint16_t limit = 0x3ff0;
  uint16_t alpha = 0xc000;
  asm volatile(
      YUVTORGB_SETUP
      "dup v22.8h, %w[limit] \n"
      "dup v23.8h, %w[alpha] \n"
      "1: \n" READYUV410 NVTORGB
      "subs %w[width], %w[width], #8 \n" STOREAR30
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),             // %[src_y]
        [src_u] "+r"(src_u),             // %[src_u]
        [src_v] "+r"(src_v),             // %[src_v]
        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
        [width] "+r"(width)              // %[width]
      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
        [limit] "r"(limit),              // %[limit]
        [alpha] "r"(alpha)               // %[alpha]
      : "cc""memory", YUVTORGB_REGS, "v22""v23");
}

void I212ToAR30Row_NEON(const uint16_t* src_y,
                        const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint8_t* dst_ar30,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  const uint16_t limit = 0x3ff0;
  asm volatile(
      YUVTORGB_SETUP
      "dup v22.8h, %w[limit] \n"
      "movi v23.8h, #0xc0, lsl #8 \n"  // A
      "1: \n" READYUV212 NVTORGB
      "subs %w[width], %w[width], #8 \n" STOREAR30
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),             // %[src_y]
        [src_u] "+r"(src_u),             // %[src_u]
        [src_v] "+r"(src_v),             // %[src_v]
        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
        [width] "+r"(width)              // %[width]
      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
        [limit] "r"(limit)               // %[limit]
      : "cc""memory", YUVTORGB_REGS, "v22""v23");
}

void I210ToARGBRow_NEON(const uint16_t* src_y,
                        const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "movi v19.8b, #255 \n"
      "1: \n" READYUV210 NVTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

void I410ToARGBRow_NEON(const uint16_t* src_y,
                        const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "movi v19.8b, #255 \n"
      "1: \n" READYUV410 NVTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

void I212ToARGBRow_NEON(const uint16_t* src_y,
                        const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  asm volatile(
      YUVTORGB_SETUP
      "movi v19.8b, #255 \n"
      "1: \n" READYUV212 NVTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),            // %[src_y]
        [src_u] "+r"(src_u),            // %[src_u]
        [src_v] "+r"(src_v),            // %[src_v]
        [dst_argb] "+r"(dst_argb),      // %[dst_argb]
        [width] "+r"(width)             // %[width]
      : [kUVCoeff] "r"(uv_coeff),       // %[kUVCoeff]
        [kRGBCoeffBias] "r"(rgb_coeff)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

void I422ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "movi v19.8b, #255 \n" /* A */
      "1: \n" READYUV422 I4XXTORGB
          RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

uint8_t kP210LoadShuffleIndices[] = {1, 1, 5, 5, 9,  9,  13, 13,
                                     3, 3, 7, 7, 11, 11, 15, 15};

void P210ToARGBRow_NEON(const uint16_t* src_y,
                        const uint16_t* src_uv,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  asm volatile(
      YUVTORGB_SETUP
      "movi v19.8b, #255 \n"
      "ldr q2, [%[kIndices]] \n"
      "1: \n"  //
      READYUVP210 NVTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                     // %[src_y]
        [src_uv] "+r"(src_uv),                   // %[src_uv]
        [dst_argb] "+r"(dst_argb),               // %[dst_argb]
        [width] "+r"(width)                      // %[width]
      : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
        [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
        [kIndices] "r"(kP210LoadShuffleIndices)  // %[kIndices]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

uint8_t kP410LoadShuffleIndices[] = {1, 5, 9,  13, 17, 21, 25, 29,
                                     3, 7, 11, 15, 19, 23, 27, 31};

void P410ToARGBRow_NEON(const uint16_t* src_y,
                        const uint16_t* src_uv,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  asm volatile(
      YUVTORGB_SETUP
      "movi v19.8b, #255 \n"
      "ldr q2, [%[kIndices]] \n"
      "1: \n"  //
      READYUVP410 NVTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                     // %[src_y]
        [src_uv] "+r"(src_uv),                   // %[src_uv]
        [dst_argb] "+r"(dst_argb),               // %[dst_argb]
        [width] "+r"(width)                      // %[width]
      : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
        [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
        [kIndices] "r"(kP410LoadShuffleIndices)  // %[kIndices]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

void P210ToAR30Row_NEON(const uint16_t* src_y,
                        const uint16_t* src_uv,
                        uint8_t* dst_ar30,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  const uint16_t limit = 0x3ff0;
  asm volatile(YUVTORGB_SETUP
      "dup v22.8h, %w[limit] \n"
      "movi v23.8h, #0xc0, lsl #8 \n"  // A
      "ldr q2, [%[kIndices]] \n"
      "1: \n" READYUVP210 NVTORGB
      "subs %w[width], %w[width], #8 \n" STOREAR30
      "b.gt 1b \n"
               : [src_y] "+r"(src_y),                     // %[src_y]
                 [src_uv] "+r"(src_uv),                   // %[src_uv]
                 [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
                 [width] "+r"(width)                      // %[width]
               : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
                 [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
                 [limit] "r"(limit),                      // %[limit]
                 [kIndices] "r"(kP210LoadShuffleIndices)  // %[kIndices]
               : "cc""memory", YUVTORGB_REGS, "v22""v23");
}

void P410ToAR30Row_NEON(const uint16_t* src_y,
                        const uint16_t* src_uv,
                        uint8_t* dst_ar30,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  uint16_t limit = 0x3ff0;
  asm volatile(YUVTORGB_SETUP
      "dup v22.8h, %w[limit] \n"
      "movi v23.8h, #0xc0, lsl #8 \n"  // A
      "ldr q2, [%[kIndices]] \n"
      "1: \n" READYUVP410 NVTORGB
      "subs %w[width], %w[width], #8 \n" STOREAR30
      "b.gt 1b \n"
               : [src_y] "+r"(src_y),                     // %[src_y]
                 [src_uv] "+r"(src_uv),                   // %[src_uv]
                 [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
                 [width] "+r"(width)                      // %[width]
               : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
                 [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
                 [limit] "r"(limit),                      // %[limit]
                 [kIndices] "r"(kP410LoadShuffleIndices)  // %[kIndices]
               : "cc""memory", YUVTORGB_REGS, "v22""v23");
}

void I422ToAR30Row_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_ar30,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  const uint16_t limit = 0x3ff0;
  asm volatile(
      YUVTORGB_SETUP
      "dup v22.8h, %w[limit] \n"
      "movi v23.8h, #0xc0, lsl #8 \n"  // A
      "1: \n" READYUV422 I4XXTORGB
      "subs %w[width], %w[width], #8 \n" STOREAR30
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),             // %[src_y]
        [src_u] "+r"(src_u),             // %[src_u]
        [src_v] "+r"(src_v),             // %[src_v]
        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
        [width] "+r"(width)              // %[width]
      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
        [limit] "r"(limit)               // %[limit]
      : "cc""memory", YUVTORGB_REGS, "v22""v23");
}

void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
                             const uint8_t* src_a,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  asm volatile(
      YUVTORGB_SETUP
      "1: \n"
      "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444
      "prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [src_a] "+r"(src_a),                               // %[src_a]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
                             const uint16_t* src_u,
                             const uint16_t* src_v,
                             const uint16_t* src_a,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  asm volatile(
      YUVTORGB_SETUP
      "1: \n"
      "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410
      "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [src_a] "+r"(src_a),                               // %[src_a]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
                             const uint16_t* src_u,
                             const uint16_t* src_v,
                             const uint16_t* src_a,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  asm volatile(
      YUVTORGB_SETUP
      "1: \n"
      "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210
      "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [src_a] "+r"(src_a),                               // %[src_a]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
                             const uint8_t* src_a,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  asm volatile(
      YUVTORGB_SETUP
      "1: \n"
      "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422
      "prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [src_a] "+r"(src_a),                               // %[src_a]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

void I422ToRGBARow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_rgba,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "movi v15.8b, #255 \n" /* A */
      "1: \n" READYUV422 I4XXTORGB
          RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_rgba] "+r"(dst_rgba),                         // %[dst_rgba]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "v15");
}

void I422ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile(
      YUVTORGB_SETUP
      "1: \n" READYUV422 I4XXTORGB
          RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS);
}

#define ARGBTORGB565                                                \
  /* Inputs:                                                        \
   * v16: bbbbbxxx                                                  \
   * v17: ggggggxx                                                  \
   * v18: rrrrrxxx */

  "shll v18.8h, v18.8b, #8 \n" /* rrrrrrxx00000000     */ \
  "shll v17.8h, v17.8b, #8 \n" /* gggggxxx00000000     */ \
  "shll v16.8h, v16.8b, #8 \n" /* bbbbbbxx00000000     */ \
  "sri v18.8h, v17.8h, #5 \n" /* rrrrrgggggg00000     */ \
  "sri v18.8h, v16.8h, #11 \n" /* rrrrrggggggbbbbb     */

#define ARGBTORGB565_FROM_TOP                                       \
  /* Inputs:                                                        \
   * v16: bbbbbxxxxxxxxxxx                                          \
   * v17: ggggggxxxxxxxxxx                                          \
   * v18: rrrrrxxxxxxxxxxx */

  "sri v18.8h, v17.8h, #5 \n" /* rrrrrgggggg00000     */ \
  "sri v18.8h, v16.8h, #11 \n" /* rrrrrggggggbbbbb     */

void I422ToRGB565Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width) {
  asm volatile(
      YUVTORGB_SETUP
      "1: \n" READYUV422 I4XXTORGB
          RGBTORGB8_TOP
      "subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP
      "st1 {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8 pixels RGB565.
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_rgb565] "+r"(dst_rgb565),                     // %[dst_rgb565]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS);
}

#define ARGBTOARGB1555                                                  \
  /* Inputs:                                                            \
   * v16: gggggxxxbbbbbxxx  v17: axxxxxxxrrrrrxxx  */

  "shl v1.8h, v16.8h, #8 \n" /* bbbbbxxx00000000 */ \
  "shl v2.8h, v17.8h, #8 \n" /* rrrrrxxx00000000 */ \
  "sri v17.8h, v2.8h, #1 \n" /* arrrrrxxxrrrrxxx */ \
  "sri v17.8h, v16.8h, #6 \n" /* arrrrrgggggxxxbb */ \
  "sri v17.8h, v1.8h, #11 \n" /* arrrrrgggggbbbbb */

#define ARGBTOARGB1555_FROM_TOP                                         \
  /* Inputs:                                                            \
   * v16: bbbbbxxxxxxxxxxx  v17: gggggxxxxxxxxxxx                       \
   * v18: rrrrrxxxxxxxxxxx  v19: axxxxxxxxxxxxxxx */

  "sri v19.8h, v18.8h, #1 \n" /* arrrrrxxxxxxxxxx */ \
  "sri v19.8h, v17.8h, #6 \n" /* arrrrrgggggxxxxx */ \
  "sri v19.8h, v16.8h, #11 \n" /* arrrrrgggggbbbbb */

void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb1555,
                            const struct YuvConstants* yuvconstants,
                            int width) {
  asm volatile(
      YUVTORGB_SETUP
      "movi v19.8h, #0x80, lsl #8 \n"
      "1: \n"  //
      READYUV422 I4XXTORGB RGBTORGB8_TOP
      "subs %w[width], %w[width], #8 \n"  //
      ARGBTOARGB1555_FROM_TOP
      "st1 {v19.8h}, [%[dst_argb1555]], #16 \n"  // store 8 pixels RGB1555.
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb1555] "+r"(dst_argb1555),                 // %[dst_argb1555]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

#define ARGBTOARGB4444                                   \
  /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A */ \
  "sri v17.8b, v16.8b, #4 \n" /* BG */          \
  "sri v19.8b, v18.8b, #4 \n" /* RA */          \
  "zip1 v0.16b, v17.16b, v19.16b \n" /* BGRA */

void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb4444,
                            const struct YuvConstants* yuvconstants,
                            int width) {
  asm volatile(
      YUVTORGB_SETUP
      "1: \n" READYUV422 I4XXTORGB
          RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "movi v19.8b, #255 \n" ARGBTOARGB4444
      "st1 {v0.8h}, [%[dst_argb4444]], #16 \n"  // store 8
                                                        // pixels
                                                        // ARGB4444.
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb4444] "+r"(dst_argb4444),                 // %[dst_argb4444]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

void I400ToARGBRow_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "movi v1.16b, #128 \n"
      "movi v19.8b, #255 \n"
      "umull v6.8h, v1.8b, v30.8b \n"
      "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */
      "umull v4.8h, v1.8b, v28.8b \n" /* DB */
      "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */
      "1: \n" READYUV400 I400TORGB
          RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "v19");
}

#if defined(LIBYUV_USE_ST4)
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  asm volatile(
      "movi v23.8b, #255 \n"
      "1: \n"
      "ld1 {v20.8b}, [%0], #8 \n"
      "prfm pldl1keep, [%0, 448] \n"
      "mov v21.8b, v20.8b \n"
      "mov v22.8b, v20.8b \n"
      "subs %w2, %w2, #8 \n"
      "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
      "b.gt 1b \n"
      : "+r"(src_y),     // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v20""v21""v22""v23");
}
#else
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  asm volatile(
      "movi v20.8b, #255 \n"
      "1: \n"
      "ldr d16, [%0], #8 \n"
      "subs %w2, %w2, #8 \n"
      "zip1 v18.16b, v16.16b, v16.16b \n"  // YY
      "zip1 v19.16b, v16.16b, v20.16b \n"  // YA
      "prfm pldl1keep, [%0, 448] \n"
      "zip1 v16.16b, v18.16b, v19.16b \n"  // YYYA
      "zip2 v17.16b, v18.16b, v19.16b \n"
      "stp q16, q17, [%1], #32 \n"
      "b.gt 1b \n"
      : "+r"(src_y),     // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
      : "cc""memory""v16""v17""v18""v19""v20");
}
#endif  // LIBYUV_USE_ST4

void NV12ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_uv,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "movi v19.8b, #255 \n"
      "ldr q2, [%[kNV12Table]] \n"
      "1: \n" READNV12 NVTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                                // %[src_y]
        [src_uv] "+r"(src_uv),                              // %[src_uv]
        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
        [width] "+r"(width)                                 // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [kNV12Table] "r"(&kNV12Table)
      : "cc""memory", YUVTORGB_REGS, "v2""v19");
}

void NV21ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_vu,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "movi v19.8b, #255 \n"
      "ldr q2, [%[kNV12Table]] \n"
      "1: \n" READNV12 NVTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                                // %[src_y]
        [src_uv] "+r"(src_vu),                              // %[src_uv]
        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
        [width] "+r"(width)                                 // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [kNV12Table] "r"(&kNV21Table)
      : "cc""memory", YUVTORGB_REGS, "v2""v19");
}

void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_uv,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile(
      YUVTORGB_SETUP
      "ldr q2, [%[kNV12Table]] \n"
      "1: \n" READNV12 NVTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                                // %[src_y]
        [src_uv] "+r"(src_uv),                              // %[src_uv]
        [dst_rgb24] "+r"(dst_rgb24),                        // %[dst_rgb24]
        [width] "+r"(width)                                 // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [kNV12Table] "r"(&kNV12Table)
      : "cc""memory", YUVTORGB_REGS, "v2");
}

void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_vu,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile(
      YUVTORGB_SETUP
      "ldr q2, [%[kNV12Table]] \n"
      "1: \n" READNV12 NVTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                                // %[src_y]
        [src_uv] "+r"(src_vu),                              // %[src_uv]
        [dst_rgb24] "+r"(dst_rgb24),                        // %[dst_rgb24]
        [width] "+r"(width)                                 // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [kNV12Table] "r"(&kNV21Table)
      : "cc""memory", YUVTORGB_REGS, "v2");
}

void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_uv,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width) {
  asm volatile(
      YUVTORGB_SETUP
      "ldr q2, [%[kNV12Table]] \n"
      "1: \n" READNV12 NVTORGB
          RGBTORGB8_TOP
      "subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP
      "st1 {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8
                                                       // pixels
                                                       // RGB565.
      "b.gt 1b \n"
      : [src_y] "+r"(src_y),                                // %[src_y]
        [src_uv] "+r"(src_uv),                              // %[src_uv]
        [dst_rgb565] "+r"(dst_rgb565),                      // %[dst_rgb565]
        [width] "+r"(width)                                 // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [kNV12Table] "r"(&kNV12Table)
      : "cc""memory", YUVTORGB_REGS, "v2");
}

void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "movi v19.8b, #255 \n"
      "ldr q2, [%[kNV21InterleavedTable]] \n"
      "1: \n" READYUY2 NVTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_yuy2] "+r"(src_yuy2),                          // %[src_yuy2]
        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
        [width] "+r"(width)                                 // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [kNV21InterleavedTable] "r"(&kNV21InterleavedTable)
      : "cc""memory", YUVTORGB_REGS, "v2""v19");
}

void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "movi v19.8b, #255 \n"
      "ldr q2, [%[kNV12InterleavedTable]] \n"
      "1: \n" READUYVY NVTORGB RGBTORGB8
      "subs %w[width], %w[width], #8 \n"
      "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt 1b \n"
      : [src_uyvy] "+r"(src_uyvy),                          // %[src_yuy2]
        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
        [width] "+r"(width)                                 // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [kNV12InterleavedTable] "r"(&kNV12InterleavedTable)
      : "cc""memory", YUVTORGB_REGS, "v2""v19");
}

// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
void SplitUVRow_NEON(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
  asm volatile(
      "1: \n"
      "ld2 {v0.16b,v1.16b}, [%0], #32 \n"  // load 16 pairs of UV
      "subs %w3, %w3, #16 \n"  // 16 processed per loop
      "prfm pldl1keep, [%0, 448] \n"
      "st1 {v0.16b}, [%1], #16 \n"  // store U
      "st1 {v1.16b}, [%2], #16 \n"  // store V
      "b.gt 1b \n"
      : "+r"(src_uv),               // %0
        "+r"(dst_u),                // %1
        "+r"(dst_v),                // %2
        "+r"(width)                 // %3  // Output registers
      :                             // Input registers
      : "cc""memory""v0""v1"  // Clobber List
  );
}

// Reads 16 byte Y's from tile and writes out 16 Y's.
// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
// width measured in bytes so 8 UV = 16.
void DetileRow_NEON(const uint8_t* src,
                    ptrdiff_t src_tile_stride,
                    uint8_t* dst,
                    int width) {
  asm volatile(
      "1: \n"
      "ld1 {v0.16b}, [%0], %3 \n"  // load 16 bytes
      "subs %w2, %w2, #16 \n"  // 16 processed per loop
      "prfm pldl1keep, [%0, 1792] \n"  // 7 tiles of 256b ahead
      "st1 {v0.16b}, [%1], #16 \n"  // store 16 bytes
      "b.gt 1b \n"
      : "+r"(src),            // %0
        "+r"(dst),            // %1
        "+r"(width)           // %2
      : "r"(src_tile_stride)  // %3
      : "cc""memory""v0"  // Clobber List
  );
}

// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
void DetileRow_16_NEON(const uint16_t* src,
                       ptrdiff_t src_tile_stride,
                       uint16_t* dst,
                       int width) {
  asm volatile(
      "1: \n"
      "ld1 {v0.8h,v1.8h}, [%0], %3 \n"  // load 16 pixels
      "subs %w2, %w2, #16 \n"  // 16 processed per loop
      "prfm pldl1keep, [%0, 3584] \n"  // 7 tiles of 512b ahead
      "st1 {v0.8h,v1.8h}, [%1], #32 \n"  // store 16 pixels
      "b.gt 1b \n"
      : "+r"(src),                  // %0
        "+r"(dst),                  // %1
        "+r"(width)                 // %2
      : "r"(src_tile_stride * 2)    // %3
      : "cc""memory""v0""v1"  // Clobber List
  );
}

// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
void DetileSplitUVRow_NEON(const uint8_t* src_uv,
                           ptrdiff_t src_tile_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
  asm volatile(
      "1: \n"
      "ld2 {v0.8b,v1.8b}, [%0], %4 \n"
      "subs %w3, %w3, #16 \n"
      "prfm pldl1keep, [%0, 1792] \n"
      "st1 {v0.8b}, [%1], #8 \n"
      "st1 {v1.8b}, [%2], #8 \n"
      "b.gt 1b \n"
      : "+r"(src_uv),               // %0
        "+r"(dst_u),                // %1
        "+r"(dst_v),                // %2
        "+r"(width)                 // %3
      : "r"(src_tile_stride)        // %4
      : "cc""memory""v0""v1"  // Clobber List
  );
}

#if defined(LIBYUV_USE_ST2)
// Read 16 Y, 8 UV, and write 8 YUY2
void DetileToYUY2_NEON(const uint8_t* src_y,
                       ptrdiff_t src_y_tile_stride,
                       const uint8_t* src_uv,
                       ptrdiff_t src_uv_tile_stride,
                       uint8_t* dst_yuy2,
                       int width) {
  asm volatile(
      "1: \n"
      "ld1 {v0.16b}, [%0], %4 \n"  // load 16 Ys
      "prfm pldl1keep, [%0, 1792] \n"
      "ld1 {v1.16b}, [%1], %5 \n"  // load 8 UVs
      "prfm pldl1keep, [%1, 1792] \n"
      "subs %w3, %w3, #16 \n"  // store 8 YUY2
      "st2 {v0.16b,v1.16b}, [%2], #32 \n"
      "b.gt 1b \n"
      : "+r"(src_y),                // %0
        "+r"(src_uv),               // %1
        "+r"(dst_yuy2),             // %2
        "+r"(width)                 // %3
      : "r"(src_y_tile_stride),     // %4
        "r"(src_uv_tile_stride)     // %5
      : "cc""memory""v0""v1"  // Clobber list
  );
}
#else
// Read 16 Y, 8 UV, and write 8 YUY2
void DetileToYUY2_NEON(const uint8_t* src_y,
                       ptrdiff_t src_y_tile_stride,
                       const uint8_t* src_uv,
                       ptrdiff_t src_uv_tile_stride,
                       uint8_t* dst_yuy2,
                       int width) {
  asm volatile(
      "1: \n"
      "ld1 {v0.16b}, [%0], %4 \n"  // load 16 Ys
      "ld1 {v1.16b}, [%1], %5 \n"  // load 8 UVs
      "subs %w3, %w3, #16 \n"
      "prfm pldl1keep, [%0, 1792] \n"
      "zip1 v2.16b, v0.16b, v1.16b \n"
      "prfm pldl1keep, [%1, 1792] \n"
      "zip2 v3.16b, v0.16b, v1.16b \n"
      "st1 {v2.16b,v3.16b}, [%2], #32 \n"  // store 8 YUY2
      "b.gt 1b \n"
      : "+r"(src_y),                            // %0
        "+r"(src_uv),                           // %1
        "+r"(dst_yuy2),                         // %2
        "+r"(width)                             // %3
      : "r"(src_y_tile_stride),                 // %4
        "r"(src_uv_tile_stride)                 // %5
      : "cc""memory""v0""v1""v2""v3"  // Clobber list
  );
}
#endif

// Unpack MT2T into tiled P010 64 pixels at a time. See
// tinyurl.com/mtk-10bit-video-format for format documentation.
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
  asm volatile(
      "1: \n"
      "ld1 {v7.16b}, [%0], #16 \n"
      "ld1 {v0.16b-v3.16b}, [%0], #64 \n"
      "shl v4.16b, v7.16b, #6 \n"
      "shl v5.16b, v7.16b, #4 \n"
      "shl v6.16b, v7.16b, #2 \n"
      "subs %2, %2, #80 \n"
      "zip1 v16.16b, v4.16b, v0.16b \n"
      "zip1 v18.16b, v5.16b, v1.16b \n"
      "zip1 v20.16b, v6.16b, v2.16b \n"
      "zip1 v22.16b, v7.16b, v3.16b \n"
      "zip2 v17.16b, v4.16b, v0.16b \n"
      "zip2 v19.16b, v5.16b, v1.16b \n"
      "zip2 v21.16b, v6.16b, v2.16b \n"
      "zip2 v23.16b, v7.16b, v3.16b \n"
      "sri v16.8h, v16.8h, #10 \n"
      "sri v17.8h, v17.8h, #10 \n"
      "sri v18.8h, v18.8h, #10 \n"
      "sri v19.8h, v19.8h, #10 \n"
      "st1 {v16.8h-v19.8h}, [%1], #64 \n"
      "sri v20.8h, v20.8h, #10 \n"
      "sri v21.8h, v21.8h, #10 \n"
      "sri v22.8h, v22.8h, #10 \n"
      "sri v23.8h, v23.8h, #10 \n"
      "st1 {v20.8h-v23.8h}, [%1], #64 \n"
      "b.gt 1b \n"
      : "+r"(src),  // %0
        "+r"(dst),  // %1
        "+r"(size)  // %2
      :
      : "cc""memory""w0""v0""v1""v2""v3""v4""v5""v6""v7",
        "v16""v17""v18""v19""v20""v21""v22""v23");
}

#if defined(LIBYUV_USE_ST2)
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width) {
  asm volatile(
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load U
      "ld1 {v1.16b}, [%1], #16 \n"  // load V
      "subs %w3, %w3, #16 \n"  // 16 processed per loop
      "prfm pldl1keep, [%0, 448] \n"
      "prfm pldl1keep, [%1, 448] \n"
      "st2 {v0.16b,v1.16b}, [%2], #32 \n"  // store 16 pairs of UV
      "b.gt 1b \n"
      : "+r"(src_u),                // %0
        "+r"(src_v),                // %1
        "+r"(dst_uv),               // %2
        "+r"(width)                 // %3  // Output registers
      :                             // Input registers
      : "cc""memory""v0""v1"  // Clobber List
  );
}

void MergeUVRow_16_NEON(const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint16_t* dst_uv,
                        int depth,
                        int width) {
  int shift = 16 - depth;
  asm volatile(
      "dup v2.8h, %w4 \n"
      "1: \n"
      "ld1 {v0.8h}, [%0], #16 \n"  // load 8 U
      "subs %w3, %w3, #8 \n"  // 8 src pixels per loop
      "ld1 {v1.8h}, [%1], #16 \n"  // load 8 V
      "ushl v0.8h, v0.8h, v2.8h \n"
      "prfm pldl1keep, [%0, 448] \n"
      "ushl v1.8h, v1.8h, v2.8h \n"
      "prfm pldl1keep, [%1, 448] \n"
      "st2 {v0.8h, v1.8h}, [%2], #32 \n"  // store 8 UV pixels
      "b.gt 1b \n"
      : "+r"(src_u),   // %0
        "+r"(src_v),   // %1
        "+r"(dst_uv),  // %2
        "+r"(width)    // %3
      : "r"(shift)     // %4
      : "cc""memory""v0""v1""v2");
}
#else
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width) {
  asm volatile(
      "1: \n"
      "ld1 {v0.16b}, [%0], #16 \n"  // load U
      "ld1 {v1.16b}, [%1], #16 \n"  // load V
      "subs %w3, %w3, #16 \n"  // 16 processed per loop
      "zip1 v2.16b, v0.16b, v1.16b \n"
      "prfm pldl1keep, [%0, 448] \n"
      "zip2 v3.16b, v0.16b, v1.16b \n"
      "prfm pldl1keep, [%1, 448] \n"
      "st1 {v2.16b,v3.16b}, [%2], #32 \n"  // store 16 pairs of UV
      "b.gt 1b \n"
      : "+r"(src_u),                            // %0
        "+r"(src_v),                            // %1
        "+r"(dst_uv),                           // %2
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5
C=88 H=91 G=89

¤ Dauer der Verarbeitung: 0.19 Sekunden  ¤

*© Formatika GbR, Deutschland






Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.






                                                                                                                                                                                                                                                                                                                                                                                                     


Neuigkeiten

     Aktuelles
     Motto des Tages

Software

     Produkte
     Quellcodebibliothek

Aktivitäten

     Artikel über Sicherheit
     Anleitung zur Aktivierung von SSL

Muße

     Gedichte
     Musik
     Bilder

Jenseits des Üblichen ....

Besucherstatistik

Besucherstatistik

Monitoring

Montastic status badge