Anforderungen  |   Konzepte  |   Entwurf  |   Entwicklung  |   Qualitätssicherung  |   Lebenszyklus  |   Steuerung
 
 
 
 


Quelle  row_neon.cc   Sprache: C

 
/*
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


#include "libyuv/row.h"

#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

// This module is for GCC Neon
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
    !defined(__aarch64__)

// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are
// reserved.

// q0: Y uint16x8_t
// d2: U uint8x8_t
// d3: V uint8x8_t

// Read 8 Y, 4 U and 4 V from 422
#define READYUV422                               \
  "vld1.8 {d0}, [%[src_y]]! \n" \
  "vld1.32 {d2[0]}, [%[src_u]]! \n" \
  "vld1.32 {d2[1]}, [%[src_v]]! \n" \
  "vmov.u8 d1, d0 \n" \
  "vmovl.u8 q1, d2 \n" \
  "vzip.u8 d0, d1 \n" \
  "vsli.u16 q1, q1, #8 \n"

// Read 8 Y, 8 U and 8 V from 444
#define READYUV444                               \
  "vld1.8 {d0}, [%[src_y]]! \n" \
  "vld1.8 {d2}, [%[src_u]]! \n" \
  "vmovl.u8 q0, d0 \n" \
  "vld1.8 {d3}, [%[src_v]]! \n" \
  "vsli.u16 q0, q0, #8 \n"

// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400                               \
  "vld1.8 {d0}, [%[src_y]]! \n" \
  "vmov.u8 q1, #128 \n" \
  "vmovl.u8 q0, d0 \n" \
  "vsli.u16 q0, q0, #8 \n"

// Read 8 Y and 4 UV from NV12
#define READNV12                                                              \
  "vld1.8 {d0}, [%[src_y]]! \n"                              \
  "vld1.8 {d2}, [%[src_uv]]! \n"                              \
  "vmov.u8 d1, d0 \n"                              \
  "vmov.u8 d3, d2 \n"                              \
  "vzip.u8 d0, d1 \n"                              \
  "vsli.u16 d2, d2, #8 \n" /* Duplicate low byte (U) */ \
  "vsri.u16 d3, d3, #8 \n" /* Duplicate high byte (V) */

// Read 8 Y and 4 VU from NV21
#define READNV21                                                               \
  "vld1.8 {d0}, [%[src_y]]! \n"                               \
  "vld1.8 {d2}, [%[src_vu]]! \n"                               \
  "vmov.u8 d1, d0 \n"                               \
  "vmov.u8 d3, d2 \n"                               \
  "vzip.u8 d0, d1 \n"                               \
  "vsri.u16 d2, d2, #8 \n" /* Duplicate high byte (U) */ \
  "vsli.u16 d3, d3, #8 \n" /* Duplicate low byte (V) */

// Read 8 YUY2
#define READYUY2                                 \
  "vld2.8 {d0, d2}, [%[src_yuy2]]! \n" \
  "vmovl.u8 q0, d0 \n" \
  "vmov.u8 d3, d2 \n" \
  "vsli.u16 q0, q0, #8 \n" \
  "vsli.u16 d2, d2, #8 \n" \
  "vsri.u16 d3, d3, #8 \n"

// Read 8 UYVY
#define READUYVY                                 \
  "vld2.8 {d2, d3}, [%[src_uyvy]]! \n" \
  "vmovl.u8 q0, d3 \n" \
  "vmov.u8 d3, d2 \n" \
  "vsli.u16 q0, q0, #8 \n" \
  "vsli.u16 d2, d2, #8 \n" \
  "vsri.u16 d3, d3, #8 \n"

// TODO: Use single register for kUVCoeff and multiply by lane
#define YUVTORGB_SETUP                                        \
  "vld1.16 {d31}, [%[kRGBCoeffBias]] \n" \
  "vld4.8 {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \
  "vdup.u16 q10, d31[1] \n" \
  "vdup.u16 q11, d31[2] \n" \
  "vdup.u16 q12, d31[3] \n" \
  "vdup.u16 d31, d31[0] \n"

// q0: B uint16x8_t
// q1: G uint16x8_t
// q2: R uint16x8_t

// Convert from YUV to 2.14 fixed point RGB
#define YUVTORGB                                           \
  "vmull.u16 q2, d1, d31 \n"           \
  "vmull.u8 q8, d3, d29 \n" /* DGV */ \
  "vmull.u16 q0, d0, d31 \n"           \
  "vmlal.u8 q8, d2, d28 \n" /* DG */  \
  "vqshrn.u32 d0, q0, #16 \n"           \
  "vqshrn.u32 d1, q2, #16 \n" /* Y */   \
  "vmull.u8 q9, d2, d26 \n" /* DB */  \
  "vmull.u8 q2, d3, d27 \n" /* DR */  \
  "vadd.u16 q4, q0, q11 \n" /* G */   \
  "vadd.u16 q2, q0, q2 \n" /* R */   \
  "vadd.u16 q0, q0, q9 \n" /* B */   \
  "vqsub.u16 q1, q4, q8 \n" /* G */   \
  "vqsub.u16 q0, q0, q10 \n" /* B */   \
  "vqsub.u16 q2, q2, q12 \n" /* R */

// Convert from 2.14 fixed point RGB To 8 bit RGB
#define RGBTORGB8                                        \
  "vqshrn.u16 d4, q2, #6 \n" /* R */ \
  "vqshrn.u16 d2, q1, #6 \n" /* G */ \
  "vqshrn.u16 d0, q0, #6 \n" /* B */

#define YUVTORGB_REGS \
  "q0""q1""q2""q4""q8""q9""q10""q11""q12""q13""q14""d31"

#define STORERGBA                                \
  "vmov.u8 d1, d0 \n" \
  "vmov.u8 d3, d4 \n" \
  "vmov.u8 d0, d6 \n" \
  "vst4.8 {d0, d1, d2, d3}, [%[dst_rgba]]! \n"

void I444ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "1: \n" READYUV444 YUVTORGB
          RGBTORGB8
      "subs %[width], %[width], #8 \n"
      "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "d6");
}

void I444ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile(
      YUVTORGB_SETUP
      "1: \n" READYUV444 YUVTORGB
          RGBTORGB8
      "subs %[width], %[width], #8 \n"
      "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS);
}

void I422ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "1: \n" READYUV422 YUVTORGB
          RGBTORGB8
      "subs %[width], %[width], #8 \n"
      "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "d6");
}

void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
                             const uint8_t* src_a,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  asm volatile(
      YUVTORGB_SETUP
      "1: \n" READYUV444 YUVTORGB
          RGBTORGB8
      "vld1.8 {d6}, [%[src_a]]! \n"
      "subs %[width], %[width], #8 \n"
      "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [src_a] "+r"(src_a),                               // %[src_a]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "d6");
}

void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
                             const uint8_t* src_a,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  asm volatile(
      YUVTORGB_SETUP
      "1: \n" READYUV422 YUVTORGB
          RGBTORGB8
      "vld1.8 {d6}, [%[src_a]]! \n"
      "subs %[width], %[width], #8 \n"
      "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [src_a] "+r"(src_a),                               // %[src_a]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "d6");
}

void I422ToRGBARow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_rgba,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "1: \n" READYUV422 YUVTORGB
      RGBTORGB8 "subs %[width], %[width], #8 \n" STORERGBA
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_rgba] "+r"(dst_rgba),                         // %[dst_rgba]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "d6");
}

void I422ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "1: \n" READYUV422 YUVTORGB
          RGBTORGB8
      "subs %[width], %[width], #8 \n"
      "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS);
}

#define ARGBTORGB565                                                        \
  "vshll.u8 q2, d4, #8 \n" /* R                    */ \
  "vshll.u8 q1, d2, #8 \n" /* G                    */ \
  "vshll.u8 q0, d0, #8 \n" /* B                    */ \
  "vsri.16 q2, q1, #5 \n" /* RG                   */ \
  "vsri.16 q2, q0, #11 \n" /* RGB                  */

void I422ToRGB565Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "1: \n" READYUV422 YUVTORGB
      RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565
      "vst1.8 {q2}, [%[dst_rgb565]]! \n"  // store 8 pixels RGB565.
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_rgb565] "+r"(dst_rgb565),                     // %[dst_rgb565]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS);
}

#define ARGBTOARGB1555                                                      \
  "vshll.u8 q3, d6, #8 \n" /* A                    */ \
  "vshll.u8 q2, d4, #8 \n" /* R                    */ \
  "vshll.u8 q1, d2, #8 \n" /* G                    */ \
  "vshll.u8 q0, d0, #8 \n" /* B                    */ \
  "vsri.16 q3, q2, #1 \n" /* AR                   */ \
  "vsri.16 q3, q1, #6 \n" /* ARG                  */ \
  "vsri.16 q3, q0, #11 \n" /* ARGB                 */

void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb1555,
                            const struct YuvConstants* yuvconstants,
                            int width) {
  asm volatile(
      YUVTORGB_SETUP
      "1: \n" READYUV422 YUVTORGB
          RGBTORGB8
      "subs %[width], %[width], #8 \n"
      "vmov.u8 d6, #0xff \n" ARGBTOARGB1555
      "vst1.8 {q3}, [%[dst_argb1555]]! \n"  // store 8 pixels RGB1555.
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb1555] "+r"(dst_argb1555),                 // %[dst_argb1555]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "q3");
}

#define ARGBTOARGB4444                                                      \
  "vshr.u8 d0, d0, #4 \n" /* B                    */ \
  "vbic.32 d2, d2, d7 \n" /* G                    */ \
  "vshr.u8 d4, d4, #4 \n" /* R                    */ \
  "vbic.32 d6, d6, d7 \n" /* A                    */ \
  "vorr d0, d0, d2 \n" /* BG                   */ \
  "vorr d1, d4, d6 \n" /* RA                   */ \
  "vzip.u8 d0, d1 \n" /* BGRA                 */

void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb4444,
                            const struct YuvConstants* yuvconstants,
                            int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "vmov.u8 d7, #0x0f \n"  // vbic bits to clear
      "1: \n" READYUV422 YUVTORGB
          RGBTORGB8
      "subs %[width], %[width], #8 \n" ARGBTOARGB4444
      "vst1.8 {q0}, [%[dst_argb4444]]! \n"  // store 8 pixels
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb4444] "+r"(dst_argb4444),                 // %[dst_argb4444]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "q3");
}

void I400ToARGBRow_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "1: \n" READYUV400 YUVTORGB
          RGBTORGB8
      "subs %[width], %[width], #8 \n"
      "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "d6");
}

void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  asm volatile(
      "vmov.u8 d23, #255 \n"
      "1: \n"
      "vld1.8 {d20}, [%0]! \n"
      "vmov d21, d20 \n"
      "vmov d22, d20 \n"
      "subs %2, %2, #8 \n"
      "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
      "bgt 1b \n"
      : "+r"(src_y),     // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
      : "cc""memory""d20""d21""d22""d23");
}

void NV12ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_uv,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "1: \n" READNV12 YUVTORGB RGBTORGB8
      "subs %[width], %[width], #8 \n"
      "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_uv] "+r"(src_uv),                             // %[src_uv]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "d6");
}

void NV21ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_vu,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "1: \n" READNV21 YUVTORGB RGBTORGB8
      "subs %[width], %[width], #8 \n"
      "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_vu] "+r"(src_vu),                             // %[src_vu]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "d6");
}

void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_uv,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "1: \n" READNV12 YUVTORGB RGBTORGB8
      "subs %[width], %[width], #8 \n"
      "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_uv] "+r"(src_uv),                             // %[src_uv]
        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS);
}

void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_vu,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "1: \n" READNV21 YUVTORGB RGBTORGB8
      "subs %[width], %[width], #8 \n"
      "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_vu] "+r"(src_vu),                             // %[src_vu]
        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS);
}

void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_uv,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "1: \n" READNV12 YUVTORGB RGBTORGB8
      "subs %[width], %[width], #8 \n" ARGBTORGB565
      "vst1.8 {q2}, [%[dst_rgb565]]! \n"  // store 8 pixels RGB565.
      "bgt 1b \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_uv] "+r"(src_uv),                             // %[src_uv]
        [dst_rgb565] "+r"(dst_rgb565),                     // %[dst_rgb565]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS);
}

void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "1: \n" READYUY2 YUVTORGB RGBTORGB8
      "subs %[width], %[width], #8 \n"
      "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
      "bgt 1b \n"
      : [src_yuy2] "+r"(src_yuy2),                         // %[src_yuy2]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "d6");
}

void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8 d6, #255 \n"
      "1: \n" READUYVY YUVTORGB RGBTORGB8
      "subs %[width], %[width], #8 \n"
      "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
      "bgt 1b \n"
      : [src_uyvy] "+r"(src_uyvy),                         // %[src_uyvy]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_REGS, "d6");
}

// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
void SplitUVRow_NEON(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
  asm volatile(
      "1: \n"
      "vld2.8 {q0, q1}, [%0]! \n"  // load 16 pairs of UV
      "subs %3, %3, #16 \n"  // 16 processed per loop
      "vst1.8 {q0}, [%1]! \n"  // store U
      "vst1.8 {q1}, [%2]! \n"  // store V
      "bgt 1b \n"
      : "+r"(src_uv),               // %0
        "+r"(dst_u),                // %1
        "+r"(dst_v),                // %2
        "+r"(width)                 // %3  // Output registers
      :                             // Input registers
      : "cc""memory""q0""q1"  // Clobber List
  );
}

// Reads 16 byte Y's from tile and writes out 16 Y's.
// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
// width measured in bytes so 8 UV = 16.
void DetileRow_NEON(const uint8_t* src,
                    ptrdiff_t src_tile_stride,
                    uint8_t* dst,
                    int width) {
  asm volatile(
      "1: \n"
      "vld1.8 {q0}, [%0], %3 \n"  // load 16 bytes
      "subs %2, %2, #16 \n"  // 16 processed per loop
      "pld [%0, #1792] \n"
      "vst1.8 {q0}, [%1]! \n"  // store 16 bytes
      "bgt 1b \n"
      : "+r"(src),            // %0
        "+r"(dst),            // %1
        "+r"(width)           // %2
      : "r"(src_tile_stride)  // %3
      : "cc""memory""q0"  // Clobber List
  );
}

// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
void DetileRow_16_NEON(const uint16_t* src,
                       ptrdiff_t src_tile_stride,
                       uint16_t* dst,
                       int width) {
  asm volatile(
      "1: \n"
      "vld1.16 {q0, q1}, [%0], %3 \n"  // load 16 pixels
      "subs %2, %2, #16 \n"  // 16 processed per loop
      "pld [%0, #3584] \n"
      "vst1.16 {q0, q1}, [%1]! \n"  // store 16 pixels
      "bgt 1b \n"
      : "+r"(src),                  // %0
        "+r"(dst),                  // %1
        "+r"(width)                 // %2
      : "r"(src_tile_stride * 2)    // %3
      : "cc""memory""q0""q1"  // Clobber List
  );
}

// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
void DetileSplitUVRow_NEON(const uint8_t* src_uv,
                           ptrdiff_t src_tile_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
  asm volatile(
      "1: \n"
      "vld2.8 {d0, d1}, [%0], %4 \n"
      "subs %3, %3, #16 \n"
      "pld [%0, #1792] \n"
      "vst1.8 {d0}, [%1]! \n"
      "vst1.8 {d1}, [%2]! \n"
      "bgt 1b \n"
      : "+r"(src_uv),               // %0
        "+r"(dst_u),                // %1
        "+r"(dst_v),                // %2
        "+r"(width)                 // %3
      : "r"(src_tile_stride)        // %4
      : "cc""memory""d0""d1"  // Clobber List
  );
}

#if defined(LIBYUV_USE_ST2)
// Read 16 Y, 8 UV, and write 8 YUYV.
void DetileToYUY2_NEON(const uint8_t* src_y,
                       ptrdiff_t src_y_tile_stride,
                       const uint8_t* src_uv,
                       ptrdiff_t src_uv_tile_stride,
                       uint8_t* dst_yuy2,
                       int width) {
  asm volatile(
      "1: \n"
      "vld1.8 {q0}, [%0], %4 \n"  // Load 16 Y
      "pld [%0, #1792] \n"
      "vld1.8 {q1}, [%1], %5 \n"  // Load 8 UV
      "pld [%1, #1792] \n"
      "subs %3, %3, #16 \n"
      "vst2.8 {q0, q1}, [%2]! \n"
      "bgt 1b \n"
      : "+r"(src_y),                            // %0
        "+r"(src_uv),                           // %1
        "+r"(dst_yuy2),                         // %2
        "+r"(width)                             // %3
      : "r"(src_y_tile_stride),                 // %4
        "r"(src_uv_tile_stride)                 // %5
      : "cc""memory""d0""d1""d2""d3"  // Clobber list
  );
}
#else
// Read 16 Y, 8 UV, and write 8 YUYV.
void DetileToYUY2_NEON(const uint8_t* src_y,
                       ptrdiff_t src_y_tile_stride,
                       const uint8_t* src_uv,
                       ptrdiff_t src_uv_tile_stride,
                       uint8_t* dst_yuy2,
                       int width) {
  asm volatile(
      "1: \n"
      "vld1.8 {q0}, [%0], %4 \n"  // Load 16 Y
      "vld1.8 {q1}, [%1], %5 \n"  // Load 8 UV
      "subs %3, %3, #16 \n"
      "pld [%0, #1792] \n"
      "vzip.8 q0, q1 \n"
      "pld [%1, #1792] \n"
      "vst1.8 {q0, q1}, [%2]! \n"
      "bgt 1b \n"
      : "+r"(src_y),                            // %0
        "+r"(src_uv),                           // %1
        "+r"(dst_yuy2),                         // %2
        "+r"(width)                             // %3
      : "r"(src_y_tile_stride),                 // %4
        "r"(src_uv_tile_stride)                 // %5
      : "cc""memory""q0""q1""q2""q3"  // Clobber list
  );
}
#endif

void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
  asm volatile(
      "1: \n"
      "vld1.8 {q14}, [%0]! \n"  // Load lower bits.
      "vld1.8 {q9}, [%0]! \n"  // Load upper bits row
                                                      // by row.
      "vld1.8 {q11}, [%0]! \n"
      "vld1.8 {q13}, [%0]! \n"
      "vld1.8 {q15}, [%0]! \n"
      "vshl.u8 q8, q14, #6 \n"  // Shift lower bit data
                                                      // appropriately.
      "vshl.u8 q10, q14, #4 \n"
      "vshl.u8 q12, q14, #2 \n"
      "vzip.u8 q8, q9 \n"  // Interleave upper and
                                                      // lower bits.
      "vzip.u8 q10, q11 \n"
      "vzip.u8 q12, q13 \n"
      "vzip.u8 q14, q15 \n"
      "vsri.u16 q8, q8, #10 \n"  // Copy upper 6 bits
                                                      // into lower 6 bits for
                                                      // better accuracy in
                                                      // conversions.
      "vsri.u16 q9, q9, #10 \n"
      "vsri.u16 q10, q10, #10 \n"
      "vsri.u16 q11, q11, #10 \n"
      "vsri.u16 q12, q12, #10 \n"
      "vsri.u16 q13, q13, #10 \n"
      "vsri.u16 q14, q14, #10 \n"
      "vsri.u16 q15, q15, #10 \n"
      "vstmia %1!, {q8-q15} \n"  // Store pixel block (64
                                                      // pixels).
      "subs %2, %2, #80 \n"
      "bgt 1b \n"
      : "+r"(src),  // %0
        "+r"(dst),  // %1
        "+r"(size)  // %2
      :
      : "cc""memory""q8""q9""q10""q11""q12""q13""q14""q15");
}

// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width) {
  asm volatile(
      "1: \n"
      "vld1.8 {q0}, [%0]! \n"  // load U
      "vld1.8 {q1}, [%1]! \n"  // load V
      "subs %3, %3, #16 \n"  // 16 processed per loop
      "vst2.8 {q0, q1}, [%2]! \n"  // store 16 pairs of UV
      "bgt 1b \n"
      : "+r"(src_u),                // %0
        "+r"(src_v),                // %1
        "+r"(dst_uv),               // %2
        "+r"(width)                 // %3  // Output registers
      :                             // Input registers
      : "cc""memory""q0""q1"  // Clobber List
  );
}

// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
void SplitRGBRow_NEON(const uint8_t* src_rgb,
                      uint8_t* dst_r,
                      uint8_t* dst_g,
                      uint8_t* dst_b,
                      int width) {
  asm volatile(
      "1: \n"
      "vld3.8 {d0, d2, d4}, [%0]! \n"  // load 8 RGB
      "vld3.8 {d1, d3, d5}, [%0]! \n"  // next 8 RGB
      "subs %4, %4, #16 \n"  // 16 processed per loop
      "vst1.8 {q0}, [%1]! \n"  // store R
      "vst1.8 {q1}, [%2]! \n"  // store G
      "vst1.8 {q2}, [%3]! \n"  // store B
      "bgt 1b \n"
      : "+r"(src_rgb),                    // %0
        "+r"(dst_r),                      // %1
        "+r"(dst_g),                      // %2
        "+r"(dst_b),                      // %3
        "+r"(width)                       // %4
      :                                   // Input registers
      : "cc""memory""q0""q1""q2"  // Clobber List
  );
}

// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
void MergeRGBRow_NEON(const uint8_t* src_r,
                      const uint8_t* src_g,
                      const uint8_t* src_b,
                      uint8_t* dst_rgb,
                      int width) {
  asm volatile(
      "1: \n"
      "vld1.8 {q0}, [%0]! \n"  // load R
      "vld1.8 {q1}, [%1]! \n"  // load G
      "vld1.8 {q2}, [%2]! \n"  // load B
      "subs %4, %4, #16 \n"  // 16 processed per loop
      "vst3.8 {d0, d2, d4}, [%3]! \n"  // store 8 RGB
      "vst3.8 {d1, d3, d5}, [%3]! \n"  // next 8 RGB
      "bgt 1b \n"
      : "+r"(src_r),                      // %0
        "+r"(src_g),                      // %1
        "+r"(src_b),                      // %2
        "+r"(dst_rgb),                    // %3
        "+r"(width)                       // %4
      :                                   // Input registers
      : "cc""memory""q0""q1""q2"  // Clobber List
  );
}

// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
void SplitARGBRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       uint8_t* dst_a,
                       int width) {
  asm volatile(
      "1: \n"
      "vld4.8 {d0, d2, d4, d6}, [%0]! \n"  // load 8 ARGB
      "vld4.8 {d1, d3, d5, d7}, [%0]! \n"  // next 8 ARGB
      "subs %5, %5, #16 \n"  // 16 processed per loop
      "vst1.8 {q0}, [%3]! \n"  // store B
      "vst1.8 {q1}, [%2]! \n"  // store G
      "vst1.8 {q2}, [%1]! \n"  // store R
      "vst1.8 {q3}, [%4]! \n"  // store A
      "bgt 1b \n"
      : "+r"(src_argb),                         // %0
        "+r"(dst_r),                            // %1
        "+r"(dst_g),                            // %2
        "+r"(dst_b),                            // %3
        "+r"(dst_a),                            // %4
        "+r"(width)                             // %5
      :                                         // Input registers
      : "cc""memory""q0""q1""q2""q3"  // Clobber List
  );
}

// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
void MergeARGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       const uint8_t* src_a,
                       uint8_t* dst_argb,
                       int width) {
  asm volatile(
      "1: \n"
      "vld1.8 {q2}, [%0]! \n"  // load R
      "vld1.8 {q1}, [%1]! \n"  // load G
      "vld1.8 {q0}, [%2]! \n"  // load B
      "vld1.8 {q3}, [%3]! \n"  // load A
      "subs %5, %5, #16 \n"  // 16 processed per loop
      "vst4.8 {d0, d2, d4, d6}, [%4]! \n"  // store 8 ARGB
      "vst4.8 {d1, d3, d5, d7}, [%4]! \n"  // next 8 ARGB
      "bgt 1b \n"
      : "+r"(src_r),                            // %0
        "+r"(src_g),                            // %1
        "+r"(src_b),                            // %2
        "+r"(src_a),                            // %3
        "+r"(dst_argb),                         // %4
        "+r"(width)                             // %5
      :                                         // Input registers
      : "cc""memory""q0""q1""q2""q3"  // Clobber List
  );
}

// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
void SplitXRGBRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width) {
  asm volatile(
      "1: \n"
      "vld4.8 {d0, d2, d4, d6}, [%0]! \n"  // load 8 ARGB
      "vld4.8 {d1, d3, d5, d7}, [%0]! \n"  // next 8 ARGB
      "subs %4, %4, #16 \n"  // 16 processed per loop
      "vst1.8 {q0}, [%3]! \n"  // store B
      "vst1.8 {q1}, [%2]! \n"  // store G
      "vst1.8 {q2}, [%1]! \n"  // store R
      "bgt 1b \n"
      : "+r"(src_argb),                         // %0
        "+r"(dst_r),                            // %1
        "+r"(dst_g),                            // %2
        "+r"(dst_b),                            // %3
        "+r"(width)                             // %4
      :                                         // Input registers
      : "cc""memory""q0""q1""q2""q3"  // Clobber List
  );
}

// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
void MergeXRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       uint8_t* dst_argb,
                       int width) {
  asm volatile(
      "vmov.u8 q3, #255 \n"  // load A(255)
      "1: \n"
      "vld1.8 {q2}, [%0]! \n"  // load R
      "vld1.8 {q1}, [%1]! \n"  // load G
      "vld1.8 {q0}, [%2]! \n"  // load B
      "subs %4, %4, #16 \n"  // 16 processed per loop
      "vst4.8 {d0, d2, d4, d6}, [%3]! \n"  // store 8 ARGB
      "vst4.8 {d1, d3, d5, d7}, [%3]! \n"  // next 8 ARGB
      "bgt 1b \n"
      : "+r"(src_r),                            // %0
        "+r"(src_g),                            // %1
        "+r"(src_b),                            // %2
        "+r"(dst_argb),                         // %3
        "+r"(width)                             // %4
      :                                         // Input registers
      : "cc""memory""q0""q1""q2""q3"  // Clobber List
  );
}

void MergeXR30Row_NEON(const uint16_t* src_r,
                       const uint16_t* src_g,
                       const uint16_t* src_b,
                       uint8_t* dst_ar30,
                       int depth,
                       int width) {
  int shift = 10 - depth;
  asm volatile(
      "vmov.u32 q14, #1023 \n"
      "vdup.32 q15, %5 \n"
      "1: \n"
      "vld1.16 {d4}, [%2]! \n"  // B
      "vld1.16 {d2}, [%1]! \n"  // G
      "vld1.16 {d0}, [%0]! \n"  // R
      "vmovl.u16 q2, d4 \n"  // B
      "vmovl.u16 q1, d2 \n"  // G
      "vmovl.u16 q0, d0 \n"  // R
      "vshl.u32 q2, q2, q15 \n"  // 000B
      "vshl.u32 q1, q1, q15 \n"
      "vshl.u32 q0, q0, q15 \n"
      "vmin.u32 q2, q2, q14 \n"
      "vmin.u32 q1, q1, q14 \n"
      "vmin.u32 q0, q0, q14 \n"
      "vsli.u32 q2, q1, #10 \n"  // 00GB
      "vsli.u32 q2, q0, #20 \n"  // 0RGB
      "vorr.u32 q2, #0xc0000000 \n"  // ARGB (AR30)
      "subs %4, %4, #4 \n"
      "vst1.8 {q2}, [%3]! \n"
      "bgt 1b \n"
      : "+r"(src_r),     // %0
        "+r"(src_g),     // %1
        "+r"(src_b),     // %2
        "+r"(dst_ar30),  // %3
        "+r"(width)      // %4
      : "r"(shift)       // %5
      : "memory""cc""q0""q1""q2""q14""q15");
}

void MergeXR30Row_10_NEON(const uint16_t* src_r,
                          const uint16_t* src_g,
                          const uint16_t* src_b,
                          uint8_t* dst_ar30,
                          int /* depth */,
                          int width) {
  asm volatile(
      "vmov.u32 q14, #1023 \n"
      "1: \n"
      "vld1.16 {d4}, [%2]! \n"  // B
      "vld1.16 {d2}, [%1]! \n"  // G
      "vld1.16 {d0}, [%0]! \n"  // R
      "vmovl.u16 q2, d4 \n"  // 000B
      "vmovl.u16 q1, d2 \n"  // G
      "vmovl.u16 q0, d0 \n"  // R
      "vmin.u32 q2, q2, q14 \n"
      "vmin.u32 q1, q1, q14 \n"
      "vmin.u32 q0, q0, q14 \n"
      "vsli.u32 q2, q1, #10 \n"  // 00GB
      "vsli.u32 q2, q0, #20 \n"  // 0RGB
      "vorr.u32 q2, #0xc0000000 \n"  // ARGB (AR30)
      "subs %4, %4, #4 \n"
      "vst1.8 {q2}, [%3]! \n"
      "bgt 1b \n"
      "3: \n"
      : "+r"(src_r),     // %0
        "+r"(src_g),     // %1
        "+r"(src_b),     // %2
        "+r"(dst_ar30),  // %3
        "+r"(width)      // %4
      :
      : "memory""cc""q0""q1""q2""q14");
}

void MergeAR64Row_NEON(const uint16_t* src_r,
                       const uint16_t* src_g,
                       const uint16_t* src_b,
                       const uint16_t* src_a,
                       uint16_t* dst_ar64,
                       int depth,
                       int width) {
  int shift = 16 - depth;
  int mask = (1 << depth) - 1;
  asm volatile(

      "vdup.u16 q15, %6 \n"
      "vdup.u16 q14, %7 \n"
      "1: \n"
      "vld1.16 {q2}, [%0]! \n"  // R
      "vld1.16 {q1}, [%1]! \n"  // G
      "vld1.16 {q0}, [%2]! \n"  // B
      "vld1.16 {q3}, [%3]! \n"  // A
      "vmin.u16 q2, q2, q14 \n"
      "vmin.u16 q1, q1, q14 \n"
      "vmin.u16 q0, q0, q14 \n"
      "vmin.u16 q3, q3, q14 \n"
      "vshl.u16 q2, q2, q15 \n"
      "vshl.u16 q1, q1, q15 \n"
      "vshl.u16 q0, q0, q15 \n"
      "vshl.u16 q3, q3, q15 \n"
      "subs %5, %5, #8 \n"
      "vst4.16 {d0, d2, d4, d6}, [%4]! \n"
      "vst4.16 {d1, d3, d5, d7}, [%4]! \n"
      "bgt 1b \n"
      : "+r"(src_r),     // %0
        "+r"(src_g),     // %1
        "+r"(src_b),     // %2
        "+r"(src_a),     // %3
        "+r"(dst_ar64),  // %4
        "+r"(width)      // %5
      : "r"(shift),      // %6
        "r"(mask)        // %7
      : "memory""cc""q0""q1""q2""q3""q15");
}

void MergeXR64Row_NEON(const uint16_t* src_r,
                       const uint16_t* src_g,
                       const uint16_t* src_b,
                       uint16_t* dst_ar64,
                       int depth,
                       int width) {
  int shift = 16 - depth;
  int mask = (1 << depth) - 1;
  asm volatile(

      "vmov.u8 q3, #0xff \n"  // A (0xffff)
      "vdup.u16 q15, %5 \n"
      "vdup.u16 q14, %6 \n"
      "1: \n"
      "vld1.16 {q2}, [%0]! \n"  // R
      "vld1.16 {q1}, [%1]! \n"  // G
      "vld1.16 {q0}, [%2]! \n"  // B
      "vmin.u16 q2, q2, q14 \n"
      "vmin.u16 q1, q1, q14 \n"
      "vmin.u16 q0, q0, q14 \n"
      "vshl.u16 q2, q2, q15 \n"
      "vshl.u16 q1, q1, q15 \n"
      "vshl.u16 q0, q0, q15 \n"
      "subs %4, %4, #8 \n"
      "vst4.16 {d0, d2, d4, d6}, [%3]! \n"
      "vst4.16 {d1, d3, d5, d7}, [%3]! \n"
      "bgt 1b \n"
      : "+r"(src_r),     // %0
        "+r"(src_g),     // %1
        "+r"(src_b),     // %2
        "+r"(dst_ar64),  // %3
        "+r"(width)      // %4
      : "r"(shift),      // %5
        "r"(mask)        // %6
      : "memory""cc""q0""q1""q2""q3""q15");
}

void MergeARGB16To8Row_NEON(const uint16_t* src_r,
                            const uint16_t* src_g,
                            const uint16_t* src_b,
                            const uint16_t* src_a,
                            uint8_t* dst_argb,
                            int depth,
                            int width) {
  int shift = 8 - depth;
  asm volatile(

      "vdup.16 q15, %6 \n"
      "1: \n"
      "vld1.16 {q2}, [%0]! \n"  // R
      "vld1.16 {q1}, [%1]! \n"  // G
      "vld1.16 {q0}, [%2]! \n"  // B
      "vld1.16 {q3}, [%3]! \n"  // A
      "vshl.u16 q2, q2, q15 \n"
      "vshl.u16 q1, q1, q15 \n"
      "vshl.u16 q0, q0, q15 \n"
      "vshl.u16 q3, q3, q15 \n"
      "vqmovn.u16 d0, q0 \n"
      "vqmovn.u16 d1, q1 \n"
      "vqmovn.u16 d2, q2 \n"
      "vqmovn.u16 d3, q3 \n"
      "subs %5, %5, #8 \n"
      "vst4.8 {d0, d1, d2, d3}, [%4]! \n"
      "bgt 1b \n"
      : "+r"(src_r),     // %0
        "+r"(src_g),     // %1
        "+r"(src_b),     // %2
        "+r"(src_a),     // %3
        "+r"(dst_argb),  // %4
        "+r"(width)      // %5
      : "r"(shift)       // %6
      : "memory""cc""q0""q1""q2""q3""q15");
}

void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
                            const uint16_t* src_g,
                            const uint16_t* src_b,
                            uint8_t* dst_argb,
                            int depth,
                            int width) {
  int shift = 8 - depth;
  asm volatile(

      "vdup.16 q15, %5 \n"
      "vmov.u8 d6, #0xff \n"  // A (0xff)
      "1: \n"
      "vld1.16 {q2}, [%0]! \n"  // R
      "vld1.16 {q1}, [%1]! \n"  // G
      "vld1.16 {q0}, [%2]! \n"  // B
      "vshl.u16 q2, q2, q15 \n"
      "vshl.u16 q1, q1, q15 \n"
      "vshl.u16 q0, q0, q15 \n"
      "vqmovn.u16 d5, q2 \n"
      "vqmovn.u16 d4, q1 \n"
      "vqmovn.u16 d3, q0 \n"
      "subs %4, %4, #8 \n"
      "vst4.u8 {d3, d4, d5, d6}, [%3]! \n"
      "bgt 1b \n"
      : "+r"(src_r),     // %0
        "+r"(src_g),     // %1
        "+r"(src_b),     // %2
        "+r"(dst_argb),  // %3
        "+r"(width)      // %4
      : "r"(shift)       // %5
      : "memory""cc""q0""q1""q2""d6""q15");
}

// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "1: \n"
      "vld1.8 {d0, d1, d2, d3}, [%0]! \n"  // load 32
      "subs %2, %2, #32 \n"  // 32 processed per loop
      "vst1.8 {d0, d1, d2, d3}, [%1]! \n"  // store 32
      "bgt 1b \n"
      : "+r"(src),                  // %0
        "+r"(dst),                  // %1
        "+r"(width)                 // %2  // Output registers
      :                             // Input registers
      : "cc""memory""q0""q1"  // Clobber List
  );
}

// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
  asm volatile(
      "vdup.8 q0, %2 \n"  // duplicate 16 bytes
      "1: \n"
      "subs %1, %1, #16 \n"  // 16 bytes per loop
      "vst1.8 {q0}, [%0]! \n"  // store
      "bgt 1b \n"
      : "+r"(dst),   // %0
        "+r"(width)  // %1
      : "r"(v8)      // %2
      : "cc""memory""q0");
}

// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
  asm volatile(
      "vdup.u32 q0, %2 \n"  // duplicate 4 ints
      "1: \n"
      "subs %1, %1, #4 \n"  // 4 pixels per loop
      "vst1.8 {q0}, [%0]! \n"  // store
      "bgt 1b \n"
      : "+r"(dst),   // %0
        "+r"(width)  // %1
      : "r"(v32)     // %2
      : "cc""memory""q0");
}

void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      // Start at end of source row.
      "add %0, %0, %2 \n"
      "sub %0, %0, #32 \n"  // 32 bytes per loop

      "1: \n"
      "vld1.8 {q1, q2}, [%0], %3 \n"  // src -= 32
      "subs %2, #32 \n"  // 32 pixels per loop.
      "vrev64.8 q0, q2 \n"
      "vrev64.8 q1, q1 \n"
      "vswp d0, d1 \n"
      "vswp d2, d3 \n"
      "vst1.8 {q0, q1}, [%1]! \n"  // dst += 32
      "bgt 1b \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
        "+r"(width)  // %2
      : "r"(-32)     // %3
      : "cc""memory""q0""q1""q2");
}

void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
  asm volatile(
      // Start at end of source row.
      "mov r12, #-16 \n"
      "add %0, %0, %2, lsl #1 \n"
      "sub %0, #16 \n"

      "1: \n"
      "vld2.8 {d0, d1}, [%0], r12 \n"  // src -= 16
      "subs %2, #8 \n"  // 8 pixels per loop.
      "vrev64.8 q0, q0 \n"
      "vst2.8 {d0, d1}, [%1]! \n"  // dst += 16
      "bgt 1b \n"
      : "+r"(src_uv),  // %0
        "+r"(dst_uv),  // %1
        "+r"(width)    // %2
      :
      : "cc""memory""r12""q0");
}

void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
  asm volatile(
      // Start at end of source row.
      "mov r12, #-16 \n"
      "add %0, %0, %3, lsl #1 \n"
      "sub %0, #16 \n"

      "1: \n"
      "vld2.8 {d0, d1}, [%0], r12 \n"  // src -= 16
      "subs %3, #8 \n"  // 8 pixels per loop.
      "vrev64.8 q0, q0 \n"
      "vst1.8 {d0}, [%1]! \n"  // dst += 8
      "vst1.8 {d1}, [%2]! \n"
      "bgt 1b \n"
      : "+r"(src_uv),  // %0
        "+r"(dst_u),   // %1
        "+r"(dst_v),   // %2
        "+r"(width)    // %3
      :
      : "cc""memory""r12""q0");
}

void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  asm volatile(
      "add %0, %0, %2, lsl #2 \n"
      "sub %0, #32 \n"

      "1: \n"
      "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n"  // src -= 32
      "subs %2, #8 \n"  // 8 pixels per loop.
      "vrev64.8 d0, d0 \n"
      "vrev64.8 d1, d1 \n"
      "vrev64.8 d2, d2 \n"
      "vrev64.8 d3, d3 \n"
      "vst4.8 {d0, d1, d2, d3}, [%1]! \n"  // dst += 32
      "bgt 1b \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      : "r"(-32)         // %3
      : "cc""memory""d0""d1""d2""d3");
}

void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
                         uint8_t* dst_rgb24,
                         int width) {
  src_rgb24 += width * 3 - 24;
  asm volatile(
      "1: \n"
      "vld3.8 {d0, d1, d2}, [%0], %3 \n"  // src -= 24
      "subs %2, #8 \n"  // 8 pixels per loop.
      "vrev64.8 d0, d0 \n"
      "vrev64.8 d1, d1 \n"
      "vrev64.8 d2, d2 \n"
      "vst3.8 {d0, d1, d2}, [%1]! \n"  // dst += 24
      "bgt 1b \n"
      : "+r"(src_rgb24),  // %0
        "+r"(dst_rgb24),  // %1
        "+r"(width)       // %2
      : "r"(-24)          // %3
      : "cc""memory""d0""d1""d2");
}

void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                         uint8_t* dst_argb,
                         int width) {
  asm volatile(
      "vmov.u8 d4, #255 \n"  // Alpha
      "1: \n"
      "vld3.8 {d1, d2, d3}, [%0]! \n"  // load 8 pixels of RGB24.
      "subs %2, %2, #8 \n"  // 8 processed per loop.
      "vst4.8 {d1, d2, d3, d4}, [%1]! \n"  // store 8 pixels of ARGB.
      "bgt 1b \n"
      : "+r"(src_rgb24),  // %0
        "+r"(dst_argb),   // %1
        "+r"(width)       // %2
      :
      : "cc""memory""d1""d2""d3""d4"  // Clobber List
  );
}

void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  asm volatile(
      "vmov.u8 d4, #255 \n"  // Alpha
      "1: \n"
      "vld3.8 {d1, d2, d3}, [%0]! \n"  // load 8 pixels of RAW.
      "subs %2, %2, #8 \n"  // 8 processed per loop.
      "vswp.u8 d1, d3 \n"  // swap R, B
      "vst4.8 {d1, d2, d3, d4}, [%1]! \n"  // store 8 pixels of ARGB.
      "bgt 1b \n"
      : "+r"(src_raw),   // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
      : "cc""memory""d1""d2""d3""d4"  // Clobber List
  );
}

void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
  asm volatile(
      "vmov.u8 d0, #255 \n"  // Alpha
      "1: \n"
      "vld3.8 {d1, d2, d3}, [%0]! \n"  // load 8 pixels of RAW.
      "subs %2, %2, #8 \n"  // 8 processed per loop.
      "vswp.u8 d1, d3 \n"  // swap R, B
      "vst4.8 {d0, d1, d2, d3}, [%1]! \n"  // store 8 pixels of RGBA.
      "bgt 1b \n"
      : "+r"(src_raw),   // %0
        "+r"(dst_rgba),  // %1
        "+r"(width)      // %2
      :
      : "cc""memory""d0""d1""d2""d3"  // Clobber List
  );
}
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
  asm volatile(
      "1: \n"
      "vld3.8 {d1, d2, d3}, [%0]! \n"  // load 8 pixels of RAW.
      "subs %2, %2, #8 \n"  // 8 processed per loop.
      "vswp.u8 d1, d3 \n"  // swap R, B
      "vst3.8 {d1, d2, d3}, [%1]! \n"  // store 8 pixels of
                                                      // RGB24.
      "bgt 1b \n"
      : "+r"(src_raw),    // %0
        "+r"(dst_rgb24),  // %1
        "+r"(width)       // %2
      :
      : "cc""memory""d1""d2""d3"  // Clobber List
  );
}

#define RGB565TOARGB                                                        \
  "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG           */ \
  "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
  "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6   */ \
  "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5   */ \
  "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
  "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
  "vorr.u8 d0, d0, d4 \n" /* B                    */ \
  "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2   */ \
  "vorr.u8 d2, d1, d5 \n" /* R                    */ \
  "vorr.u8 d1, d4, d6 \n" /* G                    */

void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5
C=85 H=89 G=86

¤ Dauer der Verarbeitung: 0.26 Sekunden  (vorverarbeitet)  ¤

*© Formatika GbR, Deutschland






Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.






                                                                                                                                                                                                                                                                                                                                                                                                     


Neuigkeiten

     Aktuelles
     Motto des Tages

Software

     Produkte
     Quellcodebibliothek

Aktivitäten

     Artikel über Sicherheit
     Anleitung zur Aktivierung von SSL

Muße

     Gedichte
     Musik
     Bilder

Jenseits des Üblichen ....

Besucherstatistik

Besucherstatistik

Monitoring

Montastic status badge