/*
* Copyright 2014 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC Neon armv8 64 bit.
#if !defined (LIBYUV_DISABLE_NEON) && defined (__aarch64__)
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
"ld1 {v0.8b}, [%0], #8 \n" \
"ld1 {v1.s}[0], [%1], #4 \n" \
"ld1 {v1.s}[1], [%2], #4 \n"
// Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \
"ld1 {v0.8b}, [%0], #8 \n" \
"ld1 {v1.d}[0], [%1], #8 \n" \
"ld1 {v1.d}[1], [%2], #8 \n" \
"uaddlp v1.8h, v1.16b \n" \
"rshrn v1.8b, v1.8h, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
"ld1 {v0.8b}, [%0], #8 \n" \
"movi v1.8b , #128 \n"
// Read 8 Y and 4 UV from NV12
#define READNV12 \
"ld1 {v0.8b}, [%0], #8 \n" \
"ld1 {v2.8b}, [%1], #8 \n" \
"uzp1 v1.8b, v2.8b, v2.8b \n" \
"uzp2 v3.8b, v2.8b, v2.8b \n" \
"ins v1.s[1], v3.s[0] \n"
// Read 8 Y and 4 VU from NV21
#define READNV21 \
"ld1 {v0.8b}, [%0], #8 \n" \
"ld1 {v2.8b}, [%1], #8 \n" \
"uzp1 v3.8b, v2.8b, v2.8b \n" \
"uzp2 v1.8b, v2.8b, v2.8b \n" \
"ins v1.s[1], v3.s[0] \n"
// Read 8 YUY2
#define READYUY2 \
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
"uzp2 v3.8b, v1.8b, v1.8b \n" \
"uzp1 v1.8b, v1.8b, v1.8b \n" \
"ins v1.s[1], v3.s[0] \n"
// Read 8 UYVY
#define READUYVY \
"ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
"orr v0.8b, v3.8b, v3.8b \n" \
"uzp1 v1.8b, v2.8b, v2.8b \n" \
"uzp2 v3.8b, v2.8b, v2.8b \n" \
"ins v1.s[1], v3.s[0] \n"
#define YUVTORGB_SETUP \
"ld3r {v24.8h, v25.8h, v26.8h}, [%[kUVBiasBGR]] \n" \
"ld1r {v31.4s}, [%[kYToRgb]] \n" \
"ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
"ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
// clang-format off
#define YUVTORGB(vR, vG, vB) \
"uxtl v0.8h, v0.8b \n" /* Extract Y */ \
"shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
"ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
"ushll v0.4s, v0.4h, #0 \n" \
"mul v3.4s, v3.4s, v31.4s \n" \
"mul v0.4s, v0.4s, v31.4s \n" \
"sqshrun v0.4h, v0.4s, #16 \n" \
"sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
"uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
"mov v2.d[0], v1.d[1] \n" /* Extract V */ \
"uxtl v2.8h, v2.8b \n" \
"uxtl v1.8h, v1.8b \n" /* Extract U */ \
"mul v3.8h, v27.8h, v1.8h \n" \
"mul v5.8h, v29.8h, v1.8h \n" \
"mul v6.8h, v30.8h, v2.8h \n" \
"mul v7.8h, v28.8h, v2.8h \n" \
"sqadd v6.8h, v6.8h, v5.8h \n" \
"sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
"sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
"sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
"sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
"sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
"sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
"sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
"sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
"sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
// clang-format on
void I444ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v23.8b, #255 \n" /* A */
"1: \n"
READYUV444
"prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
"subs %w4, %w4, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_u), // %1
"+r" (src_v), // %2
"+r" (dst_argb), // %3
"+r" (width) // %4
: [kUVToRB]"r" (&yuvconstants->kUVToRB),
[kUVToG]"r" (&yuvconstants->kUVToG),
[kUVBiasBGR]"r" (&yuvconstants->kUVBiasBGR),
[kYToRgb]"r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30"
);
}
void I422ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v23.8b, #255 \n" /* A */
"1: \n"
READYUV422
"prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%1, 128] \n"
"prfm pldl1keep, [%2, 128] \n"
"subs %w4, %w4, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_u), // %1
"+r" (src_v), // %2
"+r" (dst_argb), // %3
"+r" (width) // %4
: [kUVToRB]"r" (&yuvconstants->kUVToRB),
[kUVToG]"r" (&yuvconstants->kUVToG),
[kUVBiasBGR]"r" (&yuvconstants->kUVBiasBGR),
[kYToRgb]"r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30"
);
}
void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
const uint8_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n"
READYUV422
"prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
"ld1 {v23.8b}, [%3], #8 \n"
"prfm pldl1keep, [%1, 128] \n"
"prfm pldl1keep, [%2, 128] \n"
"prfm pldl1keep, [%3, 448] \n"
"subs %w5, %w5, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_u), // %1
"+r" (src_v), // %2
"+r" (src_a), // %3
"+r" (dst_argb), // %4
"+r" (width) // %5
: [kUVToRB]"r" (&yuvconstants->kUVToRB),
[kUVToG]"r" (&yuvconstants->kUVToG),
[kUVBiasBGR]"r" (&yuvconstants->kUVBiasBGR),
[kYToRgb]"r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30"
);
}
void I422ToRGBARow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v20.8b, #255 \n" /* A */
"1: \n"
READYUV422
"prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v23, v22, v21)
"prfm pldl1keep, [%1, 128] \n"
"prfm pldl1keep, [%2, 128] \n"
"subs %w4, %w4, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_u), // %1
"+r" (src_v), // %2
"+r" (dst_rgba), // %3
"+r" (width) // %4
: [kUVToRB]"r" (&yuvconstants->kUVToRB),
[kUVToG]"r" (&yuvconstants->kUVToG),
[kUVBiasBGR]"r" (&yuvconstants->kUVBiasBGR),
[kYToRgb]"r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30"
);
}
void I422ToRGB24Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n"
READYUV422
"prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%1, 128] \n"
"prfm pldl1keep, [%2, 128] \n"
"subs %w4, %w4, #8 \n"
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_u), // %1
"+r" (src_v), // %2
"+r" (dst_rgb24), // %3
"+r" (width) // %4
: [kUVToRB]"r" (&yuvconstants->kUVToRB),
[kUVToG]"r" (&yuvconstants->kUVToG),
[kUVBiasBGR]"r" (&yuvconstants->kUVBiasBGR),
[kYToRgb]"r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30"
);
}
#define ARGBTORGB565 \
"shll v0.8h, v22.8b, #8 \n" /* R */ \
"shll v21.8h, v21.8b, #8 \n" /* G */ \
"shll v20.8h, v20.8b, #8 \n" /* B */ \
"sri v0.8h, v21.8h, #5 \n" /* RG */ \
"sri v0.8h, v20.8h, #11 \n" /* RGB */
// clang-format off
void I422ToRGB565Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n"
READYUV422
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #8 \n"
ARGBTORGB565
"prfm pldl1keep, [%1, 128] \n"
"prfm pldl1keep, [%2, 128] \n"
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_u), // %1
"+r" (src_v), // %2
"+r" (dst_rgb565), // %3
"+r" (width) // %4
: [kUVToRB] "r" (&yuvconstants->kUVToRB),
[kUVToG] "r" (&yuvconstants->kUVToG),
[kUVBiasBGR] "r" (&yuvconstants->kUVBiasBGR),
[kYToRgb] "r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30" );
}
#define ARGBTOARGB1555 \
"shll v0.8h, v23.8b, #8 \n" /* A */ \
"shll v22.8h, v22.8b, #8 \n" /* R */ \
"shll v21.8h, v21.8b, #8 \n" /* G */ \
"shll v20.8h, v20.8b, #8 \n" /* B */ \
"sri v0.8h, v22.8h, #1 \n" /* AR */ \
"sri v0.8h, v21.8h, #6 \n" /* ARG */ \
"sri v0.8h, v20.8h, #11 \n" /* ARGB */
void I422ToARGB1555Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v23.8b, #255 \n"
"1: \n"
READYUV422
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #8 \n"
ARGBTOARGB1555
"prfm pldl1keep, [%1, 128] \n"
"prfm pldl1keep, [%2, 128] \n"
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_u), // %1
"+r" (src_v), // %2
"+r" (dst_argb1555), // %3
"+r" (width) // %4
: [kUVToRB] "r" (&yuvconstants->kUVToRB),
[kUVToG] "r" (&yuvconstants->kUVToG),
[kUVBiasBGR] "r" (&yuvconstants->kUVBiasBGR),
[kYToRgb] "r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30" );
}
// clang-format on
#define ARGBTOARGB4444 \
/* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
"ushr v20.8b, v20.8b, #4 \n" /* B */ \
"bic v21.8b, v21.8b, v4.8b \n" /* G */ \
"ushr v22.8b, v22.8b, #4 \n" /* R */ \
"bic v23.8b, v23.8b, v4.8b \n" /* A */ \
"orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
"orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
"zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
void I422ToARGB4444Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v4.16b, #0x0f \n" // bits to clear with vbic.
"1: \n"
READYUV422
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n"
ARGBTOARGB4444
"prfm pldl1keep, [%1, 128] \n"
"prfm pldl1keep, [%2, 128] \n"
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_u), // %1
"+r" (src_v), // %2
"+r" (dst_argb4444), // %3
"+r" (width) // %4
: [kUVToRB]"r" (&yuvconstants->kUVToRB),
[kUVToG]"r" (&yuvconstants->kUVToG),
[kUVBiasBGR]"r" (&yuvconstants->kUVBiasBGR),
[kYToRgb]"r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30"
);
}
void I400ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v23.8b, #255 \n"
"1: \n"
READYUV400
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (dst_argb), // %1
"+r" (width) // %2
: [kUVToRB]"r" (&yuvconstants->kUVToRB),
[kUVToG]"r" (&yuvconstants->kUVToG),
[kUVBiasBGR]"r" (&yuvconstants->kUVBiasBGR),
[kYToRgb]"r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30"
);
}
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile (
"movi v23.8b, #255 \n"
"1: \n"
"ld1 {v20.8b}, [%0], #8 \n"
"prfm pldl1keep, [%0, 448] \n"
"orr v21.8b, v20.8b, v20.8b \n"
"orr v22.8b, v20.8b, v20.8b \n"
"subs %w2, %w2, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (dst_argb), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v20" , "v21" , "v22" , "v23" );
}
void NV12ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v23.8b, #255 \n"
"1: \n"
READNV12
"prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%1, 256] \n"
"subs %w3, %w3, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_uv), // %1
"+r" (dst_argb), // %2
"+r" (width) // %3
: [kUVToRB]"r" (&yuvconstants->kUVToRB),
[kUVToG]"r" (&yuvconstants->kUVToG),
[kUVBiasBGR]"r" (&yuvconstants->kUVBiasBGR),
[kYToRgb]"r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30"
);
}
void NV21ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v23.8b, #255 \n"
"1: \n"
READNV21
"prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%1, 256] \n"
"subs %w3, %w3, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_vu), // %1
"+r" (dst_argb), // %2
"+r" (width) // %3
: [kUVToRB]"r" (&yuvconstants->kUVToRB),
[kUVToG]"r" (&yuvconstants->kUVToG),
[kUVBiasBGR]"r" (&yuvconstants->kUVBiasBGR),
[kYToRgb]"r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30"
);
}
void NV12ToRGB24Row_NEON(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n"
READNV12
"prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%1, 256] \n"
"subs %w3, %w3, #8 \n"
"st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_uv), // %1
"+r" (dst_rgb24), // %2
"+r" (width) // %3
: [kUVToRB]"r" (&yuvconstants->kUVToRB),
[kUVToG]"r" (&yuvconstants->kUVToG),
[kUVBiasBGR]"r" (&yuvconstants->kUVBiasBGR),
[kYToRgb]"r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30"
);
}
void NV21ToRGB24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n"
READNV21
"prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%1, 256] \n"
"subs %w3, %w3, #8 \n"
"st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_vu), // %1
"+r" (dst_rgb24), // %2
"+r" (width) // %3
: [kUVToRB]"r" (&yuvconstants->kUVToRB),
[kUVToG]"r" (&yuvconstants->kUVToG),
[kUVBiasBGR]"r" (&yuvconstants->kUVBiasBGR),
[kYToRgb]"r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30"
);
}
void NV12ToRGB565Row_NEON(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP "1: \n" READNV12
"prfm pldl1keep, [%0, 448] \n" YUVTORGB(
v22, v21, v20) ARGBTORGB565
"prfm pldl1keep, [%1, 256] \n"
"subs %w3, %w3, #8 \n"
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_uv), // %1
"+r" (dst_rgb565), // %2
"+r" (width) // %3
: [kUVToRB] "r" (&yuvconstants->kUVToRB),
[kUVToG] "r" (&yuvconstants->kUVToG),
[kUVBiasBGR] "r" (&yuvconstants->kUVBiasBGR),
[kYToRgb] "r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30" );
}
void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v23.8b, #255 \n"
"1: \n"
READYUY2
"prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
"subs %w2, %w2, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r" (src_yuy2), // %0
"+r" (dst_argb), // %1
"+r" (width) // %2
: [kUVToRB]"r" (&yuvconstants->kUVToRB),
[kUVToG]"r" (&yuvconstants->kUVToG),
[kUVBiasBGR]"r" (&yuvconstants->kUVBiasBGR),
[kYToRgb]"r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30"
);
}
void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP
"movi v23.8b, #255 \n"
"1: \n"
READUYVY
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
"b.gt 1b \n"
: "+r" (src_uyvy), // %0
"+r" (dst_argb), // %1
"+r" (width) // %2
: [kUVToRB]"r" (&yuvconstants->kUVToRB),
[kUVToG]"r" (&yuvconstants->kUVToG),
[kUVBiasBGR]"r" (&yuvconstants->kUVBiasBGR),
[kYToRgb]"r" (&yuvconstants->kYToRgb)
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" , "v20" ,
"v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30"
);
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
void SplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"prfm pldl1keep, [%0, 448] \n"
"subs %w3, %w3, #16 \n" // 16 processed per loop
"st1 {v0.16b}, [%1], #16 \n" // store U
"st1 {v1.16b}, [%2], #16 \n" // store V
"b.gt 1b \n"
: "+r" (src_uv), // %0
"+r" (dst_u), // %1
"+r" (dst_v), // %2
"+r" (width) // %3 // Output registers
: // Input registers
: "cc" , "memory" , "v0" , "v1" // Clobber List
);
}
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load U
"ld1 {v1.16b}, [%1], #16 \n" // load V
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #16 \n" // 16 processed per loop
"st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"b.gt 1b \n"
: "+r" (src_u), // %0
"+r" (src_v), // %1
"+r" (dst_uv), // %2
"+r" (width) // %3 // Output registers
: // Input registers
: "cc" , "memory" , "v0" , "v1" // Clobber List
);
}
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
void SplitRGBRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width) {
asm volatile (
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
"prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop
"st1 {v0.16b}, [%1], #16 \n" // store R
"st1 {v1.16b}, [%2], #16 \n" // store G
"st1 {v2.16b}, [%3], #16 \n" // store B
"b.gt 1b \n"
: "+r" (src_rgb), // %0
"+r" (dst_r), // %1
"+r" (dst_g), // %2
"+r" (dst_b), // %3
"+r" (width) // %4
: // Input registers
: "cc" , "memory" , "v0" , "v1" , "v2" // Clobber List
);
}
// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
void MergeRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_rgb,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load R
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v2.16b}, [%2], #16 \n" // load B
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop
"st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
"prfm pldl1keep, [%0, 448] \n"
"b.gt 1b \n"
: "+r" (src_r), // %0
"+r" (src_g), // %1
"+r" (src_b), // %2
"+r" (dst_rgb), // %3
"+r" (width) // %4
: // Input registers
: "cc" , "memory" , "v0" , "v1" , "v2" // Clobber List
);
}
// Copy multiple of 32.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile (
"1: \n"
"ldp q0, q1, [%0], #32 \n"
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #32 \n" // 32 processed per loop
"stp q0, q1, [%1], #32 \n"
"b.gt 1b \n"
: "+r" (src), // %0
"+r" (dst), // %1
"+r" (width) // %2 // Output registers
: // Input registers
: "cc" , "memory" , "v0" , "v1" // Clobber List
);
}
// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
asm volatile (
"dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n"
"subs %w1, %w1, #16 \n" // 16 bytes per loop
"st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n"
: "+r" (dst), // %0
"+r" (width) // %1
: "r" (v8) // %2
: "cc" , "memory" , "v0" );
}
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
asm volatile (
"dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
"subs %w1, %w1, #4 \n" // 4 ints per loop
"st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n"
: "+r" (dst), // %0
"+r" (width) // %1
: "r" (v32) // %2
: "cc" , "memory" , "v0" );
}
// Shuffle table for reversing the bytes.
static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile (
// Start at end of source row.
"ld1 {v3.16b}, [%3] \n" // shuffler
"add %0, %0, %w2, sxtw \n"
"sub %0, %0, #32 \n"
"1: \n"
"ldr q2, [%0, 16] \n"
"ldr q1, [%0], -32 \n" // src -= 32
"subs %w2, %w2, #32 \n" // 32 pixels per loop.
"tbl v0.16b, {v2.16b}, v3.16b \n"
"tbl v1.16b, {v1.16b}, v3.16b \n"
"st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
"b.gt 1b \n"
: "+r" (src), // %0
"+r" (dst), // %1
"+r" (width) // %2
: "r" (&kShuffleMirror) // %3
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" );
}
// Shuffle table for reversing the UV.
static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile (
// Start at end of source row.
"ld1 {v4.16b}, [%3] \n" // shuffler
"add %0, %0, %w2, sxtw #1 \n"
"sub %0, %0, #32 \n"
"1: \n"
"ldr q1, [%0, 16] \n"
"ldr q0, [%0], -32 \n" // src -= 32
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
"tbl v2.16b, {v1.16b}, v4.16b \n"
"tbl v3.16b, {v0.16b}, v4.16b \n"
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
"b.gt 1b \n"
: "+r" (src_uv), // %0
"+r" (dst_uv), // %1
"+r" (width) // %2
: "r" (&kShuffleMirrorUV) // %3
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" );
}
void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
// Start at end of source row.
"ld1 {v4.16b}, [%4] \n" // shuffler
"add %0, %0, %w3, sxtw #1 \n"
"sub %0, %0, #32 \n"
"1: \n"
"ldr q1, [%0, 16] \n"
"ldr q0, [%0], -32 \n" // src -= 32
"subs %w3, %w3, #16 \n" // 16 pixels per loop.
"tbl v2.16b, {v1.16b}, v4.16b \n"
"tbl v3.16b, {v0.16b}, v4.16b \n"
"uzp1 v0.16b, v2.16b, v3.16b \n" // U
"uzp2 v1.16b, v2.16b, v3.16b \n" // V
"st1 {v0.16b}, [%1], #16 \n" // dst += 16
"st1 {v1.16b}, [%2], #16 \n"
"b.gt 1b \n"
: "+r" (src_uv), // %0
"+r" (dst_u), // %1
"+r" (dst_v), // %2
"+r" (width) // %3
: "r" (&kShuffleMirrorUV) // %4
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" );
}
// Shuffle table for reversing the ARGB.
static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u};
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile (
// Start at end of source row.
"ld1 {v4.16b}, [%3] \n" // shuffler
"add %0, %0, %w2, sxtw #2 \n"
"sub %0, %0, #32 \n"
"1: \n"
"ldr q1, [%0, 16] \n"
"ldr q0, [%0], -32 \n" // src -= 32
"subs %w2, %w2, #8 \n" // 8 pixels per loop.
"tbl v2.16b, {v1.16b}, v4.16b \n"
"tbl v3.16b, {v0.16b}, v4.16b \n"
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
"b.gt 1b \n"
: "+r" (src_argb), // %0
"+r" (dst_argb), // %1
"+r" (width) // %2
: "r" (&kShuffleMirrorARGB) // %3
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" );
}
void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_rgb24,
int width) {
asm volatile (
"ld1 {v3.16b}, [%4] \n" // shuffler
"add %0, %0, %w2, sxtw #1 \n" // Start at end of row.
"add %0, %0, %w2, sxtw \n"
"sub %0, %0, #48 \n"
"1: \n"
"ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
"tbl v0.16b, {v0.16b}, v3.16b \n"
"tbl v1.16b, {v1.16b}, v3.16b \n"
"tbl v2.16b, {v2.16b}, v3.16b \n"
"st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48
"b.gt 1b \n"
: "+r" (src_rgb24), // %0
"+r" (dst_rgb24), // %1
"+r" (width) // %2
: "r" ((ptrdiff_t)-48), // %3
"r" (&kShuffleMirror) // %4
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" );
}
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile (
"movi v4.8b, #255 \n" // Alpha
"1: \n"
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of
// RGB24.
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r" (src_rgb24), // %0
"+r" (dst_argb), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v1" , "v2" , "v3" , "v4" // Clobber List
);
}
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile (
"movi v5.8b, #255 \n" // Alpha
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g
"orr v4.8b, v0.8b, v0.8b \n" // move r
"st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
"b.gt 1b \n"
: "+r" (src_raw), // %0
"+r" (dst_argb), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" // Clobber List
);
}
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile (
"movi v0.8b, #255 \n" // Alpha
"1: \n"
"ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v2.8b, v4.8b, v4.8b \n" // move g
"orr v1.8b, v5.8b, v5.8b \n" // move r
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
"b.gt 1b \n"
: "+r" (src_raw), // %0
"+r" (dst_rgba), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" // Clobber List
);
}
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile (
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g
"orr v4.8b, v0.8b, v0.8b \n" // move r
"st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
"b.gt 1b \n"
: "+r" (src_raw), // %0
"+r" (dst_rgb24), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" // Clobber List
);
}
#define RGB565TOARGB \
"shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
"shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
"ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
"orr v1.8b, v4.8b, v6.8b \n" /* G */ \
"xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
"ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
"xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
"shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
"ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
"orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
"dup v2.2D, v0.D[1] \n" /* R */
void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width) {
asm volatile (
"movi v3.8b, #255 \n" // Alpha
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r" (src_rgb565), // %0
"+r" (dst_argb), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v6" // Clobber List
);
}
#define ARGB1555TOARGB \
"ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
"shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
"xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
\
"sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
"xtn2 v3.16b, v2.8h \n" \
\
"xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
"shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
\
"ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
"shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
"ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
\
"orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
"orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
"dup v1.2D, v0.D[1] \n" \
"dup v3.2D, v2.D[1] \n"
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
#define RGB555TOARGB \
"ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
"shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
"xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
\
"xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
"shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
\
"ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
"shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
"ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
\
"orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
"orr v2.16b, v1.16b, v3.16b \n" /* R */ \
"dup v1.2D, v0.D[1] \n" /* G */
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width) {
asm volatile (
"movi v3.8b, #255 \n" // Alpha
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r" (src_argb1555), // %0
"+r" (dst_argb), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" // Clobber List
);
}
// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
// clobbers v3
#define ARGB4444TOARGB \
"shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
"xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
"shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
"ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
"ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
"shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
"orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
"orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
"dup v0.2D, v2.D[1] \n" \
"dup v1.2D, v3.D[1] \n"
void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_argb,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r" (src_argb4444), // %0
"+r" (dst_argb), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" // Clobber List
);
}
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb24,
int width) {
asm volatile (
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
// RGB24
"b.gt 1b \n"
: "+r" (src_argb), // %0
"+r" (dst_rgb24), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v1" , "v2" , "v3" , "v4" // Clobber List
);
}
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile (
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v4.8b, v2.8b, v2.8b \n" // mov g
"orr v5.8b, v1.8b, v1.8b \n" // mov b
"st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
"b.gt 1b \n"
: "+r" (src_argb), // %0
"+r" (dst_raw), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v1" , "v2" , "v3" , "v4" , "v5" // Clobber List
);
}
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile (
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
"b.gt 1b \n"
: "+r" (src_yuy2), // %0
"+r" (dst_y), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v0" , "v1" // Clobber List
);
}
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile (
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"b.gt 1b \n"
: "+r" (src_uyvy), // %0
"+r" (dst_y), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v0" , "v1" // Clobber List
);
}
void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
"prfm pldl1keep, [%0, 448] \n"
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
"st1 {v3.8b}, [%2], #8 \n" // store 8 V.
"b.gt 1b \n"
: "+r" (src_yuy2), // %0
"+r" (dst_u), // %1
"+r" (dst_v), // %2
"+r" (width) // %3
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" // Clobber List
);
}
void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
"prfm pldl1keep, [%0, 448] \n"
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
"st1 {v2.8b}, [%2], #8 \n" // store 8 V.
"b.gt 1b \n"
: "+r" (src_uyvy), // %0
"+r" (dst_u), // %1
"+r" (dst_v), // %2
"+r" (width) // %3
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" // Clobber List
);
}
void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile (
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
"urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
"st1 {v1.8b}, [%2], #8 \n" // store 8 U.
"st1 {v3.8b}, [%3], #8 \n" // store 8 V.
"b.gt 1b \n"
: "+r" (src_yuy2), // %0
"+r" (src_yuy2b), // %1
"+r" (dst_u), // %2
"+r" (dst_v), // %3
"+r" (width) // %4
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" ,
"v7" // Clobber List
);
}
void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
int stride_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile (
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
"urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
"st1 {v0.8b}, [%2], #8 \n" // store 8 U.
"st1 {v2.8b}, [%3], #8 \n" // store 8 V.
"b.gt 1b \n"
: "+r" (src_uyvy), // %0
"+r" (src_uyvyb), // %1
"+r" (dst_u), // %2
"+r" (dst_v), // %3
"+r" (width) // %4
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" ,
"v7" // Clobber List
);
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
const uint8_t* shuffler,
int width) {
asm volatile (
"ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
"st1 {v1.16b}, [%1], #16 \n" // store 4.
"b.gt 1b \n"
: "+r" (src_argb), // %0
"+r" (dst_argb), // %1
"+r" (width) // %2
: "r" (shuffler) // %3
: "cc" , "memory" , "v0" , "v1" , "v2" // Clobber List
);
}
void I422ToYUY2Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_yuy2,
int width) {
asm volatile (
"1: \n"
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
"prfm pldl1keep, [%0, 448] \n"
"orr v2.8b, v1.8b, v1.8b \n"
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %w4, %w4, #16 \n" // 16 pixels
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_u), // %1
"+r" (src_v), // %2
"+r" (dst_yuy2), // %3
"+r" (width) // %4
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" );
}
void I422ToUYVYRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uyvy,
int width) {
asm volatile (
"1: \n"
"ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
"prfm pldl1keep, [%0, 448] \n"
"orr v3.8b, v2.8b, v2.8b \n"
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %w4, %w4, #16 \n" // 16 pixels
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n"
: "+r" (src_y), // %0
"+r" (src_u), // %1
"+r" (src_v), // %2
"+r" (dst_uyvy), // %3
"+r" (width) // %4
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" );
}
void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb565,
int width) {
asm volatile (
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
// pixels
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTORGB565
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
: "+r" (src_argb), // %0
"+r" (dst_rgb565), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v0" , "v20" , "v21" , "v22" , "v23" );
}
void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb,
const uint32_t dither4,
int width) {
asm volatile (
"dup v1.4s, %w2 \n" // dither4
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8
// pixels
"prfm pldl1keep, [%0, 448] \n"
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v20.8b, v20.8b, v1.8b \n"
"uqadd v21.8b, v21.8b, v1.8b \n"
"uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
"st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
: "+r" (dst_rgb) // %0
: "r" (src_argb), // %1
"r" (dither4), // %2
"r" (width) // %3
: "cc" , "memory" , "v0" , "v1" , "v20" , "v21" , "v22" , "v23" );
}
void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb1555,
int width) {
asm volatile (
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
// pixels
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
"b.gt 1b \n"
: "+r" (src_argb), // %0
"+r" (dst_argb1555), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v0" , "v20" , "v21" , "v22" , "v23" );
}
void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444,
int width) {
asm volatile (
"movi v4.16b, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
// pixels
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
"b.gt 1b \n"
: "+r" (src_argb), // %0
"+r" (dst_argb4444), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v0" , "v1" , "v4" , "v20" , "v21" , "v22" , "v23" );
}
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile (
"movi v4.8b, #25 \n" // B * 0.1016 coefficient
"movi v5.8b, #129 \n" // G * 0.5078 coefficient
"movi v6.8b, #66 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n"
: "+r" (src_argb), // %0
"+r" (dst_y), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" );
}
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
uint8_t* dst_a,
int width) {
asm volatile (
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop
"st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
"b.gt 1b \n"
: "+r" (src_argb), // %0
"+r" (dst_a), // %1
"+r" (width) // %2
:
: "cc" , "memory" , "v0" , "v1" , "v2" , "v3" // Clobber List
);
}
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile (
"movi v4.8b, #29 \n" // B * 0.1140 coefficient
"movi v5.8b, #150 \n" // G * 0.5870 coefficient
"movi v6.8b, #77 \n" // R * 0.2990 coefficient
"1: \n"
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=79 H=83 G=80
¤ Dauer der Verarbeitung: 0.19 Sekunden
¤
*© Formatika GbR, Deutschland