Quellcodebibliothek Statistik Leitseite products/Sources/formale Sprachen/C/Firefox/third_party/aom/third_party/libyuv/source/   (Browser von der Mozilla Stiftung Version 136.0.1©)  Datei vom 10.2.2025 mit Größe 306 kB image not shown  

Quelle  row_gcc.cc   Sprache: C

 
/*
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


#include "libyuv/row.h"

#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

// This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && \
    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))

#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)

// Constants for ARGB
static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
                               25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};

// JPeg full range.
static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
                                29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};

static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
                                0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)

#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)

static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
                              112, -74, -38, 0, 112, -74, -38, 0};

static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
                               127, -84, -43, 0, 127, -84, -43, 0};

static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
                              -18, -94, 112, 0, -18, -94, 112, 0};

static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
                               -20, -107, 127, 0, -20, -107, 127, 0};

// Constants for BGRA
static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
                               0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};

static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
                              0, -38, -74, 112, 0, -38, -74, 112};

static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
                              0, 112, -94, -18, 0, 112, -94, -18};

// Constants for ABGR
static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
                               66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};

static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
                              -38, -74, 112, 0, -38, -74, 112, 0};

static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
                              112, -94, -18, 0, 112, -94, -18, 0};

// Constants for RGBA.
static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
                               0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};

static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
                              0, 112, -74, -38, 0, 112, -74, -38};

static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
                              0, -18, -94, 112, 0, -18, -94, 112};

static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
                               0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};

static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};

static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
                               0x8080u, 0x8080u, 0x8080u, 0x8080u};

#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)

#ifdef HAS_RGB24TOARGBROW_SSSE3

// Shuffle table for converting RGB24 to ARGB.
static const uvec8 kShuffleMaskRGB24ToARGB = {
    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};

// Shuffle table for converting RAW to ARGB.
static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};

// Shuffle table for converting RAW to RGBA.
static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u,  4u,  3u,
                                            14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};

// Shuffle table for converting RAW to RGB24.  First 8.
static const uvec8 kShuffleMaskRAWToRGB24_0 = {
    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};

// Shuffle table for converting RAW to RGB24.  Middle 8.
static const uvec8 kShuffleMaskRAWToRGB24_1 = {
    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};

// Shuffle table for converting RAW to RGB24.  Last 8.
static const uvec8 kShuffleMaskRAWToRGB24_2 = {
    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};

// Shuffle table for converting ARGB to RGB24.
static const uvec8 kShuffleMaskARGBToRGB24 = {
    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};

// Shuffle table for converting ARGB to RAW.
static const uvec8 kShuffleMaskARGBToRAW = {
    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};

// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
static const uvec8 kShuffleMaskARGBToRGB24_0 = {
    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};

// YUY2 shuf 16 Y to 32 Y.
static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};

// YUY2 shuf 8 UV to 16 UV.
static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};

// UYVY shuf 16 Y to 32 Y.
static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};

// UYVY shuf 8 UV to 16 UV.
static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};

// NV21 shuf 8 VU to 16 UV.
static const lvec8 kShuffleNV21 = {
    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
};
#endif  // HAS_RGB24TOARGBROW_SSSE3

#ifdef HAS_J400TOARGBROW_SSE2
void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  asm volatile(
      "pcmpeqb %%xmm5,%%xmm5 \n"
      "pslld $0x18,%%xmm5 \n"

      LABELALIGN
      "1: \n"
      "movq (%0),%%xmm0 \n"
      "lea 0x8(%0),%0 \n"
      "punpcklbw %%xmm0,%%xmm0 \n"
      "movdqa %%xmm0,%%xmm1 \n"
      "punpcklwd %%xmm0,%%xmm0 \n"
      "punpckhwd %%xmm1,%%xmm1 \n"
      "por %%xmm5,%%xmm0 \n"
      "por %%xmm5,%%xmm1 \n"
      "movdqu %%xmm0,(%1) \n"
      "movdqu %%xmm1,0x10(%1) \n"
      "lea 0x20(%1),%1 \n"
      "sub $0x8,%2 \n"
      "jg 1b \n"
      : "+r"(src_y),     // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
        ::"memory",
        "cc""xmm0""xmm1""xmm5");
}
#endif  // HAS_J400TOARGBROW_SSE2

#ifdef HAS_RGB24TOARGBROW_SSSE3
void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
                          uint8_t* dst_argb,
                          int width) {
  asm volatile(
      "pcmpeqb %%xmm5,%%xmm5 \n"  // 0xff000000
      "pslld $0x18,%%xmm5 \n"
      "movdqa %3,%%xmm4 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "movdqu 0x10(%0),%%xmm1 \n"
      "movdqu 0x20(%0),%%xmm3 \n"
      "lea 0x30(%0),%0 \n"
      "movdqa %%xmm3,%%xmm2 \n"
      "palignr $0x8,%%xmm1,%%xmm2 \n"
      "pshufb %%xmm4,%%xmm2 \n"
      "por %%xmm5,%%xmm2 \n"
      "palignr $0xc,%%xmm0,%%xmm1 \n"
      "pshufb %%xmm4,%%xmm0 \n"
      "movdqu %%xmm2,0x20(%1) \n"
      "por %%xmm5,%%xmm0 \n"
      "pshufb %%xmm4,%%xmm1 \n"
      "movdqu %%xmm0,(%1) \n"
      "por %%xmm5,%%xmm1 \n"
      "palignr $0x4,%%xmm3,%%xmm3 \n"
      "pshufb %%xmm4,%%xmm3 \n"
      "movdqu %%xmm1,0x10(%1) \n"
      "por %%xmm5,%%xmm3 \n"
      "movdqu %%xmm3,0x30(%1) \n"
      "lea 0x40(%1),%1 \n"
      "sub $0x10,%2 \n"
      "jg 1b \n"
      : "+r"(src_rgb24),              // %0
        "+r"(dst_argb),               // %1
        "+r"(width)                   // %2
      : "m"(kShuffleMaskRGB24ToARGB)  // %3
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5");
}

void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  asm volatile(
      "pcmpeqb %%xmm5,%%xmm5 \n"  // 0xff000000
      "pslld $0x18,%%xmm5 \n"
      "movdqa %3,%%xmm4 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "movdqu 0x10(%0),%%xmm1 \n"
      "movdqu 0x20(%0),%%xmm3 \n"
      "lea 0x30(%0),%0 \n"
      "movdqa %%xmm3,%%xmm2 \n"
      "palignr $0x8,%%xmm1,%%xmm2 \n"
      "pshufb %%xmm4,%%xmm2 \n"
      "por %%xmm5,%%xmm2 \n"
      "palignr $0xc,%%xmm0,%%xmm1 \n"
      "pshufb %%xmm4,%%xmm0 \n"
      "movdqu %%xmm2,0x20(%1) \n"
      "por %%xmm5,%%xmm0 \n"
      "pshufb %%xmm4,%%xmm1 \n"
      "movdqu %%xmm0,(%1) \n"
      "por %%xmm5,%%xmm1 \n"
      "palignr $0x4,%%xmm3,%%xmm3 \n"
      "pshufb %%xmm4,%%xmm3 \n"
      "movdqu %%xmm1,0x10(%1) \n"
      "por %%xmm5,%%xmm3 \n"
      "movdqu %%xmm3,0x30(%1) \n"
      "lea 0x40(%1),%1 \n"
      "sub $0x10,%2 \n"
      "jg 1b \n"
      : "+r"(src_raw),              // %0
        "+r"(dst_argb),             // %1
        "+r"(width)                 // %2
      : "m"(kShuffleMaskRAWToARGB)  // %3
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5");
}

// Same code as RAWToARGB with different shuffler and A in low bits
void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
  asm volatile(
      "pcmpeqb %%xmm5,%%xmm5 \n"  // 0x000000ff
      "psrld $0x18,%%xmm5 \n"
      "movdqa %3,%%xmm4 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "movdqu 0x10(%0),%%xmm1 \n"
      "movdqu 0x20(%0),%%xmm3 \n"
      "lea 0x30(%0),%0 \n"
      "movdqa %%xmm3,%%xmm2 \n"
      "palignr $0x8,%%xmm1,%%xmm2 \n"
      "pshufb %%xmm4,%%xmm2 \n"
      "por %%xmm5,%%xmm2 \n"
      "palignr $0xc,%%xmm0,%%xmm1 \n"
      "pshufb %%xmm4,%%xmm0 \n"
      "movdqu %%xmm2,0x20(%1) \n"
      "por %%xmm5,%%xmm0 \n"
      "pshufb %%xmm4,%%xmm1 \n"
      "movdqu %%xmm0,(%1) \n"
      "por %%xmm5,%%xmm1 \n"
      "palignr $0x4,%%xmm3,%%xmm3 \n"
      "pshufb %%xmm4,%%xmm3 \n"
      "movdqu %%xmm1,0x10(%1) \n"
      "por %%xmm5,%%xmm3 \n"
      "movdqu %%xmm3,0x30(%1) \n"
      "lea 0x40(%1),%1 \n"
      "sub $0x10,%2 \n"
      "jg 1b \n"
      : "+r"(src_raw),              // %0
        "+r"(dst_rgba),             // %1
        "+r"(width)                 // %2
      : "m"(kShuffleMaskRAWToRGBA)  // %3
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5");
}

void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
                         uint8_t* dst_rgb24,
                         int width) {
  asm volatile(
      "movdqa %3,%%xmm3 \n"
      "movdqa %4,%%xmm4 \n"
      "movdqa %5,%%xmm5 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "movdqu 0x4(%0),%%xmm1 \n"
      "movdqu 0x8(%0),%%xmm2 \n"
      "lea 0x18(%0),%0 \n"
      "pshufb %%xmm3,%%xmm0 \n"
      "pshufb %%xmm4,%%xmm1 \n"
      "pshufb %%xmm5,%%xmm2 \n"
      "movq %%xmm0,(%1) \n"
      "movq %%xmm1,0x8(%1) \n"
      "movq %%xmm2,0x10(%1) \n"
      "lea 0x18(%1),%1 \n"
      "sub $0x8,%2 \n"
      "jg 1b \n"
      : "+r"(src_raw),                  // %0
        "+r"(dst_rgb24),                // %1
        "+r"(width)                     // %2
      : "m"(kShuffleMaskRAWToRGB24_0),  // %3
        "m"(kShuffleMaskRAWToRGB24_1),  // %4
        "m"(kShuffleMaskRAWToRGB24_2)   // %5
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5");
}

void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "mov $0x1080108,%%eax \n"
      "movd %%eax,%%xmm5 \n"
      "pshufd $0x0,%%xmm5,%%xmm5 \n"
      "mov $0x20802080,%%eax \n"
      "movd %%eax,%%xmm6 \n"
      "pshufd $0x0,%%xmm6,%%xmm6 \n"
      "pcmpeqb %%xmm3,%%xmm3 \n"
      "psllw $0xb,%%xmm3 \n"
      "pcmpeqb %%xmm4,%%xmm4 \n"
      "psllw $0xa,%%xmm4 \n"
      "psrlw $0x5,%%xmm4 \n"
      "pcmpeqb %%xmm7,%%xmm7 \n"
      "psllw $0x8,%%xmm7 \n"
      "sub %0,%1 \n"
      "sub %0,%1 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "movdqa %%xmm0,%%xmm1 \n"
      "movdqa %%xmm0,%%xmm2 \n"
      "pand %%xmm3,%%xmm1 \n"
      "psllw $0xb,%%xmm2 \n"
      "pmulhuw %%xmm5,%%xmm1 \n"
      "pmulhuw %%xmm5,%%xmm2 \n"
      "psllw $0x8,%%xmm1 \n"
      "por %%xmm2,%%xmm1 \n"
      "pand %%xmm4,%%xmm0 \n"
      "pmulhuw %%xmm6,%%xmm0 \n"
      "por %%xmm7,%%xmm0 \n"
      "movdqa %%xmm1,%%xmm2 \n"
      "punpcklbw %%xmm0,%%xmm1 \n"
      "punpckhbw %%xmm0,%%xmm2 \n"
      "movdqu %%xmm1,0x00(%1,%0,2) \n"
      "movdqu %%xmm2,0x10(%1,%0,2) \n"
      "lea 0x10(%0),%0 \n"
      "sub $0x8,%2 \n"
      "jg 1b \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
        "+r"(width)  // %2
      :
      : "memory""cc""eax""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5",
        "xmm6""xmm7");
}

void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "mov $0x1080108,%%eax \n"
      "movd %%eax,%%xmm5 \n"
      "pshufd $0x0,%%xmm5,%%xmm5 \n"
      "mov $0x42004200,%%eax \n"
      "movd %%eax,%%xmm6 \n"
      "pshufd $0x0,%%xmm6,%%xmm6 \n"
      "pcmpeqb %%xmm3,%%xmm3 \n"
      "psllw $0xb,%%xmm3 \n"
      "movdqa %%xmm3,%%xmm4 \n"
      "psrlw $0x6,%%xmm4 \n"
      "pcmpeqb %%xmm7,%%xmm7 \n"
      "psllw $0x8,%%xmm7 \n"
      "sub %0,%1 \n"
      "sub %0,%1 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "movdqa %%xmm0,%%xmm1 \n"
      "movdqa %%xmm0,%%xmm2 \n"
      "psllw $0x1,%%xmm1 \n"
      "psllw $0xb,%%xmm2 \n"
      "pand %%xmm3,%%xmm1 \n"
      "pmulhuw %%xmm5,%%xmm2 \n"
      "pmulhuw %%xmm5,%%xmm1 \n"
      "psllw $0x8,%%xmm1 \n"
      "por %%xmm2,%%xmm1 \n"
      "movdqa %%xmm0,%%xmm2 \n"
      "pand %%xmm4,%%xmm0 \n"
      "psraw $0x8,%%xmm2 \n"
      "pmulhuw %%xmm6,%%xmm0 \n"
      "pand %%xmm7,%%xmm2 \n"
      "por %%xmm2,%%xmm0 \n"
      "movdqa %%xmm1,%%xmm2 \n"
      "punpcklbw %%xmm0,%%xmm1 \n"
      "punpckhbw %%xmm0,%%xmm2 \n"
      "movdqu %%xmm1,0x00(%1,%0,2) \n"
      "movdqu %%xmm2,0x10(%1,%0,2) \n"
      "lea 0x10(%0),%0 \n"
      "sub $0x8,%2 \n"
      "jg 1b \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
        "+r"(width)  // %2
      :
      : "memory""cc""eax""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5",
        "xmm6""xmm7");
}

void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "mov $0xf0f0f0f,%%eax \n"
      "movd %%eax,%%xmm4 \n"
      "pshufd $0x0,%%xmm4,%%xmm4 \n"
      "movdqa %%xmm4,%%xmm5 \n"
      "pslld $0x4,%%xmm5 \n"
      "sub %0,%1 \n"
      "sub %0,%1 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "movdqa %%xmm0,%%xmm2 \n"
      "pand %%xmm4,%%xmm0 \n"
      "pand %%xmm5,%%xmm2 \n"
      "movdqa %%xmm0,%%xmm1 \n"
      "movdqa %%xmm2,%%xmm3 \n"
      "psllw $0x4,%%xmm1 \n"
      "psrlw $0x4,%%xmm3 \n"
      "por %%xmm1,%%xmm0 \n"
      "por %%xmm3,%%xmm2 \n"
      "movdqa %%xmm0,%%xmm1 \n"
      "punpcklbw %%xmm2,%%xmm0 \n"
      "punpckhbw %%xmm2,%%xmm1 \n"
      "movdqu %%xmm0,0x00(%1,%0,2) \n"
      "movdqu %%xmm1,0x10(%1,%0,2) \n"
      "lea 0x10(%0),%0 \n"
      "sub $0x8,%2 \n"
      "jg 1b \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
        "+r"(width)  // %2
      :
      : "memory""cc""eax""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5");
}

void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(

      "movdqa %3,%%xmm6 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "movdqu 0x10(%0),%%xmm1 \n"
      "movdqu 0x20(%0),%%xmm2 \n"
      "movdqu 0x30(%0),%%xmm3 \n"
      "lea 0x40(%0),%0 \n"
      "pshufb %%xmm6,%%xmm0 \n"
      "pshufb %%xmm6,%%xmm1 \n"
      "pshufb %%xmm6,%%xmm2 \n"
      "pshufb %%xmm6,%%xmm3 \n"
      "movdqa %%xmm1,%%xmm4 \n"
      "psrldq $0x4,%%xmm1 \n"
      "pslldq $0xc,%%xmm4 \n"
      "movdqa %%xmm2,%%xmm5 \n"
      "por %%xmm4,%%xmm0 \n"
      "pslldq $0x8,%%xmm5 \n"
      "movdqu %%xmm0,(%1) \n"
      "por %%xmm5,%%xmm1 \n"
      "psrldq $0x8,%%xmm2 \n"
      "pslldq $0x4,%%xmm3 \n"
      "por %%xmm3,%%xmm2 \n"
      "movdqu %%xmm1,0x10(%1) \n"
      "movdqu %%xmm2,0x20(%1) \n"
      "lea 0x30(%1),%1 \n"
      "sub $0x10,%2 \n"
      "jg 1b \n"
      : "+r"(src),                    // %0
        "+r"(dst),                    // %1
        "+r"(width)                   // %2
      : "m"(kShuffleMaskARGBToRGB24)  // %3
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6");
}

void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(

      "movdqa %3,%%xmm6 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "movdqu 0x10(%0),%%xmm1 \n"
      "movdqu 0x20(%0),%%xmm2 \n"
      "movdqu 0x30(%0),%%xmm3 \n"
      "lea 0x40(%0),%0 \n"
      "pshufb %%xmm6,%%xmm0 \n"
      "pshufb %%xmm6,%%xmm1 \n"
      "pshufb %%xmm6,%%xmm2 \n"
      "pshufb %%xmm6,%%xmm3 \n"
      "movdqa %%xmm1,%%xmm4 \n"
      "psrldq $0x4,%%xmm1 \n"
      "pslldq $0xc,%%xmm4 \n"
      "movdqa %%xmm2,%%xmm5 \n"
      "por %%xmm4,%%xmm0 \n"
      "pslldq $0x8,%%xmm5 \n"
      "movdqu %%xmm0,(%1) \n"
      "por %%xmm5,%%xmm1 \n"
      "psrldq $0x8,%%xmm2 \n"
      "pslldq $0x4,%%xmm3 \n"
      "por %%xmm3,%%xmm2 \n"
      "movdqu %%xmm1,0x10(%1) \n"
      "movdqu %%xmm2,0x20(%1) \n"
      "lea 0x30(%1),%1 \n"
      "sub $0x10,%2 \n"
      "jg 1b \n"
      : "+r"(src),                  // %0
        "+r"(dst),                  // %1
        "+r"(width)                 // %2
      : "m"(kShuffleMaskARGBToRAW)  // %3
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6");
}

#ifdef HAS_ARGBTORGB24ROW_AVX2
// vpermd for 12+12 to 24
static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};

void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "vbroadcastf128 %3,%%ymm6 \n"
      "vmovdqa %4,%%ymm7 \n"

      LABELALIGN
      "1: \n"
      "vmovdqu (%0),%%ymm0 \n"
      "vmovdqu 0x20(%0),%%ymm1 \n"
      "vmovdqu 0x40(%0),%%ymm2 \n"
      "vmovdqu 0x60(%0),%%ymm3 \n"
      "lea 0x80(%0),%0 \n"
      "vpshufb %%ymm6,%%ymm0,%%ymm0 \n"  // xxx0yyy0
      "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
      "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
      "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
      "vpermd %%ymm0,%%ymm7,%%ymm0 \n"  // pack to 24 bytes
      "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
      "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
      "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
      "vpermq $0x3f,%%ymm1,%%ymm4 \n"  // combine 24 + 8
      "vpor %%ymm4,%%ymm0,%%ymm0 \n"
      "vmovdqu %%ymm0,(%1) \n"
      "vpermq $0xf9,%%ymm1,%%ymm1 \n"  // combine 16 + 16
      "vpermq $0x4f,%%ymm2,%%ymm4 \n"
      "vpor %%ymm4,%%ymm1,%%ymm1 \n"
      "vmovdqu %%ymm1,0x20(%1) \n"
      "vpermq $0xfe,%%ymm2,%%ymm2 \n"  // combine 8 + 24
      "vpermq $0x93,%%ymm3,%%ymm3 \n"
      "vpor %%ymm3,%%ymm2,%%ymm2 \n"
      "vmovdqu %%ymm2,0x40(%1) \n"
      "lea 0x60(%1),%1 \n"
      "sub $0x20,%2 \n"
      "jg 1b \n"
      "vzeroupper \n"
      : "+r"(src),                     // %0
        "+r"(dst),                     // %1
        "+r"(width)                    // %2
      : "m"(kShuffleMaskARGBToRGB24),  // %3
        "m"(kPermdRGB24_AVX)           // %4
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6",
        "xmm7");
}
#endif

#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
// Shuffle table for converting ARGBToRGB24
static const ulvec8 kPermARGBToRGB24_0 = {
    0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,
    14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
    29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
static const ulvec8 kPermARGBToRGB24_1 = {
    10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
    25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
    40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
static const ulvec8 kPermARGBToRGB24_2 = {
    21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
    36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
    50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};

void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "vmovdqa %3,%%ymm5 \n"
      "vmovdqa %4,%%ymm6 \n"
      "vmovdqa %5,%%ymm7 \n"

      LABELALIGN
      "1: \n"
      "vmovdqu (%0),%%ymm0 \n"
      "vmovdqu 0x20(%0),%%ymm1 \n"
      "vmovdqu 0x40(%0),%%ymm2 \n"
      "vmovdqu 0x60(%0),%%ymm3 \n"
      "lea 0x80(%0),%0 \n"
      "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
      "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
      "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
      "vmovdqu %%ymm0,(%1) \n"
      "vmovdqu %%ymm1,0x20(%1) \n"
      "vmovdqu %%ymm2,0x40(%1) \n"
      "lea 0x60(%1),%1 \n"
      "sub $0x20,%2 \n"
      "jg 1b \n"
      "vzeroupper \n"
      : "+r"(src),                // %0
        "+r"(dst),                // %1
        "+r"(width)               // %2
      : "m"(kPermARGBToRGB24_0),  // %3
        "m"(kPermARGBToRGB24_1),  // %4
        "m"(kPermARGBToRGB24_2)   // %5
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm5""xmm6""xmm7");
}
#endif

#ifdef HAS_ARGBTORAWROW_AVX2
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "vbroadcastf128 %3,%%ymm6 \n"
      "vmovdqa %4,%%ymm7 \n"

      LABELALIGN
      "1: \n"
      "vmovdqu (%0),%%ymm0 \n"
      "vmovdqu 0x20(%0),%%ymm1 \n"
      "vmovdqu 0x40(%0),%%ymm2 \n"
      "vmovdqu 0x60(%0),%%ymm3 \n"
      "lea 0x80(%0),%0 \n"
      "vpshufb %%ymm6,%%ymm0,%%ymm0 \n"  // xxx0yyy0
      "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
      "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
      "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
      "vpermd %%ymm0,%%ymm7,%%ymm0 \n"  // pack to 24 bytes
      "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
      "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
      "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
      "vpermq $0x3f,%%ymm1,%%ymm4 \n"  // combine 24 + 8
      "vpor %%ymm4,%%ymm0,%%ymm0 \n"
      "vmovdqu %%ymm0,(%1) \n"
      "vpermq $0xf9,%%ymm1,%%ymm1 \n"  // combine 16 + 16
      "vpermq $0x4f,%%ymm2,%%ymm4 \n"
      "vpor %%ymm4,%%ymm1,%%ymm1 \n"
      "vmovdqu %%ymm1,0x20(%1) \n"
      "vpermq $0xfe,%%ymm2,%%ymm2 \n"  // combine 8 + 24
      "vpermq $0x93,%%ymm3,%%ymm3 \n"
      "vpor %%ymm3,%%ymm2,%%ymm2 \n"
      "vmovdqu %%ymm2,0x40(%1) \n"
      "lea 0x60(%1),%1 \n"
      "sub $0x20,%2 \n"
      "jg 1b \n"
      "vzeroupper \n"
      : "+r"(src),                   // %0
        "+r"(dst),                   // %1
        "+r"(width)                  // %2
      : "m"(kShuffleMaskARGBToRAW),  // %3
        "m"(kPermdRGB24_AVX)         // %4
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6",
        "xmm7");
}
#endif

void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "pcmpeqb %%xmm3,%%xmm3 \n"
      "psrld $0x1b,%%xmm3 \n"
      "pcmpeqb %%xmm4,%%xmm4 \n"
      "psrld $0x1a,%%xmm4 \n"
      "pslld $0x5,%%xmm4 \n"
      "pcmpeqb %%xmm5,%%xmm5 \n"
      "pslld $0xb,%%xmm5 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "movdqa %%xmm0,%%xmm1 \n"
      "movdqa %%xmm0,%%xmm2 \n"
      "pslld $0x8,%%xmm0 \n"
      "psrld $0x3,%%xmm1 \n"
      "psrld $0x5,%%xmm2 \n"
      "psrad $0x10,%%xmm0 \n"
      "pand %%xmm3,%%xmm1 \n"
      "pand %%xmm4,%%xmm2 \n"
      "pand %%xmm5,%%xmm0 \n"
      "por %%xmm2,%%xmm1 \n"
      "por %%xmm1,%%xmm0 \n"
      "packssdw %%xmm0,%%xmm0 \n"
      "lea 0x10(%0),%0 \n"
      "movq %%xmm0,(%1) \n"
      "lea 0x8(%1),%1 \n"
      "sub $0x4,%2 \n"
      "jg 1b \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
        "+r"(width)  // %2
        ::"memory",
        "cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5");
}

void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
                                uint8_t* dst,
                                const uint32_t dither4,
                                int width) {
  asm volatile(
      "movd %3,%%xmm6 \n"
      "punpcklbw %%xmm6,%%xmm6 \n"
      "movdqa %%xmm6,%%xmm7 \n"
      "punpcklwd %%xmm6,%%xmm6 \n"
      "punpckhwd %%xmm7,%%xmm7 \n"
      "pcmpeqb %%xmm3,%%xmm3 \n"
      "psrld $0x1b,%%xmm3 \n"
      "pcmpeqb %%xmm4,%%xmm4 \n"
      "psrld $0x1a,%%xmm4 \n"
      "pslld $0x5,%%xmm4 \n"
      "pcmpeqb %%xmm5,%%xmm5 \n"
      "pslld $0xb,%%xmm5 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "paddusb %%xmm6,%%xmm0 \n"
      "movdqa %%xmm0,%%xmm1 \n"
      "movdqa %%xmm0,%%xmm2 \n"
      "pslld $0x8,%%xmm0 \n"
      "psrld $0x3,%%xmm1 \n"
      "psrld $0x5,%%xmm2 \n"
      "psrad $0x10,%%xmm0 \n"
      "pand %%xmm3,%%xmm1 \n"
      "pand %%xmm4,%%xmm2 \n"
      "pand %%xmm5,%%xmm0 \n"
      "por %%xmm2,%%xmm1 \n"
      "por %%xmm1,%%xmm0 \n"
      "packssdw %%xmm0,%%xmm0 \n"
      "lea 0x10(%0),%0 \n"
      "movq %%xmm0,(%1) \n"
      "lea 0x8(%1),%1 \n"
      "sub $0x4,%2 \n"
      "jg 1b \n"
      : "+r"(src),    // %0
        "+r"(dst),    // %1
        "+r"(width)   // %2
      : "m"(dither4)  // %3
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6",
        "xmm7");
}

#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                uint8_t* dst,
                                const uint32_t dither4,
                                int width) {
  asm volatile(
      "vbroadcastss %3,%%xmm6 \n"
      "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
      "vpermq $0xd8,%%ymm6,%%ymm6 \n"
      "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
      "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
      "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
      "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
      "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
      "vpslld $0x5,%%ymm4,%%ymm4 \n"
      "vpslld $0xb,%%ymm3,%%ymm5 \n"

      LABELALIGN
      "1: \n"
      "vmovdqu (%0),%%ymm0 \n"
      "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
      "vpsrld $0x5,%%ymm0,%%ymm2 \n"
      "vpsrld $0x3,%%ymm0,%%ymm1 \n"
      "vpsrld $0x8,%%ymm0,%%ymm0 \n"
      "vpand %%ymm4,%%ymm2,%%ymm2 \n"
      "vpand %%ymm3,%%ymm1,%%ymm1 \n"
      "vpand %%ymm5,%%ymm0,%%ymm0 \n"
      "vpor %%ymm2,%%ymm1,%%ymm1 \n"
      "vpor %%ymm1,%%ymm0,%%ymm0 \n"
      "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
      "vpermq $0xd8,%%ymm0,%%ymm0 \n"
      "lea 0x20(%0),%0 \n"
      "vmovdqu %%xmm0,(%1) \n"
      "lea 0x10(%1),%1 \n"
      "sub $0x8,%2 \n"
      "jg 1b \n"
      "vzeroupper \n"
      : "+r"(src),    // %0
        "+r"(dst),    // %1
        "+r"(width)   // %2
      : "m"(dither4)  // %3
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6",
        "xmm7");
}
#endif  // HAS_ARGBTORGB565DITHERROW_AVX2

void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "pcmpeqb %%xmm4,%%xmm4 \n"
      "psrld $0x1b,%%xmm4 \n"
      "movdqa %%xmm4,%%xmm5 \n"
      "pslld $0x5,%%xmm5 \n"
      "movdqa %%xmm4,%%xmm6 \n"
      "pslld $0xa,%%xmm6 \n"
      "pcmpeqb %%xmm7,%%xmm7 \n"
      "pslld $0xf,%%xmm7 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "movdqa %%xmm0,%%xmm1 \n"
      "movdqa %%xmm0,%%xmm2 \n"
      "movdqa %%xmm0,%%xmm3 \n"
      "psrad $0x10,%%xmm0 \n"
      "psrld $0x3,%%xmm1 \n"
      "psrld $0x6,%%xmm2 \n"
      "psrld $0x9,%%xmm3 \n"
      "pand %%xmm7,%%xmm0 \n"
      "pand %%xmm4,%%xmm1 \n"
      "pand %%xmm5,%%xmm2 \n"
      "pand %%xmm6,%%xmm3 \n"
      "por %%xmm1,%%xmm0 \n"
      "por %%xmm3,%%xmm2 \n"
      "por %%xmm2,%%xmm0 \n"
      "packssdw %%xmm0,%%xmm0 \n"
      "lea 0x10(%0),%0 \n"
      "movq %%xmm0,(%1) \n"
      "lea 0x8(%1),%1 \n"
      "sub $0x4,%2 \n"
      "jg 1b \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
        "+r"(width)  // %2
        ::"memory",
        "cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6""xmm7");
}

void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "pcmpeqb %%xmm4,%%xmm4 \n"
      "psllw $0xc,%%xmm4 \n"
      "movdqa %%xmm4,%%xmm3 \n"
      "psrlw $0x8,%%xmm3 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "movdqa %%xmm0,%%xmm1 \n"
      "pand %%xmm3,%%xmm0 \n"
      "pand %%xmm4,%%xmm1 \n"
      "psrlq $0x4,%%xmm0 \n"
      "psrlq $0x8,%%xmm1 \n"
      "por %%xmm1,%%xmm0 \n"
      "packuswb %%xmm0,%%xmm0 \n"
      "lea 0x10(%0),%0 \n"
      "movq %%xmm0,(%1) \n"
      "lea 0x8(%1),%1 \n"
      "sub $0x4,%2 \n"
      "jg 1b \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
        "+r"(width)  // %2
        ::"memory",
        "cc""xmm0""xmm1""xmm2""xmm3""xmm4");
}
#endif  // HAS_RGB24TOARGBROW_SSSE3

/*

ARGBToAR30Row:

Red Blue
With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
(1024+4)*16 for red.

Alpha Green
Alpha and Green are already in the high bits so vpand can zero out the other
bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
would be a simple multiplier to shift it into position.  It wants a gap of 10
above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
result left 10 to position the A and G channels.
*/


// Shuffle table for converting RAW to RGB24.  Last 8.
static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
                                   128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};

static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
                                   128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};

static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
static const uint32_t kMaskRB10 = 0x3ff003ff;
static const uint32_t kMaskAG10 = 0xc000ff00;
static const uint32_t kMulAG10 = 64 * 65536 + 1028;

void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "movdqa %3,%%xmm2 \n"  // shuffler for RB
      "movd %4,%%xmm3 \n"  // multipler for RB
      "movd %5,%%xmm4 \n"  // mask for R10 B10
      "movd %6,%%xmm5 \n"  // mask for AG
      "movd %7,%%xmm6 \n"  // multipler for AG
      "pshufd $0x0,%%xmm3,%%xmm3 \n"
      "pshufd $0x0,%%xmm4,%%xmm4 \n"
      "pshufd $0x0,%%xmm5,%%xmm5 \n"
      "pshufd $0x0,%%xmm6,%%xmm6 \n"
      "sub %0,%1 \n"

      "1: \n"
      "movdqu (%0),%%xmm0 \n"  // fetch 4 ARGB pixels
      "movdqa %%xmm0,%%xmm1 \n"
      "pshufb %%xmm2,%%xmm1 \n"  // R0B0
      "pand %%xmm5,%%xmm0 \n"  // A0G0
      "pmulhuw %%xmm3,%%xmm1 \n"  // X2 R16 X4  B10
      "pmulhuw %%xmm6,%%xmm0 \n"  // X10 A2 X10 G10
      "pand %%xmm4,%%xmm1 \n"  // X2 R10 X10 B10
      "pslld $10,%%xmm0 \n"  // A2 x10 G10 x10
      "por %%xmm1,%%xmm0 \n"  // A2 R10 G10 B10
      "movdqu %%xmm0,(%1,%0) \n"  // store 4 AR30 pixels
      "add $0x10,%0 \n"
      "sub $0x4,%2 \n"
      "jg 1b \n"

      : "+r"(src),          // %0
        "+r"(dst),          // %1
        "+r"(width)         // %2
      : "m"(kShuffleRB30),  // %3
        "m"(kMulRB10),      // %4
        "m"(kMaskRB10),     // %5
        "m"(kMaskAG10),     // %6
        "m"(kMulAG10)       // %7
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6");
}

void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "movdqa %3,%%xmm2 \n"  // shuffler for RB
      "movd %4,%%xmm3 \n"  // multipler for RB
      "movd %5,%%xmm4 \n"  // mask for R10 B10
      "movd %6,%%xmm5 \n"  // mask for AG
      "movd %7,%%xmm6 \n"  // multipler for AG
      "pshufd $0x0,%%xmm3,%%xmm3 \n"
      "pshufd $0x0,%%xmm4,%%xmm4 \n"
      "pshufd $0x0,%%xmm5,%%xmm5 \n"
      "pshufd $0x0,%%xmm6,%%xmm6 \n"
      "sub %0,%1 \n"

      "1: \n"
      "movdqu (%0),%%xmm0 \n"  // fetch 4 ABGR pixels
      "movdqa %%xmm0,%%xmm1 \n"
      "pshufb %%xmm2,%%xmm1 \n"  // R0B0
      "pand %%xmm5,%%xmm0 \n"  // A0G0
      "pmulhuw %%xmm3,%%xmm1 \n"  // X2 R16 X4  B10
      "pmulhuw %%xmm6,%%xmm0 \n"  // X10 A2 X10 G10
      "pand %%xmm4,%%xmm1 \n"  // X2 R10 X10 B10
      "pslld $10,%%xmm0 \n"  // A2 x10 G10 x10
      "por %%xmm1,%%xmm0 \n"  // A2 R10 G10 B10
      "movdqu %%xmm0,(%1,%0) \n"  // store 4 AR30 pixels
      "add $0x10,%0 \n"
      "sub $0x4,%2 \n"
      "jg 1b \n"

      : "+r"(src),          // %0
        "+r"(dst),          // %1
        "+r"(width)         // %2
      : "m"(kShuffleBR30),  // %3  reversed shuffler
        "m"(kMulRB10),      // %4
        "m"(kMaskRB10),     // %5
        "m"(kMaskAG10),     // %6
        "m"(kMulAG10)       // %7
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6");
}

#ifdef HAS_ARGBTOAR30ROW_AVX2
void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "vbroadcastf128 %3,%%ymm2 \n"  // shuffler for RB
      "vbroadcastss %4,%%ymm3 \n"  // multipler for RB
      "vbroadcastss %5,%%ymm4 \n"  // mask for R10 B10
      "vbroadcastss %6,%%ymm5 \n"  // mask for AG
      "vbroadcastss %7,%%ymm6 \n"  // multipler for AG
      "sub %0,%1 \n"

      "1: \n"
      "vmovdqu (%0),%%ymm0 \n"  // fetch 8 ARGB pixels
      "vpshufb %%ymm2,%%ymm0,%%ymm1 \n"  // R0B0
      "vpand %%ymm5,%%ymm0,%%ymm0 \n"  // A0G0
      "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"  // X2 R16 X4  B10
      "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n"  // X10 A2 X10 G10
      "vpand %%ymm4,%%ymm1,%%ymm1 \n"  // X2 R10 X10 B10
      "vpslld $10,%%ymm0,%%ymm0 \n"  // A2 x10 G10 x10
      "vpor %%ymm1,%%ymm0,%%ymm0 \n"  // A2 R10 G10 B10
      "vmovdqu %%ymm0,(%1,%0) \n"  // store 8 AR30 pixels
      "add $0x20,%0 \n"
      "sub $0x8,%2 \n"
      "jg 1b \n"
      "vzeroupper \n"

      : "+r"(src),          // %0
        "+r"(dst),          // %1
        "+r"(width)         // %2
      : "m"(kShuffleRB30),  // %3
        "m"(kMulRB10),      // %4
        "m"(kMaskRB10),     // %5
        "m"(kMaskAG10),     // %6
        "m"(kMulAG10)       // %7
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6");
}
#endif

#ifdef HAS_ABGRTOAR30ROW_AVX2
void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      "vbroadcastf128 %3,%%ymm2 \n"  // shuffler for RB
      "vbroadcastss %4,%%ymm3 \n"  // multipler for RB
      "vbroadcastss %5,%%ymm4 \n"  // mask for R10 B10
      "vbroadcastss %6,%%ymm5 \n"  // mask for AG
      "vbroadcastss %7,%%ymm6 \n"  // multipler for AG
      "sub %0,%1 \n"

      "1: \n"
      "vmovdqu (%0),%%ymm0 \n"  // fetch 8 ABGR pixels
      "vpshufb %%ymm2,%%ymm0,%%ymm1 \n"  // R0B0
      "vpand %%ymm5,%%ymm0,%%ymm0 \n"  // A0G0
      "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"  // X2 R16 X4  B10
      "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n"  // X10 A2 X10 G10
      "vpand %%ymm4,%%ymm1,%%ymm1 \n"  // X2 R10 X10 B10
      "vpslld $10,%%ymm0,%%ymm0 \n"  // A2 x10 G10 x10
      "vpor %%ymm1,%%ymm0,%%ymm0 \n"  // A2 R10 G10 B10
      "vmovdqu %%ymm0,(%1,%0) \n"  // store 8 AR30 pixels
      "add $0x20,%0 \n"
      "sub $0x8,%2 \n"
      "jg 1b \n"
      "vzeroupper \n"

      : "+r"(src),          // %0
        "+r"(dst),          // %1
        "+r"(width)         // %2
      : "m"(kShuffleBR30),  // %3  reversed shuffler
        "m"(kMulRB10),      // %4
        "m"(kMaskRB10),     // %5
        "m"(kMaskAG10),     // %6
        "m"(kMulAG10)       // %7
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6");
}
#endif

// clang-format off

// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
// round parameter is register containing value to add before shift.
#define RGBTOY(round)                            \
  "1: \n" \
  "movdqu (%0),%%xmm0 \n" \
  "movdqu 0x10(%0),%%xmm1 \n" \
  "movdqu 0x20(%0),%%xmm2 \n" \
  "movdqu 0x30(%0),%%xmm3 \n" \
  "psubb %%xmm5,%%xmm0 \n" \
  "psubb %%xmm5,%%xmm1 \n" \
  "psubb %%xmm5,%%xmm2 \n" \
  "psubb %%xmm5,%%xmm3 \n" \
  "movdqu %%xmm4,%%xmm6 \n" \
  "pmaddubsw %%xmm0,%%xmm6 \n" \
  "movdqu %%xmm4,%%xmm0 \n" \
  "pmaddubsw %%xmm1,%%xmm0 \n" \
  "movdqu %%xmm4,%%xmm1 \n" \
  "pmaddubsw %%xmm2,%%xmm1 \n" \
  "movdqu %%xmm4,%%xmm2 \n" \
  "pmaddubsw %%xmm3,%%xmm2 \n" \
  "lea 0x40(%0),%0 \n" \
  "phaddw %%xmm0,%%xmm6 \n" \
  "phaddw %%xmm2,%%xmm1 \n" \
  "prefetcht0 1280(%0) \n" \
  "paddw %%" #round ",%%xmm6 \n" \
  "paddw %%" #round ",%%xmm1 \n" \
  "psrlw $0x8,%%xmm6 \n" \
  "psrlw $0x8,%%xmm1 \n" \
  "packuswb %%xmm1,%%xmm6 \n" \
  "movdqu %%xmm6,(%1) \n" \
  "lea 0x10(%1),%1 \n" \
  "sub $0x10,%2 \n" \
  "jg 1b \n"

#define RGBTOY_AVX2(round)                                       \
  "1: \n"                 \
  "vmovdqu (%0),%%ymm0 \n"                 \
  "vmovdqu 0x20(%0),%%ymm1 \n"                 \
  "vmovdqu 0x40(%0),%%ymm2 \n"                 \
  "vmovdqu 0x60(%0),%%ymm3 \n"                 \
  "vpsubb %%ymm5, %%ymm0, %%ymm0 \n"                 \
  "vpsubb %%ymm5, %%ymm1, %%ymm1 \n"                 \
  "vpsubb %%ymm5, %%ymm2, %%ymm2 \n"                 \
  "vpsubb %%ymm5, %%ymm3, %%ymm3 \n"                 \
  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n"                 \
  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n"                 \
  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n"                 \
  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n"                 \
  "lea 0x80(%0),%0 \n"                 \
  "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */  \
  "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"                 \
  "prefetcht0 1280(%0) \n"                 \
  "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */             \
  "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
  "vpsrlw $0x8,%%ymm0,%%ymm0 \n"                 \
  "vpsrlw $0x8,%%ymm2,%%ymm2 \n"                 \
  "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */  \
  "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
  "vmovdqu %%ymm0,(%1) \n"                 \
  "lea 0x20(%1),%1 \n"                 \
  "sub $0x20,%2 \n"                 \
  "jg 1b \n"                 \
  "vzeroupper \n"

// clang-format on

#ifdef HAS_ARGBTOYROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  asm volatile(
      "movdqa %3,%%xmm4 \n"
      "movdqa %4,%%xmm5 \n"
      "movdqa %5,%%xmm7 \n"

      LABELALIGN RGBTOY(xmm7)
      : "+r"(src_argb),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      : "m"(kARGBToY),   // %3
        "m"(kSub128),    // %4
        "m"(kAddY16)     // %5
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6",
        "xmm7");
}
#endif  // HAS_ARGBTOYROW_SSSE3

#ifdef HAS_ARGBTOYJROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16.
void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  asm volatile(
      "movdqa %3,%%xmm4 \n"
      "movdqa %4,%%xmm5 \n"

      LABELALIGN RGBTOY(xmm5)
      : "+r"(src_argb),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      : "m"(kARGBToYJ),  // %3
        "m"(kSub128)     // %4
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6");
}
#endif  // HAS_ARGBTOYJROW_SSSE3

#ifdef HAS_RGBATOYJROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16.
void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
  asm volatile(
      "movdqa %3,%%xmm4 \n"
      "movdqa %4,%%xmm5 \n"

      LABELALIGN RGBTOY(xmm5)
      : "+r"(src_rgba),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      : "m"(kRGBAToYJ),  // %3
        "m"(kSub128)     // %4
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6");
}
#endif  // HAS_RGBATOYJROW_SSSE3

#ifdef HAS_ARGBTOYROW_AVX2
// vpermd for vphaddw + vpackuswb vpermd.
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};

// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  asm volatile(
      "vbroadcastf128 %3,%%ymm4 \n"
      "vbroadcastf128 %4,%%ymm5 \n"
      "vbroadcastf128 %5,%%ymm7 \n"
      "vmovdqu %6,%%ymm6 \n"

      LABELALIGN RGBTOY_AVX2(ymm7)
      : "+r"(src_argb),         // %0
        "+r"(dst_y),            // %1
        "+r"(width)             // %2
      : "m"(kARGBToY),          // %3
        "m"(kSub128),           // %4
        "m"(kAddY16),           // %5
        "m"(kPermdARGBToY_AVX)  // %6
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6",
        "xmm7");
}
#endif  // HAS_ARGBTOYROW_AVX2

#ifdef HAS_ABGRTOYROW_AVX2
// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
  asm volatile(
      "vbroadcastf128 %3,%%ymm4 \n"
      "vbroadcastf128 %4,%%ymm5 \n"
      "vbroadcastf128 %5,%%ymm7 \n"
      "vmovdqu %6,%%ymm6 \n"

      LABELALIGN RGBTOY_AVX2(ymm7)
      : "+r"(src_abgr),         // %0
        "+r"(dst_y),            // %1
        "+r"(width)             // %2
      : "m"(kABGRToY),          // %3
        "m"(kSub128),           // %4
        "m"(kAddY16),           // %5
        "m"(kPermdARGBToY_AVX)  // %6
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6",
        "xmm7");
}
#endif  // HAS_ABGRTOYROW_AVX2

#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  asm volatile(
      "vbroadcastf128 %3,%%ymm4 \n"
      "vbroadcastf128 %4,%%ymm5 \n"
      "vmovdqu %5,%%ymm6 \n"

      LABELALIGN RGBTOY_AVX2(ymm5)
      : "+r"(src_argb),         // %0
        "+r"(dst_y),            // %1
        "+r"(width)             // %2
      : "m"(kARGBToYJ),         // %3
        "m"(kSub128),           // %4
        "m"(kPermdARGBToY_AVX)  // %5
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6",
        "xmm7");
}
#endif  // HAS_ARGBTOYJROW_AVX2

#ifdef HAS_RGBATOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
  asm volatile(
      "vbroadcastf128 %3,%%ymm4 \n"
      "vbroadcastf128 %4,%%ymm5 \n"
      "vmovdqu %5,%%ymm6 \n"

      LABELALIGN RGBTOY_AVX2(
          ymm5) "vzeroupper \n"
      : "+r"(src_rgba),         // %0
        "+r"(dst_y),            // %1
        "+r"(width)             // %2
      : "m"(kRGBAToYJ),         // %3
        "m"(kSub128),           // %4
        "m"(kPermdARGBToY_AVX)  // %5
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6");
}
#endif  // HAS_RGBATOYJROW_AVX2

#ifdef HAS_ARGBTOUVROW_SSSE3
void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
                       int src_stride_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
  asm volatile(
      "movdqa %5,%%xmm3 \n"
      "movdqa %6,%%xmm4 \n"
      "movdqa %7,%%xmm5 \n"
      "sub %1,%2 \n"

      LABELALIGN
      "1: \n"
      "movdqu (%0),%%xmm0 \n"
      "movdqu 0x00(%0,%4,1),%%xmm7 \n"
      "pavgb %%xmm7,%%xmm0 \n"
      "movdqu 0x10(%0),%%xmm1 \n"
      "movdqu 0x10(%0,%4,1),%%xmm7 \n"
      "pavgb %%xmm7,%%xmm1 \n"
      "movdqu 0x20(%0),%%xmm2 \n"
      "movdqu 0x20(%0,%4,1),%%xmm7 \n"
      "pavgb %%xmm7,%%xmm2 \n"
      "movdqu 0x30(%0),%%xmm6 \n"
      "movdqu 0x30(%0,%4,1),%%xmm7 \n"
      "pavgb %%xmm7,%%xmm6 \n"

      "lea 0x40(%0),%0 \n"
      "movdqa %%xmm0,%%xmm7 \n"
      "shufps $0x88,%%xmm1,%%xmm0 \n"
      "shufps $0xdd,%%xmm1,%%xmm7 \n"
      "pavgb %%xmm7,%%xmm0 \n"
      "movdqa %%xmm2,%%xmm7 \n"
      "shufps $0x88,%%xmm6,%%xmm2 \n"
      "shufps $0xdd,%%xmm6,%%xmm7 \n"
      "pavgb %%xmm7,%%xmm2 \n"
      "movdqa %%xmm0,%%xmm1 \n"
      "movdqa %%xmm2,%%xmm6 \n"
      "pmaddubsw %%xmm4,%%xmm0 \n"
      "pmaddubsw %%xmm4,%%xmm2 \n"
      "pmaddubsw %%xmm3,%%xmm1 \n"
      "pmaddubsw %%xmm3,%%xmm6 \n"
      "phaddw %%xmm2,%%xmm0 \n"
      "phaddw %%xmm6,%%xmm1 \n"
      "psraw $0x8,%%xmm0 \n"
      "psraw $0x8,%%xmm1 \n"
      "packsswb %%xmm1,%%xmm0 \n"
      "paddb %%xmm5,%%xmm0 \n"
      "movlps %%xmm0,(%1) \n"
      "movhps %%xmm0,0x00(%1,%2,1) \n"
      "lea 0x8(%1),%1 \n"
      "sub $0x10,%3 \n"
      "jg 1b \n"
      : "+r"(src_argb0),                   // %0
        "+r"(dst_u),                       // %1
        "+r"(dst_v),                       // %2
        "+rm"(width)                       // %3
      : "r"((intptr_t)(src_stride_argb)),  // %4
        "m"(kARGBToV),                     // %5
        "m"(kARGBToU),                     // %6
        "m"(kAddUV128)                     // %7
      : "memory""cc""xmm0""xmm1""xmm2""xmm6""xmm7");
}
#endif  // HAS_ARGBTOUVROW_SSSE3

#ifdef HAS_ARGBTOUVROW_AVX2
// vpshufb for vphaddw + vpackuswb packed to shorts.
static const lvec8 kShufARGBToUV_AVX = {
    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
                      int src_stride_argb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  asm volatile(
      "vbroadcastf128 %5,%%ymm5 \n"
      "vbroadcastf128 %6,%%ymm6 \n"
      "vbroadcastf128 %7,%%ymm7 \n"
      "sub %1,%2 \n"

      LABELALIGN
      "1: \n"
      "vmovdqu (%0),%%ymm0 \n"
      "vmovdqu 0x20(%0),%%ymm1 \n"
      "vmovdqu 0x40(%0),%%ymm2 \n"
      "vmovdqu 0x60(%0),%%ymm3 \n"
      "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
      "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
      "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
      "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
      "lea 0x80(%0),%0 \n"
      "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
      "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
      "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
      "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
      "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
      "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"

      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
      "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
      "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
      "vpsraw $0x8,%%ymm1,%%ymm1 \n"
      "vpsraw $0x8,%%ymm0,%%ymm0 \n"
      "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
      "vpermq $0xd8,%%ymm0,%%ymm0 \n"
      "vpshufb %8,%%ymm0,%%ymm0 \n"
      "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"

      "vextractf128 $0x0,%%ymm0,(%1) \n"
      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
      "lea 0x10(%1),%1 \n"
      "sub $0x20,%3 \n"
      "jg 1b \n"
      "vzeroupper \n"
      : "+r"(src_argb0),                   // %0
        "+r"(dst_u),                       // %1
        "+r"(dst_v),                       // %2
        "+rm"(width)                       // %3
      : "r"((intptr_t)(src_stride_argb)),  // %4
        "m"(kAddUV128),                    // %5
        "m"(kARGBToV),                     // %6
        "m"(kARGBToU),                     // %7
        "m"(kShufARGBToUV_AVX)             // %8
      : "memory""cc""xmm0""xmm1""xmm2""xmm3""xmm4""xmm5""xmm6",
        "xmm7");
}
#endif  // HAS_ARGBTOUVROW_AVX2

#ifdef HAS_ABGRTOUVROW_AVX2
void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
                      int src_stride_abgr,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  asm volatile(
      "vbroadcastf128 %5,%%ymm5 \n"
      "vbroadcastf128 %6,%%ymm6 \n"
      "vbroadcastf128 %7,%%ymm7 \n"
      "sub %1,%2 \n"

      LABELALIGN
      "1: \n"
      "vmovdqu (%0),%%ymm0 \n"
      "vmovdqu 0x20(%0),%%ymm1 \n"
      "vmovdqu 0x40(%0),%%ymm2 \n"
      "vmovdqu 0x60(%0),%%ymm3 \n"
      "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
      "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
      "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
      "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
      "lea 0x80(%0),%0 \n"
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5
C=88 H=89 G=88

¤ Dauer der Verarbeitung: 0.42 Sekunden  (vorverarbeitet)  ¤

*© Formatika GbR, Deutschland






Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.