Quelle SkBlitter_ARGB32.cpp

Sprache: C

/*
* Copyright 2006 The Android Open Source Project
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/

#include "include/core/SkColor.h"
#include "include/core/SkColorPriv.h"
#include "include/core/SkColorType.h"
#include "include/core/SkPaint.h"
#include "include/core/SkPixmap.h"
#include "include/core/SkRect.h"
#include "include/core/SkTypes.h"
#include "include/private/SkColorData.h"
#include "include/private/base/SkCPUTypes.h"
#include "include/private/base/SkDebug.h"
#include "include/private/base/SkMalloc.h"
#include "include/private/base/SkTo.h"
#include "src/base/SkUtils.h"
#include "src/base/SkVx.h"
#include "src/core/SkBlitMask.h"
#include "src/core/SkBlitRow.h"
#include "src/core/SkCoreBlitters.h"
#include "src/core/SkMask.h"
#include "src/core/SkMemset.h"
#include "src/shaders/SkShaderBase.h"

#include <algorithm>
#include <cstddef>
#include <cstdint>

static inline int upscale_31_to_32(int value) {
    SkASSERT((unsigned)value <= 31);
    return value + (value >> 4);
}

static inline int blend_32(int src, int dst, int scale) {
    SkASSERT((unsigned)src <= 0xFF);
    SkASSERT((unsigned)dst <= 0xFF);
    SkASSERT((unsigned)scale <= 32);
    return dst + ((src - dst) * scale >> 5);
}

static inline SkPMColor blend_lcd16(int srcA, int srcR, int srcG, int srcB,
                                     SkPMColor dst, uint16_t mask) {
    if (mask == 0) {
        return dst;
    }

    /*  We want all of these in 5bits, hence the shifts in case one of them
     *  (green) is 6bits.
     */
    int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
    int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
    int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);

    // Now upscale them to 0..32, so we can use blend32
    maskR = upscale_31_to_32(maskR);
    maskG = upscale_31_to_32(maskG);
    maskB = upscale_31_to_32(maskB);

    // srcA has been upscaled to 256 before passed into this function
    maskR = maskR * srcA >> 8;
    maskG = maskG * srcA >> 8;
    maskB = maskB * srcA >> 8;

    int dstA = SkGetPackedA32(dst);
    int dstR = SkGetPackedR32(dst);
    int dstG = SkGetPackedG32(dst);
    int dstB = SkGetPackedB32(dst);

    // Subtract 1 from srcA to bring it back to [0-255] to compare against dstA, alpha needs to
    // use either the min or the max of the LCD coverages. See https:/skbug.com/40037823
    int maskA = (srcA-1) < dstA ? std::min(maskR, std::min(maskG, maskB))
                                : std::max(maskR, std::max(maskG, maskB));

    return SkPackARGB32(blend_32(0xFF, dstA, maskA),
                        blend_32(srcR, dstR, maskR),
                        blend_32(srcG, dstG, maskG),
                        blend_32(srcB, dstB, maskB));
}

static inline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB,
                                           SkPMColor dst, uint16_t mask,
                                           SkPMColor opaqueDst) {
    if (mask == 0) {
        return dst;
    }

    if (0xFFFF == mask) {
        return opaqueDst;
    }

    /*  We want all of these in 5bits, hence the shifts in case one of them
     *  (green) is 6bits.
     */
    int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
    int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
    int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);

    // Now upscale them to 0..32, so we can use blend32
    maskR = upscale_31_to_32(maskR);
    maskG = upscale_31_to_32(maskG);
    maskB = upscale_31_to_32(maskB);

    int dstA = SkGetPackedA32(dst);
    int dstR = SkGetPackedR32(dst);
    int dstG = SkGetPackedG32(dst);
    int dstB = SkGetPackedB32(dst);

    // Opaque src alpha always uses the max of the LCD coverages.
    int maskA = std::max(maskR, std::max(maskG, maskB));

    // LCD blitting is only supported if the dst is known/required
    // to be opaque
    return SkPackARGB32(blend_32(0xFF, dstA, maskA),
                        blend_32(srcR, dstR, maskR),
                        blend_32(srcG, dstG, maskG),
                        blend_32(srcB, dstB, maskB));
}

// TODO: rewrite at least the SSE code here.  It's miserable.

#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
    #include <emmintrin.h>

    // The following (left) shifts cause the top 5 bits of the mask components to
    // line up with the corresponding components in an SkPMColor.
    // Note that the mask's RGB16 order may differ from the SkPMColor order.
    #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
    #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
    #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)

    #if SK_R16x5_R32x5_SHIFT == 0
        #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
    #elif SK_R16x5_R32x5_SHIFT > 0
        #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
    #else
        #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
    #endif

    #if SK_G16x5_G32x5_SHIFT == 0
        #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
    #elif SK_G16x5_G32x5_SHIFT > 0
        #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
    #else
        #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
    #endif

    #if SK_B16x5_B32x5_SHIFT == 0
        #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
    #elif SK_B16x5_B32x5_SHIFT > 0
        #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
    #else
        #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
    #endif

    static __m128i blend_lcd16_sse2(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
        // In the following comments, the components of src, dst and mask are
        // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
        // by an R, G, B, or A suffix. Components of one of the four pixels that
        // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
        // example is the blue channel of the second destination pixel. Memory
        // layout is shown for an ARGB byte order in a color value.

        // src and srcA store 8-bit values interleaved with zeros.
        // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
        // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
        //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
        // mask stores 16-bit values (compressed three channels) interleaved with zeros.
        // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
        // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
        //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

        // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
        // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
        __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
                                  _mm_set1_epi32(0x1F << SK_R32_SHIFT));

        // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
        __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
                                  _mm_set1_epi32(0x1F << SK_G32_SHIFT));

        // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
        __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
                                  _mm_set1_epi32(0x1F << SK_B32_SHIFT));

        // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
        __m128i aMin = _mm_min_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
                       _mm_min_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
                                    _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
        __m128i aMax = _mm_max_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
                       _mm_max_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
                                    _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
        // srcA has been biased to [0-256], so compare srcA against (dstA+1)
        __m128i a = _mm_cmplt_epi32(srcA,
                                    _mm_and_si128(
                                            _mm_add_epi32(dst, _mm_set1_epi32(1 << SK_A32_SHIFT)),
                                            _mm_set1_epi32(SK_A32_MASK)));
        // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
        a = _mm_or_si128(_mm_and_si128(a, aMin), _mm_andnot_si128(a, aMax));

        // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
        // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
        // 8-bit position
        // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
        //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
        mask = _mm_or_si128(_mm_or_si128(a, r), _mm_or_si128(g, b));

        // Interleave R,G,B into the lower byte of word.
        // i.e. split the sixteen 8-bit values from mask into two sets of eight
        // 16-bit values, padded by zero.
        __m128i maskLo, maskHi;
        // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
        maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
        // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
        maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

        // Upscale from 0..31 to 0..32
        // (allows to replace division by left-shift further down)
        // Left-shift each component by 4 and add the result back to that component,
        // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
        maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
        maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

        // Multiply each component of maskLo and maskHi by srcA
        maskLo = _mm_mullo_epi16(maskLo, srcA);
        maskHi = _mm_mullo_epi16(maskHi, srcA);

        // Left shift mask components by 8 (divide by 256)
        maskLo = _mm_srli_epi16(maskLo, 8);
        maskHi = _mm_srli_epi16(maskHi, 8);

        // Interleave R,G,B into the lower byte of the word
        // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
        __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
        // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
        __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

        // mask = (src - dst) * mask
        maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
        maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

        // mask = (src - dst) * mask >> 5
        maskLo = _mm_srai_epi16(maskLo, 5);
        maskHi = _mm_srai_epi16(maskHi, 5);

        // Add two pixels into result.
        // result = dst + ((src - dst) * mask >> 5)
        __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
        __m128i resultHi = _mm_add_epi16(dstHi, maskHi);

        // Pack into 4 32bit dst pixels.
        // resultLo and resultHi contain eight 16-bit components (two pixels) each.
        // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
        // clamping to 255 if necessary.
        return _mm_packus_epi16(resultLo, resultHi);
    }

    static __m128i blend_lcd16_opaque_sse2(__m128i &src, __m128i &dst, __m128i &mask) {
        // In the following comments, the components of src, dst and mask are
        // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
        // by an R, G, B, or A suffix. Components of one of the four pixels that
        // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
        // example is the blue channel of the second destination pixel. Memory
        // layout is shown for an ARGB byte order in a color value.

        // src and srcA store 8-bit values interleaved with zeros.
        // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
        // mask stores 16-bit values (shown as high and low bytes) interleaved with
        // zeros
        // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
        //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

        // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
        // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
        __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
                                  _mm_set1_epi32(0x1F << SK_R32_SHIFT));

        // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
        __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
                                  _mm_set1_epi32(0x1F << SK_G32_SHIFT));

        // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
        __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
                                  _mm_set1_epi32(0x1F << SK_B32_SHIFT));

        // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
        __m128i a = _mm_max_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
                    _mm_max_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
                                 _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));

        // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
        // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
        // 8-bit position
        // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
        //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
        mask = _mm_or_si128(_mm_or_si128(a, r), _mm_or_si128(g, b));

        // Interleave R,G,B into the lower byte of word.
        // i.e. split the sixteen 8-bit values from mask into two sets of eight
        // 16-bit values, padded by zero.
        __m128i maskLo, maskHi;
        // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
        maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
        // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
        maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

        // Upscale from 0..31 to 0..32
        // (allows to replace division by left-shift further down)
        // Left-shift each component by 4 and add the result back to that component,
        // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
        maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
        maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

        // Interleave R,G,B into the lower byte of the word
        // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
        __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
        // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
        __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

        // mask = (src - dst) * mask
        maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
        maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

        // mask = (src - dst) * mask >> 5
        maskLo = _mm_srai_epi16(maskLo, 5);
        maskHi = _mm_srai_epi16(maskHi, 5);

        // Add two pixels into result.
        // result = dst + ((src - dst) * mask >> 5)
        __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
        __m128i resultHi = _mm_add_epi16(dstHi, maskHi);

        // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
        // clamping to 255 if necessary.
        return _mm_packus_epi16(resultLo, resultHi);
    }

    void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
        if (width <= 0) {
            return;
        }

        int srcA = SkColorGetA(src);
        int srcR = SkColorGetR(src);
        int srcG = SkColorGetG(src);
        int srcB = SkColorGetB(src);

        srcA = SkAlpha255To256(srcA);

        if (width >= 4) {
            SkASSERT(((size_t)dst & 0x03) == 0);
            while (((size_t)dst & 0x0F) != 0) {
                *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
                mask++;
                dst++;
                width--;
            }

            __m128i *d = reinterpret_cast<__m128i*>(dst);
            // Set alpha to 0xFF and replicate source four times in SSE register.
            __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
            // Interleave with zeros to get two sets of four 16-bit values.
            src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
            // Set srcA_sse to contain eight copies of srcA, padded with zero.
            // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
            __m128i srcA_sse = _mm_set1_epi16(srcA);
            while (width >= 4) {
                // Load four destination pixels into dst_sse.
                __m128i dst_sse = _mm_load_si128(d);
                // Load four 16-bit masks into lower half of mask_sse.
                __m128i mask_sse = _mm_loadl_epi64((const __m128i*)mask);

                // Check whether masks are equal to 0 and get the highest bit
                // of each byte of result, if masks are all zero, we will get
                // pack_cmp to 0xFFFF
                int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
                                                 _mm_setzero_si128()));

                // if mask pixels are not all zero, we will blend the dst pixels
                if (pack_cmp != 0xFFFF) {
                    // Unpack 4 16bit mask pixels to
                    // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
                    //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
                    mask_sse = _mm_unpacklo_epi16(mask_sse,
                                                  _mm_setzero_si128());

                    // Process 4 32bit dst pixels
                    __m128i result = blend_lcd16_sse2(src_sse, dst_sse, mask_sse, srcA_sse);
                    _mm_store_si128(d, result);
                }

                d++;
                mask += 4;
                width -= 4;
            }

            dst = reinterpret_cast<SkPMColor*>(d);
        }

        while (width > 0) {
            *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
            mask++;
            dst++;
            width--;
        }
    }

    void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
                                   SkColor src, int width, SkPMColor opaqueDst) {
        if (width <= 0) {
            return;
        }

        int srcR = SkColorGetR(src);
        int srcG = SkColorGetG(src);
        int srcB = SkColorGetB(src);

        if (width >= 4) {
            SkASSERT(((size_t)dst & 0x03) == 0);
            while (((size_t)dst & 0x0F) != 0) {
                *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
                mask++;
                dst++;
                width--;
            }

            __m128i *d = reinterpret_cast<__m128i*>(dst);
            // Set alpha to 0xFF and replicate source four times in SSE register.
            __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
            // Set srcA_sse to contain eight copies of srcA, padded with zero.
            // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
            src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
            while (width >= 4) {
                // Load four destination pixels into dst_sse.
                __m128i dst_sse = _mm_load_si128(d);
                // Load four 16-bit masks into lower half of mask_sse.
                __m128i mask_sse = _mm_loadl_epi64((const __m128i*)mask);

                // Check whether masks are equal to 0 and get the highest bit
                // of each byte of result, if masks are all zero, we will get
                // pack_cmp to 0xFFFF
                int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
                                                 _mm_setzero_si128()));

                // if mask pixels are not all zero, we will blend the dst pixels
                if (pack_cmp != 0xFFFF) {
                    // Unpack 4 16bit mask pixels to
                    // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
                    //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
                    mask_sse = _mm_unpacklo_epi16(mask_sse,
                                                  _mm_setzero_si128());

                    // Process 4 32bit dst pixels
                    __m128i result = blend_lcd16_opaque_sse2(src_sse, dst_sse, mask_sse);
                    _mm_store_si128(d, result);
                }

                d++;
                mask += 4;
                width -= 4;
            }

            dst = reinterpret_cast<SkPMColor*>(d);
        }

        while (width > 0) {
            *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
            mask++;
            dst++;
            width--;
        }
    }

#elif defined(SK_ARM_HAS_NEON)
    #include <arm_neon.h>

    #define NEON_A (SK_A32_SHIFT / 8)
    #define NEON_R (SK_R32_SHIFT / 8)
    #define NEON_G (SK_G32_SHIFT / 8)
    #define NEON_B (SK_B32_SHIFT / 8)

    static inline uint8x8_t blend_32_neon(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) {
        int16x8_t src_wide, dst_wide;

        src_wide = vreinterpretq_s16_u16(vmovl_u8(src));
        dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst));

        src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale);

        dst_wide += vshrq_n_s16(src_wide, 5);

        return vmovn_u16(vreinterpretq_u16_s16(dst_wide));
    }

    void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t src[],
                               SkColor color, int width,
                               SkPMColor opaqueDst) {
        int colR = SkColorGetR(color);
        int colG = SkColorGetG(color);
        int colB = SkColorGetB(color);

        uint8x8_t vcolA = vdup_n_u8(0xFF);
        uint8x8_t vcolR = vdup_n_u8(colR);
        uint8x8_t vcolG = vdup_n_u8(colG);
        uint8x8_t vcolB = vdup_n_u8(colB);

        while (width >= 8) {
            uint8x8x4_t vdst;
            uint16x8_t vmask;
            uint16x8_t vmaskR, vmaskG, vmaskB, vmaskA;

            vdst = vld4_u8((uint8_t*)dst);
            vmask = vld1q_u16(src);

            // Get all the color masks on 5 bits
            vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
            vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
                                 SK_B16_BITS + SK_R16_BITS + 1);
            vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);

            // Upscale to 0..32
            vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
            vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
            vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
            // Opaque srcAlpha always uses the max of the 3 LCD coverage values
            vmaskA = vmaxq_u16(vmaskR, vmaxq_u16(vmaskG, vmaskB));

            vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
            vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
            vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
            vdst.val[NEON_A] = blend_32_neon(vcolA, vdst.val[NEON_A], vmaskA);

            vst4_u8((uint8_t*)dst, vdst);

            dst += 8;
            src += 8;
            width -= 8;
        }

        // Leftovers
        for (int i = 0; i < width; i++) {
            dst[i] = blend_lcd16_opaque(colR, colG, colB, dst[i], src[i], opaqueDst);
        }
    }

    void blit_row_lcd16(SkPMColor dst[], const uint16_t src[],
                        SkColor color, int width, SkPMColor) {
        int colA = SkColorGetA(color);
        int colR = SkColorGetR(color);
        int colG = SkColorGetG(color);
        int colB = SkColorGetB(color);

        // srcA in [0-255] to compare vs dstA
        uint16x8_t vcolACmp = vdupq_n_u16(colA);
        colA = SkAlpha255To256(colA);

        uint16x8_t vcolA = vdupq_n_u16(colA); // srcA in [0-256] to combine with coverage
        uint8x8_t vcolR = vdup_n_u8(colR);
        uint8x8_t vcolG = vdup_n_u8(colG);
        uint8x8_t vcolB = vdup_n_u8(colB);

        while (width >= 8) {
            uint8x8x4_t vdst;
            uint16x8_t vmask;
            uint16x8_t vmaskR, vmaskG, vmaskB, vmaskA;

            vdst = vld4_u8((uint8_t*)dst);
            vmask = vld1q_u16(src);

            // Get all the color masks on 5 bits
            vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
            vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
                                 SK_B16_BITS + SK_R16_BITS + 1);
            vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);

            // Upscale to 0..32
            vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
            vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
            vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);

            vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
            vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
            vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);

            // Select either the min or the max of the RGB mask values, depending on if the src
            // alpha is less than the dst alpha.
            vmaskA = vbslq_u16(vcleq_u16(vcolACmp, vmovl_u8(vdst.val[NEON_A])), // srcA < dstA
                               vminq_u16(vmaskR, vminq_u16(vmaskG, vmaskB)),    // ? min(r,g,b)
                               vmaxq_u16(vmaskR, vmaxq_u16(vmaskG, vmaskB)));   // : max(r,g,b)

            vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
            vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
            vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
            // vmaskA already includes vcolA so blend against 0xFF
            vdst.val[NEON_A] = blend_32_neon(vdup_n_u8(0xFF), vdst.val[NEON_A], vmaskA);
            vst4_u8((uint8_t*)dst, vdst);

            dst += 8;
            src += 8;
            width -= 8;
        }

        for (int i = 0; i < width; i++) {
            dst[i] = blend_lcd16(colA, colR, colG, colB, dst[i], src[i]);
        }
    }

#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX

    // The following (left) shifts cause the top 5 bits of the mask components to
    // line up with the corresponding components in an SkPMColor.
    // Note that the mask's RGB16 order may differ from the SkPMColor order.
    #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
    #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
    #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)

    #if SK_R16x5_R32x5_SHIFT == 0
        #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (x)
    #elif SK_R16x5_R32x5_SHIFT > 0
        #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (__lasx_xvslli_w(x, SK_R16x5_R32x5_SHIFT))
    #else
        #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_R16x5_R32x5_SHIFT))
    #endif

    #if SK_G16x5_G32x5_SHIFT == 0
        #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (x)
    #elif SK_G16x5_G32x5_SHIFT > 0
        #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (__lasx_xvslli_w(x, SK_G16x5_G32x5_SHIFT))
    #else
        #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_G16x5_G32x5_SHIFT))
    #endif

    #if SK_B16x5_B32x5_SHIFT == 0
        #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (x)
    #elif SK_B16x5_B32x5_SHIFT > 0
        #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (__lasx_xvslli_w(x, SK_B16x5_B32x5_SHIFT))
    #else
        #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_B16x5_B32x5_SHIFT))
    #endif

    static __m256i blend_lcd16_lasx(__m256i &src, __m256i &dst, __m256i &mask, __m256i &srcA) {
        // In the following comments, the components of src, dst and mask are
        // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
        // by an R, G, B, or A suffix. Components of one of the four pixels that
        // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
        // example is the blue channel of the second destination pixel. Memory
        // layout is shown for an ARGB byte order in a color value.

        // src and srcA store 8-bit values interleaved with zeros.
        // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
        //         0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
        // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
        //         srcA, 0, srcA, 0, srcA, 0, srcA, 0,
        //         srcA, 0, srcA, 0, srcA, 0, srcA, 0,
        //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
        // mask stores 16-bit values (compressed three channels) interleaved with zeros.
        // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
        // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
        //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
        //         m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
        //         m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)

        __m256i xv_zero = __lasx_xvldi(0);

        // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
        // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0,
        //      0, m4R, 0, 0, 0, m5R, 0, 0, 0, m6R, 0, 0, 0, m7R, 0, 0)
        __m256i r = __lasx_xvand_v(SkPackedR16x5ToUnmaskedR32x5_LASX(mask),
                                   __lasx_xvreplgr2vr_w(0x1F << SK_R32_SHIFT));

        // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
        //      0, 0, m4G, 0, 0, 0, m5G, 0, 0, 0, m6G, 0, 0, 0, m7R, 0)
        __m256i g = __lasx_xvand_v(SkPackedG16x5ToUnmaskedG32x5_LASX(mask),
                                   __lasx_xvreplgr2vr_w(0x1F << SK_G32_SHIFT));

        // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
        //      0, 0, 0, m4B, 0, 0, 0, m5B, 0, 0, 0, m6B, 0, 0, 0, m7B)
        __m256i b = __lasx_xvand_v(SkPackedB16x5ToUnmaskedB32x5_LASX(mask),
                                   __lasx_xvreplgr2vr_w(0x1F << SK_B32_SHIFT));

        // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
        __m256i aMin = __lasx_xvmin_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
                       __lasx_xvmin_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
                                      __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
        __m256i aMax = __lasx_xvmax_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
                       __lasx_xvmax_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
                                      __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
        // srcA has been biased to [0-256], so compare srcA against (dstA+1)
        __m256i a = __lasx_xvmskltz_w(srcA -
                                    __lasx_xvand_v(
                                           __lasx_xvadd_w(dst,
                                                          __lasx_xvreplgr2vr_w(1 << SK_A32_SHIFT)),
                                           __lasx_xvreplgr2vr_w(SK_A32_MASK)));
        // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
        a = __lasx_xvor_v(__lasx_xvand_v(a, aMin), __lasx_xvandn_v(a, aMax));

        // Pack the 8 16bit mask pixels into 8 32bit pixels, (p0, p1, p2, p3)
        // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
        // 8-bit position
        // mask = (m0A, m0R, m0G, m0B, m1R, m1R, m1G, m1B,
        //         m2A, m2R, m2G, m2B, m3R, m3R, m3G, m3B,
        //         m4A, m4R, m4G, m4B, m5R, m5R, m5G, m5B,
        //         m6A, m6R, m6G, m6B, m7R, m7R, m7G, m7B)
        mask = __lasx_xvor_v(__lasx_xvor_v(a, r), __lasx_xvor_v(g, b));

        // Interleave R,G,B into the lower byte of word.
        // i.e. split the sixteen 8-bit values from mask into two sets of sixteen
        // 16-bit values, padded by zero.
        __m256i maskLo, maskHi;
        // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0,
        //           m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
        maskLo = __lasx_xvilvl_b(xv_zero, mask);
        // maskHi = (m4A, 0, m4R, 0, m4G, 0, m4B, 0, m5A, 0, m5R, 0, m5G, 0, m5B, 0,
        //           m6A, 0, m6R, 0, m6G, 0, m6B, 0, m7A, 0, m7R, 0, m7G, 0, m7B, 0)
        maskHi = __lasx_xvilvh_b(xv_zero, mask);

        // Upscale from 0..31 to 0..32
        // (allows to replace division by left-shift further down)
        // Left-shift each component by 4 and add the result back to that component,
        // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
        maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
        maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));

        // Multiply each component of maskLo and maskHi by srcA
        maskLo = __lasx_xvmul_h(maskLo, srcA);
        maskHi = __lasx_xvmul_h(maskHi, srcA);

        // Left shift mask components by 8 (divide by 256)
        maskLo = __lasx_xvsrli_h(maskLo, 8);
        maskHi = __lasx_xvsrli_h(maskHi, 8);

        // Interleave R,G,B into the lower byte of the word
        // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
        //          d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
        __m256i dstLo = __lasx_xvilvl_b(xv_zero, dst);
        // dstLo = (d4A, 0, d4R, 0, d4G, 0, d4B, 0, d5A, 0, d5R, 0, d5G, 0, d5B, 0)
        //          d6A, 0, d6R, 0, d6G, 0, d6B, 0, d7A, 0, d7R, 0, d7G, 0, d7B, 0)
        __m256i dstHi = __lasx_xvilvh_b(xv_zero, dst);

        // mask = (src - dst) * mask
        maskLo = __lasx_xvmul_h(maskLo, __lasx_xvsub_h(src, dstLo));
        maskHi = __lasx_xvmul_h(maskHi, __lasx_xvsub_h(src, dstHi));

        // mask = (src - dst) * mask >> 5
        maskLo = __lasx_xvsrai_h(maskLo, 5);
        maskHi = __lasx_xvsrai_h(maskHi, 5);

        // Add two pixels into result.
        // result = dst + ((src - dst) * mask >> 5)
        __m256i resultLo = __lasx_xvadd_h(dstLo, maskLo);
        __m256i resultHi = __lasx_xvadd_h(dstHi, maskHi);

        // Pack into 8 32bit dst pixels.
        // resultLo and resultHi contain sixteen 16-bit components (four pixels) each.
        // Merge into one LASX regsiter with 32 8-bit values (eight pixels),
        // clamping to 255 if necessary.
        __m256i tmpl = __lasx_xvsat_hu(resultLo, 7);
        __m256i tmph = __lasx_xvsat_hu(resultHi, 7);
        return __lasx_xvpickev_b(tmph, tmpl);
    }

    static __m256i blend_lcd16_opaque_lasx(__m256i &src, __m256i &dst, __m256i &mask) {
        // In the following comments, the components of src, dst and mask are
        // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
        // by an R, G, B, or A suffix. Components of one of the four pixels that
        // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
        // example is the blue channel of the second destination pixel. Memory
        // layout is shown for an ARGB byte order in a color value.

        // src and srcA store 8-bit values interleaved with zeros.
        // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
        //         0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
        // mask stores 16-bit values (shown as high and low bytes) interleaved with
        // zeros
        // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
        //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
        //         m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
        //         m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)

        __m256i xv_zero = __lasx_xvldi(0);

        // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
        // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0,
        //      0, m4R, 0, 0, 0, m5R, 0, 0, 0, m6R, 0, 0, 0, m7R, 0, 0)
        __m256i r = __lasx_xvand_v(SkPackedR16x5ToUnmaskedR32x5_LASX(mask),
                                   __lasx_xvreplgr2vr_w(0x1F << SK_R32_SHIFT));

        // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0,
        //      0, 0, m4G, 0, 0, 0, m5G, 0, 0, 0, m6G, 0, 0, 0, m7G, 0)
        __m256i g = __lasx_xvand_v(SkPackedG16x5ToUnmaskedG32x5_LASX(mask),
                                   __lasx_xvreplgr2vr_w(0x1F << SK_G32_SHIFT));

        // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B,
        //      0, 0, 0, m4B, 0, 0, 0, m5B, 0, 0, 0, m6B, 0, 0, 0, m7B)
        __m256i b = __lasx_xvand_v(SkPackedB16x5ToUnmaskedB32x5_LASX(mask),
                                   __lasx_xvreplgr2vr_w(0x1F << SK_B32_SHIFT));

        // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
        __m256i a = __lasx_xvmax_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
                    __lasx_xvmax_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
                                   __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));

        // Pack the 8 16bit mask pixels into 8 32bit pixels, (p0, p1, p2, p3,
        // p4, p5, p6, p7)
        // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
        // 8-bit position
        // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
        //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B,
        //         m4A, m4R, m4G, m4B, m5A, m5R, m5G, m5B,
        //         m6A, m6R, m6G, m6B, m7A, m7R, m7G, m7B)
        mask = __lasx_xvor_v(__lasx_xvor_v(a, r), __lasx_xvor_v(g, b));

        // Interleave R,G,B into the lower byte of word.
        // i.e. split the 32 8-bit values from mask into two sets of sixteen
        // 16-bit values, padded by zero.
        __m256i maskLo, maskHi;
        // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0,
        //           m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
        maskLo = __lasx_xvilvl_b(xv_zero, mask);
        // maskHi = (m4A, 0, m4R, 0, m4G, 0, m4B, 0, m5A, 0, m5R, 0, m5G, 0, m5B, 0,
        //           m6A, 0, m6R, 0, m6G, 0, m6B, 0, m7A, 0, m7R, 0, m7G, 0, m7B, 0)
        maskHi = __lasx_xvilvh_b(xv_zero, mask);

        // Upscale from 0..31 to 0..32
        // (allows to replace division by left-shift further down)
        // Left-shift each component by 4 and add the result back to that component,
        // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
        maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
        maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));

        // Interleave R,G,B into the lower byte of the word
        // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0,
        //          d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
        __m256i dstLo = __lasx_xvilvl_b(xv_zero, dst);
        // dstLo = (d4A, 0, d4R, 0, d4G, 0, d4B, 0, d5A, 0, d5R, 0, d5G, 0, d5B, 0,
        // dstLo = (d6A, 0, d6R, 0, d6G, 0, d6B, 0, d7A, 0, d7R, 0, d7G, 0, d7B, 0)
        __m256i dstHi = __lasx_xvilvh_b(xv_zero, dst);

        // mask = (src - dst) * mask
        maskLo = __lasx_xvmul_h(maskLo, __lasx_xvsub_h(src, dstLo));
        maskHi = __lasx_xvmul_h(maskHi, __lasx_xvsub_h(src, dstHi));

        // mask = (src - dst) * mask >> 5
        maskLo = __lasx_xvsrai_h(maskLo, 5);
        maskHi = __lasx_xvsrai_h(maskHi, 5);

        // Add two pixels into result.
        // result = dst + ((src - dst) * mask >> 5)
        __m256i resultLo = __lasx_xvadd_h(dstLo, maskLo);
        __m256i resultHi = __lasx_xvadd_h(dstHi, maskHi);

        // Merge into one SSE regsiter with 32 8-bit values (eight pixels),
        // clamping to 255 if necessary.
        __m256i tmpl = __lasx_xvsat_hu(resultLo, 7);
        __m256i tmph = __lasx_xvsat_hu(resultHi, 7);

        return __lasx_xvpickev_b(tmph, tmpl);
    }

    void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
        if (width <= 0) {
            return;
        }

        int srcA = SkColorGetA(src);
        int srcR = SkColorGetR(src);
        int srcG = SkColorGetG(src);
        int srcB = SkColorGetB(src);
        __m256i xv_zero = __lasx_xvldi(0);

        srcA = SkAlpha255To256(srcA);
        if (width >= 8) {
            SkASSERT(((size_t)dst & 0x03) == 0);
            while (((size_t)dst & 0x0F) != 0) {
                *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
                mask++;
                dst++;
                width--;
            }

            __m256i *d = reinterpret_cast<__m256i*>(dst);
            // Set alpha to 0xFF and replicate source eight times in LASX register.
            unsigned int skpackargb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
            __m256i src_lasx = __lasx_xvreplgr2vr_w(skpackargb32);
            // Interleave with zeros to get two sets of eight 16-bit values.
            src_lasx = __lasx_xvilvl_b(xv_zero, src_lasx);
            // Set srcA_lasx to contain sixteen copies of srcA, padded with zero.
            // src_lasx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
            //           0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
            __m256i srcA_lasx = __lasx_xvreplgr2vr_h(srcA);

            while (width >= 8) {
                // Load eight destination pixels into dst_lasx.
                __m256i dst_lasx = __lasx_xvld(d, 0);
                // Load eight 16-bit masks into lower half of mask_lasx.
                __m256i mask_lasx = __lasx_xvld(mask, 0);
                mask_lasx = (__m256i){mask_lasx[0], 0, mask_lasx[1], 0};

                int pack_cmp = __lasx_xbz_v(mask_lasx);
                // if mask pixels are not all zero, we will blend the dst pixels
                if (pack_cmp != 1) {
                    // Unpack 8 16bit mask pixels to
                    // mask_lasx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
                    //              m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
                    //              m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
                    //              m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
                    mask_lasx = __lasx_xvilvl_h(xv_zero, mask_lasx);

                    // Process 8 32bit dst pixels
                    __m256i result = blend_lcd16_lasx(src_lasx, dst_lasx, mask_lasx, srcA_lasx);
                    __lasx_xvst(result, d, 0);
                }
                d++;
                mask += 8;
                width -= 8;
            }
            dst = reinterpret_cast<SkPMColor*>(d);
        }

        while (width > 0) {
            *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
            mask++;
            dst++;
            width--;
        }
    }

    void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
                               SkColor src, int width, SkPMColor opaqueDst) {
        if (width <= 0) {
            return;
        }

        int srcR = SkColorGetR(src);
        int srcG = SkColorGetG(src);
        int srcB = SkColorGetB(src);
        __m256i xv_zero = __lasx_xvldi(0);

        if (width >= 8) {
            SkASSERT(((size_t)dst & 0x03) == 0);
            while (((size_t)dst & 0x0F) != 0) {
                *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
                mask++;
                dst++;
                width--;
            }

            __m256i *d = reinterpret_cast<__m256i*>(dst);
            // Set alpha to 0xFF and replicate source four times in LASX register.
            unsigned int sk_pack_argb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
            __m256i src_lasx = __lasx_xvreplgr2vr_w(sk_pack_argb32);
            // Set srcA_lasx to contain sixteen copies of srcA, padded with zero.
            // src_lasx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
            //           0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
            src_lasx = __lasx_xvilvl_b(xv_zero, src_lasx);

            while (width >= 8) {
                // Load eight destination pixels into dst_lasx.
                __m256i dst_lasx = __lasx_xvld(d, 0);
                // Load eight 16-bit masks into lower half of mask_lasx.
                __m256i mask_lasx = __lasx_xvld(mask, 0);
                mask_lasx = (__m256i){mask_lasx[0], 0, mask_lasx[1], 0};

                int32_t pack_cmp = __lasx_xbz_v(mask_lasx);
                // if mask pixels are not all zero, we will blend the dst pixels
                if (pack_cmp != 1) {
                    // Unpack 8 16bit mask pixels to
                    // mask_lasx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
                    //              m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
                    //              m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
                    //              m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
                    mask_lasx = __lasx_xvilvl_h(xv_zero, mask_lasx);
                    // Process 8 32bit dst pixels
                    __m256i result = blend_lcd16_opaque_lasx(src_lasx, dst_lasx, mask_lasx);
                    __lasx_xvst(result, d, 0);
                }
                d++;
                mask += 8;
                width -= 8;
            }

            dst = reinterpret_cast<SkPMColor*>(d);
        }

        while (width > 0) {
            *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
            mask++;
            dst++;
            width--;
        }
    }

#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX

    // The following (left) shifts cause the top 5 bits of the mask components to
    // line up with the corresponding components in an SkPMColor.
    // Note that the mask's RGB16 order may differ from the SkPMColor order.
    #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
    #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
    #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)

    #if SK_R16x5_R32x5_SHIFT == 0
        #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (x)
    #elif SK_R16x5_R32x5_SHIFT > 0
        #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (__lsx_vslli_w(x, SK_R16x5_R32x5_SHIFT))
    #else
        #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (__lsx_vsrli_w(x, -SK_R16x5_R32x5_SHIFT))
    #endif

    #if SK_G16x5_G32x5_SHIFT == 0
        #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (x)
    #elif SK_G16x5_G32x5_SHIFT > 0
        #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (__lsx_vslli_w(x, SK_G16x5_G32x5_SHIFT))
    #else
        #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (__lsx_vsrli_w(x, -SK_G16x5_G32x5_SHIFT))
    #endif

    #if SK_B16x5_B32x5_SHIFT == 0
        #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (x)
    #elif SK_B16x5_B32x5_SHIFT > 0
        #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (__lsx_vslli_w(x, SK_B16x5_B32x5_SHIFT))
    #else
        #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (__lsx_vsrli_w(x, -SK_B16x5_B32x5_SHIFT))
    #endif

    static __m128i blend_lcd16_lsx(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
        // In the following comments, the components of src, dst and mask are
        // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
        // by an R, G, B, or A suffix. Components of one of the four pixels that
        // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
        // example is the blue channel of the second destination pixel. Memory
        // layout is shown for an ARGB byte order in a color value.

        // src and srcA store 8-bit values interleaved with zeros.
        // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
        // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
        //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
        // mask stores 16-bit values (compressed three channels) interleaved with zeros.
        // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
        // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
        //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

        __m128i v_zero = __lsx_vldi(0);

        // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
        // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
        __m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
                                 __lsx_vreplgr2vr_w(0x1F << SK_R32_SHIFT));

        // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
        __m128i g = __lsx_vand_v(SkPackedG16x5ToUnmaskedG32x5_LSX(mask),
                                 __lsx_vreplgr2vr_w(0x1F << SK_G32_SHIFT));

        // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
        __m128i b = __lsx_vand_v(SkPackedB16x5ToUnmaskedB32x5_LSX(mask),
                                 __lsx_vreplgr2vr_w(0x1F << SK_B32_SHIFT));

        // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
        __m128i aMin = __lsx_vmin_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
                       __lsx_vmin_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
                                    __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
        __m128i aMax = __lsx_vmax_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
                       __lsx_vmax_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
                                    __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
        // srcA has been biased to [0-256], so compare srcA against (dstA+1)
        __m128i a = __lsx_vmskltz_w(srcA -
                                    __lsx_vand_v(
                                          __lsx_vadd_w(dst,
                                                       __lsx_vreplgr2vr_w(1 << SK_A32_SHIFT)),
                                          __lsx_vreplgr2vr_w(SK_A32_MASK)));
        // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
        a = __lsx_vor_v(__lsx_vand_v(a, aMin), __lsx_vandn_v(a, aMax));

        // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
        // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
        // 8-bit position
        // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
        //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
        mask = __lsx_vor_v(__lsx_vor_v(a, r), __lsx_vor_v(g, b));

        // Interleave R,G,B into the lower byte of word.
        // i.e. split the sixteen 8-bit values from mask into two sets of eight
        // 16-bit values, padded by zero.
        __m128i maskLo, maskHi;
        // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
        maskLo = __lsx_vilvl_b(v_zero, mask);
        // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
        maskHi = __lsx_vilvh_b(v_zero, mask);

        // Upscale from 0..31 to 0..32
        // (allows to replace division by left-shift further down)
        // Left-shift each component by 4 and add the result back to that component,
        // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
        maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
        maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));

        // Multiply each component of maskLo and maskHi by srcA
        maskLo = __lsx_vmul_h(maskLo, srcA);
        maskHi = __lsx_vmul_h(maskHi, srcA);

        // Left shift mask components by 8 (divide by 256)
        maskLo = __lsx_vsrli_h(maskLo, 8);
        maskHi = __lsx_vsrli_h(maskHi, 8);

        // Interleave R,G,B into the lower byte of the word
        // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
        __m128i dstLo = __lsx_vilvl_b(v_zero, dst);
        // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
        __m128i dstHi = __lsx_vilvh_b(v_zero, dst);

        // mask = (src - dst) * mask
        maskLo = __lsx_vmul_h(maskLo, __lsx_vsub_h(src, dstLo));
        maskHi = __lsx_vmul_h(maskHi, __lsx_vsub_h(src, dstHi));

        // mask = (src - dst) * mask >> 5
        maskLo = __lsx_vsrai_h(maskLo, 5);
        maskHi = __lsx_vsrai_h(maskHi, 5);

        // Add two pixels into result.
        // result = dst + ((src - dst) * mask >> 5)
        __m128i resultLo = __lsx_vadd_h(dstLo, maskLo);
        __m128i resultHi = __lsx_vadd_h(dstHi, maskHi);

        // Pack into 4 32bit dst pixels.
        // resultLo and resultHi contain eight 16-bit components (two pixels) each.
        // Merge into one LSX regsiter with sixteen 8-bit values (four pixels),
        // clamping to 255 if necessary.
        __m128i tmpl = __lsx_vsat_hu(resultLo, 7);
        __m128i tmph = __lsx_vsat_hu(resultHi, 7);
        return __lsx_vpickev_b(tmph, tmpl);
    }

    static __m128i blend_lcd16_opaque_lsx(__m128i &src, __m128i &dst, __m128i &mask) {
        // In the following comments, the components of src, dst and mask are
        // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
        // by an R, G, B, or A suffix. Components of one of the four pixels that
        // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
        // example is the blue channel of the second destination pixel. Memory
        // layout is shown for an ARGB byte order in a color value.

        // src and srcA store 8-bit values interleaved with zeros.
        // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
        // mask stores 16-bit values (shown as high and low bytes) interleaved with
        // zeros
        // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
        //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

        __m128i v_zero = __lsx_vldi(0);

        // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
        // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
        __m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
                                 __lsx_vreplgr2vr_w(0x1F << SK_R32_SHIFT));

        // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
        __m128i g = __lsx_vand_v(SkPackedG16x5ToUnmaskedG32x5_LSX(mask),
                                 __lsx_vreplgr2vr_w(0x1F << SK_G32_SHIFT));

        // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
        __m128i b = __lsx_vand_v(SkPackedB16x5ToUnmaskedB32x5_LSX(mask),
                                 __lsx_vreplgr2vr_w(0x1F << SK_B32_SHIFT));

        // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
        __m128i a = __lsx_vmax_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
                    __lsx_vmax_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
                                 __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));

        // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
        // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
        // 8-bit position
        // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
        //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
        mask = __lsx_vor_v(__lsx_vor_v(a, r), __lsx_vor_v(g, b));

        // Interleave R,G,B into the lower byte of word.
        // i.e. split the sixteen 8-bit values from mask into two sets of eight
        // 16-bit values, padded by zero.
        __m128i maskLo, maskHi;
        // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
        maskLo = __lsx_vilvl_b(v_zero, mask);
        // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
        maskHi = __lsx_vilvh_b(v_zero, mask);

        // Upscale from 0..31 to 0..32
        // (allows to replace division by left-shift further down)
        // Left-shift each component by 4 and add the result back to that component,
        // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
        maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
        maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));

        // Interleave R,G,B into the lower byte of the word
        // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
        __m128i dstLo = __lsx_vilvl_b(v_zero, dst);
        // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
        __m128i dstHi = __lsx_vilvh_b(v_zero, dst);

        // mask = (src - dst) * mask
        maskLo = __lsx_vmul_h(maskLo, __lsx_vsub_h(src, dstLo));
        maskHi = __lsx_vmul_h(maskHi, __lsx_vsub_h(src, dstHi));

        // mask = (src - dst) * mask >> 5
        maskLo = __lsx_vsrai_h(maskLo, 5);
        maskHi = __lsx_vsrai_h(maskHi, 5);

        // Add two pixels into result.
        // result = dst + ((src - dst) * mask >> 5)
        __m128i resultLo = __lsx_vadd_h(dstLo, maskLo);
        __m128i resultHi = __lsx_vadd_h(dstHi, maskHi);

        // Merge into one LSX regsiter with sixteen 8-bit values (four pixels),
        // clamping to 255 if necessary.
        __m128i tmpl = __lsx_vsat_hu(resultLo, 7);
        __m128i tmph = __lsx_vsat_hu(resultHi, 7);
        return __lsx_vpickev_b(tmph, tmpl);
    }

    void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
        if (width <= 0) {
            return;
        }

        int srcA = SkColorGetA(src);
        int srcR = SkColorGetR(src);
        int srcG = SkColorGetG(src);
        int srcB = SkColorGetB(src);
        __m128i v_zero = __lsx_vldi(0);

        srcA = SkAlpha255To256(srcA);
        if (width >= 4) {
            SkASSERT(((size_t)dst & 0x03) == 0);
            while (((size_t)dst & 0x0F) != 0) {
                *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
                mask++;
                dst++;
                width--;
            }

            __m128i *d = reinterpret_cast<__m128i*>(dst);
            // Set alpha to 0xFF and replicate source eight times in LSX register.
            unsigned int skpackargb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
            __m128i src_lsx = __lsx_vreplgr2vr_w(skpackargb32);
            // Interleave with zeros to get two sets of eight 16-bit values.
            src_lsx = __lsx_vilvl_b(v_zero, src_lsx);
            // Set srcA_lsx to contain eight copies of srcA, padded with zero.
            // src_lsx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
            __m128i srcA_lsx = __lsx_vreplgr2vr_h(srcA);

            while (width >= 4) {
                // Load eight destination pixels into dst_lsx.
                __m128i dst_lsx = __lsx_vld(d, 0);
                // Load four 16-bit masks into lower half of mask_lsx.
                __m128i mask_lsx = __lsx_vldrepl_d((void *)mask, 0);
                mask_lsx =  __lsx_vilvl_d(v_zero, mask_lsx);

                int pack_cmp = __lsx_bz_v(mask_lsx);
                // if mask pixels are not all zero, we will blend the dst pixels
                if (pack_cmp != 1) {
                    // Unpack 4 16bit mask pixels to
                    // mask_lsx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
                    //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
                    mask_lsx = __lsx_vilvl_h(v_zero, mask_lsx);

                    // Process 8 32bit dst pixels
                    __m128i result = blend_lcd16_lsx(src_lsx, dst_lsx, mask_lsx, srcA_lsx);
                    __lsx_vst(result, d, 0);
                }

                d++;
                mask += 4;
                width -= 4;
            }

            dst = reinterpret_cast<SkPMColor*>(d);
        }

        while (width > 0) {
            *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
            mask++;
            dst++;
            width--;
        }
    }

    void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
                               SkColor src, int width, SkPMColor opaqueDst) {
        if (width <= 0) {
            return;
        }

        int srcR = SkColorGetR(src);
        int srcG = SkColorGetG(src);
        int srcB = SkColorGetB(src);
        __m128i v_zero = __lsx_vldi(0);

        if (width >= 4) {
            SkASSERT(((size_t)dst & 0x03) == 0);
            while (((size_t)dst & 0x0F) != 0) {
                *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
                mask++;
                dst++;
                width--;
            }

            __m128i *d = reinterpret_cast<__m128i*>(dst);
            // Set alpha to 0xFF and replicate source four times in LSX register.
            unsigned int sk_pack_argb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
            __m128i src_lsx = __lsx_vreplgr2vr_w(sk_pack_argb32);
            // Set srcA_lsx to contain eight copies of srcA, padded with zero.
            // src_lsx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
            src_lsx = __lsx_vilvl_b(v_zero, src_lsx);

            while (width >= 4) {
                // Load four destination pixels into dst_lsx.
                __m128i dst_lsx = __lsx_vld(d, 0);
                // Load four 16-bit masks into lower half of mask_lsx.
                __m128i mask_lsx = __lsx_vldrepl_d((void *)(mask), 0);
                mask_lsx =  __lsx_vilvl_d(v_zero, mask_lsx);

                int pack_cmp = __lsx_bz_v(mask_lsx);
                // if mask pixels are not all zero, we will blend the dst pixels
                if (pack_cmp != 1) {
                    // Unpack 4 16bit mask pixels to
                    mask_lsx = __lsx_vilvl_h(v_zero, mask_lsx);

                    // Process 8 32bit dst pixels
                    __m128i result = blend_lcd16_opaque_lsx(src_lsx, dst_lsx, mask_lsx);
                    __lsx_vst(result, d, 0);
                }
                d++;
                mask += 4;
                width -= 4;
            }

            dst = reinterpret_cast<SkPMColor*>(d);
        }

        while (width > 0) {
            *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
            mask++;
            dst++;
            width--;
        }
    }

#else

    static inline void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[],
                                      SkColor src, int width, SkPMColor) {
        int srcA = SkColorGetA(src);
        int srcR = SkColorGetR(src);
        int srcG = SkColorGetG(src);
        int srcB = SkColorGetB(src);

        srcA = SkAlpha255To256(srcA);

        for (int i = 0; i < width; i++) {
            dst[i] = blend_lcd16(srcA, srcR, srcG, srcB, dst[i], mask[i]);
        }
    }

    static inline void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
                                             SkColor src, int width,
                                             SkPMColor opaqueDst) {
        int srcR = SkColorGetR(src);
        int srcG = SkColorGetG(src);
        int srcB = SkColorGetB(src);

        for (int i = 0; i < width; i++) {
            dst[i] = blend_lcd16_opaque(srcR, srcG, srcB, dst[i], mask[i], opaqueDst);
        }
    }

#endif

static bool blit_color(const SkPixmap& device,
                       const SkMask& mask,
                       const SkIRect& clip,
                       SkColor color) {
    int x = clip.fLeft,
        y = clip.fTop;

    if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kA8_Format) {
        SkOpts::blit_mask_d32_a8(device.writable_addr32(x,y), device.rowBytes(),
                                 (const SkAlpha*)mask.getAddr(x,y), mask.fRowBytes,
                                 color, clip.width(), clip.height());
        return true;
    }

    if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kLCD16_Format) {
        auto dstRow  = device.writable_addr32(x,y);
        auto maskRow = (const uint16_t*)mask.getAddr(x,y);

        auto blit_row = blit_row_lcd16;
        SkPMColor opaqueDst = 0;  // ignored unless opaque

        if (0xff == SkColorGetA(color)) {
            blit_row  = blit_row_lcd16_opaque;
            opaqueDst = SkPreMultiplyColor(color);
        }

        for (int height = clip.height(); height --> 0; ) {
            blit_row(dstRow, maskRow, color, clip.width(), opaqueDst);

            dstRow  = (SkPMColor*)     ((      char*) dstRow + device.rowBytes());
            maskRow = (const uint16_t*)((const char*)maskRow +  mask.fRowBytes);
        }
        return true;
    }

    return false;
}

///////////////////////////////////////////////////////////////////////////////

static void SkARGB32_Blit32(const SkPixmap& device, const SkMask& mask,
                            const SkIRect& clip, SkPMColor srcColor) {
    U8CPU alpha = SkGetPackedA32(srcColor);
    unsigned flags = SkBlitRow::kSrcPixelAlpha_Flag32;
    if (alpha != 255) {
        flags |= SkBlitRow::kGlobalAlpha_Flag32;
    }
    SkBlitRow::Proc32 proc = SkBlitRow::Factory32(flags);

    int x = clip.fLeft;
    int y = clip.fTop;
    int width = clip.width();
    int height = clip.height();

    SkPMColor* dstRow = device.writable_addr32(x, y);
    const SkPMColor* srcRow = reinterpret_cast<const SkPMColor*>(mask.getAddr8(x, y));

    do {
        proc(dstRow, srcRow, width, alpha);
        dstRow = (SkPMColor*)((char*)dstRow + device.rowBytes());
        srcRow = (const SkPMColor*)((const char*)srcRow + mask.fRowBytes);
    } while (--height != 0);
}

//////////////////////////////////////////////////////////////////////////////////////

SkARGB32_Blitter::SkARGB32_Blitter(const SkPixmap& device, const SkPaint& paint)
        : INHERITED(device) {
    SkColor color = paint.getColor();
    fColor = color;

    fSrcA = SkColorGetA(color);
    unsigned scale = SkAlpha255To256(fSrcA);
    fSrcR = SkAlphaMul(SkColorGetR(color), scale);
    fSrcG = SkAlphaMul(SkColorGetG(color), scale);
    fSrcB = SkAlphaMul(SkColorGetB(color), scale);

    fPMColor = SkPackARGB32(fSrcA, fSrcR, fSrcG, fSrcB);
}

#if defined _WIN32  // disable warning : local variable used without having been initialized
#pragma warning ( push )
#pragma warning ( disable : 4701 )
#endif

void SkARGB32_Blitter::blitH(int x, int y, int width) {
    SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());

    uint32_t* device = fDevice.writable_addr32(x, y);
    SkBlitRow::Color32(device, width, fPMColor);
}

void SkARGB32_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
                                 const int16_t runs[]) {
    if (fSrcA == 0) {
        return;
    }

    uint32_t    color = fPMColor;
    uint32_t*   device = fDevice.writable_addr32(x, y);
    unsigned    opaqueMask = fSrcA; // if fSrcA is 0xFF, then we will catch the fast opaque case

    for (;;) {
        int count = runs[0];
        SkASSERT(count >= 0);
        if (count <= 0) {
            return;
        }
        unsigned aa = antialias[0];
        if (aa) {
            if ((opaqueMask & aa) == 255) {
                SkOpts::memset32(device, color, count);
            } else {
                uint32_t sc = SkAlphaMulQ(color, SkAlpha255To256(aa));
                SkBlitRow::Color32(device, count, sc);
            }
        }
        runs += count;
        antialias += count;
        device += count;
    }
}

void SkARGB32_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
    uint32_t* device = fDevice.writable_addr32(x, y);
    SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)

    device[0] = SkBlendARGB32(fPMColor, device[0], a0);
    device[1] = SkBlendARGB32(fPMColor, device[1], a1);
}

void SkARGB32_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
    uint32_t* device = fDevice.writable_addr32(x, y);
    SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)

    device[0] = SkBlendARGB32(fPMColor, device[0], a0);
    device = (uint32_t*)((char*)device + fDevice.rowBytes());
    device[0] = SkBlendARGB32(fPMColor, device[0], a1);
}

//////////////////////////////////////////////////////////////////////////////////////

#define solid_8_pixels(mask, dst, color)    \
    do {                                    \
        if (mask & 0x80) dst[0] = color;    \
        if (mask & 0x40) dst[1] = color;    \
        if (mask & 0x20) dst[2] = color;    \
        if (mask & 0x10) dst[3] = color;    \
        if (mask & 0x08) dst[4] = color;    \
        if (mask & 0x04) dst[5] = color;    \
        if (mask & 0x02) dst[6] = color;    \
        if (mask & 0x01) dst[7] = color;    \
    } while (0)

#define SK_BLITBWMASK_NAME                  SkARGB32_BlitBW
#define SK_BLITBWMASK_ARGS                  , SkPMColor color
#define SK_BLITBWMASK_BLIT8(mask, dst)      solid_8_pixels(mask, dst, color)
#define SK_BLITBWMASK_GETADDR               writable_addr32
#define SK_BLITBWMASK_DEVTYPE               uint32_t
#include "src/core/SkBlitBWMaskTemplate.h"

#define blend_8_pixels(mask, dst, sc, dst_scale)                            \
    do {                                                                    \
        if (mask & 0x80) { dst[0] = sc + SkAlphaMulQ(dst[0], dst_scale); }  \
        if (mask & 0x40) { dst[1] = sc + SkAlphaMulQ(dst[1], dst_scale); }  \
        if (mask & 0x20) { dst[2] = sc + SkAlphaMulQ(dst[2], dst_scale); }  \
        if (mask & 0x10) { dst[3] = sc + SkAlphaMulQ(dst[3], dst_scale); }  \
        if (mask & 0x08) { dst[4] = sc + SkAlphaMulQ(dst[4], dst_scale); }  \
        if (mask & 0x04) { dst[5] = sc + SkAlphaMulQ(dst[5], dst_scale); }  \
        if (mask & 0x02) { dst[6] = sc + SkAlphaMulQ(dst[6], dst_scale); }  \
        if (mask & 0x01) { dst[7] = sc + SkAlphaMulQ(dst[7], dst_scale); }  \
    } while (0)

#define SK_BLITBWMASK_NAME                  SkARGB32_BlendBW
#define SK_BLITBWMASK_ARGS                  , uint32_t sc, unsigned dst_scale
#define SK_BLITBWMASK_BLIT8(mask, dst)      blend_8_pixels(mask, dst, sc, dst_scale)
#define SK_BLITBWMASK_GETADDR               writable_addr32
#define SK_BLITBWMASK_DEVTYPE               uint32_t
#include "src/core/SkBlitBWMaskTemplate.h"

void SkARGB32_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
    SkASSERT(mask.fBounds.contains(clip));
    SkASSERT(fSrcA != 0xFF);

    if (fSrcA == 0) {
        return;
    }

    if (blit_color(fDevice, mask, clip, fColor)) {
        return;
    }

    switch (mask.fFormat) {
        case SkMask::kBW_Format:
            SkARGB32_BlendBW(fDevice, mask, clip, fPMColor, SkAlpha255To256(255 - fSrcA));
            break;
        case SkMask::kARGB32_Format:
            SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
            break;
        default:
            SK_ABORT("Mask format not handled.");
    }
}

void SkARGB32_Opaque_Blitter::blitMask(const SkMask& mask,
                                       const SkIRect& clip) {
    SkASSERT(mask.fBounds.contains(clip));

    if (blit_color(fDevice, mask, clip, fColor)) {
        return;
    }

    switch (mask.fFormat) {
        case SkMask::kBW_Format:
            SkARGB32_BlitBW(fDevice, mask, clip, fPMColor);
            break;
        case SkMask::kARGB32_Format:
            SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
            break;
        default:
            SK_ABORT("Mask format not handled.");
    }
}

void SkARGB32_Opaque_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
    uint32_t* device = fDevice.writable_addr32(x, y);
    SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)

    device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
    device[1] = SkFastFourByteInterp(fPMColor, device[1], a1);
}

void SkARGB32_Opaque_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
    uint32_t* device = fDevice.writable_addr32(x, y);
    SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)

    device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
    device = (uint32_t*)((char*)device + fDevice.rowBytes());
    device[0] = SkFastFourByteInterp(fPMColor, device[0], a1);
}

///////////////////////////////////////////////////////////////////////////////

void SkARGB32_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
    if (alpha == 0 || fSrcA == 0) {
        return;
    }

    uint32_t* device = fDevice.writable_addr32(x, y);
    uint32_t  color = fPMColor;

    if (alpha != 255) {
        color = SkAlphaMulQ(color, SkAlpha255To256(alpha));
    }

    unsigned dst_scale = SkAlpha255To256(255 - SkGetPackedA32(color));
    size_t rowBytes = fDevice.rowBytes();
    while (--height >= 0) {
        device[0] = color + SkAlphaMulQ(device[0], dst_scale);
        device = (uint32_t*)((char*)device + rowBytes);
    }
}

void SkARGB32_Blitter::blitRect(int x, int y, int width, int height) {
    SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width() && y + height <= fDevice.height());

    if (fSrcA == 0) {
        return;
    }

    uint32_t*   device = fDevice.writable_addr32(x, y);
    uint32_t    color = fPMColor;
    size_t      rowBytes = fDevice.rowBytes();

    if (SkGetPackedA32(fPMColor) == 0xFF) {
        SkOpts::rect_memset32(device, color, width, rowBytes, height);
    } else {
        while (height --> 0) {
            SkBlitRow::Color32(device, width, color);
            device = (uint32_t*)((char*)device + rowBytes);
        }
    }
}

#if defined _WIN32
#pragma warning ( pop )
#endif

///////////////////////////////////////////////////////////////////////

void SkARGB32_Black_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
                                       const int16_t runs[]) {
    uint32_t*   device = fDevice.writable_addr32(x, y);
    SkPMColor   black = (SkPMColor)(SK_A32_MASK << SK_A32_SHIFT);

    for (;;) {
        int count = runs[0];
        SkASSERT(count >= 0);
        if (count <= 0) {
            return;
        }
        unsigned aa = antialias[0];
        if (aa) {
            if (aa == 255) {
                SkOpts::memset32(device, black, count);
            } else {
                SkPMColor src = aa << SK_A32_SHIFT;
                unsigned dst_scale = 256 - aa;
                int n = count;
                do {
                    --n;
                    device[n] = src + SkAlphaMulQ(device[n], dst_scale);
                } while (n > 0);
            }
        }
        runs += count;
        antialias += count;
        device += count;
    }
}

void SkARGB32_Black_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
    uint32_t* device = fDevice.writable_addr32(x, y);
    SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)

    device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
    device[1] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[1], 256 - a1);
}

void SkARGB32_Black_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
    uint32_t* device = fDevice.writable_addr32(x, y);
    SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)

    device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
    device = (uint32_t*)((char*)device + fDevice.rowBytes());
    device[0] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a1);
}

///////////////////////////////////////////////////////////////////////////////

SkARGB32_Shader_Blitter::SkARGB32_Shader_Blitter(const SkPixmap& device,
        const SkPaint& paint, SkShaderBase::Context* shaderContext)
    : INHERITED(device, paint, shaderContext)
{
    fBuffer = (SkPMColor*)sk_malloc_throw(device.width() * (sizeof(SkPMColor)));

    SkASSERT(paint.isSrcOver());

    int flags = 0;
    if (!(shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
        flags |= SkBlitRow::kSrcPixelAlpha_Flag32;
    }
    // we call this on the output from the shader
    fProc32 = SkBlitRow::Factory32(flags);
    // we call this on the output from the shader + alpha from the aa buffer
    fProc32Blend = SkBlitRow::Factory32(flags | SkBlitRow::kGlobalAlpha_Flag32);

    fShadeDirectlyIntoDevice =
            SkToBool(shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag);
}

SkARGB32_Shader_Blitter::~SkARGB32_Shader_Blitter() {
    sk_free(fBuffer);
}

void SkARGB32_Shader_Blitter::blitH(int x, int y, int width) {
    SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());

    uint32_t* device = fDevice.writable_addr32(x, y);

    if (fShadeDirectlyIntoDevice) {
        fShaderContext->shadeSpan(x, y, device, width);
    } else {
        SkPMColor*  span = fBuffer;
        fShaderContext->shadeSpan(x, y, span, width);
        fProc32(device, span, width, 255);
    }
}

void SkARGB32_Shader_Blitter::blitRect(int x, int y, int width, int height) {
    SkASSERT(x >= 0 && y >= 0 &&
             x + width <= fDevice.width() && y + height <= fDevice.height());

    uint32_t*  device = fDevice.writable_addr32(x, y);
    size_t     deviceRB = fDevice.rowBytes();
    auto*      shaderContext = fShaderContext;
    SkPMColor* span = fBuffer;

    if (fShadeDirectlyIntoDevice) {
        do {
            shaderContext->shadeSpan(x, y, device, width);
            y += 1;
            device = (uint32_t*)((char*)device + deviceRB);
        } while (--height > 0);
    } else {
        SkBlitRow::Proc32 proc = fProc32;
        do {
            shaderContext->shadeSpan(x, y, span, width);
            proc(device, span, width, 255);
            y += 1;
            device = (uint32_t*)((char*)device + deviceRB);
        } while (--height > 0);
    }
}

void SkARGB32_Shader_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
                                        const int16_t runs[]) {
    SkPMColor* span = fBuffer;
    uint32_t*  device = fDevice.writable_addr32(x, y);
    auto*      shaderContext = fShaderContext;

    if (fShadeDirectlyIntoDevice || (shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
        for (;;) {
            int count = *runs;
            if (count <= 0) {
                break;
            }
            int aa = *antialias;
            if (aa) {
                if (aa == 255) {
                    // cool, have the shader draw right into the device
                    shaderContext->shadeSpan(x, y, device, count);
                } else {
                    shaderContext->shadeSpan(x, y, span, count);
                    fProc32Blend(device, span, count, aa);
                }
            }
            device += count;
            runs += count;
            antialias += count;
            x += count;
        }
    } else {
        for (;;) {
            int count = *runs;
            if (count <= 0) {
                break;
            }
            int aa = *antialias;
            if (aa) {
                shaderContext->shadeSpan(x, y, span, count);
                if (aa == 255) {
                    fProc32(device, span, count, 255);
                } else {
                    fProc32Blend(device, span, count, aa);
                }
            }
            device += count;
            runs += count;
            antialias += count;
            x += count;
        }
    }
}

using U32  = skvx::Vec< 4, uint32_t>;
using U8x4 = skvx::Vec<16, uint8_t>;
using U8   = skvx::Vec< 4, uint8_t>;

static void drive(SkPMColor* dst, const SkPMColor* src, const uint8_t* cov, int n,
                  U8x4 (*kernel)(U8x4,U8x4,U8x4)) {

    auto apply = [kernel](U32 dst, U32 src, U8 cov) -> U32 {
        U8x4 cov_splat = skvx::shuffle<0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3>(cov);
        return sk_bit_cast<U32>(kernel(sk_bit_cast<U8x4>(dst),
                                       sk_bit_cast<U8x4>(src),
                                       cov_splat));
    };
    while (n >= 4) {
        apply(U32::Load(dst), U32::Load(src), U8::Load(cov)).store(dst);
        dst += 4;
        src += 4;
        cov += 4;
        n   -= 4;
    }
    while (n --> 0) {
        *dst = apply(U32{*dst}, U32{*src}, U8{*cov})[0];
        dst++;
        src++;
        cov++;
    }
}

static void blend_row_A8(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
    auto cov = (const uint8_t*)mask;
    drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
        U8x4 s_aa  = skvx::approx_scale(s, c),
             alpha = skvx::shuffle<3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15>(s_aa);
        return s_aa + skvx::approx_scale(d, 255 - alpha);
    });
}

static void blend_row_A8_opaque(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
    auto cov = (const uint8_t*)mask;
    drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
        return skvx::div255( skvx::cast<uint16_t>(s) * skvx::cast<uint16_t>(  c  )
                           + skvx::cast<uint16_t>(d) * skvx::cast<uint16_t>(255-c));
    });
}

static void blend_row_lcd16(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
    auto src_alpha_blend = [](int s, int d, int sa, int m) {
        return d + SkAlphaMul(s - SkAlphaMul(sa, d), m);
    };

    auto upscale_31_to_255 = [](int v) {
        return (v << 3) | (v >> 2);
    };

    auto mask = (const uint16_t*)vmask;
    for (int i = 0; i < n; ++i) {
        uint16_t m = mask[i];
        if (0 == m) {
            continue;
        }

        SkPMColor s = src[i];
        SkPMColor d = dst[i];

        int srcA = SkGetPackedA32(s);
        int srcR = SkGetPackedR32(s);
        int srcG = SkGetPackedG32(s);
        int srcB = SkGetPackedB32(s);

        srcA += srcA >> 7;

        // We're ignoring the least significant bit of the green coverage channel here.
        int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
        int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
        int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);

        // Scale up to 8-bit coverage to work with SkAlphaMul() in src_alpha_blend().
        maskR = upscale_31_to_255(maskR);
        maskG = upscale_31_to_255(maskG);
        maskB = upscale_31_to_255(maskB);

        // This LCD blit routine only works if the destination is opaque.
        dst[i] = SkPackARGB32(0xFF,
                              src_alpha_blend(srcR, SkGetPackedR32(d), srcA, maskR),
                              src_alpha_blend(srcG, SkGetPackedG32(d), srcA, maskG),
                              src_alpha_blend(srcB, SkGetPackedB32(d), srcA, maskB));
    }
}

static void blend_row_LCD16_opaque(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
    auto mask = (const uint16_t*)vmask;

    for (int i = 0; i < n; ++i) {
        uint16_t m = mask[i];
        if (0 == m) {
            continue;
        }

        SkPMColor s = src[i];
        SkPMColor d = dst[i];

        int srcR = SkGetPackedR32(s);
        int srcG = SkGetPackedG32(s);
        int srcB = SkGetPackedB32(s);

        // We're ignoring the least significant bit of the green coverage channel here.
        int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
        int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
        int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);

        // Now upscale them to 0..32, so we can use blend_32.
        maskR = upscale_31_to_32(maskR);
        maskG = upscale_31_to_32(maskG);
        maskB = upscale_31_to_32(maskB);

        // This LCD blit routine only works if the destination is opaque.
        dst[i] = SkPackARGB32(0xFF,
                              blend_32(srcR, SkGetPackedR32(d), maskR),
                              blend_32(srcG, SkGetPackedG32(d), maskG),
                              blend_32(srcB, SkGetPackedB32(d), maskB));
    }
}

void SkARGB32_Shader_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
    SkASSERT(mask.fBounds.contains(clip));

    void (*blend_row)(SkPMColor*, const void* mask, const SkPMColor*, int) = nullptr;

    bool opaque = (fShaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag);

    if (mask.fFormat == SkMask::kA8_Format && opaque) {
        blend_row = blend_row_A8_opaque;
    } else if (mask.fFormat == SkMask::kA8_Format) {
        blend_row = blend_row_A8;
    } else if (mask.fFormat == SkMask::kLCD16_Format && opaque) {
        blend_row = blend_row_LCD16_opaque;
    } else if (mask.fFormat == SkMask::kLCD16_Format) {
        blend_row = blend_row_lcd16;
    } else {
        this->INHERITED::blitMask(mask, clip);
        return;
    }

    const int x = clip.fLeft;
    const int width = clip.width();
    int y = clip.fTop;
    int height = clip.height();

    char* dstRow = (char*)fDevice.writable_addr32(x, y);
    const size_t dstRB = fDevice.rowBytes();
    const uint8_t* maskRow = (const uint8_t*)mask.getAddr(x, y);
    const size_t maskRB = mask.fRowBytes;

    SkPMColor* span = fBuffer;
    SkASSERT(blend_row);
    do {
        fShaderContext->shadeSpan(x, y, span, width);
        blend_row(reinterpret_cast<SkPMColor*>(dstRow), maskRow, span, width);
        dstRow += dstRB;
        maskRow += maskRB;
        y += 1;
    } while (--height > 0);
}

void SkARGB32_Shader_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
    SkASSERT(x >= 0 && y >= 0 && y + height <= fDevice.height());

    uint32_t* device = fDevice.writable_addr32(x, y);
    size_t    deviceRB = fDevice.rowBytes();

    if (fShadeDirectlyIntoDevice) {
        if (255 == alpha) {
            do {
                fShaderContext->shadeSpan(x, y, device, 1);
                y += 1;
                device = (uint32_t*)((char*)device + deviceRB);
            } while (--height > 0);
        } else {
            do {
                SkPMColor c;
                fShaderContext->shadeSpan(x, y, &c, 1);
                *device = SkFourByteInterp(c, *device, alpha);
                y += 1;
                device = (uint32_t*)((char*)device + deviceRB);
            } while (--height > 0);
        }
    } else {
        SkPMColor* span = fBuffer;
        SkBlitRow::Proc32 proc = (255 == alpha) ? fProc32 : fProc32Blend;
        do {
            fShaderContext->shadeSpan(x, y, span, 1);
            proc(device, span, 1, alpha);
            y += 1;
            device = (uint32_t*)((char*)device + deviceRB);
        } while (--height > 0);
    }
}

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.35 Sekunden (vorverarbeitet am 2026-04-26) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.