/* * Copyright 2006 The Android Open Source Project * * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE file.
*/
staticinline SkPMColor blend_lcd16(int srcA, int srcR, int srcG, int srcB,
SkPMColor dst, uint16_t mask) { if (mask == 0) { return dst;
}
/* We want all of these in 5bits, hence the shifts in case one of them * (green) is 6bits.
*/ int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5); int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5); int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
// Now upscale them to 0..32, so we can use blend32
maskR = upscale_31_to_32(maskR);
maskG = upscale_31_to_32(maskG);
maskB = upscale_31_to_32(maskB);
// srcA has been upscaled to 256 before passed into this function
maskR = maskR * srcA >> 8;
maskG = maskG * srcA >> 8;
maskB = maskB * srcA >> 8;
int dstA = SkGetPackedA32(dst); int dstR = SkGetPackedR32(dst); int dstG = SkGetPackedG32(dst); int dstB = SkGetPackedB32(dst);
// Subtract 1 from srcA to bring it back to [0-255] to compare against dstA, alpha needs to // use either the min or the max of the LCD coverages. See https:/skbug.com/40037823 int maskA = (srcA-1) < dstA ? std::min(maskR, std::min(maskG, maskB))
: std::max(maskR, std::max(maskG, maskB));
staticinline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB,
SkPMColor dst, uint16_t mask,
SkPMColor opaqueDst) { if (mask == 0) { return dst;
}
if (0xFFFF == mask) { return opaqueDst;
}
/* We want all of these in 5bits, hence the shifts in case one of them * (green) is 6bits.
*/ int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5); int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5); int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
// Now upscale them to 0..32, so we can use blend32
maskR = upscale_31_to_32(maskR);
maskG = upscale_31_to_32(maskG);
maskB = upscale_31_to_32(maskB);
int dstA = SkGetPackedA32(dst); int dstR = SkGetPackedR32(dst); int dstG = SkGetPackedG32(dst); int dstB = SkGetPackedB32(dst);
// Opaque src alpha always uses the max of the LCD coverages. int maskA = std::max(maskR, std::max(maskG, maskB));
// LCD blitting is only supported if the dst is known/required // to be opaque return SkPackARGB32(blend_32(0xFF, dstA, maskA),
blend_32(srcR, dstR, maskR),
blend_32(srcG, dstG, maskG),
blend_32(srcB, dstB, maskB));
}
// TODO: rewrite at least the SSE code here. It's miserable.
// The following (left) shifts cause the top 5 bits of the mask components to // line up with the corresponding components in an SkPMColor. // Note that the mask's RGB16 order may differ from the SkPMColor order. #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
static __m128i blend_lcd16_sse2(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) { // In the following comments, the components of src, dst and mask are // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked // by an R, G, B, or A suffix. Components of one of the four pixels that // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for // example is the blue channel of the second destination pixel. Memory // layout is shown for an ARGB byte order in a color value.
// src and srcA store 8-bit values interleaved with zeros. // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, // srcA, 0, srcA, 0, srcA, 0, srcA, 0) // mask stores 16-bit values (compressed three channels) interleaved with zeros. // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
_mm_set1_epi32(0x1F << SK_R32_SHIFT));
// a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
__m128i aMin = _mm_min_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
_mm_min_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
_mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
__m128i aMax = _mm_max_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
_mm_max_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
_mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT))); // srcA has been biased to [0-256], so compare srcA against (dstA+1)
__m128i a = _mm_cmplt_epi32(srcA,
_mm_and_si128(
_mm_add_epi32(dst, _mm_set1_epi32(1 << SK_A32_SHIFT)),
_mm_set1_epi32(SK_A32_MASK))); // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
a = _mm_or_si128(_mm_and_si128(a, aMin), _mm_andnot_si128(a, aMax));
// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an // 8-bit position // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B, // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
mask = _mm_or_si128(_mm_or_si128(a, r), _mm_or_si128(g, b));
// Interleave R,G,B into the lower byte of word. // i.e. split the sixteen 8-bit values from mask into two sets of eight // 16-bit values, padded by zero.
__m128i maskLo, maskHi; // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
// Upscale from 0..31 to 0..32 // (allows to replace division by left-shift further down) // Left-shift each component by 4 and add the result back to that component, // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
// Multiply each component of maskLo and maskHi by srcA
maskLo = _mm_mullo_epi16(maskLo, srcA);
maskHi = _mm_mullo_epi16(maskHi, srcA);
// Left shift mask components by 8 (divide by 256)
maskLo = _mm_srli_epi16(maskLo, 8);
maskHi = _mm_srli_epi16(maskHi, 8);
// Add two pixels into result. // result = dst + ((src - dst) * mask >> 5)
__m128i resultLo = _mm_add_epi16(dstLo, maskLo);
__m128i resultHi = _mm_add_epi16(dstHi, maskHi);
// Pack into 4 32bit dst pixels. // resultLo and resultHi contain eight 16-bit components (two pixels) each. // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), // clamping to 255 if necessary. return _mm_packus_epi16(resultLo, resultHi);
}
static __m128i blend_lcd16_opaque_sse2(__m128i &src, __m128i &dst, __m128i &mask) { // In the following comments, the components of src, dst and mask are // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked // by an R, G, B, or A suffix. Components of one of the four pixels that // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for // example is the blue channel of the second destination pixel. Memory // layout is shown for an ARGB byte order in a color value.
// src and srcA store 8-bit values interleaved with zeros. // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) // mask stores 16-bit values (shown as high and low bytes) interleaved with // zeros // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
_mm_set1_epi32(0x1F << SK_R32_SHIFT));
// a = max(r, g, b) since opaque src alpha uses max of LCD coverages
__m128i a = _mm_max_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
_mm_max_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
_mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an // 8-bit position // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B, // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
mask = _mm_or_si128(_mm_or_si128(a, r), _mm_or_si128(g, b));
// Interleave R,G,B into the lower byte of word. // i.e. split the sixteen 8-bit values from mask into two sets of eight // 16-bit values, padded by zero.
__m128i maskLo, maskHi; // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
// Upscale from 0..31 to 0..32 // (allows to replace division by left-shift further down) // Left-shift each component by 4 and add the result back to that component, // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
// Add two pixels into result. // result = dst + ((src - dst) * mask >> 5)
__m128i resultLo = _mm_add_epi16(dstLo, maskLo);
__m128i resultHi = _mm_add_epi16(dstHi, maskHi);
// Merge into one SSE regsiter with sixteen 8-bit values (four pixels), // clamping to 255 if necessary. return _mm_packus_epi16(resultLo, resultHi);
}
void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) { if (width <= 0) { return;
}
int srcA = SkColorGetA(src); int srcR = SkColorGetR(src); int srcG = SkColorGetG(src); int srcB = SkColorGetB(src);
__m128i *d = reinterpret_cast<__m128i*>(dst); // Set alpha to 0xFF and replicate source four times in SSE register.
__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); // Interleave with zeros to get two sets of four 16-bit values.
src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); // Set srcA_sse to contain eight copies of srcA, padded with zero. // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
__m128i srcA_sse = _mm_set1_epi16(srcA); while (width >= 4) { // Load four destination pixels into dst_sse.
__m128i dst_sse = _mm_load_si128(d); // Load four 16-bit masks into lower half of mask_sse.
__m128i mask_sse = _mm_loadl_epi64((const __m128i*)mask);
// Check whether masks are equal to 0 and get the highest bit // of each byte of result, if masks are all zero, we will get // pack_cmp to 0xFFFF int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
_mm_setzero_si128()));
// if mask pixels are not all zero, we will blend the dst pixels if (pack_cmp != 0xFFFF) { // Unpack 4 16bit mask pixels to // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
mask_sse = _mm_unpacklo_epi16(mask_sse,
_mm_setzero_si128());
// Process 4 32bit dst pixels
__m128i result = blend_lcd16_sse2(src_sse, dst_sse, mask_sse, srcA_sse);
_mm_store_si128(d, result);
}
__m128i *d = reinterpret_cast<__m128i*>(dst); // Set alpha to 0xFF and replicate source four times in SSE register.
__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); // Set srcA_sse to contain eight copies of srcA, padded with zero. // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); while (width >= 4) { // Load four destination pixels into dst_sse.
__m128i dst_sse = _mm_load_si128(d); // Load four 16-bit masks into lower half of mask_sse.
__m128i mask_sse = _mm_loadl_epi64((const __m128i*)mask);
// Check whether masks are equal to 0 and get the highest bit // of each byte of result, if masks are all zero, we will get // pack_cmp to 0xFFFF int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
_mm_setzero_si128()));
// if mask pixels are not all zero, we will blend the dst pixels if (pack_cmp != 0xFFFF) { // Unpack 4 16bit mask pixels to // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
mask_sse = _mm_unpacklo_epi16(mask_sse,
_mm_setzero_si128());
// Process 4 32bit dst pixels
__m128i result = blend_lcd16_opaque_sse2(src_sse, dst_sse, mask_sse);
_mm_store_si128(d, result);
}
// Leftovers for (int i = 0; i < width; i++) {
dst[i] = blend_lcd16_opaque(colR, colG, colB, dst[i], src[i], opaqueDst);
}
}
void blit_row_lcd16(SkPMColor dst[], const uint16_t src[],
SkColor color, int width, SkPMColor) { int colA = SkColorGetA(color); int colR = SkColorGetR(color); int colG = SkColorGetG(color); int colB = SkColorGetB(color);
// srcA in [0-255] to compare vs dstA
uint16x8_t vcolACmp = vdupq_n_u16(colA);
colA = SkAlpha255To256(colA);
uint16x8_t vcolA = vdupq_n_u16(colA); // srcA in [0-256] to combine with coverage
uint8x8_t vcolR = vdup_n_u8(colR);
uint8x8_t vcolG = vdup_n_u8(colG);
uint8x8_t vcolB = vdup_n_u8(colB);
// Select either the min or the max of the RGB mask values, depending on if the src // alpha is less than the dst alpha.
vmaskA = vbslq_u16(vcleq_u16(vcolACmp, vmovl_u8(vdst.val[NEON_A])), // srcA < dstA
vminq_u16(vmaskR, vminq_u16(vmaskG, vmaskB)), // ? min(r,g,b)
vmaxq_u16(vmaskR, vmaxq_u16(vmaskG, vmaskB))); // : max(r,g,b)
for (int i = 0; i < width; i++) {
dst[i] = blend_lcd16(colA, colR, colG, colB, dst[i], src[i]);
}
}
#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
// The following (left) shifts cause the top 5 bits of the mask components to // line up with the corresponding components in an SkPMColor. // Note that the mask's RGB16 order may differ from the SkPMColor order. #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
static __m256i blend_lcd16_lasx(__m256i &src, __m256i &dst, __m256i &mask, __m256i &srcA) { // In the following comments, the components of src, dst and mask are // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked // by an R, G, B, or A suffix. Components of one of the four pixels that // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for // example is the blue channel of the second destination pixel. Memory // layout is shown for an ARGB byte order in a color value.
// a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
__m256i aMin = __lasx_xvmin_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
__lasx_xvmin_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
__lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
__m256i aMax = __lasx_xvmax_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
__lasx_xvmax_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
__lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT))); // srcA has been biased to [0-256], so compare srcA against (dstA+1)
__m256i a = __lasx_xvmskltz_w(srcA -
__lasx_xvand_v(
__lasx_xvadd_w(dst,
__lasx_xvreplgr2vr_w(1 << SK_A32_SHIFT)),
__lasx_xvreplgr2vr_w(SK_A32_MASK))); // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
a = __lasx_xvor_v(__lasx_xvand_v(a, aMin), __lasx_xvandn_v(a, aMax));
// Pack the 8 16bit mask pixels into 8 32bit pixels, (p0, p1, p2, p3) // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an // 8-bit position // mask = (m0A, m0R, m0G, m0B, m1R, m1R, m1G, m1B, // m2A, m2R, m2G, m2B, m3R, m3R, m3G, m3B, // m4A, m4R, m4G, m4B, m5R, m5R, m5G, m5B, // m6A, m6R, m6G, m6B, m7R, m7R, m7G, m7B)
mask = __lasx_xvor_v(__lasx_xvor_v(a, r), __lasx_xvor_v(g, b));
// Upscale from 0..31 to 0..32 // (allows to replace division by left-shift further down) // Left-shift each component by 4 and add the result back to that component, // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));
// Multiply each component of maskLo and maskHi by srcA
maskLo = __lasx_xvmul_h(maskLo, srcA);
maskHi = __lasx_xvmul_h(maskHi, srcA);
// Left shift mask components by 8 (divide by 256)
maskLo = __lasx_xvsrli_h(maskLo, 8);
maskHi = __lasx_xvsrli_h(maskHi, 8);
// Add two pixels into result. // result = dst + ((src - dst) * mask >> 5)
__m256i resultLo = __lasx_xvadd_h(dstLo, maskLo);
__m256i resultHi = __lasx_xvadd_h(dstHi, maskHi);
// Pack into 8 32bit dst pixels. // resultLo and resultHi contain sixteen 16-bit components (four pixels) each. // Merge into one LASX regsiter with 32 8-bit values (eight pixels), // clamping to 255 if necessary.
__m256i tmpl = __lasx_xvsat_hu(resultLo, 7);
__m256i tmph = __lasx_xvsat_hu(resultHi, 7); return __lasx_xvpickev_b(tmph, tmpl);
}
static __m256i blend_lcd16_opaque_lasx(__m256i &src, __m256i &dst, __m256i &mask) { // In the following comments, the components of src, dst and mask are // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked // by an R, G, B, or A suffix. Components of one of the four pixels that // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for // example is the blue channel of the second destination pixel. Memory // layout is shown for an ARGB byte order in a color value.
// a = max(r, g, b) since opaque src alpha uses max of LCD coverages
__m256i a = __lasx_xvmax_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
__lasx_xvmax_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
__lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
// Pack the 8 16bit mask pixels into 8 32bit pixels, (p0, p1, p2, p3, // p4, p5, p6, p7) // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an // 8-bit position // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B, // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B, // m4A, m4R, m4G, m4B, m5A, m5R, m5G, m5B, // m6A, m6R, m6G, m6B, m7A, m7R, m7G, m7B)
mask = __lasx_xvor_v(__lasx_xvor_v(a, r), __lasx_xvor_v(g, b));
// Upscale from 0..31 to 0..32 // (allows to replace division by left-shift further down) // Left-shift each component by 4 and add the result back to that component, // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));
// The following (left) shifts cause the top 5 bits of the mask components to // line up with the corresponding components in an SkPMColor. // Note that the mask's RGB16 order may differ from the SkPMColor order. #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
static __m128i blend_lcd16_lsx(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) { // In the following comments, the components of src, dst and mask are // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked // by an R, G, B, or A suffix. Components of one of the four pixels that // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for // example is the blue channel of the second destination pixel. Memory // layout is shown for an ARGB byte order in a color value.
// src and srcA store 8-bit values interleaved with zeros. // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, // srcA, 0, srcA, 0, srcA, 0, srcA, 0) // mask stores 16-bit values (compressed three channels) interleaved with zeros. // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
__m128i v_zero = __lsx_vldi(0);
// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
__m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
__lsx_vreplgr2vr_w(0x1F << SK_R32_SHIFT));
// a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
__m128i aMin = __lsx_vmin_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
__lsx_vmin_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
__lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
__m128i aMax = __lsx_vmax_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
__lsx_vmax_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
__lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT))); // srcA has been biased to [0-256], so compare srcA against (dstA+1)
__m128i a = __lsx_vmskltz_w(srcA -
__lsx_vand_v(
__lsx_vadd_w(dst,
__lsx_vreplgr2vr_w(1 << SK_A32_SHIFT)),
__lsx_vreplgr2vr_w(SK_A32_MASK))); // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
a = __lsx_vor_v(__lsx_vand_v(a, aMin), __lsx_vandn_v(a, aMax));
// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an // 8-bit position // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B, // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
mask = __lsx_vor_v(__lsx_vor_v(a, r), __lsx_vor_v(g, b));
// Interleave R,G,B into the lower byte of word. // i.e. split the sixteen 8-bit values from mask into two sets of eight // 16-bit values, padded by zero.
__m128i maskLo, maskHi; // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
maskLo = __lsx_vilvl_b(v_zero, mask); // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
maskHi = __lsx_vilvh_b(v_zero, mask);
// Upscale from 0..31 to 0..32 // (allows to replace division by left-shift further down) // Left-shift each component by 4 and add the result back to that component, // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));
// Multiply each component of maskLo and maskHi by srcA
maskLo = __lsx_vmul_h(maskLo, srcA);
maskHi = __lsx_vmul_h(maskHi, srcA);
// Left shift mask components by 8 (divide by 256)
maskLo = __lsx_vsrli_h(maskLo, 8);
maskHi = __lsx_vsrli_h(maskHi, 8);
// Add two pixels into result. // result = dst + ((src - dst) * mask >> 5)
__m128i resultLo = __lsx_vadd_h(dstLo, maskLo);
__m128i resultHi = __lsx_vadd_h(dstHi, maskHi);
// Pack into 4 32bit dst pixels. // resultLo and resultHi contain eight 16-bit components (two pixels) each. // Merge into one LSX regsiter with sixteen 8-bit values (four pixels), // clamping to 255 if necessary.
__m128i tmpl = __lsx_vsat_hu(resultLo, 7);
__m128i tmph = __lsx_vsat_hu(resultHi, 7); return __lsx_vpickev_b(tmph, tmpl);
}
static __m128i blend_lcd16_opaque_lsx(__m128i &src, __m128i &dst, __m128i &mask) { // In the following comments, the components of src, dst and mask are // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked // by an R, G, B, or A suffix. Components of one of the four pixels that // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for // example is the blue channel of the second destination pixel. Memory // layout is shown for an ARGB byte order in a color value.
// src and srcA store 8-bit values interleaved with zeros. // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) // mask stores 16-bit values (shown as high and low bytes) interleaved with // zeros // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
__m128i v_zero = __lsx_vldi(0);
// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
__m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
__lsx_vreplgr2vr_w(0x1F << SK_R32_SHIFT));
// a = max(r, g, b) since opaque src alpha uses max of LCD coverages
__m128i a = __lsx_vmax_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
__lsx_vmax_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
__lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an // 8-bit position // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B, // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
mask = __lsx_vor_v(__lsx_vor_v(a, r), __lsx_vor_v(g, b));
// Interleave R,G,B into the lower byte of word. // i.e. split the sixteen 8-bit values from mask into two sets of eight // 16-bit values, padded by zero.
__m128i maskLo, maskHi; // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
maskLo = __lsx_vilvl_b(v_zero, mask); // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
maskHi = __lsx_vilvh_b(v_zero, mask);
// Upscale from 0..31 to 0..32 // (allows to replace division by left-shift further down) // Left-shift each component by 4 and add the result back to that component, // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));
__m128i *d = reinterpret_cast<__m128i*>(dst); // Set alpha to 0xFF and replicate source four times in LSX register. unsignedint sk_pack_argb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
__m128i src_lsx = __lsx_vreplgr2vr_w(sk_pack_argb32); // Set srcA_lsx to contain eight copies of srcA, padded with zero. // src_lsx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
src_lsx = __lsx_vilvl_b(v_zero, src_lsx);
while (width >= 4) { // Load four destination pixels into dst_lsx.
__m128i dst_lsx = __lsx_vld(d, 0); // Load four 16-bit masks into lower half of mask_lsx.
__m128i mask_lsx = __lsx_vldrepl_d((void *)(mask), 0);
mask_lsx = __lsx_vilvl_d(v_zero, mask_lsx);
int pack_cmp = __lsx_bz_v(mask_lsx); // if mask pixels are not all zero, we will blend the dst pixels if (pack_cmp != 1) { // Unpack 4 16bit mask pixels to
mask_lsx = __lsx_vilvl_h(v_zero, mask_lsx);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.