// Copyright 2015 Google Inc. All Rights Reserved. // // Use of this source code is governed by a BSD-style license // that can be found in the COPYING file in the root of the source // tree. An additional intellectual property rights grant can be found // in the file PATENTS. All contributing project authors may // be found in the AUTHORS file in the root of the source tree. // ----------------------------------------------------------------------------- // // SSE2 variant of methods for lossless encoder // // Author: Skal (pascal.massimino@gmail.com)
staticvoid SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data, int num_pixels) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g const __m128i out = _mm_sub_epi8(in, C);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
} // fallthrough and finish off with plain-C if (i != num_pixels) {
VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
}
}
//------------------------------------------------------------------------------ // Color Transform
staticvoid TransformColor_SSE2(const VP8LMultipliers* WEBP_RESTRICT const m,
uint32_t* WEBP_RESTRICT argb_data, int num_pixels) { const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
CST_5b(m->green_to_blue_)); const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0); const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0 const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0 const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2 const __m128i H = _mm_add_epi8(G, D); // x dr x db const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db const __m128i out = _mm_sub_epi8(in, I);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
} // fallthrough and finish off with plain-C if (i != num_pixels) {
VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
}
}
//------------------------------------------------------------------------------ #define SPAN 8 staticvoid CollectColorBlueTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb, int stride, int tile_width, int tile_height, int green_to_blue, int red_to_blue,
uint32_t histo[]) { const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0); const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue)); const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask int y; for (y = 0; y < tile_height; ++y) { const uint32_t* const src = argb + y * stride; int i, x; for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN]; const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); const __m128i A0 = _mm_slli_epi16(in0, 8); // r 0 | b 0 const __m128i A1 = _mm_slli_epi16(in1, 8); const __m128i B0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 const __m128i B1 = _mm_and_si128(in1, mask_g); const __m128i C0 = _mm_mulhi_epi16(A0, mults_r); // x db | 0 0 const __m128i C1 = _mm_mulhi_epi16(A1, mults_r); const __m128i D0 = _mm_mulhi_epi16(B0, mults_g); // 0 0 | x db const __m128i D1 = _mm_mulhi_epi16(B1, mults_g); const __m128i E0 = _mm_sub_epi8(in0, D0); // x x | x b' const __m128i E1 = _mm_sub_epi8(in1, D1); const __m128i F0 = _mm_srli_epi32(C0, 16); // 0 0 | x db const __m128i F1 = _mm_srli_epi32(C1, 16); const __m128i G0 = _mm_sub_epi8(E0, F0); // 0 0 | x b' const __m128i G1 = _mm_sub_epi8(E1, F1); const __m128i H0 = _mm_and_si128(G0, mask_b); // 0 0 | 0 b const __m128i H1 = _mm_and_si128(G1, mask_b); const __m128i I = _mm_packs_epi32(H0, H1); // 0 b' | 0 b'
_mm_storeu_si128((__m128i*)values, I); for (i = 0; i < SPAN; ++i) ++histo[values[i]];
}
}
{ constint left_over = tile_width & (SPAN - 1); if (left_over > 0) {
VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height,
green_to_blue, red_to_blue, histo);
}
}
}
staticvoid CollectColorRedTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb, int stride, int tile_width, int tile_height, int green_to_red, uint32_t histo[]) { const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red)); const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask const __m128i mask = _mm_set1_epi32(0xff);
int y; for (y = 0; y < tile_height; ++y) { const uint32_t* const src = argb + y * stride; int i, x; for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN]; const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); const __m128i A0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 const __m128i A1 = _mm_and_si128(in1, mask_g); const __m128i B0 = _mm_srli_epi32(in0, 16); // 0 0 | x r const __m128i B1 = _mm_srli_epi32(in1, 16); const __m128i C0 = _mm_mulhi_epi16(A0, mults_g); // 0 0 | x dr const __m128i C1 = _mm_mulhi_epi16(A1, mults_g); const __m128i E0 = _mm_sub_epi8(B0, C0); // x x | x r' const __m128i E1 = _mm_sub_epi8(B1, C1); const __m128i F0 = _mm_and_si128(E0, mask); // 0 0 | 0 r' const __m128i F1 = _mm_and_si128(E1, mask); const __m128i I = _mm_packs_epi32(F0, F1);
_mm_storeu_si128((__m128i*)values, I); for (i = 0; i < SPAN; ++i) ++histo[values[i]];
}
}
{ constint left_over = tile_width & (SPAN - 1); if (left_over > 0) {
VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height,
green_to_red, histo);
}
}
} #undef SPAN #undef MK_CST_16
// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But // that's ok since the histogram values are less than 1<<28 (max picture size). staticvoid AddVector_SSE2(const uint32_t* WEBP_RESTRICT a, const uint32_t* WEBP_RESTRICT b,
uint32_t* WEBP_RESTRICT out, int size) { int i = 0; int aligned_size = size & ~15; // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of // 2). See the usage in VP8LHistogramAdd().
assert(size >= 16);
assert(size % 2 == 0);
staticvoid AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a,
uint32_t* WEBP_RESTRICT out, int size) { int i = 0; int aligned_size = size & ~15; // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of // 2). See the usage in VP8LHistogramAdd().
assert(size >= 16);
assert(size % 2 == 0);
staticint VectorMismatch_SSE2(const uint32_t* const array1, const uint32_t* const array2, int length) { int match_len;
if (length >= 12) {
__m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]);
__m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]);
match_len = 0; do { // Loop unrolling and early load both provide a speedup of 10% for the // current function. Also, max_limit can be MAX_LENGTH=4096 at most. const __m128i cmpA = _mm_cmpeq_epi32(A0, A1); const __m128i B0 =
_mm_loadu_si128((const __m128i*)&array1[match_len + 4]); const __m128i B1 =
_mm_loadu_si128((const __m128i*)&array2[match_len + 4]); if (_mm_movemask_epi8(cmpA) != 0xffff) break;
match_len += 4;
// Predictor0: ARGB_BLACK. staticvoid PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; const __m128i black = _mm_set1_epi32((int)ARGB_BLACK); for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); const __m128i res = _mm_sub_epi8(src, black);
_mm_storeu_si128((__m128i*)&out[i], res);
} if (i != num_pixels) {
VP8LPredictorsSub_C[0](in + i, NULL, num_pixels - i, out + i);
}
(void)upper;
}
#define GENERATE_PREDICTOR_1(X, IN) \ staticvoid PredictorSub##X##_SSE2(const uint32_t* const in, \ const uint32_t* const upper, \ int num_pixels, \
uint32_t* WEBP_RESTRICT const out) { \ int i; \ for (i = 0; i + 4 <= num_pixels; i += 4) { \ const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \ const __m128i res = _mm_sub_epi8(src, pred); \
_mm_storeu_si128((__m128i*)&out[i], res); \
} \ if (i != num_pixels) { \
VP8LPredictorsSub_C[(X)](in + i, WEBP_OFFSET_PTR(upper, i), \
num_pixels - i, out + i); \
} \
}
// Predictor10: avg(avg(L,TL), avg(T, TR)). staticvoid PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
__m128i avgTTR, avgLTL, avg, res;
Average2_m128i(&T, &TR, &avgTTR);
Average2_m128i(&L, &TL, &avgLTL);
Average2_m128i(&avgTTR, &avgLTL, &avg);
res = _mm_sub_epi8(src, avg);
_mm_storeu_si128((__m128i*)&out[i], res);
} if (i != num_pixels) {
VP8LPredictorsSub_C[10](in + i, upper + i, num_pixels - i, out + i);
}
}
// Predictor11: select. staticvoid GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
__m128i* const out) { // We can unpack with any value on the upper 32 bits, provided it's the same // on both operands (to that their sum of abs diff is zero). Here we use *A. const __m128i A_lo = _mm_unpacklo_epi32(*A, *A); const __m128i B_lo = _mm_unpacklo_epi32(*B, *A); const __m128i A_hi = _mm_unpackhi_epi32(*A, *A); const __m128i B_hi = _mm_unpackhi_epi32(*B, *A); const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo); const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
*out = _mm_packs_epi32(s_lo, s_hi);
}
staticvoid PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
__m128i pa, pb;
GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL|
GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL|
{ const __m128i mask = _mm_cmpgt_epi32(pb, pa); const __m128i A = _mm_and_si128(mask, L); const __m128i B = _mm_andnot_si128(mask, T); const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T const __m128i res = _mm_sub_epi8(src, pred);
_mm_storeu_si128((__m128i*)&out[i], res);
}
} if (i != num_pixels) {
VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i);
}
}
// Predictor12: ClampedSubSubtractFull. staticvoid PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; const __m128i zero = _mm_setzero_si128(); for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); const __m128i L_lo = _mm_unpacklo_epi8(L, zero); const __m128i L_hi = _mm_unpackhi_epi8(L, zero); const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); const __m128i T_lo = _mm_unpacklo_epi8(T, zero); const __m128i T_hi = _mm_unpackhi_epi8(T, zero); const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero); const __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo); const __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi); const __m128i pred_lo = _mm_add_epi16(L_lo, diff_lo); const __m128i pred_hi = _mm_add_epi16(L_hi, diff_hi); const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); const __m128i res = _mm_sub_epi8(src, pred);
_mm_storeu_si128((__m128i*)&out[i], res);
} if (i != num_pixels) {
VP8LPredictorsSub_C[12](in + i, upper + i, num_pixels - i, out + i);
}
}
// Predictors13: ClampedAddSubtractHalf staticvoid PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; const __m128i zero = _mm_setzero_si128(); for (i = 0; i + 2 <= num_pixels; i += 2) { // we can only process two pixels at a time const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]); const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]); const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]); const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]); const __m128i L_lo = _mm_unpacklo_epi8(L, zero); const __m128i T_lo = _mm_unpacklo_epi8(T, zero); const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); const __m128i sum = _mm_add_epi16(T_lo, L_lo); const __m128i avg = _mm_srli_epi16(sum, 1); const __m128i A1 = _mm_sub_epi16(avg, TL_lo); const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg); const __m128i A2 = _mm_sub_epi16(A1, bit_fix); const __m128i A3 = _mm_srai_epi16(A2, 1); const __m128i A4 = _mm_add_epi16(avg, A3); const __m128i pred = _mm_packus_epi16(A4, A4); const __m128i res = _mm_sub_epi8(src, pred);
_mm_storel_epi64((__m128i*)&out[i], res);
} if (i != num_pixels) {
VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i);
}
}
//------------------------------------------------------------------------------ // Entry point
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.