/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/* The functions below use the following system for averaging 4 pixels: * * The first observation is that a half-adder is implemented as follows: * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1); * * This can be trivially extended to three pixels by observaring that when * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the * carries of the individual numbers, since the sum of 3 bits can only ever * have a carry of one. * * We then observe that the average is then ((carry << 1) + sum) >> 1, or, * assuming eliminating overflows and underflows, carry + (sum >> 1). * * We now average our existing sum with the fourth number, so we get: * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1). * * We now observe that our sum has been moved into place relative to the * carry, so we can now average with the carry to get the final 4 input * average: avg = (sum2 + carry) >> 1; * * Or to reverse the proof: * avg = ((sum >> 1) + carry + d >> 1) >> 1 * avg = ((a + b + c) >> 1 + d >> 1) >> 1 * avg = ((a + b + c + d) >> 2) * * An additional fact used in the SSE versions is the concept that we can * trivially convert a rounded average to a truncated average: * * We have: * f(a, b) = (a + b + 1) >> 1 * * And want: * g(a, b) = (a + b) >> 1 * * Observe: * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1) * == ~((-a - 1 + -b - 1 + 1) >> 1) * == ~((-a - 1 + -b) >> 1) * == ~((-(a + b) - 1) >> 1) * == ~((~(a + b)) >> 1) * == (a + b) >> 1 * == g(a, b)
*/
/* We have to pass pointers here, MSVC does not allow passing more than 3 * __m128i arguments on the stack. And it does not allow 16-byte aligned * stack variables. This inlines properly on MSVC 2010. It does -not- inline * with just the inline directive.
*/
MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i* a, __m128i* b, __m128i* c,
__m128i* d) { #define shuf1 _MM_SHUFFLE(2, 0, 2, 0) #define shuf2 _MM_SHUFFLE(3, 1, 3, 1)
// This cannot be an inline function as the __Imm argument to _mm_shuffle_ps // needs to be a compile time constant. #define shuffle_si128(arga, argb, imm) \
_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), \
_mm_castsi128_ps((argb)), (imm)));
MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c,
uint32_t d) {
uint32_t sum = a ^ b ^ c;
uint32_t carry = (a & b) | (a & c) | (b & c);
uint32_t mask = 0xfefefefe;
// Not having a byte based average instruction means we should mask to avoid // underflow.
sum = (((sum ^ d) & mask) >> 1) + (sum & d);
// Simple 2 pixel average version of the function above.
MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b) {
uint32_t sum = a ^ b;
uint32_t carry = (a & b);
for (int y = 0; y < aSourceSize.height; y += 2) {
__m128i* storage = (__m128i*)(aDest + (y / 2) * aDestStride); int x = 0; // Run a loop depending on alignment. if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { for (; x < (aSourceSize.width - 7); x += 8) {
__m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
__m128i* lowerRow =
(__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
__m128i a = _mm_load_si128(upperRow);
__m128i b = _mm_load_si128(upperRow + 1);
__m128i c = _mm_load_si128(lowerRow);
__m128i d = _mm_load_si128(lowerRow + 1);
__m128i a = _mm_load_si128(upperRow);
__m128i b = _mm_load_si128(upperRow + 1);
__m128i c = loadUnaligned128(lowerRow);
__m128i d = loadUnaligned128(lowerRow + 1);
__m128i a = loadUnaligned128((__m128i*)upperRow);
__m128i b = loadUnaligned128((__m128i*)upperRow + 1);
__m128i c = _mm_load_si128((__m128i*)lowerRow);
__m128i d = _mm_load_si128((__m128i*)lowerRow + 1);
*storage++ = avg_sse2_8x2(&a, &b, &c, &d);
}
} else { for (; x < (aSourceSize.width - 7); x += 8) {
__m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
__m128i* lowerRow =
(__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
__m128i a = loadUnaligned128(upperRow);
__m128i b = loadUnaligned128(upperRow + 1);
__m128i c = loadUnaligned128(lowerRow);
__m128i d = loadUnaligned128(lowerRow + 1);
*storage++ = avg_sse2_8x2(&a, &b, &c, &d);
}
}
uint32_t* unalignedStorage = (uint32_t*)storage; // Take care of the final pixels, we know there's an even number of pixels // in the source rectangle. We use a 2x2 'simd' implementation for this. // // Potentially we only have to do this in the last row since overflowing // 8 pixels in an earlier row would appear to be harmless as it doesn't // touch invalid memory. Even when reading and writing to the same surface. // in practice we only do this when doing an additional downscale pass, and // in this situation we have unused stride to write into harmlessly. // I do not believe the additional code complexity would be worth it though. for (; x < aSourceSize.width; x += 2) {
uint8_t* upperRow = aSource + (y * aSourceStride + x * Bpp);
uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp);
__m128i a = loadUnaligned128((__m128i*)upperRow);
__m128i b = _mm_load_si128((__m128i*)lowerRow);
*storage++ = avg_sse2_4x2_4x1(a, b);
}
} else { for (; x < (aSourceSize.width - 3); x += 4) {
uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
__m128i a = loadUnaligned128((__m128i*)upperRow);
__m128i b = loadUnaligned128((__m128i*)lowerRow);
*storage++ = avg_sse2_4x2_4x1(a, b);
}
}
uint32_t* unalignedStorage = (uint32_t*)storage; // Take care of the final pixels, we know there's an even number of pixels // in the source rectangle. // // Similar overflow considerations are valid as in the previous function. for (; x < aSourceSize.width; x++) {
uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
void ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t* aSource,
int32_t aSourceStride, const IntSize& aSourceSize,
uint8_t* aDest,
uint32_t aDestStride) { for (int y = 0; y < aSourceSize.height; y++) {
__m128i* storage = (__m128i*)(aDest + (y * aDestStride)); int x = 0; // Run a loop depending on alignment. if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { for (; x < (aSourceSize.width - 7); x += 8) {
__m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
__m128i a = _mm_load_si128(pixels);
__m128i b = _mm_load_si128(pixels + 1);
*storage++ = avg_sse2_8x1_4x1(a, b);
}
} else { for (; x < (aSourceSize.width - 7); x += 8) {
__m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
__m128i a = loadUnaligned128(pixels);
__m128i b = loadUnaligned128(pixels + 1);
*storage++ = avg_sse2_8x1_4x1(a, b);
}
}
uint32_t* unalignedStorage = (uint32_t*)storage; // Take care of the final pixels, we know there's an even number of pixels // in the source rectangle. // // Similar overflow considerations are valid as in the previous function. for (; x < aSourceSize.width; x += 2) {
uint32_t* pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4));
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.