/* * Copyright (c) 2018 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree.
*/
void vpx_mbpost_proc_down_sse2(unsignedchar *dst, int pitch, int rows, int cols, int flimit) { int col; const __m128i zero = _mm_setzero_si128(); const __m128i f = _mm_set1_epi32(flimit);
DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);
// 8 columns are processed at a time. // If rows is less than 8 the bottom border extension fails.
assert(cols % 8 == 0);
assert(rows >= 8);
for (col = 0; col < cols; col += 8) { int row, i;
__m128i s = _mm_loadl_epi64((__m128i *)dst);
__m128i sum, sumsq_0, sumsq_1;
__m128i tmp_0, tmp_1;
__m128i below_context = _mm_setzero_si128();
s = _mm_unpacklo_epi8(s, zero);
for (i = 0; i < 8; ++i) {
_mm_store_si128((__m128i *)above_context + i, s);
}
// sum *= 9
sum = _mm_slli_epi16(s, 3);
sum = _mm_add_epi16(s, sum);
// Prime sum/sumsq for (i = 1; i <= 6; ++i) {
__m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch));
a = _mm_unpacklo_epi8(a, zero);
sum = _mm_add_epi16(sum, a);
a = _mm_mullo_epi16(a, a);
sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
}
if (row + 7 < rows) { // Instead of copying the end context we just stop loading when we get // to the last one.
below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch));
below_context = _mm_unpacklo_epi8(below_context, zero);
}
sum = _mm_sub_epi16(sum, above);
sum = _mm_add_epi16(sum, below_context);
// context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero // extend. Unfortunately we can't do below_sq - above_sq in 16 bits // because x86 does not have unpack with sign extension.
above_sq = _mm_mullo_epi16(above, above);
sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.