/* * Copyright (c) 2018 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree.
*/
// Slide across window and add. staticINLINE int16x8_t slide_sum_s16(int16x8_t x) { // x = A B C D E F G H // // 0 A B C D E F G const int16x8_t sum1 = vec_add(x, vec_slo(x, vec_splats((int8_t)(2 << 3)))); // 0 0 A B C D E F const int16x8_t sum2 = vec_add(vec_slo(x, vec_splats((int8_t)(4 << 3))), // 0 0 0 A B C D E
vec_slo(x, vec_splats((int8_t)(6 << 3)))); // 0 0 0 0 A B C D const int16x8_t sum3 = vec_add(vec_slo(x, vec_splats((int8_t)(8 << 3))), // 0 0 0 0 0 A B C
vec_slo(x, vec_splats((int8_t)(10 << 3)))); // 0 0 0 0 0 0 A B const int16x8_t sum4 = vec_add(vec_slo(x, vec_splats((int8_t)(12 << 3))), // 0 0 0 0 0 0 0 A
vec_slo(x, vec_splats((int8_t)(14 << 3)))); return vec_add(vec_add(sum1, sum2), vec_add(sum3, sum4));
}
// Slide across window and add. staticINLINE int32x4_t slide_sumsq_s32(int32x4_t xsq_even, int32x4_t xsq_odd) { // 0 A C E // + 0 B D F
int32x4_t sumsq_1 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(4 << 3))),
vec_slo(xsq_odd, vec_splats((int8_t)(4 << 3)))); // 0 0 A C // + 0 0 B D
int32x4_t sumsq_2 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(8 << 3))),
vec_slo(xsq_odd, vec_splats((int8_t)(8 << 3)))); // 0 0 0 A // + 0 0 0 B
int32x4_t sumsq_3 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(12 << 3))),
vec_slo(xsq_odd, vec_splats((int8_t)(12 << 3))));
sumsq_1 = vec_add(sumsq_1, xsq_even);
sumsq_2 = vec_add(sumsq_2, sumsq_3); return vec_add(sumsq_1, sumsq_2);
}
void vpx_mbpost_proc_across_ip_vsx(unsignedchar *src, int pitch, int rows, int cols, int flimit) { int row, col; const int32x4_t lim = vec_splats(flimit);
// 8 columns are processed at a time.
assert(cols % 8 == 0);
for (row = 0; row < rows; row++) { // The sum is signed and requires at most 13 bits. // (8 bits + sign) * 15 (4 bits)
int16x8_t sum; // The sum of squares requires at most 20 bits. // (16 bits + sign) * 15 (4 bits)
int32x4_t sumsq_even, sumsq_odd;
// Fill left context with first col.
int16x8_t left_ctx = vec_splats((int16_t)src[0]);
int16_t s = src[0] * 9;
int32_t ssq = src[0] * src[0] * 9 + 16;
// Fill the next 6 columns of the sliding window with cols 2 to 7. for (col = 1; col <= 6; ++col) {
s += src[col];
ssq += src[col] * src[col];
} // Set this sum to every element in the window.
sum = vec_splats(s);
sumsq_even = vec_splats(ssq);
sumsq_odd = vec_splats(ssq);
for (col = 0; col < cols; col += 8) {
bool16x8_t mask;
int16x8_t filtered, masked;
uint8x16_t out;
const int32x4_t sumsq_tmp = slide_sumsq_s32(xsq_even, xsq_odd); // A C E G // 0 B D F // 0 A C E // 0 0 B D // 0 0 A C // 0 0 0 B // 0 0 0 A
sumsq_even = vec_add(sumsq_even, sumsq_tmp); // B D F G // A C E G // 0 B D F // 0 A C E // 0 0 B D // 0 0 A C // 0 0 0 B // 0 0 0 A
sumsq_odd = vec_add(sumsq_odd, vec_add(sumsq_tmp, xsq_odd));
void vpx_mbpost_proc_down_vsx(uint8_t *dst, int pitch, int rows, int cols, int flimit) { int col, row, i;
int16x8_t window[16]; const int32x4_t lim = vec_splats(flimit);
// 8 columns are processed at a time.
assert(cols % 8 == 0); // If rows is less than 8 the bottom border extension fails.
assert(rows >= 8);
for (col = 0; col < cols; col += 8) { // The sum is signed and requires at most 13 bits. // (8 bits + sign) * 15 (4 bits)
int16x8_t r1, sum; // The sum of squares requires at most 20 bits. // (16 bits + sign) * 15 (4 bits)
int32x4_t sumsq_even, sumsq_odd;
r1 = unpack_to_s16_h(vec_vsx_ld(0, dst)); // Fill sliding window with first row. for (i = 0; i <= 8; i++) {
window[i] = r1;
} // First 9 rows of the sliding window are the same. // sum = r1 * 9
sum = vec_mladd(r1, vec_splats((int16_t)9), vec_zeros_s16);
// Fill the next 6 rows of the sliding window with rows 2 to 7. for (i = 1; i <= 6; ++i) { const int16x8_t next_row = unpack_to_s16_h(vec_vsx_ld(i * pitch, dst));
window[i + 8] = next_row;
sum = vec_add(sum, next_row);
sumsq_odd = vec_add(sumsq_odd, vec_mulo(next_row, next_row));
sumsq_even = vec_add(sumsq_even, vec_mule(next_row, next_row));
}
// C: sumsq * 15 - sum * sum
mask = mask_s16(sumsq_even, sumsq_odd, sum, lim);
masked = vec_sel(window[8], filtered, mask);
// TODO(ltrudeau) If cols % 16 == 0, we could just process 16 per // iteration
out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, dst + row * pitch),
load_merge);
vec_vsx_st(out, 0, dst + row * pitch);
// Optimization Note: Turns out that the following loop is faster than // using pointers to manage the sliding window. for (i = 1; i < 16; i++) {
window[i - 1] = window[i];
}
}
dst += 8;
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.