/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height) { do { const uint8_t *src_ptr = src;
uint16_t *dst_ptr = dst;
int w = 0; while (width - w >= 16) {
uint8x16_t row = vld1q_u8(src_ptr + w);
uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } };
vst2q_u8((uint8_t *)(dst_ptr + w), row_u16);
w += 16;
} if (width - w >= 8) {
uint8x8_t row = vld1_u8(src_ptr + w);
vst1q_u16(dst_ptr + w, vmovl_u8(row));
w += 8;
} if (width - w == 4) { for (int i = w; i < w + 4; i++) {
dst_ptr[i] = src_ptr[i];
}
}
// This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal // down-right, 6 is vertical). // // For each direction the lines are shifted so that we can perform a // basic sum on each vector element. For example, direction 5 is "south by // southeast", so we need to add the pixels along each line i below: // // 0 1 2 3 4 5 6 7 // 0 1 2 3 4 5 6 7 // 8 0 1 2 3 4 5 6 // 8 0 1 2 3 4 5 6 // 9 8 0 1 2 3 4 5 // 9 8 0 1 2 3 4 5 // 10 9 8 0 1 2 3 4 // 10 9 8 0 1 2 3 4 // // For this to fit nicely in vectors, the lines need to be shifted like so: // 0 1 2 3 4 5 6 7 // 0 1 2 3 4 5 6 7 // 8 0 1 2 3 4 5 6 // 8 0 1 2 3 4 5 6 // 9 8 0 1 2 3 4 5 // 9 8 0 1 2 3 4 5 // 10 9 8 0 1 2 3 4 // 10 9 8 0 1 2 3 4 // // In this configuration we can now perform SIMD additions to get the cost // along direction 5. Since this won't fit into a single 128-bit vector, we use // two of them to compute each half of the new configuration, and pad the empty // spaces with zeros. Similar shifting is done for other directions, except // direction 6 which is straightforward as it's the vertical direction. staticinline uint32x4_t compute_vert_directions_neon(int16x8_t lines[8],
uint32_t cost[4]) { const int16x8_t zero = vdupq_n_s16(0);
// Special case for direction 2 as it's just a sum along each line.
int16x8_t lines03[4] = { lines[0], lines[1], lines[2], lines[3] };
int16x8_t lines47[4] = { lines[4], lines[5], lines[6], lines[7] };
int32x4_t partial2a = horizontal_add_4d_s16x8(lines03);
int32x4_t partial2b = horizontal_add_4d_s16x8(lines47);
// Find max cost as well as its index to get best_dir. // The max cost needs to be propagated in the whole vector to find its // position in the original cost vectors cost03 and cost47.
uint32x4_t cost07 = vmaxq_u32(cost03, cost47); #if AOM_ARCH_AARCH64
best_cost = vmaxvq_u32(cost07);
uint32x4_t max_cost = vdupq_n_u32(best_cost);
uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)),
vreinterpretq_u8_u32(
vceqq_u32(max_cost, cost47)) } }; // idx = { 28, 24, 20, 16, 12, 8, 4, 0 };
uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL)); // Get the lowest 8 bit of each 32-bit elements and reverse them.
uint8x8_t tbl = vqtbl2_u8(costs, idx);
uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0);
best_dir = aom_clzll(a) >> 3; #else
uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07));
cost64 = vpmax_u32(cost64, cost64);
uint32x4_t max_cost = vcombine_u32(cost64, cost64);
best_cost = vget_lane_u32(cost64, 0);
uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)),
vmovn_u32(vceqq_u32(max_cost, cost47)));
uint8x8_t idx =
vand_u8(vmovn_u16(costs),
vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL))); int sum = horizontal_add_u8x8(idx);
best_dir = get_msb(sum ^ (sum - 1)); #endif
// Difference between the optimal variance and the variance along the // orthogonal direction. Again, the sum(x^2) terms cancel out.
*var = best_cost - cost[(best_dir + 4) & 7]; // We'd normally divide by 840, but dividing by 1024 is close enough // for what we're going to do with this.
*var >>= 10; return best_dir;
}
void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var_out_1st,
int32_t *var_out_2nd, int coeff_shift, int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { // Process first 8x8.
*out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
// Process second 8x8.
*out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
}
// The source is 16 bits, however, we only really care about the lower // 8 bits. The upper 8 bits contain the "large" flag. After the final // primary max has been calculated, zero out the upper 8 bits. Use this // to find the "16 bit" max.
uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
vreinterpretq_u8_u16(pri_src[1]));
uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
vreinterpretq_u8_u16(pri_src[3]));
pri_max0 = vmaxq_u8(pri_max0, pri_max1);
max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
cdef_large_value_mask));
// The source is 16 bits, however, we only really care about the lower // 8 bits. The upper 8 bits contain the "large" flag. After the final // primary max has been calculated, zero out the upper 8 bits. Use this // to find the "16 bit" max.
uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
vreinterpretq_u8_u16(sec_src[1]));
uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
vreinterpretq_u8_u16(sec_src[3]));
uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
vreinterpretq_u8_u16(sec_src[5]));
uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
vreinterpretq_u8_u16(sec_src[7]));
sec_max0 = vmaxq_u8(sec_max0, sec_max1);
sec_max2 = vmaxq_u8(sec_max2, sec_max3);
sec_max0 = vmaxq_u8(sec_max0, sec_max2);
max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
cdef_large_value_mask));
// The source is 16 bits, however, we only really care about the lower // 8 bits. The upper 8 bits contain the "large" flag. After the final // primary max has been calculated, zero out the upper 8 bits. Use this // to find the "16 bit" max.
uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
vreinterpretq_u8_u16(pri_src[1]));
uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
vreinterpretq_u8_u16(pri_src[3]));
pri_max0 = vmaxq_u8(pri_max0, pri_max1);
max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
cdef_large_value_mask));
// The source is 16 bits, however, we only really care about the lower // 8 bits. The upper 8 bits contain the "large" flag. After the final // primary max has been calculated, zero out the upper 8 bits. Use this // to find the "16 bit" max.
uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
vreinterpretq_u8_u16(sec_src[1]));
uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
vreinterpretq_u8_u16(sec_src[3]));
uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
vreinterpretq_u8_u16(sec_src[5]));
uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
vreinterpretq_u8_u16(sec_src[7]));
sec_max0 = vmaxq_u8(sec_max0, sec_max1);
sec_max2 = vmaxq_u8(sec_max2, sec_max3);
sec_max0 = vmaxq_u8(sec_max0, sec_max2);
max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
cdef_large_value_mask));
in += 2 * CDEF_BSTRIDE;
dst8 += 2 * dstride;
h -= 2;
} while (h != 0);
}
}
void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) {
(void)sec_strength;
(void)sec_damping;
in += 2 * CDEF_BSTRIDE;
dst8 += 2 * dstride;
h -= 2;
} while (h != 0);
}
}
void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) {
(void)pri_strength;
(void)pri_damping;
(void)coeff_shift;
in += 2 * CDEF_BSTRIDE;
dst8 += 2 * dstride;
h -= 2;
} while (h != 0);
}
}
void cdef_filter_8_3_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) {
(void)pri_strength;
(void)sec_strength;
(void)dir;
(void)pri_damping;
(void)sec_damping;
(void)coeff_shift;
(void)block_width; if (block_width == 8) {
uint8_t *dst8 = (uint8_t *)dest;
int h = block_height; do { const uint16x8_t s = vld1q_u16(in); const uint8x8_t res = vqmovn_u16(s);
vst1_u8(dst8, res);
in += CDEF_BSTRIDE;
dst8 += dstride;
} while (--h != 0);
} else {
uint8_t *dst8 = (uint8_t *)dest;
int h = block_height; do { const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); const uint8x8_t res = vqmovn_u16(s);
store_u8x4_strided_x2(dst8, dstride, res);
in += 2 * CDEF_BSTRIDE;
dst8 += 2 * dstride;
h -= 2;
} while (h != 0);
}
}
void cdef_filter_16_0_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) {
uint16x8_t max, min; const uint16x8_t cdef_large_value_mask =
vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); constint po1 = cdef_directions[dir][0]; constint po2 = cdef_directions[dir][1]; constint s1o1 = cdef_directions[dir + 2][0]; constint s1o2 = cdef_directions[dir + 2][1]; constint s2o1 = cdef_directions[dir - 2][0]; constint s2o2 = cdef_directions[dir - 2][1]; constint *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; constint *sec_taps = cdef_sec_taps;
if (pri_strength) {
pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
} if (sec_strength) {
sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
}
if (block_width == 8) {
uint16_t *dst16 = (uint16_t *)dest;
int h = block_height; do {
int16x8_t sum = vdupq_n_s16(0);
uint16x8_t s = vld1q_u16(in);
max = min = s;
in += 2 * CDEF_BSTRIDE;
dst16 += 2 * dstride;
h -= 2;
} while (h != 0);
}
}
void cdef_filter_16_1_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) {
(void)sec_strength;
(void)sec_damping;
in += 2 * CDEF_BSTRIDE;
dst16 += 2 * dstride;
h -= 2;
} while (h != 0);
}
}
void cdef_filter_16_2_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) {
(void)pri_strength;
(void)pri_damping;
(void)coeff_shift;
in += 2 * CDEF_BSTRIDE;
dst16 += 2 * dstride;
h -= 2;
} while (h != 0);
}
}
void cdef_filter_16_3_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) {
(void)pri_strength;
(void)sec_strength;
(void)dir;
(void)pri_damping;
(void)sec_damping;
(void)coeff_shift;
(void)block_width; if (block_width == 8) {
uint16_t *dst16 = (uint16_t *)dest;
int h = block_height; do { const uint16x8_t s = vld1q_u16(in);
vst1q_u16(dst16, s);
in += CDEF_BSTRIDE;
dst16 += dstride;
} while (--h != 0);
} else {
uint16_t *dst16 = (uint16_t *)dest;
int h = block_height; do { const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
store_u16x4_strided_x2(dst16, dstride, s);
in += 2 * CDEF_BSTRIDE;
dst16 += 2 * dstride;
h -= 2;
} while (h != 0);
}
}
Messung V0.5
¤ Dauer der Verarbeitung: 0.17 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.