/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/ #include <immintrin.h> // AVX2
static AOM_FORCE_INLINE void aom_sadMxNx3d_avx2( int M, int N, const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
__m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
__m256i sum_ref0, sum_ref1, sum_ref2; int i, j; const uint8_t *ref0, *ref1, *ref2; const __m256i zero = _mm256_setzero_si256();
for (int i = 0; i < N; i += 2) { // load src and all refs
src_reg = yy_loadu2_128(src + src_stride, src);
ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
// sum of the absolute differences between every ref-i to src
ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
// sum every ref-i
sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
for (int i = 0; i < N; i += 2) { // load src and all refs
src_reg = yy_loadu2_128(src + src_stride, src);
ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
ref3_reg = yy_loadu2_128(ref3 + ref_stride, ref3);
// sum of the absolute differences between every ref-i to src
ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
// sum every ref-i
sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.