/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
/* partial A is a 16-bit vector of the form: [x8 - - x1 | x16 - - x9] and partial B has the form: [0 y1 - y7 | 0 y9 - y15]. This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants
are in const1 and const2. */ staticinline __m256i fold_mul_and_sum_avx2(__m256i *partiala,
__m256i *partialb, const __m256i *const1, const __m256i *const2) { // Mask used to shuffle the elements present in 256bit register. staticconstint shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
0x0f0e0100, 0x0b0a0d0c, 0x07060908,
0x03020504, 0x0f0e0100 };
__m256i tmp; /* Reverse partial B. */
*partialb = _mm256_shuffle_epi8(
*partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit));
/* Interleave the x and y values of identical indices and pair x8 with 0. */
tmp = *partiala;
*partiala = _mm256_unpacklo_epi16(*partiala, *partialb);
*partialb = _mm256_unpackhi_epi16(tmp, *partialb);
/* Square and add the corresponding x and y values. */
*partiala = _mm256_madd_epi16(*partiala, *partiala);
*partialb = _mm256_madd_epi16(*partialb, *partialb); /* Multiply by constant. */
*partiala = _mm256_mullo_epi32(*partiala, *const1);
*partialb = _mm256_mullo_epi32(*partialb, *const2); /* Sum all results. */
*partiala = _mm256_add_epi32(*partiala, *partialb); return *partiala;
}
void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var_out_1st,
int32_t *var_out_2nd, int coeff_shift, int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
int32_t cost_first_8x8[8];
int32_t cost_second_8x8[8]; // Used to store the best cost for 2 8x8's.
int32_t best_cost[2] = { 0 }; // Best direction for 2 8x8's. int best_dir[2] = { 0 };
/* Difference between the optimal variance and the variance along the
orthogonal direction. Again, the sum(x^2) terms cancel out. */
*var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7];
*var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7];
/* We'd normally divide by 840, but dividing by 1024 is close enough
for what we're going to do with this. */
*var_out_1st >>= 10;
*var_out_2nd >>= 10;
*out_dir_1st_8x8 = best_dir[0];
*out_dir_2nd_8x8 = best_dir[1];
}
void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height) { int j = 0; int remaining_width = width;
assert(height % 2 == 0);
assert(height > 0);
assert(width > 0);
// Process 16 pixels at a time. if (remaining_width > 15) { int i = 0; do {
__m128i row0 =
_mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]);
__m128i row1 =
_mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]);
_mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j],
_mm256_cvtepu8_epi16(row0));
_mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j],
_mm256_cvtepu8_epi16(row1));
i += 2;
} while (i < height);
remaining_width = width & 15;
j += 16;
}
// Process 8 pixels at a time. if (remaining_width > 7) { int i = 0; do {
__m128i row0 =
_mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]);
__m128i row1 =
_mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]);
_mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j],
_mm_unpacklo_epi8(row0, _mm_setzero_si128()));
_mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j],
_mm_unpacklo_epi8(row1, _mm_setzero_si128()));
i += 2;
} while (i < height);
remaining_width = width & 7;
j += 8;
}
// Process 4 pixels at a time. if (remaining_width > 3) { int i = 0; do {
__m128i row0 =
_mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j]));
__m128i row1 =
_mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j]));
_mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j],
_mm_unpacklo_epi8(row0, _mm_setzero_si128()));
_mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j],
_mm_unpacklo_epi8(row1, _mm_setzero_si128()));
i += 2;
} while (i < height);
remaining_width = width & 3;
j += 4;
}
// Process the remaining pixels. if (remaining_width) { for (int i = 0; i < height; i++) { for (int k = j; k < width; k++) {
dst[i * dstride + k] = src[i * sstride + k];
}
}
}
}
Messung V0.5
¤ Dauer der Verarbeitung: 0.10 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.