/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#if CONFIG_AV1_HIGHBITDEPTH staticinlinevoid acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd, const __m256i *shuffle, const __m256i *dgd_ijkl) { // Load two 128-bit chunks from dgd const __m256i s0 = _mm256_inserti128_si256(
_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)dgd)),
_mm_loadu_si128((__m128i *)(dgd + 4)), 1); // s0 = [11 10 9 8 7 6 5 4] [7 6 5 4 3 2 1 0] as u16 (values are dgd indices) // The weird order is so the shuffle stays within 128-bit lanes
// Shuffle 16x u16 values within lanes according to the mask: // [0 1 1 2 2 3 3 4] [0 1 1 2 2 3 3 4] // (Actually we shuffle u8 values as there's no 16-bit shuffle) const __m256i s1 = _mm256_shuffle_epi8(s0, *shuffle); // s1 = [8 7 7 6 6 5 5 4] [4 3 3 2 2 1 1 0] as u16 (values are dgd indices)
// Multiply 16x 16-bit integers in dgd_ijkl and s1, resulting in 16x 32-bit // integers then horizontally add pairs of these integers resulting in 8x // 32-bit integers const __m256i d0 = _mm256_madd_epi16(*dgd_ijkl, s1); // d0 = [a b c d] [e f g h] as u32
// Take the lower-half of d0, extend to u64, add it on to dst (H) const __m256i d0l = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 0)); // d0l = [a b] [c d] as u64 const __m256i dst0 = yy_load_256(dst);
yy_store_256(dst, _mm256_add_epi64(d0l, dst0));
// Take the upper-half of d0, extend to u64, add it on to dst (H) const __m256i d0h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 1)); // d0h = [e f] [g h] as u64 const __m256i dst1 = yy_load_256(dst + 4);
yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1));
}
staticinlinevoid acc_stat_highbd_win7_one_line_avx2( const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, int dgd_stride, const __m256i *shuffle, int32_t *sumX,
int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN],
int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { int j, k, l; constint wiener_win = WIENER_WIN; // Main loop handles two pixels at a time // We can assume that h_start is even, since it will always be aligned to // a tile edge + some number of restoration units, and both of those will // be 64-pixel aligned. // However, at the edge of the image, h_end may be odd, so we need to handle // that case correctly.
assert(h_start % 2 == 0); constint h_end_even = h_end & ~1; constint has_odd_pixel = h_end & 1; for (j = h_start; j < h_end_even; j += 2) { const uint16_t X1 = src[j]; const uint16_t X2 = src[j + 1];
*sumX += X1 + X2; const uint16_t *dgd_ij = dgd + j; for (k = 0; k < wiener_win; k++) { const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) {
int64_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint16_t D1 = dgd_ijk[l]; const uint16_t D2 = dgd_ijk[l + 1];
sumY[k][l] += D1 + D2;
M_int[k][l] += D1 * X1 + D2 * X2;
// Load two u16 values from dgd_ijkl combined as a u32, // then broadcast to 8x u32 slots of a 256 const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l)); // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16
// The `acc_stat_highbd_avx2` function wants its input to have // interleaved copies of two pixels, but we only have one. However, the // pixels are (effectively) used as inputs to a multiply-accumulate. So // if we set the extra pixel slot to 0, then it is effectively ignored. const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1);
staticinlinevoid acc_stat_highbd_win5_one_line_avx2( const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, int dgd_stride, const __m256i *shuffle, int32_t *sumX,
int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { int j, k, l; constint wiener_win = WIENER_WIN_CHROMA; // Main loop handles two pixels at a time // We can assume that h_start is even, since it will always be aligned to // a tile edge + some number of restoration units, and both of those will // be 64-pixel aligned. // However, at the edge of the image, h_end may be odd, so we need to handle // that case correctly.
assert(h_start % 2 == 0); constint h_end_even = h_end & ~1; constint has_odd_pixel = h_end & 1; for (j = h_start; j < h_end_even; j += 2) { const uint16_t X1 = src[j]; const uint16_t X2 = src[j + 1];
*sumX += X1 + X2; const uint16_t *dgd_ij = dgd + j; for (k = 0; k < wiener_win; k++) { const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) {
int64_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint16_t D1 = dgd_ijk[l]; const uint16_t D2 = dgd_ijk[l + 1];
sumY[k][l] += D1 + D2;
M_int[k][l] += D1 * X1 + D2 * X2;
// Load two u16 values from dgd_ijkl combined as a u32, // then broadcast to 8x u32 slots of a 256 const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l)); // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16
// The `acc_stat_highbd_avx2` function wants its input to have // interleaved copies of two pixels, but we only have one. However, the // pixels are (effectively) used as inputs to a multiply-accumulate. So // if we set the extra pixel slot to 0, then it is effectively ignored. const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1);
// When we load 32 values of int8_t type and need less than 32 values for // processing, the below mask is used to make the extra values zero. staticconst int8_t mask_8bit[32] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes
};
// When we load 16 values of int16_t type and need less than 16 values for // processing, the below mask is used to make the extra values zero. staticconst int16_t mask_16bit[32] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes
};
// When width is not multiple of 32, it still loads 32 and to make the data // which is extra (beyond required) as zero using the below mask. if (wd_beyond_mul32 >= 16) {
mask_low = _mm_set1_epi8(-1);
mask_high = _mm_loadu_si128((__m128i *)(&mask_8bit[32 - wd_beyond_mul32]));
} else {
mask_low = _mm_loadu_si128((__m128i *)(&mask_8bit[16 - wd_beyond_mul32]));
mask_high = _mm_setzero_si128();
} const __m256i mask =
_mm256_inserti128_si256(_mm256_castsi128_si256(mask_low), mask_high, 1);
int32_t proc_ht = 0; do { // Process width in multiple of 32.
int32_t proc_wd = 0; while (proc_wd < wd_mul32) { const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd)); const __m256i sad_0 = _mm256_sad_epu8(s_0, zero);
ss = _mm256_add_epi32(ss, sad_0);
proc_wd += 32;
}
// Process the remaining width. if (wd_beyond_mul32) { const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd)); const __m256i s_m_0 = _mm256_and_si256(s_0, mask); const __m256i sad_0 = _mm256_sad_epu8(s_m_0, zero);
ss = _mm256_add_epi32(ss, sad_0);
}
src_temp += stride;
proc_ht++;
} while (proc_ht < height);
const uint32_t sum = calc_sum_of_register(ss); const uint8_t avg = sum / (width * height); return avg;
}
// Fill (src-avg) or (dgd-avg) buffers. Note that when n = (width % 16) is not // 0, it writes (16 - n) more data than required. staticinlinevoid sub_avg_block_avx2(const uint8_t *src, int32_t src_stride,
uint8_t avg, int32_t width,
int32_t height, int16_t *dst,
int32_t dst_stride, int use_downsampled_wiener_stats) { const __m256i avg_reg = _mm256_set1_epi16(avg);
int32_t proc_ht = 0; do { int ds_factor =
use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; if (use_downsampled_wiener_stats &&
(height - proc_ht < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
ds_factor = height - proc_ht;
}
int32_t proc_wd = 0; while (proc_wd < width) { const __m128i s = _mm_loadu_si128((__m128i *)(src + proc_wd)); const __m256i ss = _mm256_cvtepu8_epi16(s); const __m256i d = _mm256_sub_epi16(ss, avg_reg);
_mm256_storeu_si256((__m256i *)(dst + proc_wd), d);
proc_wd += 16;
}
// Fills lower-triangular elements of H buffer from upper triangular elements of // the same staticinlinevoid fill_lower_triag_elements_avx2(const int32_t wiener_win2,
int64_t *const H) { for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
__m256i in[4], out[4];
in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + i + 1));
in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + i + 1));
in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + i + 1));
in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i + 1));
// The buffers H(auto-covariance) and M(cross-correlation) are used to estimate // the filter tap values required for wiener filtering. Here, the buffer H is of // size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size // (wiener_window_size*wiener_window_size). H is a symmetric matrix where the // value above the diagonal (upper triangle) are equal to the values below the // diagonal (lower triangle). The calculation of elements/stats of H(upper // triangle) and M is done in steps as described below where each step fills // specific values of H and M. // Once the upper triangular elements of H matrix are derived, the same will be // copied to lower triangular using the function // fill_lower_triag_elements_avx2(). // Example: Wiener window size = // WIENER_WIN_CHROMA (5) M buffer = [M0 M1 M2 ---- M23 M24] H buffer = Hxy // (x-row, y-column) [H00 H01 H02 ---- H023 H024] [H10 H11 H12 ---- H123 H124] // [H30 H31 H32 ---- H323 H324] // [H40 H41 H42 ---- H423 H424] // [H50 H51 H52 ---- H523 H524] // [H60 H61 H62 ---- H623 H624] // || // || // [H230 H231 H232 ---- H2323 H2324] // [H240 H241 H242 ---- H2423 H2424] // In Step 1, whole M buffers (i.e., M0 to M24) and the first row of H (i.e., // H00 to H024) is filled. The remaining rows of H buffer are filled through // steps 2 to 6. staticvoid compute_stats_win5_avx2(const int16_t *const d, int32_t d_stride, const int16_t *const s, int32_t s_stride,
int32_t width, int v_start, int v_end,
int64_t *const M, int64_t *const H, int use_downsampled_wiener_stats) { const int32_t wiener_win = WIENER_WIN_CHROMA; const int32_t wiener_win2 = wiener_win * wiener_win; // Amount of width which is beyond multiple of 16. This case is handled // appropriately to process only the required width towards the end. const int32_t wd_mul16 = width & ~15; const int32_t wd_beyond_mul16 = width - wd_mul16; const __m256i mask =
_mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16])); int downsample_factor;
// Step 1: Full M (i.e., M0 to M24) and first row H (i.e., H00 to H024) // values are filled here. Here, the loop over 'j' is executed for values 0 // to 4 (wiener_win-1). When the loop executed for a specific 'j', 5 values of // M and H are filled as shown below. // j=0: M0-M4 and H00-H04, j=1: M5-M9 and H05-H09 are filled etc,. int j = 0; do { const int16_t *s_t = s; const int16_t *d_t = d;
__m256i sum_m[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
__m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
downsample_factor =
use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; int proc_ht = v_start; do {
UPDATE_DOWNSAMPLE_FACTOR
// Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
INIT_MH_VALUES(d_t + j + proc_wd)
// The below steps are designed to fill remaining rows of H buffer. Here, aim // is to fill only upper triangle elements correspond to each row and lower // triangle elements are copied from upper-triangle elements. Also, as // mentioned in Step 1, the core function is designed to fill 5 // elements/stats/values of H buffer. // // Step 2: Here, the rows 1, 6, 11, 16 and 21 are filled. As we need to fill // only upper-triangle elements, H10 from row1, H60-H64 and H65 from row6,etc, // are need not be filled. As the core function process 5 values, in first // iteration of 'j' only 4 values to be filled i.e., H11-H14 from row1,H66-H69 // from row6, etc. for (int i = 1; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started.
INITIALIZATION(WIENER_WIN_CHROMA)
do {
UPDATE_DOWNSAMPLE_FACTOR
// Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd =
_mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4)
// process the remaining 'j' iterations.
j++;
CALCULATE_REMAINING_H_WIN5
}
// Step 3: Here, the rows 2, 7, 12, 17 and 22 are filled. As we need to fill // only upper-triangle elements, H20-H21 from row2, H70-H74 and H75-H76 from // row7, etc, are need not be filled. As the core function process 5 values, // in first iteration of 'j' only 3 values to be filled i.e., H22-H24 from // row2, H77-H79 from row7, etc. for (int i = 2; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started.
INITIALIZATION(WIENER_WIN_CHROMA)
do {
UPDATE_DOWNSAMPLE_FACTOR
// Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd =
_mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3)
// process the remaining 'j' iterations.
j++;
CALCULATE_REMAINING_H_WIN5
}
// Step 4: Here, the rows 3, 8, 13, 18 and 23 are filled. As we need to fill // only upper-triangle elements, H30-H32 from row3, H80-H84 and H85-H87 from // row8, etc, are need not be filled. As the core function process 5 values, // in first iteration of 'j' only 2 values to be filled i.e., H33-H34 from // row3, H88-89 from row8, etc. for (int i = 3; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started.
INITIALIZATION(WIENER_WIN_CHROMA)
do {
UPDATE_DOWNSAMPLE_FACTOR
// Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd =
_mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2)
// process the remaining 'j' iterations.
j++;
CALCULATE_REMAINING_H_WIN5
}
// Step 5: Here, the rows 4, 9, 14, 19 and 24 are filled. As we need to fill // only upper-triangle elements, H40-H43 from row4, H90-H94 and H95-H98 from // row9, etc, are need not be filled. As the core function process 5 values, // in first iteration of 'j' only 1 values to be filled i.e., H44 from row4, // H99 from row9, etc. for (int i = 4; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started.
INITIALIZATION(WIENER_WIN_CHROMA) do {
UPDATE_DOWNSAMPLE_FACTOR
// Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd =
_mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1)
// process the remaining 'j' iterations.
j++;
CALCULATE_REMAINING_H_WIN5
}
// Step 6: Here, the rows 5, 10, 15 and 20 are filled. As we need to fill only // upper-triangle elements, H50-H54 from row5, H100-H104 and H105-H109 from // row10,etc, are need not be filled. The first iteration of 'j' fills H55-H59 // from row5 and H1010-H1014 from row10, etc. for (int i = 5; i < wiener_win2; i += wiener_win) { // Derive j'th iteration from where the H buffer filling needs to be // started.
j = i / wiener_win; int shift = 0; do { // Update the dgd pointers appropriately. int proc_ht = v_start; const int16_t *d_window = d + (i / wiener_win); const int16_t *d_current_row =
d + (i / wiener_win) + ((i % wiener_win) * d_stride);
downsample_factor =
use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
__m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() }; do {
UPDATE_DOWNSAMPLE_FACTOR
// Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd =
_mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
INIT_H_VALUES(d_window + shift + proc_wd, 5)
// The buffers H(auto-covariance) and M(cross-correlation) are used to estimate // the filter tap values required for wiener filtering. Here, the buffer H is of // size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size // (wiener_window_size*wiener_window_size). H is a symmetric matrix where the // value above the diagonal (upper triangle) are equal to the values below the // diagonal (lower triangle). The calculation of elements/stats of H(upper // triangle) and M is done in steps as described below where each step fills // specific values of H and M. // Example: // Wiener window size = WIENER_WIN (7) // M buffer = [M0 M1 M2 ---- M47 M48] // H buffer = Hxy (x-row, y-column) // [H00 H01 H02 ---- H047 H048] // [H10 H11 H12 ---- H147 H148] // [H30 H31 H32 ---- H347 H348] // [H40 H41 H42 ---- H447 H448] // [H50 H51 H52 ---- H547 H548] // [H60 H61 H62 ---- H647 H648] // || // || // [H470 H471 H472 ---- H4747 H4748] // [H480 H481 H482 ---- H4847 H4848] // In Step 1, whole M buffers (i.e., M0 to M48) and the first row of H (i.e., // H00 to H048) is filled. The remaining rows of H buffer are filled through // steps 2 to 8. staticvoid compute_stats_win7_avx2(const int16_t *const d, int32_t d_stride, const int16_t *const s, int32_t s_stride,
int32_t width, int v_start, int v_end,
int64_t *const M, int64_t *const H, int use_downsampled_wiener_stats) { const int32_t wiener_win = WIENER_WIN; const int32_t wiener_win2 = wiener_win * wiener_win; // Amount of width which is beyond multiple of 16. This case is handled // appropriately to process only the required width towards the end. const int32_t wd_mul16 = width & ~15; const int32_t wd_beyond_mul16 = width - wd_mul16; const __m256i mask =
_mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16])); int downsample_factor;
// Step 1: Full M (i.e., M0 to M48) and first row H (i.e., H00 to H048) // values are filled here. Here, the loop over 'j' is executed for values 0 // to 6. When the loop executed for a specific 'j', 7 values of M and H are // filled as shown below. // j=0: M0-M6 and H00-H06, j=1: M7-M13 and H07-H013 are filled etc,. int j = 0; do { const int16_t *s_t = s; const int16_t *d_t = d;
__m256i sum_m[WIENER_WIN] = { _mm256_setzero_si256() };
__m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() };
downsample_factor =
use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; int proc_ht = v_start; do {
UPDATE_DOWNSAMPLE_FACTOR
// Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
INIT_MH_VALUES(d_t + j + proc_wd)
// The below steps are designed to fill remaining rows of H buffer. Here, aim // is to fill only upper triangle elements correspond to each row and lower // triangle elements are copied from upper-triangle elements. Also, as // mentioned in Step 1, the core function is designed to fill 7 // elements/stats/values of H buffer. // // Step 2: Here, the rows 1, 8, 15, 22, 29, 36 and 43 are filled. As we need // to fill only upper-triangle elements, H10 from row1, H80-H86 and H87 from // row8, etc. are need not be filled. As the core function process 7 values, // in first iteration of 'j' only 6 values to be filled i.e., H11-H16 from // row1 and H88-H813 from row8, etc. for (int i = 1; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started.
INITIALIZATION(WIENER_WIN)
do {
UPDATE_DOWNSAMPLE_FACTOR
// Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd =
_mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6)
// process the remaining 'j' iterations.
j++;
CALCULATE_REMAINING_H_WIN7
}
// Step 3: Here, the rows 2, 9, 16, 23, 30, 37 and 44 are filled. As we need // to fill only upper-triangle elements, H20-H21 from row2, H90-H96 and // H97-H98 from row9, etc. are need not be filled. As the core function // process 7 values, in first iteration of 'j' only 5 values to be filled // i.e., H22-H26 from row2 and H99-H913 from row9, etc. for (int i = 2; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started.
INITIALIZATION(WIENER_WIN) do {
UPDATE_DOWNSAMPLE_FACTOR
// Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd =
_mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5)
// process the remaining 'j' iterations.
j++;
CALCULATE_REMAINING_H_WIN7
}
// Step 4: Here, the rows 3, 10, 17, 24, 31, 38 and 45 are filled. As we need // to fill only upper-triangle elements, H30-H32 from row3, H100-H106 and // H107-H109 from row10, etc. are need not be filled. As the core function // process 7 values, in first iteration of 'j' only 4 values to be filled // i.e., H33-H36 from row3 and H1010-H1013 from row10, etc. for (int i = 3; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started.
INITIALIZATION(WIENER_WIN)
do {
UPDATE_DOWNSAMPLE_FACTOR
// Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd =
_mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4)
// process the remaining 'j' iterations.
j++;
CALCULATE_REMAINING_H_WIN7
}
// Step 5: Here, the rows 4, 11, 18, 25, 32, 39 and 46 are filled. As we need // to fill only upper-triangle elements, H40-H43 from row4, H110-H116 and // H117-H1110 from row10, etc. are need not be filled. As the core function // process 7 values, in first iteration of 'j' only 3 values to be filled // i.e., H44-H46 from row4 and H1111-H1113 from row11, etc. for (int i = 4; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started.
INITIALIZATION(WIENER_WIN)
do {
UPDATE_DOWNSAMPLE_FACTOR
// Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd =
_mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3)
// process the remaining 'j' iterations.
j++;
CALCULATE_REMAINING_H_WIN7
}
// Step 6: Here, the rows 5, 12, 19, 26, 33, 40 and 47 are filled. As we need // to fill only upper-triangle elements, H50-H54 from row5, H120-H126 and // H127-H1211 from row12, etc. are need not be filled. As the core function // process 7 values, in first iteration of 'j' only 2 values to be filled // i.e., H55-H56 from row5 and H1212-H1213 from row12, etc. for (int i = 5; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started.
INITIALIZATION(WIENER_WIN) do {
UPDATE_DOWNSAMPLE_FACTOR
// Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd =
_mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2)
// process the remaining 'j' iterations.
j++;
CALCULATE_REMAINING_H_WIN7
}
// Step 7: Here, the rows 6, 13, 20, 27, 34, 41 and 48 are filled. As we need // to fill only upper-triangle elements, H60-H65 from row6, H130-H136 and // H137-H1312 from row13, etc. are need not be filled. As the core function // process 7 values, in first iteration of 'j' only 1 value to be filled // i.e., H66 from row6 and H1313 from row13, etc. for (int i = 6; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started.
INITIALIZATION(WIENER_WIN) do {
UPDATE_DOWNSAMPLE_FACTOR
// Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd =
_mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1)
// process the remaining 'j' iterations.
j++;
CALCULATE_REMAINING_H_WIN7
}
// Step 8: Here, the rows 7, 14, 21, 28, 35 and 42 are filled. As we need // to fill only upper-triangle elements, H70-H75 from row7, H140-H146 and
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.21 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.