/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
// This mask rearranges source pixels in the order shown below. // shuffle_src_layer0[0][8]: applied on source pixels 0 to 7. // shuffle_src_layer0[1][8]: applied on source pixels 7 to 14. // This shuffling is needed to process 3 5x5 blocks which need // source pixels in the following order. // 1st 5x5 block: source pixels needed are 0 to 4, // 2nd 5x5 block: source pixels needed are 4 to 8, // 3rd 5x5 block: source pixels needed are 8 to 12. // Source pixels are loaded like mentioned below. // load_src0 : 0, 1, 2, 3, 4, 5, 6, 7 // load_src1 : 7, 8, 9, 10, 11, 12, 13, 14 // After applying masks, source bytes will be in the order: // load_src0 : 0, 1, 2, 3, 4, 4, 5, 6 // consists 5 pixels needed for 1st 5x5 block and // first 3 pixels needed for 2nd 5x5 block. // load_src1 : 7, 8, 8, 9, 10, 11, 12, x // consists last 2 pixels needed for 2nd 5x5 block and // 5 pixels needed for 3rd 5x5 block.
DECLARE_ALIGNED(32, staticconst uint32_t,
shuffle_src_layer0[2][8]) = { { 0, 1, 2, 3, 4, 4, 5, 6 },
{ 0, 1, 1, 2, 3, 4, 5, 0 } };
// This mask rearrange the weights to match shuffled source pixels order.
DECLARE_ALIGNED(32, staticconst uint32_t,
shuffle_weight_layer0[2][8]) = { { 0, 1, 2, 3, 4, 0, 1, 2 },
{ 3, 4, 0, 1, 2, 3, 4, 0 } };
// Shuffle mask used to rearrange weights corresponding to layer 1 and layer 2. // For layer 1 and layer 2, convolution happens at 2x2 as filter_width and // filter_height are equal to 2. So rearranging the weights in the // order shown below to match source pixels. Basically this mask replicates // the weights across the width of 2.
DECLARE_ALIGNED(32, staticconst uint32_t,
shuffle_weight_layer_1_and_2[2][8]) = {
{ 0, 1, 0, 1, 0, 1, 0, 1 }, { 2, 3, 2, 3, 2, 3, 2, 3 }
};
// After the stages of multiplication and accumulation, the output values // in the register will be jumbled. In order to store register into // output buffer in a proper way, the following mask is applied on output // register.
DECLARE_ALIGNED(32, staticconst uint32_t,
shuffle_output_layer_1_and_2[8]) = { 0, 1, 4, 5, 2, 3, 6, 7 };
// Load weights needed for layer 0 (for 5x5 block processing), // and fill the registers appropriately to match source pixel mapping. staticinlinevoid prepare_weights_for_5x5_convolve( constfloat *layer_config_weights, int off, float weight[5][8], constint cstep, __m256 *shuffle_weight, const __m256i weight_mask_0, const __m256i weight_mask_1) { for (int row = 0; row < 5; ++row) { for (int col = 0; col < 5; ++col) {
weight[row][col] = layer_config_weights[off];
off += cstep;
}
}
shuffle_weight[0] = _mm256_loadu_ps(weight[0]);
shuffle_weight[1] = _mm256_loadu_ps(weight[1]);
shuffle_weight[2] = _mm256_loadu_ps(weight[2]);
shuffle_weight[3] = _mm256_loadu_ps(weight[3]);
shuffle_weight[4] = _mm256_loadu_ps(weight[4]);
// Sort the output in order to store into output buffer.
load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask);
*out_accum = _mm256_add_ps(*out_accum, load_src[0]);
}
// Sort the output in order to store into output buffer.
load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask);
*out_accum = _mm256_add_ps(*out_accum, load_src[0]);
}
// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when // filter_width and filter_height are equal to 5. // CNN convolve parsing is based on av1_intra_mode_cnn_partition_cnn_config. // Based on the configuration set for each layer, the current encoder // always chooses the case of no_maxpool_padding_valid. // And also for layer 0 convolution happens at 5x5 level as the // filter_width and filter_height are set as 5. staticvoid cnn_convolve_no_maxpool_padding_valid_5x5_avx2( constfloat **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, int start_idx, constint cstep, constint channel_step) { constint kFilterWidth = 5; constint kFilterHeight = 5; constint kSkipWidth = 4; constint kSkipHeight = 4;
assert(layer_config->filter_width == kFilterWidth &&
layer_config->filter_height == kFilterHeight);
assert(layer_config->skip_width == kSkipWidth &&
layer_config->skip_height == kSkipHeight);
// Width needs to be moved to go to next iteration of processing 3 5x5 blocks. constint kSkipWidthForNextIter = kSkipWidth * 3;
// Minimum width required to process 3 5x5 blocks at a time. // min width (for processing 3 5x5 block) = 2*skip_width + filter_width // Here, skip_width specifies how much width we should move while processing // next block convolution and filter_width specifies for how many pixels // filter needs to be applied. constint kMinWidthFor3_5x5Blocks = (kSkipWidth * 2) + kFilterWidth; for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { constfloat out_ch_bias = layer_config->bias[i]; for (int k = 0; k < layer_config->in_channels; ++k) {
__m256 shuffle_weight[10];
// Weights needed are 5x5, for SIMD purpose made this array as 5x8. float weight[5][8] = { { 0 } }; int off = k * layer_config->out_channels + i;
// In layer 0, the convolution process happens at 5x5. // The weights needed for 5x5 block are same across the in-channels, // which is why the load of weights happens once for each in-channel.
prepare_weights_for_5x5_convolve(layer_config->weights, off, weight,
cstep, shuffle_weight, weight_mask_0,
weight_mask_1);
for (int h = 0, u = 0; h < in_height - kFilterHeight + 1;
h += kSkipHeight, ++u) { constint out_h = u * out_stride; int v = 0; int w = 0; int rem_width = in_width; // Processing 3 5x5 blocks at a time, if sufficient width is present. while (rem_width >= kMinWidthFor3_5x5Blocks) {
__m256 load_src_0, load_src_1;
__m256 accum_src_0 = _mm256_setzero_ps();
__m256 accum_src_1 = _mm256_setzero_ps(); constfloat *input_ptr = &input[k][h * in_stride + w];
PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS();
v += 1;
w += kSkipWidth;
rem_width -= kSkipWidth;
}
}
}
}
}
// AVX2 implementation for layer 1. staticinlinevoid cnn_convolve_no_maxpool_padding_valid_layer1_avx2( constfloat **input, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, int start_idx, constint cstep, constint channel_step) {
__m256i weight_mask[2];
__m256i shuffle_output_mask;
load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask);
constint kInHeight = 16; constint kFilterHeight = 2; constint kSkipHeight = 2; for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
__m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]); // out_accum registers are used to store the 2x2 convolve outputs // (calculated over input block size), which are accumulated across the // in_channels. As per the design, each iteration of for loop processes 8 // (horizontal) 2x2 blocks and stores in corresponding out_accum register // (as input size is 16x16, a total of 64 2x2 blocks are present and 8 // out_accum registers are enough to store the outputs). // Hence for loops corresponding to 'j' and 'h', below, run over the number // of out_accum registers.
__m256 out_accum[8]; for (int j = 0; j < 8; ++j) out_accum[j] = bias_reg; for (int k = 0; k < layer_config->in_channels; ++k) {
__m256 shuffle_weight[2]; int off = k * layer_config->out_channels + i; // In layer 1, the convolution process happens at 2x2. // The weights needed for 2x2 block are same across the in-channels, // which is why the load of weights happens once for each in-channel.
prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep,
shuffle_weight, weight_mask);
for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
h += kSkipHeight, ++u) { constfloat *input_ptr = &input[k][h * in_stride];
perform_convolve_for_8h_2x2_blocks(input_ptr, in_stride, shuffle_weight,
&out_accum[u], shuffle_output_mask);
}
} // Store output of layer 1. for (int j = 0; j < 8; ++j) {
_mm256_storeu_ps(&output[i][j * out_stride], out_accum[j]);
}
}
}
// AVX2 implementation for layer 2. staticinlinevoid cnn_convolve_no_maxpool_padding_valid_layer2_avx2( constfloat **input, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, int start_idx, constint cstep, constint channel_step) {
__m256i weight_mask[2];
__m256i shuffle_output_mask;
load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask);
constint kInHeight = 8; constint kFilterHeight = 2; constint kSkipHeight = 2; for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
__m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]); // out_accum registers are used to store the 2x2 convolve outputs // (calculated over input block size), which are accumulated across the // in_channels. As per the design, each iteration of for loop processes 8 // (4 horizontal x 2 vertical) 2x2 blocks and stores in corresponding // out_accum register (as input size is 8x8, a total of 16 2x2 blocks are // present and 2 out_accum registers are enough to store the outputs). // Hence for loops corresponding to 'j' and 'h', below, run over the number // of out_accum registers.
__m256 out_accum[2];
// Height needs to be moved to go to next iteration of processing // while processing 2 2x2 blocks vertically. constint kSkipHeightForNextIter = kSkipHeight * 2; for (int j = 0; j < 2; ++j) out_accum[j] = bias_reg; for (int k = 0; k < layer_config->in_channels; ++k) {
__m256 shuffle_weight[2]; int off = k * layer_config->out_channels + i; // In layer 2, the convolution process happens at 2x2. // The weights needed for 2x2 block are same across the in-channels, // which is why the load of weights happens once for each in-channel.
prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep,
shuffle_weight, weight_mask);
for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
h += kSkipHeightForNextIter, ++u) { constfloat *input_ptr = &input[k][h * in_stride];
perform_convolve_for_4hx2v_2x2_blocks(input_ptr, in_stride,
shuffle_weight, &out_accum[u],
shuffle_output_mask);
}
} // Store output of layer 2. for (int j = 0; j < 2; ++j) {
_mm256_storeu_ps(&output[i][j * out_stride * 2], out_accum[j]);
}
}
}
// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when // filter_width and filter_height are equal to 2. // As per the layer config set by av1_intra_mode_cnn_partition_cnn_config, // the filter_width and filter_height are equal to 2 for layer >= 1. So // convolution happens at 2x2 for layer >= 1. staticvoid cnn_convolve_no_maxpool_padding_valid_2x2_avx2( constfloat **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, int start_idx, constint cstep, constint channel_step) {
assert(layer_config->filter_width == 2 && layer_config->filter_height == 2);
assert(layer_config->skip_width == 2 && layer_config->skip_height == 2);
if (in_width == 16 && in_height == 16) { // This case of in_width and in_height equal to 16 corresponds to layer 1. // The output size of this layer is 8x8.
cnn_convolve_no_maxpool_padding_valid_layer1_avx2(
input, in_stride, layer_config, output, out_stride, start_idx, cstep,
channel_step);
} elseif (in_width == 8 && in_height == 8) { // This case of in_width and in_height equal to 8 corresponds to layer 2. // The output size of this layer is 4x4.
cnn_convolve_no_maxpool_padding_valid_layer2_avx2(
input, in_stride, layer_config, output, out_stride, start_idx, cstep,
channel_step);
} else { // For layer equal to 3 and 4, the input is of size 4x4 and 2x2 // respectively. Implementing SIMD for these cases might not be optimal, // which is why we call C path for layer >= 3.
av1_cnn_convolve_no_maxpool_padding_valid_c(
input, in_width, in_height, in_stride, layer_config, output, out_stride,
start_idx, cstep, channel_step);
}
}
// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(). // As per the current encoder, av1_cnn_convolve function gets called for // block size equal to 64x64. av1_cnn_convolve() uses layer config values // set by av1_intra_mode_cnn_partition_cnn_config. The following are a few // details related to each layer's config parameters. // Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht // 0 64x64 16x16 5 5 4 4 // 1 16x16 8x8 2 2 2 2 // 2 8x8 4x4 2 2 2 2 // 3 4x4 2x2 2 2 2 2 // 4 2x2 1x1 2 2 2 2 // Here, // filter_wd = filter_width and filter_ht = filter_height, // skip_wd = skip_width and skip_ht = skip_height. void av1_cnn_convolve_no_maxpool_padding_valid_avx2( constfloat **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step) { if (layer_config->filter_width == 5 && layer_config->filter_height == 5 &&
layer_config->skip_width == 4 && layer_config->skip_height == 4) {
cnn_convolve_no_maxpool_padding_valid_5x5_avx2(
input, in_width, in_height, in_stride, layer_config, output, out_stride,
start_idx, cstep, channel_step);
} elseif (layer_config->filter_width == 2 &&
layer_config->filter_height == 2 &&
layer_config->skip_width == 2 && layer_config->skip_height == 2) {
cnn_convolve_no_maxpool_padding_valid_2x2_avx2(
input, in_width, in_height, in_stride, layer_config, output, out_stride,
start_idx, cstep, channel_step);
} else {
av1_cnn_convolve_no_maxpool_padding_valid_c(
input, in_width, in_height, in_stride, layer_config, output, out_stride,
start_idx, cstep, channel_step);
}
}
Messung V0.5
¤ Dauer der Verarbeitung: 0.11 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.