/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
// In order to avoid the high-latency of swapping between FPU and SIMD // operations, we keep the result in a 128-bit register even though we only // care about a single value. staticvoid nn_propagate_8to1(constfloat *const inputs, constfloat *const weights,
__m128 *const output) { const __m128 inputs_h = _mm_loadu_ps(&inputs[4]); const __m128 inputs_l = _mm_loadu_ps(inputs);
// Calculate prediction based on the given input features and neural net config. // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden // layer. void av1_nn_predict_sse3(constfloat *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output) { float buf[2][NN_MAX_NODES_PER_LAYER]; int buf_index = 0; int num_inputs = nn_config->num_inputs;
if (num_inputs % 4 == 0 && num_outputs % 8 == 0) { for (int out = 0; out < num_outputs; out += 8) {
__m128 out_h = _mm_loadu_ps(&layer_bias[out + 4]);
__m128 out_l = _mm_loadu_ps(&layer_bias[out]); for (int in = 0; in < num_inputs; in += 4) {
av1_nn_propagate_4to8_sse3(&input_nodes[in],
&layer_weights[out * num_inputs + in],
&out_h, &out_l, num_inputs);
} if (!output_layer) nn_activate8(&out_h, &out_l);
_mm_storeu_ps(&output_nodes[out + 4], out_h);
_mm_storeu_ps(&output_nodes[out], out_l);
}
} elseif (num_inputs % 8 == 0 && num_outputs % 4 == 0) { for (int out = 0; out < num_outputs; out += 4) {
__m128 outputs = _mm_loadu_ps(&layer_bias[out]); for (int in = 0; in < num_inputs; in += 8) {
nn_propagate_8to4(&input_nodes[in],
&layer_weights[out * num_inputs + in], &outputs,
num_inputs);
} if (!output_layer) nn_activate4(&outputs);
_mm_storeu_ps(&output_nodes[out], outputs);
}
} elseif (num_inputs % 4 == 0 && num_outputs % 4 == 0) { for (int out = 0; out < num_outputs; out += 4) {
__m128 outputs = _mm_loadu_ps(&layer_bias[out]); for (int in = 0; in < num_inputs; in += 4) {
av1_nn_propagate_4to4_sse3(&input_nodes[in],
&layer_weights[out * num_inputs + in],
&outputs, num_inputs);
} if (!output_layer) nn_activate4(&outputs);
_mm_storeu_ps(&output_nodes[out], outputs);
}
} elseif (num_inputs % 8 == 0) { for (int out = 0; out < num_outputs; out++) {
__m128 total = _mm_load1_ps(&layer_bias[out]); for (int in = 0; in < num_inputs; in += 8) {
nn_propagate_8to1(&input_nodes[in],
&layer_weights[out * num_inputs + in], &total);
} if (!output_layer) nn_activate4(&total);
output_nodes[out] = _mm_cvtss_f32(total);
}
} elseif (num_inputs % 4 == 0) { for (int out = 0; out < num_outputs; out++) {
__m128 total = _mm_load1_ps(&layer_bias[out]); for (int in = 0; in < num_inputs; in += 4) {
av1_nn_propagate_4to1_sse3(
&input_nodes[in], &layer_weights[out * num_inputs + in], &total);
} if (!output_layer) nn_activate4(&total);
output_nodes[out] = _mm_cvtss_f32(total);
}
} else { // Use SSE instructions for scalar operations to avoid the latency of // swapping between SIMD and FPU modes. for (int out = 0; out < num_outputs; out++) {
__m128 total = _mm_load1_ps(&layer_bias[out]); for (int in_node = 0; in_node < num_inputs; in_node++) {
__m128 input = _mm_load1_ps(&input_nodes[in_node]);
__m128 weight =
_mm_load1_ps(&layer_weights[num_inputs * out + in_node]);
total = _mm_add_ps(total, _mm_mul_ps(input, weight));
} if (!output_layer) nn_activate4(&total);
output_nodes[out] = _mm_cvtss_f32(total);
}
}
input_nodes = output_nodes;
num_inputs = num_outputs;
buf_index = 1 - buf_index;
} if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
}
// Based on N. N. Schraudolph. A Fast, Compact Approximation of the Exponential // Function. Neural Computation, 11(4):853–862, 1999. staticinline __m128 approx_exp(__m128 y) { #define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2) #define B \
127 // Offset for the exponent according to IEEE floating point standard. #define C 60801 // Magic number controls the accuracy of approximation const __m128 multiplier = _mm_set1_ps(A); const __m128i offset = _mm_set1_epi32(B * (1 << 23) - C);
y = _mm_mul_ps(y, multiplier);
y = _mm_castsi128_ps(_mm_add_epi32(_mm_cvtps_epi32(y), offset)); return y; #undef A #undef B #undef C
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.