// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// We do block based broadcasting using a trick with 2x tensor rank and 0 // strides. See block method implementation for details. typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
// The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar // and store the result in a scalar. Instead one should reshape the scalar into a a N-D // tensor with N >= 1 of 1 element first and then broadcast.
EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); const InputDimensions& input_dims = m_impl.dimensions();
isCopy = true; for (int i = 0; i < NumDims; ++i) {
eigen_assert(input_dims[i] > 0);
m_dimensions[i] = input_dims[i] * m_broadcast[i]; if (m_broadcast[i] != 1) {
isCopy = false;
}
}
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_inputStrides[0] = 1;
m_outputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) {
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
}
} else {
m_inputStrides[NumDims-1] = 1;
m_outputStrides[NumDims-1] = 1; for (int i = NumDims-2; i >= 0; --i) {
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
}
}
if (input_dims[0] == 1) {
oneByN = true; for (int i = 1; i < NumDims; ++i) { if (m_broadcast[i] != 1) {
oneByN = false; break;
}
}
} elseif (input_dims[NumDims-1] == 1) {
nByOne = true; for (int i = 0; i < NumDims-1; ++i) { if (m_broadcast[i] != 1) {
nByOne = false; break;
}
}
}
// Handle special format like NCHW, its input shape is '[1, N..., 1]' and // broadcast shape is '[N, 1..., N]' if (!oneByN && !nByOne) { if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
nByOne = true;
oneByN = true; for (int i = 1; i < NumDims-1; ++i) { if (m_broadcast[i] != 1) {
nByOne = false;
oneByN = false; break;
}
}
}
}
}
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
Index dim, inputIndex, outputOffset;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
dim = 1;
} else {
dim = NumDims - 2;
}
inputIndex = index / m_outputStrides[dim];
outputOffset = index % m_outputStrides[dim]; if (outputOffset + PacketSize <= m_outputStrides[dim]) {
values[0] = m_impl.coeff(inputIndex); return internal::pload1<PacketReturnType>(values);
} else {
EIGEN_UNROLL_LOOP for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { if (outputOffset + cur < m_outputStrides[dim]) {
values[i] = m_impl.coeff(inputIndex);
} else {
values[i] = m_impl.coeff(++inputIndex);
outputOffset = 0;
cur = 0;
}
} return internal::pload<PacketReturnType>(values);
}
}
// Ignore the LoadMode and always use unaligned loads since we can't guarantee // the alignment at compile time. template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index originalIndex = index;
Index inputIndex = 0;
EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq<Broadcast>(i, 1)) {
eigen_assert(idx < m_impl.dimensions()[i]);
inputIndex += idx * m_inputStrides[i];
} else { if (internal::index_statically_eq<InputDimensions>(i, 1)) {
eigen_assert(idx % m_impl.dimensions()[i] == 0);
} else {
inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
}
}
index -= idx * m_outputStrides[i];
}
Index innermostLoc; if (internal::index_statically_eq<Broadcast>(0, 1)) {
eigen_assert(index < m_impl.dimensions()[0]);
innermostLoc = index;
} else { if (internal::index_statically_eq<InputDimensions>(0, 1)) {
eigen_assert(index % m_impl.dimensions()[0] == 0);
innermostLoc = 0;
} else {
innermostLoc = index % m_impl.dimensions()[0];
}
}
inputIndex += innermostLoc;
// Todo: this could be extended to the second dimension if we're not // broadcasting alongside the first dimension, and so on. if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { return m_impl.template packet<Unaligned>(inputIndex);
} else {
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
values[0] = m_impl.coeff(inputIndex);
EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize; ++i) { if (innermostLoc + i < m_impl.dimensions()[0]) {
values[i] = m_impl.coeff(inputIndex+i);
} else {
values[i] = coeffColMajor(originalIndex+i);
}
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values); return rslt;
}
}
Index inputIndex = 0;
EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq<Broadcast>(i, 1)) {
eigen_assert(idx < m_impl.dimensions()[i]);
inputIndex += idx * m_inputStrides[i];
} else { if (internal::index_statically_eq<InputDimensions>(i, 1)) {
eigen_assert(idx % m_impl.dimensions()[i] == 0);
} else {
inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
}
}
index -= idx * m_outputStrides[i];
}
Index innermostLoc; if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
eigen_assert(index < m_impl.dimensions()[NumDims-1]);
innermostLoc = index;
} else { if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
innermostLoc = 0;
} else {
innermostLoc = index % m_impl.dimensions()[NumDims-1];
}
}
inputIndex += innermostLoc;
// Todo: this could be extended to the second dimension if we're not // broadcasting alongside the first dimension, and so on. if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) { return m_impl.template packet<Unaligned>(inputIndex);
} else {
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
values[0] = m_impl.coeff(inputIndex);
EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize; ++i) { if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
values[i] = m_impl.coeff(inputIndex+i);
} else {
values[i] = coeffRowMajor(originalIndex+i);
}
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values); return rslt;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const { // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large // tensors. But this might need further tuning. const size_t target_size = m_device.firstLevelCacheSize(); return internal::TensorBlockResourceRequirements::merge(
m_impl.getResourceRequirements(),
internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
}
// Prepare storage for the materialized broadcasting result. consttypename TensorBlock::Storage block_storage =
TensorBlock::prepareStorage(desc, scratch);
ScalarNoConst* materialized_output = block_storage.data();
// We potentially will need to materialize input blocks.
size_t materialized_input_size = 0;
ScalarNoConst* materialized_input = NULL;
// Initialize block broadcating iterator state for outer dimensions (outer // with regard to bcast dimension). Dimension in this array are always in // inner_most -> outer_most order (col major layout).
array<BlockBroadcastingIteratorState, NumDims> it; int idx = 0;
for (int i = params.inner_dim_count + 1; i < NumDims; ++i) { const Index dim = IsColMajor ? i : NumDims - 1 - i;
it[idx].size = params.output_dims[dim];
it[idx].count = 0;
it[idx].output_stride = m_outputStrides[dim];
it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
idx++;
}
// Write output into the beginning of `materialized_output`.
Index output_offset = 0;
// We will fill output block by broadcasting along the bcast dim, and // iterating over outer dimension. const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
Broadcast functor() const { return m_broadcast; } #ifdef EIGEN_USE_SYCL // binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
cl::sycl::handler& cgh) const {
m_impl.bind(cgh);
} #endif private: staticconstbool IsColMajor = static_cast<int>(Layout) == static_cast<int>(ColMajor);
// We will build a general case block broadcasting on top of broadcasting // primitive that will do broadcasting only for the inner dimension(s) along // the first dimension smaller than the input size (it's called `bcast_dim`). // // Example: // dim: 0 1 2 (ColMajor) // input size: [9, 3, 6] // block size: [9, 2, 6] // // We will compute broadcasted block by iterating over the outer dimensions // before `bcast_dim` (only dimension `2` in this example) and computing // broadcasts along the `bcast_dim` (dimension `1` in this example).
// BlockBroadcastingParams holds precomputed parameters for broadcasting a // single block along the broadcasting dimension. Sizes and strides along the // `bcast_dim` might be invalid, they will be adjusted later in // `BroadcastBlockAlongBcastDim`. struct BlockBroadcastingParams {
Dimensions input_dims; // input expression dimensions
Dimensions output_dims; // output block sizes
Dimensions output_strides; // output block strides
int inner_dim_count; // count inner dimensions matching in size int bcast_dim; // broadcasting dimension index
Index bcast_dim_size; // broadcasting dimension size
Index inner_dim_size; // inner dimensions size
// Block sizes and strides for the input block where all dimensions before // `bcast_dim` are equal to `1`.
Dimensions input_block_sizes;
Dimensions input_block_strides;
// Block sizes and strides for blocks with extra dimensions and strides `0`.
BroadcastDimensions bcast_block_sizes;
BroadcastDimensions bcast_block_strides;
BroadcastDimensions bcast_input_strides;
};
struct BlockBroadcastingIteratorState {
Index size;
Index count;
Index output_stride;
Index output_span;
};
// First non-matching dimension is the broadcasting dimension.
eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
params.bcast_dim = dim;
params.bcast_dim_size = params.output_dims[dim]; break;
}
// Calculate the input block size for looking into the input. for (int i = 0; i < params.inner_dim_count; ++i) { constint dim = IsColMajor ? i : NumDims - i - 1;
params.input_block_sizes[dim] = params.input_dims[dim];
} for (int i = params.inner_dim_count; i < NumDims; ++i) { constint dim = IsColMajor ? i : NumDims - i - 1;
params.input_block_sizes[dim] = 1;
}
params.input_block_strides =
internal::strides<Layout>(params.input_block_sizes);
// Broadcast with the 0-stride trick: Create 1 extra dim for each // broadcast, set the input stride to 0. // // When ColMajor: // // - bcast_block_sizes: // [d_0, b_0, d_1, b_1, ...] // // - bcast_block_strides: // [output_block_strides[0], output_block_strides[0] * d_0, // output_block_strides[1], output_block_strides[1] * d_1, // ...] // // - bcast_input_strides: // [input_block_strides[0], 0, // input_block_strides[1], 0, // ...]. // for (int i = 0; i < params.inner_dim_count; ++i) { constint dim = IsColMajor ? i : NumDims - i - 1;
} else { // Keep track of the total number of the coefficients written to the // output block.
Index num_output_coeffs = 0;
// The general case. Let's denote the output block as // // x[..., a:a+bcast_dim_size, :, ..., :] // // where a:a+bcast_dim_size is a slice on the bcast_dim dimension // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3 // sub-blocks: // // (1) a:b, where b is the smallest multiple of // input_dims[bcast_dim_start] in [a, a+bcast_dim_size]. // // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start] // in [a, a+bcast_dim_size]. // // (3) c:a+bcast_dim_size . // // Or, when b and c do not exist, we just need to process the whole block // together.
// Find a. const Index bcast_dim_left_index =
bcast_offset / m_outputStrides[params.bcast_dim];
// Find b and c. const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
// First multiple after a. This is b when <= bcast_dim_left_index + // bcast_dim_size. const Index first_multiple =
divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
input_bcast_dim_size;
if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) { // b exists, so does c. Find it. const Index last_multiple =
(bcast_dim_left_index + params.bcast_dim_size) /
input_bcast_dim_size * input_bcast_dim_size; constint copy_bcast_dim =
IsColMajor ? 2 * params.inner_dim_count
: 2 * NumDims - 2 * params.inner_dim_count - 1; constint broadcast_bcast_dim =
IsColMajor ? 2 * params.inner_dim_count + 1
: 2 * NumDims - 2 * params.inner_dim_count - 2;
// ---------------------------------------------------------------------- // // Materialize input block into a temporary memory buffer only if it's not // already available in the arg block. const ScalarNoConst* input_buffer = NULL;
if (input_block.data() != NULL) { // Input block already has raw data, there is no need to materialize it.
input_buffer = input_block.data();
} else { // Otherwise we have to do block assignment into a temporary buffer.
// Maybe reuse previously allocated buffer, or allocate a new one with a // scratch allocator. const size_t input_total_size = input_block_sizes.TotalSize(); if (*materialized_input == NULL ||
*materialized_input_size < input_total_size) {
*materialized_input_size = input_total_size; void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
*materialized_input = static_cast<ScalarNoConst*>(mem);
}
// ---------------------------------------------------------------------- // // Copy data from materialized input block to the materialized output, using // given broadcast strides (strides with zeroes). typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout>
TensorBlockIO;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.