// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType> struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >
{ // Type promotion to handle the case where the types of the lhs and the rhs are different. typedeftypename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type, typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
for (Index x = 0; x < num_slices; x++) { if (num_lhs > 0) lhs_blocks[x].resize(num_lhs); for (Index m = 0; m < num_lhs; m++) {
lhs_blocks[x][m] = reinterpret_cast<LhsScalar*>(mem);
mem += sz.lhs_size;
} if (num_rhs > 0) rhs_blocks[x].resize(num_rhs); for (Index n = 0; n < num_rhs; n++) {
rhs_blocks[x][n] = reinterpret_cast<RhsScalar*>(mem);
mem += sz.rhs_size;
}
}
private: struct BlockSizes {
Index lhs_size;
Index rhs_size;
};
EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm, const Index bk, const Index bn) {
Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
BlockSizes sz;
sz.lhs_size = divup<Index>(bm * bk * sizeof(LhsScalar), align) * align;
sz.rhs_size = divup<Index>(bn * bk * sizeof(RhsScalar), align) * align; return sz;
}
};
// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in // ColMajor storage order. This property is guaranteed by the // TensorContractionOp evaluator. TensorContractionKernel specifies how we pack // blocks of Lhs and Rhs tensor expressions, and how we invoke matrix // multiplication for these blocks. Default tensor contraction uses // gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see // GeneralBlocPanelKernel.h for details). // // By specializing contraction kernels we can use other low level libraries to // perform matrix multiplication, and still rely on Eigen contraction evaluator. // This also includes full support in TensorContractionThreadPool, assuming that // underlying gemm do not use it's own threading. // // - ResScalar/LhsScalar/RhsScalar - scalar type for the result of // multiplication, lhs tensor and rhs tensor respectively. // // - StorageIndex - index type for the tensor expressions. In practice almost // always is Eigen::Index. // // - OutputMapper provides access to the memory of the output matrix. In // practice it's always column major blas_data_mapper (it must be of ResScalar // type). // // - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional // view into the Lhs/Rhs tensor expressions. In practice it's // TensorContractionInputMapper, or some specialization of it based on the // type of tensor expression (e.g. TensorImagePatchOp has optimized input // mapper). template <typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, typename OutputMapper, typename LhsMapper, typename RhsMapper> struct TensorContractionKernel { // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C` // (otherwise beta should be always equal to 1). enum { HasBeta = false };
private: // These are dimensions of the original Tensors, and selected block sizes. The // actual block sizes passed to all function above might be smaller because of // the partial blocks at the end. const StorageIndex m; const StorageIndex k; const StorageIndex n; const StorageIndex bm; const StorageIndex bk; const StorageIndex bn;
};
} // end namespace internal
// Tensor contraction params that should enable to get from output matrix // 2-dimensional coordinates to the output tensor dimensions. struct TensorContractionParams { // TensorContraction evaluator assumes that both tensors are in ColMajor // layout, if tensors are in RowMajor evaluator swap lhs with rhs. bool swapped_arguments;
};
// Output kernel allows to fuse operations into the tensor contraction. // // Examples: // 1. Elementwise Relu transformation following Conv2D. // 2. AddBias to the Conv2D output channels dimension. // // The NoOpOutputKernel implements an output kernel that does absolutely nothing. struct NoOpOutputKernel { /** * Tensor contraction evaluator calls this kernel after finishing each block * of output matrix. Output blocks belong to the 2-dimensional output tensor. * * TensorContractionParams contains contraction dimensions information * required to map output 2-d space into the expected output tensor space * (potentially higher dimensional). * * \param[in] output_mapper Access to output tensor memory * \param[in] params Tensor contraction parameters * \param[in] i Index of a first row available through output_mapper * \param[in] j Index of a first column available through output_mapper * \param[in] num_rows Number of available rows * \param[in] num_cols Number of available columns
*/ template <typename Index, typename Scalar>
EIGEN_ALWAYS_INLINE voidoperator()( const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper, const TensorContractionParams& params, Index i,
Index j, Index num_rows, Index num_cols) const {
EIGEN_UNUSED_VARIABLE(output_mapper);
EIGEN_UNUSED_VARIABLE(params);
EIGEN_UNUSED_VARIABLE(i);
EIGEN_UNUSED_VARIABLE(j);
EIGEN_UNUSED_VARIABLE(num_rows);
EIGEN_UNUSED_VARIABLE(num_cols);
}
};
// Most of the code is assuming that both input tensors are ColMajor. If the // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: // If we want to compute A * B = C, where A is LHS and B is RHS, the code // will pretend B is LHS and A is RHS. typedeftypename internal::conditional< static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedeftypename internal::conditional< static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
DSizes<Index, LDims> eval_left_dims;
DSizes<Index, RDims> eval_right_dims;
array<IndexPair<Index>, ContractDims> eval_op_indices; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { // For ColMajor, we keep using the existing dimensions for (int i = 0; i < LDims; i++) {
eval_left_dims[i] = m_leftImpl.dimensions()[i];
} for (int i = 0; i < RDims; i++) {
eval_right_dims[i] = m_rightImpl.dimensions()[i];
} // We keep the pairs of contracting indices. for (int i = 0; i < ContractDims; i++) {
eval_op_indices[i].first = op.indices()[i].first;
eval_op_indices[i].second = op.indices()[i].second;
}
} else { // For RowMajor, we need to reverse the existing dimensions for (int i = 0; i < LDims; i++) {
eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1];
} for (int i = 0; i < RDims; i++) {
eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1];
} // We need to flip all the pairs of contracting indices as well as // reversing the dimensions. for (int i = 0; i < ContractDims; i++) {
eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second;
eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first;
}
}
// Check for duplicate axes and make sure the first index in eval_op_indices // is increasing. Using O(n^2) sorting is OK since ContractDims is small for (int i = 0; i < ContractDims; i++) { for (int j = i + 1; j < ContractDims; j++) {
eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first &&
eval_op_indices[j].second != eval_op_indices[i].second && "contraction axes should be unique"); if (eval_op_indices[j].first < eval_op_indices[i].first) {
numext::swap(eval_op_indices[j], eval_op_indices[i]);
}
}
}
array<Index, LDims> lhs_strides;
lhs_strides[0] = 1; for (int i = 0; i < LDims-1; ++i) {
lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i];
}
array<Index, RDims> rhs_strides;
rhs_strides[0] = 1; for (int i = 0; i < RDims-1; ++i) {
rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
}
if (m_i_strides.size() > 0) m_i_strides[0] = 1; if (m_j_strides.size() > 0) m_j_strides[0] = 1; if (m_k_strides.size() > 0) m_k_strides[0] = 1;
m_i_size = 1;
m_j_size = 1;
m_k_size = 1;
// To compute the dimension, we simply concatenate the non-contracting // dimensions of the left and then the right tensor. Additionally, we also // compute the strides corresponding to the left non-contracting // dimensions and right non-contracting dimensions.
m_lhs_inner_dim_contiguous = true; int dim_idx = 0;
Index nocontract_idx = 0;
for (int i = 0; i < LDims; i++) { // find if we are contracting on index i of left tensor bool contracting = false; for (int j = 0; j < ContractDims; j++) { if (eval_op_indices[j].first == i) {
contracting = true; break;
}
} if (!contracting) { // add dimension size to output dimensions
m_dimensions[dim_idx] = eval_left_dims[i];
m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; if (dim_idx != i) {
m_lhs_inner_dim_contiguous = false;
} if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) {
m_i_strides[nocontract_idx+1] =
m_i_strides[nocontract_idx] * eval_left_dims[i];
} else {
m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i];
}
dim_idx++;
nocontract_idx++;
}
}
nocontract_idx = 0; for (int i = 0; i < RDims; i++) { bool contracting = false; // find if we are contracting on index i of right tensor for (int j = 0; j < ContractDims; j++) { if (eval_op_indices[j].second == i) {
contracting = true; break;
}
} if (!contracting) {
m_dimensions[dim_idx] = eval_right_dims[i]; if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) {
m_j_strides[nocontract_idx+1] =
m_j_strides[nocontract_idx] * eval_right_dims[i];
} else {
m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i];
}
m_right_nocontract_strides[nocontract_idx] = rhs_strides[i];
dim_idx++;
nocontract_idx++;
}
}
// Now compute the strides corresponding to the contracting dimensions. We // assumed above that non-contracting axes are represented in the same order // in the matrix as they are in the tensor. This is not the case for // contracting axes. As the contracting axes must be of the same size in // each tensor, we'll only look at the first tensor here.
m_rhs_inner_dim_contiguous = true;
m_rhs_inner_dim_reordered = false; for (int i = 0; i < ContractDims; i++) {
Index left = eval_op_indices[i].first;
Index right = eval_op_indices[i].second;
Index size = eval_left_dims[left];
eigen_assert(size == eval_right_dims[right] && "Contraction axes must be same size");
if (i > 0 && right < eval_op_indices[i-1].second) {
m_rhs_inner_dim_reordered = true;
} if (right != i) {
m_rhs_inner_dim_contiguous = false;
}
}
// If the layout is RowMajor, we need to reverse the m_dimensions if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) { for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
numext::swap(m_dimensions[i], m_dimensions[j]);
}
}
// A set of parameters that will allow output kernel to get from output // tensor dimensions (i, j) into the original tensor dimensions. // TODO(ezhulenev): Add parameters required to infer output tensor index for // more complex contractions than 2x2 on internal dimension.
m_tensor_contraction_params.swapped_arguments = static_cast<int>(Layout) == RowMajor;
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> #if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC #endif void evalGemm(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size;
this->template evalGemmPartial<lhs_inner_dim_contiguous,
rhs_inner_dim_contiguous,
rhs_inner_dim_reordered,
Alignment, true>(buffer, 0, k, 1);
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel(
Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
rhs_inner_dim_reordered, Alignment, /*use_output_kernel*/ false>(buffer, k_start, k_end,
num_threads);
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment, bool use_output_kernel>
EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size); // columns in slice on left side, rows on right side const Index k_slice = k_end - k_start;
// rows in left side const Index m = this->m_i_size;
// columns in right side const Index n = this->m_j_size;
// define data mappers for Lhs and Rhs typedeftypename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; typedeftypename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
// Sizes of the blocks to load in cache. See the Goto paper for details.
internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar,
Index, internal::ShardByCol>
blocking(k_slice, m, n, num_threads); const Index kc = blocking.kc(); const Index mc = numext::mini(m, blocking.mc()); const Index nc = numext::mini(n, blocking.nc());
// If a contraction kernel does not support beta, explicitly initialize // output buffer with zeroes. if (!TensorContractionKernel::HasBeta) {
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
}
for(Index i2=0; i2<m; i2+=mc)
{ const Index actual_mc = numext::mini(i2+mc,m)-i2; for (Index k2 = k_start; k2 < k_end; k2 += kc) { // make sure we don't overshoot right edge of left matrix, then pack vertical panel const Index actual_kc = numext::mini(k2 + kc, k_end) - k2;
kernel.packLhs(&blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
// If kernel supports beta, there is no need to initialize output // buffer with zeroes. const Scalar alpha = Scalar(1); const Scalar beta = (TensorContractionKernel::HasBeta && k2 == k_start)
? Scalar(0)
: Scalar(1);
// series of horizontal blocks for (Index j2 = 0; j2 < n; j2 += nc) { // make sure we don't overshoot right edge of right matrix, then pack block const Index actual_nc = numext::mini(j2 + nc, n) - j2;
kernel.packRhs(&blockB, rhs.getSubMapper(k2, j2), actual_kc,
actual_nc);
// call gebp (matrix kernel) // The parameters here are copied from Eigen's GEMM implementation const OutputMapper output_mapper = output.getSubMapper(i2, j2);
kernel.invoke(output_mapper, blockA, blockB, actual_mc, actual_kc,
actual_nc, alpha, beta);
// We are done with this [i2, j2] output block. if (use_output_kernel && k2 + kc >= k_end) {
m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2,
actual_mc, actual_nc);
}
}
}
}
// Most of the code is assuming that both input tensors are ColMajor. If the // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: // If we want to compute A * B = C, where A is LHS and B is RHS, the code // will pretend B is LHS and A is RHS. typedeftypename internal::conditional< static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedeftypename internal::conditional< static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.