// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Mehdi Goli Codeplay Software Ltd. // Ralph Potter Codeplay Software Ltd. // Luke Iwanski Codeplay Software Ltd. // Contact: <eigen@codeplay.com> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/***************************************************************** * TensorReductionSycl.h * * \brief: * This is the specialization of the reduction operation. Two phase reduction approach * is used since the GPU does not have Global Synchronization for global memory among * different work-group/thread block. To solve the problem, we need to create two kernels * to reduce the data, where the first kernel reduce the data locally and each local * workgroup/thread-block save the input data into global memory. In the second phase (global reduction) * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU: * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf *
*****************************************************************/
voidoperator()(cl::sycl::nd_item<1> itemID) { // Our empirical research shows that the best performance will be achieved // when there is only one element per thread to reduce in the second step. // in this step the second step reduction time is almost negligible. // Hence, in the second step of reduction the input size is fixed to the // local size, thus, there is only one element read per thread. The // algorithm must be changed if the number of reduce per thread in the // second step is greater than 1. Otherwise, the result will be wrong. const Index localid = itemID.get_local_id(0); auto aInPtr = aI.get_pointer() + localid; auto aOutPtr = outAcc.get_pointer();
CoeffReturnType *scratchptr = scratch.get_pointer();
CoeffReturnType accumulator = *aInPtr;
/* Apply the reduction operation between the current local
* id and the one on the other half of the vector. */ auto out_scratch_ptr =
scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
itemID.barrier(cl::sycl::access::fence_space::local_space); if (rt == reduction_dim::inner_most) {
accumulator = *out_scratch_ptr;
} // The Local LocalThreadSizeR is always power of 2
EIGEN_UNROLL_LOOP for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) { if (rLocalThreadId < offset) {
op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator); // The result has already been divided for mean reducer in the // previous reduction so no need to divide furthermore
*out_scratch_ptr = op.finalize(accumulator);
} /* All threads collectively read from global memory into local. * The barrier ensures all threads' IO is resolved before * execution continues (strictly speaking, all threads within * a single work-group - there is no co-ordination between
* work-groups, only work-items). */
itemID.barrier(cl::sycl::access::fence_space::local_space);
}
staticbool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
// getPowerOfTwo makes sure local range is power of 2 and <= // maxSyclThreadPerBlock this will help us to avoid extra check on the // kernel
static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
(PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)), "The Local thread size must be a power of 2 for the reduction " "operation");
EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR; // In this step, we force the code not to be more than 2-step reduction: // Our empirical research shows that if each thread reduces at least 64 // elemnts individually, we get better performance. However, this can change // on different platforms. In this step we force the code not to be // morthan step reduction: Our empirical research shows that for inner_most // dim reducer, it is better to have 8 group in a reduce dimension for sizes // > 1024 to achieve the best performance. const Index reductionPerThread = 64;
Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true); const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
Index rGroups = (cu + pNumGroups - 1) / pNumGroups; const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1; const Index globalRange = pNumGroups * rNumGroups * localRange;
template <typename Self, typename Op, bool Vectorizable> struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> { typedeftypename Self::CoeffReturnType CoeffReturnType; typedeftypename Self::EvaluatorPointerType EvaluatorPointerType; static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1; staticvoid run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) { typedeftypename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), "The Local thread size must be a power of 2 for the reduction " "operation");
EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
typename Self::Index inputSize = self.impl().dimensions().TotalSize(); // In this step we force the code not to be more than 2-step reduction: // Our empirical research shows that if each thread reduces at least 512 // elemnts individually, we get better performance. const Index reductionPerThread = 2048; // const Index num_work_group =
Index reductionGroup = dev.getPowerOfTwo(
(inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true); const Index num_work_group = std::min(reductionGroup, local_range); // 1 // ? local_range // : 1); const Index global_range = num_work_group * local_range;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.