// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifdefined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) // Full reducers for GPU, don't vectorize for now
// Reducer function that enables multiple gpu thread to safely accumulate at the same // output address. It basically reads the current value of the output variable, and // attempts to update it with the new value. If in the meantime another gpu thread // updated the content of the output address it will try again. template <typename T, typename R>
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) if (sizeof(T) == 4)
{ unsignedint oldval = *reinterpret_cast<unsignedint*>(output); unsignedint newval = oldval;
reducer.reduce(accum, reinterpret_cast<T*>(&newval)); if (newval == oldval) { return;
} unsignedint readback; while ((readback = atomicCAS((unsignedint*)output, oldval, newval)) != oldval) {
oldval = readback;
newval = oldval;
reducer.reduce(accum, reinterpret_cast<T*>(&newval)); if (newval == oldval) { return;
}
}
} elseif (sizeof(T) == 8) { unsignedlonglong oldval = *reinterpret_cast<unsignedlonglong*>(output); unsignedlonglong newval = oldval;
reducer.reduce(accum, reinterpret_cast<T*>(&newval)); if (newval == oldval) { return;
} unsignedlonglong readback; while ((readback = atomicCAS((unsignedlonglong*)output, oldval, newval)) != oldval) {
oldval = readback;
newval = oldval;
reducer.reduce(accum, reinterpret_cast<T*>(&newval)); if (newval == oldval) { return;
}
}
} else {
gpu_assert(0 && "Wordsize not supported");
} #else// EIGEN_CUDA_ARCH >= 300
gpu_assert(0 && "Shouldn't be called on unsupported device"); #endif// EIGEN_CUDA_ARCH >= 300
}
// We extend atomicExch to support extra data types template <typename Type>
__device__ inline Type atomicExchCustom(Type* address, Type val) { return atomicExch(address, val);
}
template <typename CoeffType, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; const Index num_threads = blockDim.x * gridDim.x; for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
output[i] = val;
}
}
template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, typename Self::CoeffReturnType* output, unsignedint* semaphore) { #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) // Initialize the output value const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; if (gridDim.x == 1) { if (first_index == 0) {
*output = reducer.initialize();
}
} else { if (threadIdx.x == 0) { unsignedint block = atomicCAS(semaphore, 0u, 1u); if (block == 0) { // We're the first block to run, initialize the output value
atomicExchCustom(output, reducer.initialize());
__threadfence();
atomicExch(semaphore, 2u);
} else { // Wait for the first block to initialize the output value. // Use atomicCAS here to ensure that the reads aren't cached unsignedint val; do {
val = atomicCAS(semaphore, 2u, 2u);
} while (val < 2u);
}
}
}
__syncthreads();
eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
typename Self::CoeffReturnType accum = reducer.initialize();
Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize); for (Index i = 0; i < max_iter; i+=BlockSize) { const Index index = first_index + i;
eigen_assert(index < num_coeffs); typename Self::CoeffReturnType val = input.m_impl.coeff(index);
reducer.reduce(val, &accum);
}
#pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { #ifdefined(EIGEN_HIPCC) // use std::is_floating_point to determine the type of reduced_val // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error // and list the float and int versions of __shfl_down as the candidate functions. if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
} else {
reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
} #elifdefined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); #else
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); #endif
}
if (num_blocks > 1) { // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there // won't be a race conditions between multiple thread blocks.
LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
1, 1, 0, device, reducer, self, num_coeffs, scratch);
}
template <typename Self, typename Op, bool Vectorizable> struct FullReducer<Self, Op, GpuDevice, Vectorizable> { // Unfortunately nvidia doesn't support well exotic types such as complex, // so reduce the scope of the optimized version of the code to the simple cases // of doubles, floats and half floats #ifdef EIGEN_HAS_GPU_FP16 staticconstbool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); #else// EIGEN_HAS_GPU_FP16 staticconstbool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value); #endif// EIGEN_HAS_GPU_FP16
template <typename OutputType> staticvoid run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); const Index num_coeffs = array_prod(self.m_impl.dimensions()); // Don't crash when we're called with an input tensor of size 0. if (num_coeffs == 0) { return;
}
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread); const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
const Index num_threads = blockDim.x * gridDim.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
// Initialize the output values if they weren't initialized by the ReductionInitKernel if (gridDim.x == 1) { for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
output[i] = reducer.initialize();
}
__syncthreads();
}
for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { const Index row = i / input_col_blocks;
if (row < num_preserved_coeffs) { const Index col_block = i % input_col_blocks; const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
Type reduced_val = reducer.initialize();
for (Index j = 0; j < NumPerThread; j += unroll_times) { const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); if (last_col >= num_coeffs_to_reduce) { for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) { const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
reducer.reduce(val, &reduced_val);
} break;
} else { // Faster version of the loop with no branches after unrolling. #pragma unroll for (int k = 0; k < unroll_times; ++k) { const Index col = col_begin + blockDim.x * (j + k);
reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
}
}
}
#pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { #ifdefined(EIGEN_HIPCC) // use std::is_floating_point to determine the type of reduced_val // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error // and list the float and int versions of __shfl_down as the candidate functions. if (std::is_floating_point<Type>::value) {
reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
} else {
reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
} #elifdefined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); #else
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); #endif
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
atomicReduce(&(output[row]), reduced_val, reducer);
}
}
} #else// EIGEN_CUDA_ARCH >= 300
gpu_assert(0 && "Shouldn't be called on unsupported device"); #endif// EIGEN_CUDA_ARCH >= 300
}
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
const Index num_threads = blockDim.x * gridDim.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
// Initialize the output values if they weren't initialized by the ReductionInitKernel if (gridDim.x == 1) {
Index i = packet_width * thread_id; for (; i + packet_width <= num_preserved_coeffs;
i += packet_width * num_threads) {
PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
*poutput = reducer.template initializePacket<PacketType>();
} if (i < num_preserved_coeffs) {
output[i] = reducer.initialize();
}
__syncthreads();
}
for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows
if (row + 1 < num_preserved_coeffs) { const Index col_block = i % input_col_blocks; const Index col_begin =
packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there // won't be a race conditions between multiple thread blocks. constint dyn_blocks = divup<int>(num_preserved_vals, 1024); constint max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / 1024; constint num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
num_blocks, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output);
}
if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there // won't be a race conditions between multiple thread blocks.
LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
1, 1, 0, device, reducer, self, num_preserved_vals, output);
}
template <typename Self, typename Op> struct InnerReducer<Self, Op, GpuDevice> { // Unfortunately nvidia doesn't support well exotic types such as complex, // so reduce the scope of the optimized version of the code to the simple case // of floats and half floats. #ifdef EIGEN_HAS_GPU_FP16 staticconstbool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); #else// EIGEN_HAS_GPU_FP16 staticconstbool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value); #endif// EIGEN_HAS_GPU_FP16
template <typename OutputType> staticbool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); const Index num_coeffs = array_prod(self.m_impl.dimensions()); // Don't crash when we're called with an input tensor of size 0. if (num_coeffs == 0) { returntrue;
} // It's faster to use the usual code. if (num_coeffs_to_reduce <= 128) { returntrue;
}
template <int NumPerThread, typename Self, typename Reducer, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, typename Self::CoeffReturnType* output) { const Index num_threads = blockDim.x * gridDim.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; // Initialize the output values if they weren't initialized by the ReductionInitKernel if (gridDim.x == 1) { for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
output[i] = reducer.initialize();
}
__syncthreads();
}
// Do the reduction. const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread); for (Index i = thread_id; i < max_iter; i += num_threads) { const Index input_col = i % num_preserved_coeffs; const Index input_row = (i / num_preserved_coeffs) * NumPerThread; typename Self::CoeffReturnType reduced_val = reducer.initialize(); const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce); for (Index j = input_row; j < max_row; j++) { typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
reducer.reduce(val, &reduced_val);
}
atomicReduce(&(output[input_col]), reduced_val, reducer);
}
}
template <typename Self, typename Op> struct OuterReducer<Self, Op, GpuDevice> { // Unfortunately nvidia doesn't support well exotic types such as complex, // so reduce the scope of the optimized version of the code to the simple case // of floats. staticconstbool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value); template <typename Device, typename OutputType> static #if !defined(EIGEN_HIPCC) // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error // (in the cxx11_tensor_reduction_gpu test) // // terminate called after throwing an instance of 'std::runtime_error' // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL... // // don't know why this happens (and why is it a runtime error instead of a compile time error) // // this will be fixed by HIP PR#457
EIGEN_DEVICE_FUNC #endif bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device"); returntrue;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.