// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// We fix the index along the scan axis to 0 and perform a // scan per remaining entry. The iteration is split into two nested // loops to avoid an integer division by keeping track of each idx1 and // idx2. for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
block_reducer(self, idx1, data);
}
}
};
#ifdef EIGEN_USE_THREADS
// Adjust block_size to avoid false sharing of cachelines among // threads. Currently set to twice the cache line size on Intel and ARM // processors.
EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
EIGEN_CONSTEXPR Index kBlockAlignment = 128; const Index items_per_cacheline =
numext::maxi<Index>(1, kBlockAlignment / item_size); return items_per_cacheline * divup(block_size, items_per_cacheline);
}
template <typename Self> struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
EIGEN_STRONG_INLINE voidoperator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) { using Scalar = typename Self::CoeffReturnType; using Packet = typename Self::PacketReturnType; constint PacketSize = internal::unpacket_traits<Packet>::size;
Index num_scalars = self.stride();
Index num_packets = 0; if (self.stride() >= PacketSize) {
num_packets = self.stride() / PacketSize;
self.device().parallelFor(
num_packets,
TensorOpCost(PacketSize * self.size(), PacketSize * self.size(),
16 * PacketSize * self.size(), true, PacketSize), // Make the shard size large enough that two neighboring threads // won't write to the same cacheline of `data`.
[=](Index blk_size) { return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size);
},
[&](Index first, Index last) { for (Index packet = first; packet < last; ++packet) { const Index idx2 = packet * PacketSize;
ReducePacket(self, idx1 + idx2, data);
}
});
num_scalars -= num_packets * PacketSize;
}
self.device().parallelFor(
num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()), // Make the shard size large enough that two neighboring threads // won't write to the same cacheline of `data`.
[=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size);
},
[&](Index first, Index last) { for (Index scalar = first; scalar < last; ++scalar) { const Index idx2 = num_packets * PacketSize + scalar;
ReduceScalar(self, idx1 + idx2, data);
}
});
}
};
template <typename Self> struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
EIGEN_STRONG_INLINE voidoperator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) { using Scalar = typename Self::CoeffReturnType;
self.device().parallelFor(
self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()), // Make the shard size large enough that two neighboring threads // won't write to the same cacheline of `data`.
[=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size);
},
[&](Index first, Index last) { for (Index idx2 = first; idx2 < last; ++idx2) {
ReduceScalar(self, idx1 + idx2, data);
}
});
}
};
// GPU implementation of scan // TODO(ibab) This placeholder implementation performs multiple scans in // parallel, but it would be better to use a parallel scan algorithm and // optimize memory access. template <typename Self, typename Reducer>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { // Compute offset as in the CPU version
Index val = threadIdx.x + blockIdx.x * blockDim.x;
Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
if (offset + (self.size() - 1) * self.stride() < total_size) { // Compute the scan along the axis, starting at the calculated offset typename Self::CoeffReturnType accum = self.accumulator().initialize(); for (Index idx = 0; idx < self.size(); idx++) {
Index curr = offset + idx * self.stride(); if (self.exclusive()) {
data[curr] = self.accumulator().finalize(accum);
self.accumulator().reduce(self.inner().coeff(curr), &accum);
} else {
self.accumulator().reduce(self.inner().coeff(curr), &accum);
data[curr] = self.accumulator().finalize(accum);
}
}
}
__syncthreads();
// Compute stride of scan axis const Dimensions& dims = m_impl.dimensions(); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { for (int i = 0; i < op.axis(); ++i) {
m_stride = m_stride * dims[i];
}
} else { // dims can only be indexed through unsigned integers, // so let's use an unsigned type to let the compiler knows. // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function" unsignedint axis = internal::convert_index<unsignedint>(op.axis()); for (unsignedint i = NumDims - 1; i > axis; --i) {
m_stride = m_stride * dims[i];
}
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.