// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // // The conversion routines are Copyright (c) Fabian Giesen, 2016. // The original license follows: // // Copyright (c) Fabian Giesen, 2016 // All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Standard 16-bit float type, mostly useful for GPUs. Defines a new // type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with // operator overloads such that it behaves basically as an arithmetic // type. It will be quite slow on CPUs (so it is recommended to stay // in fp32 for CPUs, except for simple parameter conversions, I/O // to disk and the likes), but fast on GPUs.
#ifndef EIGEN_HALF_H #define EIGEN_HALF_H
#include <sstream>
#ifdefined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) // When compiling with GPU support, the "__half_raw" base class as well as // some other routines are defined in the GPU compiler header files // (cuda_fp16.h, hip_fp16.h), and they are not tagged constexpr // As a consequence, we get compile failures when compiling Eigen with // GPU support. Hence the need to disable EIGEN_CONSTEXPR when building // Eigen with GPU support #pragma push_macro("EIGEN_CONSTEXPR") #undef EIGEN_CONSTEXPR #define EIGEN_CONSTEXPR #endif
// We want to use the __half_raw struct from the HIP header file only during the device compile phase. // This is required because of a quirk in the way TensorFlow GPU builds are done. // When compiling TensorFlow source code with GPU support, files that // * contain GPU kernels (i.e. *.cu.cc files) are compiled via hipcc // * do not contain GPU kernels ( i.e. *.cc files) are compiled via gcc (typically) // // Tensorflow uses the Eigen::half type as its FP16 type, and there are functions that // * are defined in a file that gets compiled via hipcc AND // * have Eigen::half as a pass-by-value argument AND // * are called in a file that gets compiled via gcc // // In the scenario described above the caller and callee will see different versions // of the Eigen::half base class __half_raw, and they will be compiled by different compilers // // There appears to be an ABI mismatch between gcc and clang (which is called by hipcc) that results in // the callee getting corrupted values for the Eigen::half argument. // // Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves // this error, and hence the following convoluted #if condition #if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) // Make our own __half_raw definition that is similar to CUDA's. struct __half_raw { #if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)) // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF) // The element type for shared memory cannot have non-trivial constructors // and hence the following special casing (which skips the zero-initilization). // Note that this check gets done even in the host compilation phase, and // hence the need for this
EIGEN_DEVICE_FUNC __half_raw() {} #else
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {} #endif #ifdefined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {
}
__fp16 x; #else explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(raw) {}
numext::uint16_t x; #endif
};
#elifdefined(EIGEN_HAS_HIP_FP16) // Nothing to do here // HIP fp16 header file has a definition for __half_raw #elifdefined(EIGEN_HAS_CUDA_FP16) #if EIGEN_CUDA_SDK_VER < 90000 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw typedef __half __half_raw; #endif// defined(EIGEN_HAS_CUDA_FP16) #elifdefined(SYCL_DEVICE_ONLY) typedef cl::sycl::half __half_raw; #endif
// Class definition. struct half : public half_impl::half_base {
// Writing this out as separate #if-else blocks to make the code easier to follow // The same applies to most #if-else blocks in this file #if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) // Use the same base class for the following two scenarios // * when compiling without GPU support enabled // * during host compile phase when compiling with GPU support enabled typedef half_impl::__half_raw __half_raw; #elifdefined(EIGEN_HAS_HIP_FP16) // Nothing to do here // HIP fp16 header file has a definition for __half_raw #elifdefined(EIGEN_HAS_CUDA_FP16) // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within // #if defined(EIGEN_HAS_CUDA_FP16) is needed #ifdefined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 typedef half_impl::__half_raw __half_raw; #endif #endif
// Following the convention of numpy, converting between complex and // float will lead to loss of imag value. template<typename RealScalar> explicit EIGEN_DEVICE_FUNC half(std::complex<RealScalar> c)
: half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(c.real()))) {}
EIGEN_DEVICE_FUNC operatorfloat() const { // NOLINT: Allow implicit conversion to float, because it is lossless. return half_impl::half_to_float(*this);
}
// If std::numeric_limits<T> is specialized, should also specialize // std::numeric_limits<const T>, std::numeric_limits<volatile T>, and // std::numeric_limits<const volatile T> // https://stackoverflow.com/a/16519653/ template<> struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {}; template<> struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {}; template<> struct numeric_limits<constvolatile Eigen::half> : numeric_limits<Eigen::half> {};
} // end namespace std
namespace Eigen {
namespace half_impl {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) // Note: We deliberatly do *not* define this to 1 even if we have Arm's native // fp16 type since GPU halfs are rather different from native CPU halfs. // TODO: Rename to something like EIGEN_HAS_NATIVE_GPU_FP16 #define EIGEN_HAS_NATIVE_FP16 #endif
// Intrinsics for native fp16 support. Note that on current hardware, // these are no faster than fp32 arithmetic (you need to use the half2 // versions to get the ALU speed increased), but you do save the // conversion steps back and forth.
#ifdefined(EIGEN_HAS_NATIVE_FP16)
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { #ifdefined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 return __hadd(::__half(a), ::__half(b)); #else return __hadd(a, b); #endif
}
EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { return __hmul(a, b);
}
EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { return __hsub(a, b);
}
EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { #ifdefined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 return __hdiv(a, b); #else float num = __half2float(a); float denom = __half2float(b); return __float2half(num / denom); #endif
}
EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { return __hneg(a);
}
EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
a = a + b; return a;
}
EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
a = a * b; return a;
}
EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
a = a - b; return a;
}
EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
a = a / b; return a;
}
EIGEN_STRONG_INLINE __device__ booloperator == (const half& a, const half& b) { return __heq(a, b);
}
EIGEN_STRONG_INLINE __device__ booloperator != (const half& a, const half& b) { return __hne(a, b);
}
EIGEN_STRONG_INLINE __device__ booloperator < (const half& a, const half& b) { return __hlt(a, b);
}
EIGEN_STRONG_INLINE __device__ booloperator <= (const half& a, const half& b) { return __hle(a, b);
}
EIGEN_STRONG_INLINE __device__ booloperator > (const half& a, const half& b) { return __hgt(a, b);
}
EIGEN_STRONG_INLINE __device__ booloperator >= (const half& a, const half& b) { return __hge(a, b);
} #endif
#ifdefined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { return half(vaddh_f16(a.x, b.x));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { return half(vmulh_f16(a.x, b.x));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { return half(vsubh_f16(a.x, b.x));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { return half(vdivh_f16(a.x, b.x));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { return half(vnegh_f16(a.x));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
a = half(vaddh_f16(a.x, b.x)); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
a = half(vmulh_f16(a.x, b.x)); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
a = half(vsubh_f16(a.x, b.x)); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
a = half(vdivh_f16(a.x, b.x)); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator == (const half& a, const half& b) { return vceqh_f16(a.x, b.x);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator != (const half& a, const half& b) { return !vceqh_f16(a.x, b.x);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator < (const half& a, const half& b) { return vclth_f16(a.x, b.x);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator <= (const half& a, const half& b) { return vcleh_f16(a.x, b.x);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator > (const half& a, const half& b) { return vcgth_f16(a.x, b.x);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator >= (const half& a, const half& b) { return vcgeh_f16(a.x, b.x);
} // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation // of the functions, while the latter can only deal with one of them. #elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats
#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) // We need to provide emulated *host-side* FP16 operators for clang. #pragma push_macro("EIGEN_DEVICE_FUNC") #undef EIGEN_DEVICE_FUNC #ifdefined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16) #define EIGEN_DEVICE_FUNC __host__ #else// both host and device need emulated ops. #define EIGEN_DEVICE_FUNC __host__ __device__ #endif #endif
// Definitions for CPUs and older HIP+CUDA, mostly working through conversion // to/from fp32.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { return half(float(a) + float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { return half(float(a) * float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { return half(float(a) - float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { return half(float(a) / float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
half result;
result.x = a.x ^ 0x8000; return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
a = half(float(a) + float(b)); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
a = half(float(a) * float(b)); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
a = half(float(a) - float(b)); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
a = half(float(a) / float(b)); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator == (const half& a, const half& b) { return numext::equal_strict(float(a),float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator != (const half& a, const half& b) { return numext::not_equal_strict(float(a), float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator < (const half& a, const half& b) { returnfloat(a) < float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator <= (const half& a, const half& b) { returnfloat(a) <= float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator > (const half& a, const half& b) { returnfloat(a) > float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator >= (const half& a, const half& b) { returnfloat(a) >= float(b);
}
#ifdefined(__clang__) && defined(__CUDA__) #pragma pop_macro("EIGEN_DEVICE_FUNC") #endif #endif// Emulate support for half floats
// Division by an index. Do it in full float precision to avoid accuracy // issues in converting the denominator to half.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { return half(static_cast<float>(a) / static_cast<float>(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a) {
a += half(1); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a) {
a -= half(1); return a;
}
// Conversion routines, including fallbacks for the host or older CUDA. // Note that newer Intel CPUs (Haswell or newer) have vectorized versions of // these in hardware. If we need more performance on older/other CPUs, they are // also possible to vectorize directly.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) { // We cannot simply do a "return __half_raw(x)" here, because __half_raw is union type // in the hip_fp16 header file, and that will trigger a compile error // On the other hand, having anything but a return statement also triggers a compile error // because this is constexpr function. // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out // of this catch22 by having separate bodies for GPU / non GPU #ifdefined(EIGEN_HAS_GPU_FP16)
__half_raw h;
h.x = x; return h; #else return __half_raw(x); #endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const __half_raw& h) { // HIP/CUDA/Default have a member 'x' of type uint16_t. // For ARM64 native half, the member 'x' is of type __fp16, so we need to bit-cast. // For SYCL, cl::sycl::half is _Float16, so cast directly. #ifdefined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) return numext::bit_cast<numext::uint16_t>(h.x); #elifdefined(SYCL_DEVICE_ONLY) return numext::bit_cast<numext::uint16_t>(h); #else return h.x; #endif
}
// NOTE all the integer compares in this function can be safely // compiled into signed compares since all operands are below // 0x80000000. Important if you want fast straight SSE2 code // (since there's no unsigned PCMPGTD).
if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
} else { // (De)normalized number or zero if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero // use a magic value to align our 10 mantissa bits at the bottom of // the float. as long as FP addition is round-to-nearest-even this // just works.
f.f += denorm_magic.f;
// and one integer subtract of the bias later, we have our final float!
o.x = static_cast<numext::uint16_t>(f.u - denorm_magic.u);
} else { unsignedint mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
// update exponent, rounding bias part 1 // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but // without arithmetic overflow.
f.u += 0xc8000fffU; // rounding bias part 2
f.u += mant_odd; // take the bits!
o.x = static_cast<numext::uint16_t>(f.u >> 13);
}
}
// Add the missing shfl* intrinsics. // The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300. // CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)) // // HIP and CUDA prior to SDK 9.0 define // __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float // CUDA since 9.0 deprecates those and instead defines // __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync, // with native support for __half and __nv_bfloat16 // // Note that the following are __device__ - only functions. #if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) \
|| defined(EIGEN_HIPCC)
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width=warpSize) { const __half h = var; returnstatic_cast<Eigen::half>(__shfl_sync(mask, h, srcLane, width));
}
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsignedint delta, int width=warpSize) { const __half h = var; returnstatic_cast<Eigen::half>(__shfl_up_sync(mask, h, delta, width));
}
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsignedint delta, int width=warpSize) { const __half h = var; returnstatic_cast<Eigen::half>(__shfl_down_sync(mask, h, delta, width));
}
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask, int width=warpSize) { const __half h = var; returnstatic_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
}
#else// HIP or CUDA SDK < 9.0
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width=warpSize) { constint ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var)); return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl(ivar, srcLane, width)));
}
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsignedint delta, int width=warpSize) { constint ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var)); return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_up(ivar, delta, width)));
}
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsignedint delta, int width=warpSize) { constint ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var)); return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
}
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { constint ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var)); return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
}
#endif// HIP vs CUDA #endif// __shfl*
// ldg() has an overload for __half_raw, but we also need one for Eigen::half. #if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) \
|| defined(EIGEN_HIPCC)
EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
} #endif// __ldg
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.