/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
==============================================================================*/
// Following the convention of numpy, converting between complex and // float will lead to loss of imag value. template<typename RealScalar> explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const std::complex<RealScalar>& val)
: bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(static_cast<float>(val.real()))) {}
EIGEN_DEVICE_FUNC operatorfloat() const { // NOLINT: Allow implicit conversion to float, because it is lossless. return bfloat16_impl::bfloat16_to_float(*this);
}
};
} // namespace Eigen
// If std::numeric_limits<T> is specialized, should also specialize // std::numeric_limits<const T>, std::numeric_limits<volatile T>, and // std::numeric_limits<const volatile T> // https://stackoverflow.com/a/16519653/ template<> struct numeric_limits<const Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {}; template<> struct numeric_limits<volatile Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {}; template<> struct numeric_limits<constvolatile Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {};
} // namespace std
namespace Eigen {
namespace bfloat16_impl {
// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation // of the functions, while the latter can only deal with one of them. #if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for bfloat16 floats
#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) // We need to provide emulated *host-side* BF16 operators for clang. #pragma push_macro("EIGEN_DEVICE_FUNC") #undef EIGEN_DEVICE_FUNC #ifdefined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_NATIVE_BF16) #define EIGEN_DEVICE_FUNC __host__ #else// both host and device need emulated ops. #define EIGEN_DEVICE_FUNC __host__ __device__ #endif #endif
// Definitions for CPUs, mostly working through conversion // to/from fp32.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const bfloat16& b) { return bfloat16(float(a) + float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, constint& b) { return bfloat16(float(a) + static_cast<float>(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (constint& a, const bfloat16& b) { return bfloat16(static_cast<float>(a) + float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator * (const bfloat16& a, const bfloat16& b) { return bfloat16(float(a) * float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a, const bfloat16& b) { return bfloat16(float(a) - float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, const bfloat16& b) { return bfloat16(float(a) / float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a) {
bfloat16 result;
result.value = a.value ^ 0x8000; return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator += (bfloat16& a, constbfloat16& b) {
a = bfloat16(float(a) + float(b)); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator *= (bfloat16& a, constbfloat16& b) {
a = bfloat16(float(a) * float(b)); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator -= (bfloat16& a, constbfloat16& b) {
a = bfloat16(float(a) - float(b)); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator /= (bfloat16& a, constbfloat16& b) {
a = bfloat16(float(a) / float(b)); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a) {
a += bfloat16(1); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a) {
a -= bfloat16(1); return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a, int) {
bfloat16 original_value = a;
++a; return original_value;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a, int) {
bfloat16 original_value = a;
--a; return original_value;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator == (const bfloat16& a, const bfloat16& b) { return numext::equal_strict(float(a),float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator != (const bfloat16& a, const bfloat16& b) { return numext::not_equal_strict(float(a), float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator < (const bfloat16& a, const bfloat16& b) { returnfloat(a) < float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator <= (const bfloat16& a, const bfloat16& b) { returnfloat(a) <= float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator > (const bfloat16& a, const bfloat16& b) { returnfloat(a) > float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC booloperator >= (const bfloat16& a, const bfloat16& b) { returnfloat(a) >= float(b);
}
#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) #pragma pop_macro("EIGEN_DEVICE_FUNC") #endif #endif// Emulate support for bfloat16 floats
// Division by an index. Do it in full float precision to avoid accuracy // issues in converting the denominator to bfloat16.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, Index b) { return bfloat16(static_cast<float>(a) / static_cast<float>(b));
}
// float_to_bfloat16_rtne template specialization that does not make any // assumption about the value of its function argument (ff). template <>
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<false>(float ff) { #if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16)) // Nothing to do here #else
__bfloat16_raw output;
if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(ff)) { // If the value is a NaN, squash it to a qNaN with msb of fraction set, // this makes sure after truncation we don't end up with an inf. // // qNaN magic: All exponent bits set + most significant bit of fraction // set.
output.value = std::signbit(ff) ? 0xFFC0: 0x7FC0;
} else { // Fast rounding algorithm that rounds a half value to nearest even. This // reduces expected error when we convert a large number of floats. Here // is how it works: // // Definitions: // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits // with the following tags: // // Sign | Exp (8 bits) | Frac (23 bits) // S EEEEEEEE FFFFFFLRTTTTTTTTTTTTTTT // // S: Sign bit. // E: Exponent bits. // F: First 6 bits of fraction. // L: Least significant bit of resulting bfloat16 if we truncate away the // rest of the float32. This is also the 7th bit of fraction // R: Rounding bit, 8th bit of fraction. // T: Sticky bits, rest of fraction, 15 bits. // // To round half to nearest even, there are 3 cases where we want to round // down (simply truncate the result of the bits away, which consists of // rounding bit and sticky bits) and two cases where we want to round up // (truncate then add one to the result). // // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of // 1s) as the rounding bias, adds the rounding bias to the input, then // truncates the last 16 bits away. // // To understand how it works, we can analyze this algorithm case by case: // // 1. L = 0, R = 0: // Expect: round down, this is less than half value. // // Algorithm: // - Rounding bias: 0x7fff + 0 = 0x7fff // - Adding rounding bias to input may create any carry, depending on // whether there is any value set to 1 in T bits. // - R may be set to 1 if there is a carry. // - L remains 0. // - Note that this case also handles Inf and -Inf, where all fraction // bits, including L, R and Ts are all 0. The output remains Inf after // this algorithm. // // 2. L = 1, R = 0: // Expect: round down, this is less than half value. // // Algorithm: // - Rounding bias: 0x7fff + 1 = 0x8000 // - Adding rounding bias to input doesn't change sticky bits but // adds 1 to rounding bit. // - L remains 1. // // 3. L = 0, R = 1, all of T are 0: // Expect: round down, this is exactly at half, the result is already // even (L=0). // // Algorithm: // - Rounding bias: 0x7fff + 0 = 0x7fff // - Adding rounding bias to input sets all sticky bits to 1, but // doesn't create a carry. // - R remains 1. // - L remains 0. // // 4. L = 1, R = 1: // Expect: round up, this is exactly at half, the result needs to be // round to the next even number. // // Algorithm: // - Rounding bias: 0x7fff + 1 = 0x8000 // - Adding rounding bias to input doesn't change sticky bits, but // creates a carry from rounding bit. // - The carry sets L to 0, creates another carry bit and propagate // forward to F bits. // - If all the F bits are 1, a carry then propagates to the exponent // bits, which then creates the minimum value with the next exponent // value. Note that we won't have the case where exponents are all 1, // since that's either a NaN (handled in the other if condition) or inf // (handled in case 1). // // 5. L = 0, R = 1, any of T is 1: // Expect: round up, this is greater than half. // // Algorithm: // - Rounding bias: 0x7fff + 0 = 0x7fff // - Adding rounding bias to input creates a carry from sticky bits, // sets rounding bit to 0, then create another carry. // - The second carry sets L to 1. // // Examples: // // Exact half value that is already even: // Input: // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1000000000000000 // // This falls into case 3. We truncate the rest of 16 bits and no // carry is created into F and L: // // Output: // Sign | Exp (8 bit) | Frac (first 7 bit) // S E E E E E E E E F F F F F F L // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 // // Exact half value, round to next even number: // Input: // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1000000000000000 // // This falls into case 4. We create a carry from R and T, // which then propagates into L and F: // // Output: // Sign | Exp (8 bit) | Frac (first 7 bit) // S E E E E E E E E F F F F F F L // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 // // // Max denormal value round to min normal value: // Input: // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT // 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1111111111111111 // // This falls into case 4. We create a carry from R and T, // propagate into L and F, which then propagates into exponent // bits: // // Output: // Sign | Exp (8 bit) | Frac (first 7 bit) // S E E E E E E E E F F F F F F L // 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 // // Max normal value round to Inf: // Input: // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT // 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1111111111111111 // // This falls into case 4. We create a carry from R and T, // propagate into L and F, which then propagates into exponent // bits: // // Sign | Exp (8 bit) | Frac (first 7 bit) // S E E E E E E E E F F F F F F L // 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0
// At this point, ff must be either a normal float, or +/-infinity.
output = float_to_bfloat16_rtne<true>(ff);
} return output; #endif
}
// float_to_bfloat16_rtne template specialization that assumes that its function // argument (ff) is either a normal floating point number, or +/-infinity, or // zero. Used to improve the runtime performance of conversion from an integer // type to bfloat16. template <>
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff) { #if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16)) // Nothing to do here #else
numext::uint32_t input = numext::bit_cast<numext::uint32_t>(ff);
__bfloat16_raw output;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.