Quelle TensorRandom.h Sprache: C

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H

namespace Eigen {
namespace internal {

namespace {

EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
#if defined(EIGEN_GPU_COMPILE_PHASE)
  // We don't support 3d kernels since we currently only use 1 and
  // 2d kernels.
  gpu_assert(threadIdx.z == 0);
  return blockIdx.x * blockDim.x + threadIdx.x
         + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
#else
  // Rely on Eigen's random implementation.
  return random<uint64_t>();
#endif
}

static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
  // TODO: Unify with the implementation in the non blocking thread pool.
  uint64_t current = *state;
  // Update the internal state
  *state = current * 6364136223846793005ULL + (stream << 1 | 1);
  // Generate the random output (using the PCG-XSH-RS scheme)
  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
}

static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
  seed = seed ? seed : get_random_seed();
  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
}

}  // namespace

template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
  unsigned rnd = PCG_XSH_RS_generator(state, stream);
  return static_cast<T>(rnd);
}

template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
  // Generate 10 random bits for the mantissa, merge with exponent.
  unsigned rnd = PCG_XSH_RS_generator(state, stream);
  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
  Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
  // Return the final result
  return result - Eigen::half(1.0f);
}

template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {

  // Generate 7 random bits for the mantissa, merge with exponent.
  unsigned rnd = PCG_XSH_RS_generator(state, stream);
  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
  Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
  // Return the final result
  return result - Eigen::bfloat16(1.0f);
}

template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
  typedef union {
    uint32_t raw;
    float fp;
  } internal;
  internal result;
  // Generate 23 random bits for the mantissa mantissa
  const unsigned rnd = PCG_XSH_RS_generator(state, stream);
  result.raw = rnd & 0x7fffffu;
  // Set the exponent
  result.raw |= (static_cast<uint32_t>(127) << 23);
  // Return the final result
  return result.fp - 1.0f;
}

template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
  typedef union {
    uint64_t raw;
    double dp;
  } internal;
  internal result;
  result.raw = 0;
  // Generate 52 random bits for the mantissa
  // First generate the upper 20 bits
  unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
  // The generate the lower 32 bits
  unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
  // Set the exponent
  result.raw |= (static_cast<uint64_t>(1023) << 52);
  // Return the final result
  return result.dp - 1.0;
}

template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
  return std::complex<float>(RandomToTypeUniform<float>(state, stream),
                             RandomToTypeUniform<float>(state, stream));
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
  return std::complex<double>(RandomToTypeUniform<double>(state, stream),
                              RandomToTypeUniform<double>(state, stream));
}

template <typename T> class UniformRandomGenerator {
public:
  static const bool PacketAccess = true;

  // Uses the given "seed" if non-zero, otherwise uses a random seed.
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
      uint64_t seed = 0) {
    m_state = PCG_XSH_RS_state(seed);
    #ifdef EIGEN_USE_SYCL
    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
    // Therefor, we need two step to initializate the m_state.
    // IN SYCL, the constructor of the functor is s called on the CPU
    // and we get the clock seed here from the CPU. However, This seed is
    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
    // and only  available on the Operator() function (which is called on the GPU).
    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
    // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
    // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
    // similar to CUDA Therefore, the thread Id injection is not available at this stage.
    //However when the operator() is called the thread ID will be avilable. So inside the opeator,
    // we add the thrreadID, BlockId,... (which is equivalent of i)
    //to the seed and construct the unique m_state per thead similar to cuda.
    m_exec_once =false;
   #endif
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
      const UniformRandomGenerator& other) {
    m_state = other.m_state;
    #ifdef EIGEN_USE_SYCL
     m_exec_once =other.m_exec_once;
    #endif
  }

  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  T operator()(Index i) const {
    #ifdef EIGEN_USE_SYCL
      if(!m_exec_once) {
      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
      // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
       m_state += (i * 6364136223846793005ULL);
       m_exec_once =true;
      }
    #endif
    T result = RandomToTypeUniform<T>(&m_state, i);
    return result;
  }

  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Packet packetOp(Index i) const {
    const int packetSize = internal::unpacket_traits<Packet>::size;
    EIGEN_ALIGN_MAX T values[packetSize];
      #ifdef EIGEN_USE_SYCL
      if(!m_exec_once) {
      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
       m_state += (i * 6364136223846793005ULL);
       m_exec_once =true;
      }
    #endif
    EIGEN_UNROLL_LOOP
    for (int j = 0; j < packetSize; ++j) {
      values[j] = RandomToTypeUniform<T>(&m_state, i);
    }
    return internal::pload<Packet>(values);
  }

private:
  mutable uint64_t m_state;
  #ifdef EIGEN_USE_SYCL
  mutable bool m_exec_once;
  #endif
};

template <typename Scalar>
struct functor_traits<UniformRandomGenerator<Scalar> > {
  enum {
    // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
    Cost = 12 * NumTraits<Scalar>::AddCost *
           ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
  };
};

template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
  // Use the ratio of uniform method to generate numbers following a normal
  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
  // details.
  T u, v, q;
  do {
    u = RandomToTypeUniform<T>(state, stream);
    v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
    const T x = u - T(0.449871);
    const T y = numext::abs(v) + T(0.386595);
    q = x*x + y * (T(0.196)*y - T(0.25472)*x);
  } while (q > T(0.27597) &&
           (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));

  return v/u;
}

template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
  return std::complex<float>(RandomToTypeNormal<float>(state, stream),
                             RandomToTypeNormal<float>(state, stream));
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
  return std::complex<double>(RandomToTypeNormal<double>(state, stream),
                              RandomToTypeNormal<double>(state, stream));
}

template <typename T> class NormalRandomGenerator {
public:
  static const bool PacketAccess = true;

  // Uses the given "seed" if non-zero, otherwise uses a random seed.
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
    m_state = PCG_XSH_RS_state(seed);
    #ifdef EIGEN_USE_SYCL
    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
    // Therefor, we need two steps to initializate the m_state.
    // IN SYCL, the constructor of the functor is s called on the CPU
    // and we get the clock seed here from the CPU. However, This seed is
    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
    // and only  available on the Operator() function (which is called on the GPU).
    // Therefore, the thread Id injection is not available at this stage. However when the operator()
    //is called the thread ID will be avilable. So inside the opeator,
    // we add the thrreadID, BlockId,... (which is equivalent of i)
    //to the seed and construct the unique m_state per thead similar to cuda.
    m_exec_once =false;
   #endif
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
      const NormalRandomGenerator& other) {
    m_state = other.m_state;
#ifdef EIGEN_USE_SYCL
    m_exec_once=other.m_exec_once;
#endif
  }

template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  T operator()(Index i) const {
    #ifdef EIGEN_USE_SYCL
    if(!m_exec_once) {
      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
      m_state += (i * 6364136223846793005ULL);
      m_exec_once =true;
    }
    #endif
    T result = RandomToTypeNormal<T>(&m_state, i);
    return result;
  }

  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Packet packetOp(Index i) const {
    const int packetSize = internal::unpacket_traits<Packet>::size;
    EIGEN_ALIGN_MAX T values[packetSize];
    #ifdef EIGEN_USE_SYCL
    if(!m_exec_once) {
      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
      m_state += (i * 6364136223846793005ULL);
      m_exec_once =true;
    }
    #endif
    EIGEN_UNROLL_LOOP
    for (int j = 0; j < packetSize; ++j) {
      values[j] = RandomToTypeNormal<T>(&m_state, i);
    }
    return internal::pload<Packet>(values);
  }

private:
  mutable uint64_t m_state;
   #ifdef EIGEN_USE_SYCL
  mutable bool m_exec_once;
  #endif
};

template <typename Scalar>
struct functor_traits<NormalRandomGenerator<Scalar> > {
  enum {
    // On average, we need to generate about 3 random numbers
    // 15 mul, 8 add, 1.5 logs
    Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
           15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
           3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
  };
};

} // end namespace internal
} // end namespace Eigen

#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H

Messung V0.5

¤ Dauer der Verarbeitung: 0.11 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.