// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr> // Copyright (C) 2008-2009 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2009 Kenneth Riddile <kfriddile@yahoo.com> // Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com> // Copyright (C) 2010 Thomas Capricelli <orzel@freehackers.org> // Copyright (C) 2013 Pavel Holoborodko <pavel@holoborodko.com> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/***************************************************************************** *** Platform checks for aligned malloc functions ***
*****************************************************************************/
#ifndef EIGEN_MEMORY_H #define EIGEN_MEMORY_H
#ifndef EIGEN_MALLOC_ALREADY_ALIGNED
// Try to determine automatically if malloc is already aligned.
// On 64-bit systems, glibc's malloc returns 16-byte-aligned pointers, see: // http://www.gnu.org/s/libc/manual/html_node/Aligned-Memory-Blocks.html // This is true at least since glibc 2.8. // This leaves the question how to detect 64-bit. According to this document, // http://gcc.fyxm.net/summit/2003/Porting%20to%2064%20bit.pdf // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed // quite safe, at least within the context of glibc, to equate 64-bit with LP64. #ifdefined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
&& defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_DEFAULT_ALIGN_BYTES == 16) #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1 #else #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0 #endif
EIGEN_DEVICE_FUNC inlinevoid throw_std_bad_alloc()
{ #ifdef EIGEN_EXCEPTIONS throw std::bad_alloc(); #else
std::size_t huge = static_cast<std::size_t>(-1); #ifdefined(EIGEN_HIPCC) // // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining), // and as a consequence the code in the #else block triggers the hipcc warning : // "no overloaded function has restriction specifiers that are compatible with the ambient context" // // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects // the same on "operator new" // Reverting code back to the old version in this #if block for the hipcc compiler // newint[huge]; #else void* unused = ::operatornew(huge);
EIGEN_UNUSED_VARIABLE(unused); #endif #endif
}
/***************************************************************************** *** Implementation of handmade aligned functions ***
*****************************************************************************/
/* ----- Hand made implementations of aligned malloc/free and realloc ----- */
/** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned. * Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
*/
EIGEN_DEVICE_FUNC inlinevoid* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
{
eigen_assert(alignment >= sizeof(void*) && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*) and a power of 2");
/***************************************************************************** *** Implementation of portable aligned versions of malloc/free/realloc ***
*****************************************************************************/
#ifdef EIGEN_NO_MALLOC
EIGEN_DEVICE_FUNC inlinevoid check_that_malloc_is_allowed()
{
eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
} #elifdefined EIGEN_RUNTIME_NO_MALLOC
EIGEN_DEVICE_FUNC inlinebool is_malloc_allowed_impl(bool update, bool new_value = false)
{ staticbool value = true; if (update == 1)
value = new_value; return value;
}
EIGEN_DEVICE_FUNC inlinebool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
EIGEN_DEVICE_FUNC inlinebool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
EIGEN_DEVICE_FUNC inlinevoid check_that_malloc_is_allowed()
{
eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
} #else
EIGEN_DEVICE_FUNC inlinevoid check_that_malloc_is_allowed()
{} #endif
/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements. * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
*/
EIGEN_DEVICE_FUNC inlinevoid* aligned_malloc(std::size_t size)
{
check_that_malloc_is_allowed();
#if EIGEN_DEFAULT_ALIGN_BYTES==16
eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade aligned memory allocator."); #endif #else
result = handmade_aligned_malloc(size); #endif
/** * \internal * \brief Reallocates an aligned block of memory. * \throws std::bad_alloc on allocation failure
*/ inlinevoid* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size)
{
EIGEN_UNUSED_VARIABLE(old_size)
void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
result = std::realloc(ptr,new_size); #else
result = handmade_aligned_realloc(ptr,new_size,old_size); #endif
if (!result && new_size)
throw_std_bad_alloc();
return result;
}
/***************************************************************************** *** Implementation of conditionally aligned functions ***
*****************************************************************************/
/** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned. * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
*/ template<bool Align> EIGEN_DEVICE_FUNC inlinevoid* conditional_aligned_malloc(std::size_t size)
{ return aligned_malloc(size);
}
/***************************************************************************** *** Construction/destruction of array elements ***
*****************************************************************************/
/** \internal Destructs the elements of an array. * The \a size parameters tells on how many objects to call the destructor of T.
*/ template<typename T> EIGEN_DEVICE_FUNC inlinevoid destruct_elements_of_array(T *ptr, std::size_t size)
{ // always destruct an array starting from the end. if(ptr) while(size) ptr[--size].~T();
}
/** \internal Constructs the elements of an array. * The \a size parameter tells on how many objects to call the constructor of T.
*/ template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size)
{
std::size_t i;
EIGEN_TRY
{ for (i = 0; i < size; ++i) ::new (ptr + i) T; return ptr;
}
EIGEN_CATCH(...)
{
destruct_elements_of_array(ptr, i);
EIGEN_THROW;
} return NULL;
}
/***************************************************************************** *** Implementation of aligned new/delete-like functions ***
*****************************************************************************/
/** \internal Allocates \a size objects of type T. The returned pointer is guaranteed to have 16 bytes alignment. * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown. * The default constructor of T is called.
*/ template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size)
{
check_size_for_overflow<T>(size);
T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
EIGEN_TRY
{ return construct_elements_of_array(result, size);
}
EIGEN_CATCH(...)
{
aligned_free(result);
EIGEN_THROW;
} return result;
}
/** \internal Deletes objects constructed with aligned_new * The \a size parameters tells on how many objects to call the destructor of T.
*/ template<typename T> EIGEN_DEVICE_FUNC inlinevoid aligned_delete(T *ptr, std::size_t size)
{
destruct_elements_of_array<T>(ptr, size);
Eigen::internal::aligned_free(ptr);
}
/** \internal Deletes objects constructed with conditional_aligned_new * The \a size parameters tells on how many objects to call the destructor of T.
*/ template<typename T, bool Align> EIGEN_DEVICE_FUNC inlinevoid conditional_aligned_delete(T *ptr, std::size_t size)
{
destruct_elements_of_array<T>(ptr, size);
conditional_aligned_free<Align>(ptr);
}
/** \internal Returns the index of the first element of the array that is well aligned with respect to the requested \a Alignment. * * \tparam Alignment requested alignment in Bytes. * \param array the address of the start of the array * \param size the size of the array * * \note If no element of the array is well aligned or the requested alignment is not a multiple of a scalar, * the size of the array is returned. For example with SSE, the requested alignment is typically 16-bytes. If * packet size for the given scalar type is 1, then everything is considered well-aligned. * * \note Otherwise, if the Alignment is larger that the scalar size, we rely on the assumptions that sizeof(Scalar) is a * power of 2. On the other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for * example with Scalar=double on certain 32-bit platforms, see bug #79. * * There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h. * \sa first_default_aligned()
*/ template<int Alignment, typename Scalar, typename Index>
EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size)
{ const Index ScalarSize = sizeof(Scalar); const Index AlignmentSize = Alignment / ScalarSize; const Index AlignmentMask = AlignmentSize-1;
if(AlignmentSize<=1)
{ // Either the requested alignment if smaller than a scalar, or it exactly match a 1 scalar // so that all elements of the array have the same alignment. return 0;
} elseif( (UIntPtr(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0)
{ // The array is not aligned to the size of a single scalar, or the requested alignment is not a multiple of the scalar size. // Consequently, no element of the array is well aligned. return size;
} else
{
Index first = (AlignmentSize - (Index((UIntPtr(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask; return (first < size) ? first : size;
}
}
/** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement.
* \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */ template<typename Scalar, typename Index>
EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index size)
{ typedeftypename packet_traits<Scalar>::type DefaultPacketType; return first_aligned<unpacket_traits<DefaultPacketType>::alignment>(array, size);
}
/** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
*/ template<typename Index> inline Index first_multiple(Index size, Index base)
{ return ((size+base-1)/base)*base;
}
// std::copy is much slower than memcpy, so let's introduce a smart_copy which // use memcpy on trivial types, i.e., on types that does not require an initialization ctor. template<typename T, bool UseMemcpy> struct smart_copy_helper;
// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. template<typename T, bool UseMemmove> struct smart_memmove_helper;
/***************************************************************************** *** Implementation of runtime stack allocation (falling back to malloc) ***
*****************************************************************************/
// you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // to the appropriate stack allocation function #if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca) #define EIGEN_ALLOCA alloca #elif EIGEN_COMP_MSVC #define EIGEN_ALLOCA _alloca #endif #endif
// With clang -Oz -mthumb, alloca changes the stack pointer in a way that is // not allowed in Thumb2. -DEIGEN_STACK_ALLOCATION_LIMIT=0 doesn't work because // the compiler still emits bad code because stack allocation checks use "<=". // TODO: Eliminate after https://bugs.llvm.org/show_bug.cgi?id=23772 // is fixed. #ifdefined(__clang__) && defined(__thumb__) #undef EIGEN_ALLOCA #endif
// This helper class construct the allocated memory, and takes care of destructing and freeing the handled data // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions. template<typename T> class aligned_stack_memory_handler : noncopyable
{ public: /* Creates a stack_memory_handler responsible for the buffer \a ptr of size \a size. * Note that \a ptr can be 0 regardless of the other parameters. * This constructor takes care of constructing/initializing the elements of the buffer if required by the scalar type T (see NumTraits<T>::RequireInitialization). * In this case, the buffer elements will also be destructed when this handler will be destructed. * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
**/
EIGEN_DEVICE_FUNC
aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
: m_ptr(ptr), m_size(size), m_deallocate(dealloc)
{ if(NumTraits<T>::RequireInitialization && m_ptr)
Eigen::internal::construct_elements_of_array(m_ptr, size);
}
EIGEN_DEVICE_FUNC
~aligned_stack_memory_handler()
{ if(NumTraits<T>::RequireInitialization && m_ptr)
Eigen::internal::destruct_elements_of_array<T>(m_ptr, m_size); if(m_deallocate)
Eigen::internal::aligned_free(m_ptr);
} protected:
T* m_ptr;
std::size_t m_size; bool m_deallocate;
};
/** \internal * * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates, * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap. * The allocated buffer is automatically deleted when exiting the scope of this declaration. * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs. * Here is an example: * \code * { * ei_declare_aligned_stack_constructed_variable(float,data,size,0); * // use data[0] to data[size-1] * } * \endcode * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. * * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to * \code * typename internal::nested_eval<XPRT_T,N>::type NAME(XPR); * \endcode * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown. * This is accomplished through alloca if this later is supported and if the required number of bytes * is below EIGEN_STACK_ALLOCATION_LIMIT.
*/ #ifdef EIGEN_ALLOCA
#if EIGEN_DEFAULT_ALIGN_BYTES>0 // We always manually re-align the result of EIGEN_ALLOCA. // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment. #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((internal::UIntPtr(EIGEN_ALLOCA(SIZE+EIGEN_DEFAULT_ALIGN_BYTES-1)) + EIGEN_DEFAULT_ALIGN_BYTES-1) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) #else #define EIGEN_ALIGNED_ALLOCA(SIZE) EIGEN_ALLOCA(SIZE) #endif
/** \class aligned_allocator * \ingroup Core_Module * * \brief STL compatible allocator to use with types requiring a non standrad alignment. * * The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd. * By default, it will thus provide at least 16 bytes alignment and more in following cases: * - 32 bytes alignment if AVX is enabled. * - 64 bytes alignment if AVX512 is enabled. * * This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented * \link TopicPreprocessorDirectivesPerformance there \endlink. * * Example: * \code * // Matrix4f requires 16 bytes alignment: * std::map< int, Matrix4f, std::less<int>, * aligned_allocator<std::pair<const int, Matrix4f> > > my_map_mat4; * // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator: * std::map< int, Vector3f > my_map_vec3; * \endcode * * \sa \blank \ref TopicStlContainers.
*/ template<class T> class aligned_allocator : public std::allocator<T>
{ public: typedef std::size_t size_type; typedef std::ptrdiff_t difference_type; typedef T* pointer; typedefconst T* const_pointer; typedef T& reference; typedefconst T& const_reference; typedef T value_type;
#if !defined(EIGEN_NO_CPUID) # if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64 # ifdefined(__PIC__) && EIGEN_ARCH_i386 // Case for x86 with PIC # define EIGEN_CPUID(abcd,func,id) \
__asm__ __volatile__ ("xchgl %%ebx, %k1;cpuid; xchgl %%ebx,%k1": "=a" (abcd[0]), "=&r"pan> (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "a" (func), "c" (id)); # elif defined(__PIC__) && EIGEN_ARCH_x86_64 // Case for x64 with PIC. In theory this is only a problem with recent gcc and with medium or large code model, not with the default small code model. // However, we cannot detect which code model is used, and the xchg overhead is negligible anyway. # define EIGEN_CPUID(abcd,func,id) \
__asm__ __volatile__ ("xchg{q}\t{%%}rbx, %q1; cpuid; xchg{q}\t{%%}rbx, %q1": "=a" (abcd[0]), "=&r" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id)); # else // Case for x86_64 or x86 w/o PIC # define EIGEN_CPUID(abcd,func,id) \
__asm__ __volatile__ ("cpuid": "=a" (abcd[0]), "=b" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id) ); # endif # elif EIGEN_COMP_MSVC # if (EIGEN_COMP_MSVC > 1500) && EIGEN_ARCH_i386_OR_x86_64 # define EIGEN_CPUID(abcd,func,id) __cpuidex((int*)abcd,func,id) # endif # endif #endif
/** \internal
* Queries and returns the cache sizes in Bytes of the L1, L2, and L3 data caches respectively */ inlinevoid queryCacheSizes(int& l1, int& l2, int& l3)
{ #ifdef EIGEN_CPUID int abcd[4]; constint GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e}; constint AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163}; constint AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; // "AMDisbetter!"
// identify the CPU vendor
EIGEN_CPUID(abcd,0x0,0); int max_std_funcs = abcd[0]; if(cpuid_is_vendor(abcd,GenuineIntel))
queryCacheSizes_intel(l1,l2,l3,max_std_funcs); elseif(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_))
queryCacheSizes_amd(l1,l2,l3); else // by default let's use Intel's API
queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
// here is the list of other vendors: // ||cpuid_is_vendor(abcd,"VIA VIA VIA ") // ||cpuid_is_vendor(abcd,"CyrixInstead") // ||cpuid_is_vendor(abcd,"CentaurHauls") // ||cpuid_is_vendor(abcd,"GenuineTMx86") // ||cpuid_is_vendor(abcd,"TransmetaCPU") // ||cpuid_is_vendor(abcd,"RiseRiseRise") // ||cpuid_is_vendor(abcd,"Geode by NSC") // ||cpuid_is_vendor(abcd,"SiS SiS SiS ") // ||cpuid_is_vendor(abcd,"UMC UMC UMC ") // ||cpuid_is_vendor(abcd,"NexGenDriven") #else
l1 = l2 = l3 = -1; #endif
}
/** \internal
* \returns the size in Bytes of the L1 data cache */ inlineint queryL1CacheSize()
{ int l1(-1), l2, l3;
queryCacheSizes(l1,l2,l3); return l1;
}
/** \internal
* \returns the size in Bytes of the L2 or L3 cache if this later is present */ inlineint queryTopLevelCacheSize()
{ int l1, l2(-1), l3(-1);
queryCacheSizes(l1,l2,l3); return (std::max)(l2,l3);
}
} // end namespace internal
} // end namespace Eigen
#endif// EIGEN_MEMORY_H
¤ Dauer der Verarbeitung: 0.48 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.