// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2020, Arm Limited and Contributors // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
template <>
EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const numext::int32_t* from)
{
svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...}
indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...} return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
}
template <>
EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const numext::int32_t* from)
{
svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...}
indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...}
indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...} return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
}
template <>
EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
{
EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
{
EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
}
template <>
EIGEN_DEVICE_FUNC inlinevoid pscatter<numext::int32_t, PacketXi>(numext::int32_t* to, const PacketXi& from, Index stride)
{ // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
svint32_t indices = svindex_s32(0, stride);
svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from);
}
template <>
EIGEN_STRONG_INLINE numext::int32_t pfirst<PacketXi>(const PacketXi& a)
{ // svlasta returns the first element if all predicate bits are 0 return svlasta_s32(svpfalse_b(), a);
}
template <int N>
EIGEN_DEVICE_FUNC inlinevoid ptranspose(PacketBlock<PacketXi, N>& kernel) { int buffer[packet_traits<numext::int32_t>::size * N] = {0}; int i = 0;
PacketXi stride_index = svindex_s32(0, N);
for (i = 0; i < N; i++) {
svst1_scatter_s32index_s32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
} for (i = 0; i < N; i++) {
kernel.packet[i] = svld1_s32(svptrue_b32(), buffer + i * packet_traits<numext::int32_t>::size);
}
}
template <>
EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(const PacketXf& a, const PacketXf& b)
{ return svmul_f32_z(svptrue_b32(), a, b);
}
template <>
EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(const PacketXf& a, const PacketXf& b)
{ return svdiv_f32_z(svptrue_b32(), a, b);
}
template <>
EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c)
{ return svmla_f32_z(svptrue_b32(), c, a, b);
}
template <>
EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(const PacketXf& a, const PacketXf& b)
{ return svmin_f32_z(svptrue_b32(), a, b);
}
template <>
EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
{ return pmin<PacketXf>(a, b);
}
template <>
EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
{ return svminnm_f32_z(svptrue_b32(), a, b);
}
template <>
EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(const PacketXf& a, const PacketXf& b)
{ return svmax_f32_z(svptrue_b32(), a, b);
}
template <>
EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
{ return pmax<PacketXf>(a, b);
}
template <>
EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
{ return svmaxnm_f32_z(svptrue_b32(), a, b);
}
// Float comparisons in SVE return svbool (predicate). Use svdup to set active // lanes to 1 (0xffffffffu) and inactive lanes to 0. template <>
EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(const PacketXf& a, const PacketXf& b)
{ return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
}
template <>
EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(const PacketXf& a, const PacketXf& b)
{ return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
}
template <>
EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(const PacketXf& a, const PacketXf& b)
{ return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu));
}
// Do a predicate inverse (svnot_b_z) on the predicate resulted from the // greater/equal comparison (svcmpge_f32). Then fill a float vector with the // active elements. template <>
EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const PacketXf& b)
{ return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
}
// Logical Operations are not supported for float, so reinterpret casts template <>
EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(const PacketXf& a, const PacketXf& b)
{ return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
}
template <>
EIGEN_STRONG_INLINE PacketXf por<PacketXf>(const PacketXf& a, const PacketXf& b)
{ return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
}
template <>
EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(const PacketXf& a, const PacketXf& b)
{ return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
}
template <>
EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(constfloat* from)
{
svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...}
indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...} return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
}
template <>
EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(constfloat* from)
{
svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...}
indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...}
indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...} return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
}
template <>
EIGEN_STRONG_INLINE void pstore<float>(float* to, const PacketXf& from)
{
EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const PacketXf& from)
{
EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
}
template <>
EIGEN_DEVICE_FUNC inlinevoid pscatter<float, PacketXf>(float* to, const PacketXf& from, Index stride)
{ // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
svint32_t indices = svindex_s32(0, stride);
svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from);
}
template <>
EIGEN_STRONG_INLINE float pfirst<PacketXf>(const PacketXf& a)
{ // svlasta returns the first element if all predicate bits are 0 return svlasta_f32(svpfalse_b(), a);
}
// TODO(tellenbach): Should this go into MathFunctions.h? If so, change for // all vector extensions and the generic version. template <>
EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent)
{ return pfrexp_generic(a, exponent);
}
// Other reduction functions: // mul // Only works for SVE Vls multiple of 128 template <>
EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a)
{
EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT); // Multiply the vector by its reverse
svfloat32_t prod = svmul_f32_z(svptrue_b32(), a, svrev_f32(a));
svfloat32_t half_prod;
// Extract the high half of the vector. Depending on the VL more reductions need to be done if (EIGEN_ARM64_SVE_VL >= 2048) {
half_prod = svtbl_f32(prod, svindex_u32(32, 1));
prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
} if (EIGEN_ARM64_SVE_VL >= 1024) {
half_prod = svtbl_f32(prod, svindex_u32(16, 1));
prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
} if (EIGEN_ARM64_SVE_VL >= 512) {
half_prod = svtbl_f32(prod, svindex_u32(8, 1));
prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
} if (EIGEN_ARM64_SVE_VL >= 256) {
half_prod = svtbl_f32(prod, svindex_u32(4, 1));
prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
} // Last reduction
half_prod = svtbl_f32(prod, svindex_u32(2, 1));
prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
// The reduction is done to the first element. return pfirst<PacketXf>(prod);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.