/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_AVX2_HPP #define XSIMD_AVX2_HPP
#include <complex> #include <type_traits>
#include"../types/xsimd_avx2_register.hpp"
namespace xsimd
{
namespace kernel
{ usingnamespace types;
// abs template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
{ if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{ return _mm256_abs_epi8(self);
} else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{ return _mm256_abs_epi16(self);
} else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{ return _mm256_abs_epi32(self);
} else
{ return abs(self, avx {});
}
} return self;
}
// avgr template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{ return _mm256_avg_epu8(self, other);
} else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{ return _mm256_avg_epu16(self, other);
} else
{ return avgr(self, other, generic {});
}
}
// avg template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{ auto adj = ((self ^ other) << 7) >> 7; return avgr(self, other, A {}) - adj;
} else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{ auto adj = ((self ^ other) << 15) >> 15; return avgr(self, other, A {}) - adj;
} else
{ return avg(self, other, generic {});
}
}
// bitwise_and template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{ return _mm256_and_si256(self, other);
} template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
{ return _mm256_and_si256(self, other);
}
// bitwise_andnot template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{ return _mm256_andnot_si256(other, self);
} template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
{ return _mm256_andnot_si256(other, self);
}
// bitwise_not template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
{ return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
} template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
{ return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{ return _mm256_sllv_epi32(self, other);
} else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{ return _mm256_sllv_epi64(self, other);
} else
{ return bitwise_lshift(self, other, avx {});
}
}
// bitwise_or template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{ return _mm256_or_si256(self, other);
} template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
{ return _mm256_or_si256(self, other);
}
// bitwise_xor template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{ return _mm256_xor_si256(self, other);
} template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
{ return _mm256_xor_si256(self, other);
}
// gather template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
kernel::requires_arch<avx2>) noexcept
{ // scatter for this one is AVX512F+AVX512VL return _mm256_i32gather_epi32(reinterpret_cast<constint*>(src), index, sizeof(T));
}
template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
kernel::requires_arch<avx2>) noexcept
{ // scatter for this one is AVX512F+AVX512VL return _mm256_i64gather_epi64(reinterpret_cast<constlonglongint*>(src), index, sizeof(T));
}
template <class A, class U,
detail::enable_sized_integral_t<U, 4> = 0>
XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, floatconst* src,
batch<U, A> const& index,
kernel::requires_arch<avx2>) noexcept
{ // scatter for this one is AVX512F+AVX512VL return _mm256_i32gather_ps(src, index, sizeof(float));
}
template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
XSIMD_INLINE batch<double, A> gather(batch<double, A> const&, doubleconst* src,
batch<U, A> const& index,
requires_arch<avx2>) noexcept
{ // scatter for this one is AVX512F+AVX512VL return _mm256_i64gather_pd(src, index, sizeof(double));
}
// gather: handmade conversions template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, doubleconst* src,
batch<V, A> const& index,
requires_arch<avx2>) noexcept
{ const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double))); const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double))); return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data));
}
template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
XSIMD_INLINE batch<int32_t, A> gather(batch<int32_t, A> const&, doubleconst* src,
batch<V, A> const& index,
requires_arch<avx2>) noexcept
{ const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double))); const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double))); return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data));
}
// rotate_left template <size_t N, class A>
XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<avx2>) noexcept
{ return _mm256_alignr_epi8(self, self, N);
} template <size_t N, class A>
XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<avx2>) noexcept
{ return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), avx2 {}));
}
// select template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{ return _mm256_blendv_epi8(false_br, true_br, cond);
} else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{ return _mm256_blendv_epi8(false_br, true_br, cond);
} else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{ return _mm256_blendv_epi8(false_br, true_br, cond);
} else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{ return _mm256_blendv_epi8(false_br, true_br, cond);
} else
{ return select(cond, true_br, false_br, avx {});
}
} template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
{
constexpr int mask = batch_bool_constant<T, A, Values...>::mask(); // FIXME: for some reason mask here is not considered as an immediate, // but it's okay for _mm256_blend_epi32 // case 2: return _mm256_blend_epi16(false_br, true_br, mask);
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{ return _mm256_blend_epi32(false_br, true_br, mask);
} else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
constexpr int imask = detail::interleave(mask); return _mm256_blend_epi32(false_br, true_br, imask);
} else
{ return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
}
}
// slide_left template <size_t N, class A, class T>
XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
{
constexpr unsigned BitCount = N * 8; if (BitCount == 0)
{ return x;
} if (BitCount >= 256)
{ return batch<T, A>(T(0));
} if (BitCount > 128)
{
constexpr unsigned M = (BitCount - 128) / 8; auto y = _mm256_bslli_epi128(x, M); return _mm256_permute2x128_si256(y, y, 0x28);
} if (BitCount == 128)
{ return _mm256_permute2x128_si256(x, x, 0x28);
} // shifting by [0, 128[ bits
constexpr unsigned M = BitCount / 8; auto y = _mm256_bslli_epi128(x, M); auto z = _mm256_bsrli_epi128(x, 16 - M); auto w = _mm256_permute2x128_si256(z, z, 0x28); return _mm256_or_si256(y, w);
}
// slide_right template <size_t N, class A, class T>
XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
{
constexpr unsigned BitCount = N * 8; if (BitCount == 0)
{ return x;
} if (BitCount >= 256)
{ return batch<T, A>(T(0));
} if (BitCount > 128)
{
constexpr unsigned M = (BitCount - 128) / 8; auto y = _mm256_bsrli_epi128(x, M); return _mm256_permute2x128_si256(y, y, 0x81);
} if (BitCount == 128)
{ return _mm256_permute2x128_si256(x, x, 0x81);
} // shifting by [0, 128[ bits
constexpr unsigned M = BitCount / 8; auto y = _mm256_bsrli_epi128(x, M); auto z = _mm256_bslli_epi128(x, 16 - M); auto w = _mm256_permute2x128_si256(z, z, 0x81); return _mm256_or_si256(y, w);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.