Quellcodebibliothek Statistik Leitseite products/Sources/formale Sprachen/C/Firefox/third_party/xsimd/include/xsimd/arch/   (Browser von der Mozilla Stiftung Version 136.0.1©)  Datei vom 10.2.2025 mit Größe 123 kB image not shown  

Quelle  xsimd_neon.hpp   Sprache: C

 
/***************************************************************************
 * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
 * Martin Renou                                                             *
 * Copyright (c) QuantStack                                                 *
 * Copyright (c) Serge Guelton                                              *
 *                                                                          *
 * Distributed under the terms of the BSD 3-Clause License.                 *
 *                                                                          *
 * The full license is in the file LICENSE, distributed with this software. *
 ****************************************************************************/


#ifndef XSIMD_NEON_HPP
#define XSIMD_NEON_HPP

#include <algorithm>
#include <complex>
#include <tuple>
#include <type_traits>

#include "../types/xsimd_neon_register.hpp"
#include "../types/xsimd_utils.hpp"

// Wrap intrinsics so we can pass them as function pointers
// - OP: intrinsics name prefix, e.g., vorrq
// - RT: type traits to deduce intrinsics return types
#define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT)                                     \
    namespace wrap                                                                \
    {                                                                             \
        XSIMD_INLINE RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept  \
        {                                                                         \
            return ::OP##_u8(a, b);                                               \
        }                                                                         \
        XSIMD_INLINE RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
        {                                                                         \
            return ::OP##_u16(a, b);                                              \
        }                                                                         \
        XSIMD_INLINE RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
        {                                                                         \
            return ::OP##_u32(a, b);                                              \
        }                                                                         \
    }

#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                   \
    WRAP_BINARY_UINT_EXCLUDING_64(OP, RT)                                      \
    namespace wrap                                                             \
    {                                                                          \
        XSIMD_INLINE RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept  \
        {                                                                      \
            return ::OP##_s8(a, b);                                            \
        }                                                                      \
        XSIMD_INLINE RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
        {                                                                      \
            return ::OP##_s16(a, b);                                           \
        }                                                                      \
        XSIMD_INLINE RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
        {                                                                      \
            return ::OP##_s32(a, b);                                           \
        }                                                                      \
    }

#define WRAP_BINARY_INT(OP, RT)                                                   \
    WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                          \
    namespace wrap                                                                \
    {                                                                             \
        XSIMD_INLINE RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \
        {                                                                         \
            return ::OP##_u64(a, b);                                              \
        }                                                                         \
        XSIMD_INLINE RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept    \
        {                                                                         \
            return ::OP##_s64(a, b);                                              \
        }                                                                         \
    }

#define WRAP_BINARY_FLOAT(OP, RT)                                                    \
    namespace wrap                                                                   \
    {                                                                                \
        XSIMD_INLINE RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \
        {                                                                            \
            return ::OP##_f32(a, b);                                                 \
        }                                                                            \
    }

#define WRAP_UNARY_INT_EXCLUDING_64(OP)                         \
    namespace wrap                                              \
    {                                                           \
        XSIMD_INLINE uint8x16_t OP##_u8(uint8x16_t a) noexcept  \
        {                                                       \
            return ::OP##_u8(a);                                \
        }                                                       \
        XSIMD_INLINE int8x16_t OP##_s8(int8x16_t a) noexcept    \
        {                                                       \
            return ::OP##_s8(a);                                \
        }                                                       \
        XSIMD_INLINE uint16x8_t OP##_u16(uint16x8_t a) noexcept \
        {                                                       \
            return ::OP##_u16(a);                               \
        }                                                       \
        XSIMD_INLINE int16x8_t OP##_s16(int16x8_t a) noexcept   \
        {                                                       \
            return ::OP##_s16(a);                               \
        }                                                       \
        XSIMD_INLINE uint32x4_t OP##_u32(uint32x4_t a) noexcept \
        {                                                       \
            return ::OP##_u32(a);                               \
        }                                                       \
        XSIMD_INLINE int32x4_t OP##_s32(int32x4_t a) noexcept   \
        {                                                       \
            return ::OP##_s32(a);                               \
        }                                                       \
    }

#define WRAP_UNARY_INT(OP)                                      \
    WRAP_UNARY_INT_EXCLUDING_64(OP)                             \
    namespace wrap                                              \
    {                                                           \
        XSIMD_INLINE uint64x2_t OP##_u64(uint64x2_t a) noexcept \
        {                                                       \
            return ::OP##_u64(a);                               \
        }                                                       \
        XSIMD_INLINE int64x2_t OP##_s64(int64x2_t a) noexcept   \
        {                                                       \
            return ::OP##_s64(a);                               \
        }                                                       \
    }

#define WRAP_UNARY_FLOAT(OP)                                      \
    namespace wrap                                                \
    {                                                             \
        XSIMD_INLINE float32x4_t OP##_f32(float32x4_t a) noexcept \
        {                                                         \
            return ::OP##_f32(a);                                 \
        }                                                         \
    }

// Dummy identity caster to ease coding
XSIMD_INLINE uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; }
XSIMD_INLINE int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; }
XSIMD_INLINE uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; }
XSIMD_INLINE int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; }
XSIMD_INLINE uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; }
XSIMD_INLINE int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; }
XSIMD_INLINE uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; }
XSIMD_INLINE int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; }
XSIMD_INLINE float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; }

namespace xsimd
{
    template <typename T, class A, bool... Values>
    struct batch_bool_constant;

    namespace kernel
    {
        using namespace types;

        namespace detail
        {
            template <template <classclass return_type, class... T>
            struct neon_dispatcher_base
            {
                struct unary
                {
                    using container_type = std::tuple<return_type<T> (*)(T)...>;
                    const container_type m_func;

                    template <class U>
                    return_type<U> apply(U rhs) const noexcept
                    {
                        using func_type = return_type<U> (*)(U);
                        auto func = xsimd::detail::get<func_type>(m_func);
                        return func(rhs);
                    }
                };

                struct binary
                {
                    using container_type = std::tuple<return_type<T> (*)(T, T)...>;
                    const container_type m_func;

                    template <class U>
                    return_type<U> apply(U lhs, U rhs) const noexcept
                    {
                        using func_type = return_type<U> (*)(U, U);
                        auto func = xsimd::detail::get<func_type>(m_func);
                        return func(lhs, rhs);
                    }
                };
            };

            /***************************
             *  arithmetic dispatchers *
             ***************************/


            template <class T>
            using identity_return_type = T;

            template <class... T>
            struct neon_dispatcher_impl : neon_dispatcher_base<identity_return_type, T...>
            {
            };

            using neon_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
                                                         uint16x8_t, int16x8_t,
                                                         uint32x4_t, int32x4_t,
                                                         uint64x2_t, int64x2_t,
                                                         float32x4_t>;

            using excluding_int64_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
                                                                    uint16x8_t, int16x8_t,
                                                                    uint32x4_t, int32x4_t,
                                                                    float32x4_t>;

            using excluding_int64f32_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
                                                                       uint16x8_t, int16x8_t,
                                                                       uint32x4_t, int32x4_t>;

            /**************************
             * comparison dispatchers *
             **************************/


            template <class T>
            struct comp_return_type_impl;

            template <>
            struct comp_return_type_impl<uint8x16_t>
            {
                using type = uint8x16_t;
            };

            template <>
            struct comp_return_type_impl<int8x16_t>
            {
                using type = uint8x16_t;
            };

            template <>
            struct comp_return_type_impl<uint16x8_t>
            {
                using type = uint16x8_t;
            };

            template <>
            struct comp_return_type_impl<int16x8_t>
            {
                using type = uint16x8_t;
            };

            template <>
            struct comp_return_type_impl<uint32x4_t>
            {
                using type = uint32x4_t;
            };

            template <>
            struct comp_return_type_impl<int32x4_t>
            {
                using type = uint32x4_t;
            };

            template <>
            struct comp_return_type_impl<uint64x2_t>
            {
                using type = uint64x2_t;
            };

            template <>
            struct comp_return_type_impl<int64x2_t>
            {
                using type = uint64x2_t;
            };

            template <>
            struct comp_return_type_impl<float32x4_t>
            {
                using type = uint32x4_t;
            };

            template <class T>
            using comp_return_type = typename comp_return_type_impl<T>::type;

            template <class... T>
            struct neon_comp_dispatcher_impl : neon_dispatcher_base<comp_return_type, T...>
            {
            };

            using excluding_int64_comp_dispatcher = neon_comp_dispatcher_impl<uint8x16_t, int8x16_t,
                                                                              uint16x8_t, int16x8_t,
                                                                              uint32x4_t, int32x4_t,
                                                                              float32x4_t>;

            /**************************************
             * enabling / disabling metafunctions *
             **************************************/


            template <class T>
            using enable_neon_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value,
                                                               int>::type;

            template <class T>
            using exclude_int64_neon_t
                = typename std::enable_if<(std::is_integral<T>::value && sizeof(T) != 8) || std::is_same<T, float>::value, int>::type;
        }

        /*************
         * broadcast *
         *************/


        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
        {
            return vdupq_n_u8(uint8_t(val));
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
        {
            return vdupq_n_s8(int8_t(val));
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
        {
            return vdupq_n_u16(uint16_t(val));
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
        {
            return vdupq_n_s16(int16_t(val));
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
        {
            return vdupq_n_u32(uint32_t(val));
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
        {
            return vdupq_n_s32(int32_t(val));
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
        {
            return vdupq_n_u64(uint64_t(val));
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
        {
            return vdupq_n_s64(int64_t(val));
        }

        template <class A>
        XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<neon>) noexcept
        {
            return vdupq_n_f32(val);
        }

        /*******
         * set *
         *******/


        template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept
        {
            return xsimd::types::detail::neon_vector_type<T> { args... };
        }

        template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept
        {
            using register_type = typename batch_bool<T, A>::register_type;
            using unsigned_type = as_unsigned_integer_t<T>;
            return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
        }

        template <class A>
        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept
        {
            return float32x4_t { f0, f1, f2, f3 };
        }

        template <class A>
        XSIMD_INLINE batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>,
                                                       std::complex<float> c0, std::complex<float> c1,
                                                       std::complex<float> c2, std::complex<float> c3) noexcept
        {
            return batch<std::complex<float>, A>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
                                                 float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() });
        }

        template <class A, class... Args>
        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept
        {
            using register_type = typename batch_bool<float, A>::register_type;
            using unsigned_type = as_unsigned_integer_t<float>;
            return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
        }

        /*************
         * from_bool *
         *************/


        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
        {
            return vandq_u8(arg, vdupq_n_u8(1));
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
        {
            return vandq_s8(reinterpret_cast<int8x16_t>(arg.data), vdupq_n_s8(1));
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
        {
            return vandq_u16(arg, vdupq_n_u16(1));
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
        {
            return vandq_s16(reinterpret_cast<int16x8_t>(arg.data), vdupq_n_s16(1));
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
        {
            return vandq_u32(arg, vdupq_n_u32(1));
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
        {
            return vandq_s32(reinterpret_cast<int32x4_t>(arg.data), vdupq_n_s32(1));
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
        {
            return vandq_u64(arg, vdupq_n_u64(1));
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
        {
            return vandq_s64(reinterpret_cast<int64x2_t>(arg.data), vdupq_n_s64(1));
        }

        template <class A>
        XSIMD_INLINE batch<float, A> from_bool(batch_bool<float, A> const& arg, requires_arch<neon>) noexcept
        {
            return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(vdupq_n_f32(1.f))));
        }

        /********
         * load *
         ********/


        // It is not possible to use a call to A::alignment() here, so use an
        // immediate instead.
#if defined(__clang__) || defined(__GNUC__)
#define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
#elif defined(_MSC_VER)
#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
#else
#define xsimd_aligned_load(inst, type, expr) inst((type)expr)
#endif

        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return xsimd_aligned_load(vld1q_u8, uint8_t*, src);
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return xsimd_aligned_load(vld1q_s8, int8_t*, src);
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return xsimd_aligned_load(vld1q_u16, uint16_t*, src);
        }
        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return xsimd_aligned_load(vld1q_s16, int16_t*, src);
        }
        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return xsimd_aligned_load(vld1q_u32, uint32_t*, src);
        }
        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return xsimd_aligned_load(vld1q_s32, int32_t*, src);
        }
        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return xsimd_aligned_load(vld1q_u64, uint64_t*, src);
        }
        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return xsimd_aligned_load(vld1q_s64, int64_t*, src);
        }

        template <class A>
        XSIMD_INLINE batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept
        {
            return xsimd_aligned_load(vld1q_f32, float*, src);
        }

#undef xsimd_aligned_load

        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return vld1q_u8((uint8_t*)src);
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return vld1q_s8((int8_t*)src);
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return vld1q_u16((uint16_t*)src);
        }
        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return vld1q_s16((int16_t*)src);
        }
        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return vld1q_u32((uint32_t*)src);
        }
        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return vld1q_s32((int32_t*)src);
        }
        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return vld1q_u64((uint64_t*)src);
        }
        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
            return vld1q_s64((int64_t*)src);
        }

        template <class A>
        XSIMD_INLINE batch<float, A> load_unaligned(float const* src, convert<float>, requires_arch<neon>) noexcept
        {
            return vld1q_f32(src);
        }

        /*********
         * store *
         *********/


        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
        {
            vst1q_u8((uint8_t*)dst, src);
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
        {
            vst1q_s8((int8_t*)dst, src);
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
        {
            vst1q_u16((uint16_t*)dst, src);
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
        {
            vst1q_s16((int16_t*)dst, src);
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
        {
            vst1q_u32((uint32_t*)dst, src);
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
        {
            vst1q_s32((int32_t*)dst, src);
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
        {
            vst1q_u64((uint64_t*)dst, src);
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
        {
            vst1q_s64((int64_t*)dst, src);
        }

        template <class A>
        XSIMD_INLINE void store_aligned(float* dst, batch<float, A> const& src, requires_arch<neon>) noexcept
        {
            vst1q_f32(dst, src);
        }

        template <class A, class T>
        XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
        {
            store_aligned<A>(dst, src, A {});
        }

        /****************
         * load_complex *
         ****************/


        template <class A>
        XSIMD_INLINE batch<std::complex<float>, A> load_complex_aligned(std::complex<floatconstmem, convert<std::complex<float>>, requires_arch<neon>) noexcept
        {
            using real_batch = batch<float, A>;
            const float* buf = reinterpret_cast<const float*>(mem);
            float32x4x2_t tmp = vld2q_f32(buf);
            real_batch real = tmp.val[0],
                       imag = tmp.val[1];
            return batch<std::complex<float>, A> { real, imag };
        }

        template <class A>
        XSIMD_INLINE batch<std::complex<float>, A> load_complex_unaligned(std::complex<floatconst* mem, convert<std::complex<float>> cvt, requires_arch<neon>) noexcept
        {
            return load_complex_aligned<A>(mem, cvt, A {});
        }

        /*****************
         * store_complex *
         *****************/


        template <class A>
        XSIMD_INLINE void store_complex_aligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
        {
            float32x4x2_t tmp;
            tmp.val[0] = src.real();
            tmp.val[1] = src.imag();
            float* buf = reinterpret_cast<float*>(dst);
            vst2q_f32(buf, tmp);
        }

        template <class A>
        XSIMD_INLINE void store_complex_unaligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
        {
            store_complex_aligned(dst, src, A {});
        }

        /*******
         * neg *
         *******/


        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs)));
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return vnegq_s8(rhs);
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs)));
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return vnegq_s16(rhs);
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs)));
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return vnegq_s32(rhs);
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return batch<T, A> { -rhs.get(0), -rhs.get(1) };
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return batch<T, A> { -rhs.get(0), -rhs.get(1) };
        }

        template <class A>
        XSIMD_INLINE batch<float, A> neg(batch<float, A> const& rhs, requires_arch<neon>) noexcept
        {
            return vnegq_f32(rhs);
        }

        /*******
         * add *
         *******/


        WRAP_BINARY_INT(vaddq, detail::identity_return_type)
        WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type)

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::neon_dispatcher::binary dispatcher = {
                std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16,
                                wrap::vaddq_u32, wrap::vaddq_s32, wrap::vaddq_u64, wrap::vaddq_s64,
                                wrap::vaddq_f32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        /*******
         * avg *
         *******/


        WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type)

        template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
                std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        /********
         * avgr *
         ********/


        WRAP_BINARY_UINT_EXCLUDING_64(vrhaddq, detail::identity_return_type)

        template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
                std::make_tuple(wrap::vrhaddq_u8, wrap::vrhaddq_u16, wrap::vrhaddq_u32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        /********
         * sadd *
         ********/


        WRAP_BINARY_INT(vqaddq, detail::identity_return_type)

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::neon_dispatcher::binary dispatcher = {
                std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16,
                                wrap::vqaddq_u32, wrap::vqaddq_s32, wrap::vqaddq_u64, wrap::vqaddq_s64,
                                wrap::vaddq_f32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        /*******
         * sub *
         *******/


        WRAP_BINARY_INT(vsubq, detail::identity_return_type)
        WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type)

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::neon_dispatcher::binary dispatcher = {
                std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16,
                                wrap::vsubq_u32, wrap::vsubq_s32, wrap::vsubq_u64, wrap::vsubq_s64,
                                wrap::vsubq_f32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        /********
         * ssub *
         ********/


        WRAP_BINARY_INT(vqsubq, detail::identity_return_type)

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::neon_dispatcher::binary dispatcher = {
                std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16,
                                wrap::vqsubq_u32, wrap::vqsubq_s32, wrap::vqsubq_u64, wrap::vqsubq_s64,
                                wrap::vsubq_f32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        /*******
         * mul *
         *******/


        WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type)
        WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type)

        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::excluding_int64_dispatcher::binary dispatcher = {
                std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16,
                                wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        /*******
         * div *
         *******/


#if defined(XSIMD_FAST_INTEGER_DIVISION)
        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs));
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs));
        }
#endif

        template <class A>
        XSIMD_INLINE batch<float, A> div(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
        {
            // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html
            // get an initial estimate of 1/b.
            float32x4_t rcp = reciprocal(rhs);

            // use a couple Newton-Raphson steps to refine the estimate.  Depending on your
            // application's accuracy requirements, you may be able to get away with only
            // one refinement (instead of the two used here).  Be sure to test!
            rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
            rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);

            // and finally, compute a / b = a * (1 / b)
            return vmulq_f32(lhs, rcp);
        }

        /******
         * eq *
         ******/


        WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type)
        WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type)

        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
                std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16,
                                wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch_bool<T, A>::register_type;
            using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary;
            const dispatcher_type dispatcher = {
                std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
        }

        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
        }

        /*************
         * fast_cast *
         *************/


        namespace detail
        {
            template <class A>
            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&,&nbsp;requires_arch<neon>) noexcept
            {
                return vcvtq_f32_s32(self);
            }

            template <class A>
            XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
            {
                return vcvtq_f32_u32(self);
            }

            template <class A>
            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<neon>) noexcept
            {
                return vcvtq_s32_f32(self);
            }

            template <class A>
            XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<neon>) noexcept
            {
                return vcvtq_u32_f32(self);
            }

        }

        /******
         * lt *
         ******/


        WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type)
        WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type)

        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
                std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16,
                                wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) });
        }

        /******
         * le *
         ******/


        WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type)
        WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type)

        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
                std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16,
                                wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return batch_bool<T, A>({ lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1) });
        }

        /******
         * gt *
         ******/


        WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
        WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)

        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
                std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16,
                                wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return batch_bool<T, A>({ lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1) });
        }

        /******
         * ge *
         ******/


        WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type)
        WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type)

        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
                std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16,
                                wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) });
        }

        /*******************
         * batch_bool_cast *
         *******************/


        template <class A, class T_out, class T_in>
        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon>) noexcept
        {
            using register_type = typename batch_bool<T_out, A>::register_type;
            return register_type(self);
        }

        /***************
         * bitwise_and *
         ***************/


        WRAP_BINARY_INT(vandq, detail::identity_return_type)

        namespace detail
        {
            XSIMD_INLINE float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept
            {
                return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs),
                                                       vreinterpretq_u32_f32(rhs)));
            }

            template <class V>
            V bitwise_and_neon(V const& lhs, V const& rhs)
            {
                const neon_dispatcher::binary dispatcher = {
                    std::make_tuple(wrap::vandq_u8, wrap::vandq_s8, wrap::vandq_u16, wrap::vandq_s16,
                                    wrap::vandq_u32, wrap::vandq_s32, wrap::vandq_u64, wrap::vandq_s64,
                                    bitwise_and_f32)
                };
                return dispatcher.apply(lhs, rhs);
            }
        }

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
        }

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch_bool<T, A>::register_type;
            return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
        }

        /**************
         * bitwise_or *
         **************/


        WRAP_BINARY_INT(vorrq, detail::identity_return_type)

        namespace detail
        {
            XSIMD_INLINE float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept
            {
                return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs),
                                                       vreinterpretq_u32_f32(rhs)));
            }

            template <class V>
            XSIMD_INLINE V bitwise_or_neon(V const& lhs, V const& rhs) noexcept
            {
                const neon_dispatcher::binary dispatcher = {
                    std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16,
                                    wrap::vorrq_u32, wrap::vorrq_s32, wrap::vorrq_u64, wrap::vorrq_s64,
                                    bitwise_or_f32)
                };
                return dispatcher.apply(lhs, rhs);
            }
        }

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
        }

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch_bool<T, A>::register_type;
            return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
        }

        /***************
         * bitwise_xor *
         ***************/


        WRAP_BINARY_INT(veorq, detail::identity_return_type)

        namespace detail
        {
            XSIMD_INLINE float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept
            {
                return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(lhs),
                                                       vreinterpretq_u32_f32(rhs)));
            }

            template <class V>
            XSIMD_INLINE V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept
            {
                const neon_dispatcher::binary dispatcher = {
                    std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16,
                                    wrap::veorq_u32, wrap::veorq_s32, wrap::veorq_u64, wrap::veorq_s64,
                                    bitwise_xor_f32)
                };
                return dispatcher.apply(lhs, rhs);
            }
        }

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
        }

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch_bool<T, A>::register_type;
            return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
        }

        /*******
         * neq *
         *******/


        template <class A, class T>
        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return bitwise_xor(lhs, rhs, A {});
        }

        /***************
         * bitwise_not *
         ***************/


        WRAP_UNARY_INT_EXCLUDING_64(vmvnq)

        namespace detail
        {
            XSIMD_INLINE int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
            {
                return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
            }

            XSIMD_INLINE uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
            {
                return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
            }

            XSIMD_INLINE float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
            {
                return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
            }

            template <class V>
            XSIMD_INLINE V bitwise_not_neon(V const& arg) noexcept
            {
                const neon_dispatcher::unary dispatcher = {
                    std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
                                    wrap::vmvnq_u32, wrap::vmvnq_s32,
                                    bitwise_not_u64, bitwise_not_s64,
                                    bitwise_not_f32)
                };
                return dispatcher.apply(arg);
            }
        }

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            return detail::bitwise_not_neon(register_type(arg));
        }

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
        {
            using register_type = typename batch_bool<T, A>::register_type;
            return detail::bitwise_not_neon(register_type(arg));
        }

        /******************
         * bitwise_andnot *
         ******************/


        WRAP_BINARY_INT(vbicq, detail::identity_return_type)

        namespace detail
        {
            XSIMD_INLINE float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept
            {
                return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs)));
            }

            template <class V>
            XSIMD_INLINE V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept
            {
                const detail::neon_dispatcher::binary dispatcher = {
                    std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16,
                                    wrap::vbicq_u32, wrap::vbicq_s32, wrap::vbicq_u64, wrap::vbicq_s64,
                                    bitwise_andnot_f32)
                };
                return dispatcher.apply(lhs, rhs);
            }
        }

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
        }

        template <class A, class T, detail::enable_neon_type_t<T> = 0>
        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch_bool<T, A>::register_type;
            return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
        }

        /*******
         * min *
         *******/


        WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type)
        WRAP_BINARY_FLOAT(vminq, detail::identity_return_type)

        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
        XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::excluding_int64_dispatcher::binary dispatcher = {
                std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16,
                                wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
        XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return { std::min(lhs.get(0), rhs.get(0)), std::min(lhs.get(1), rhs.get(1)) };
        }

        /*******
         * max *
         *******/


        WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type)
        WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type)

        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
        XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::excluding_int64_dispatcher::binary dispatcher = {
                std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16,
                                wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32)
            };
            return dispatcher.apply(register_type(lhs), register_type(rhs));
        }

        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
        XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
        {
            return { std::max(lhs.get(0), rhs.get(0)), std::max(lhs.get(1), rhs.get(1)) };
        }

        /*******
         * abs *
         *******/


        namespace wrap
        {
            XSIMD_INLINE int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); }
            XSIMD_INLINE int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(a); }
            XSIMD_INLINE int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(a); }
        }
        WRAP_UNARY_FLOAT(vabsq)

        namespace detail
        {
            XSIMD_INLINE uint8x16_t abs_u8(uint8x16_t arg) noexcept
            {
                return arg;
            }

            XSIMD_INLINE uint16x8_t abs_u16(uint16x8_t arg) noexcept
            {
                return arg;
            }

            XSIMD_INLINE uint32x4_t abs_u32(uint32x4_t arg) noexcept
            {
                return arg;
            }
        }

        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept
        {
            using register_type = typename batch<T, A>::register_type;
            const detail::excluding_int64_dispatcher::unary dispatcher = {
                std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16,
                                detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32)
            };
            return dispatcher.apply(register_type(arg));
        }

        /********
         * rsqrt *
         ********/


        template <class A>
        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
        {
            return vrsqrteq_f32(arg);
        }

        /********
         * sqrt *
         ********/


        template <class A>
        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
        {
            batch<float, A> sqrt_reciprocal = vrsqrteq_f32(arg);
            // one iter
            sqrt_reciprocal = sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
            batch<float, A> sqrt_approx = arg * sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
            batch<float, A> zero(0.f);
            return select(arg == zero, zero, sqrt_approx);
        }

        /********************
         * Fused operations *
         ********************/


#ifdef __ARM_FEATURE_FMA
        template <class A>
        XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
        {
            return vfmaq_f32(z, x, y);
        }

        template <class A>
        XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
        {
            return vfmaq_f32(-z, x, y);
        }
#endif

        /*********
         * haddp *
         *********/


        template <class A>
        XSIMD_INLINE batch<float, A> haddp(const batch<float, A>* row, requires_arch<neon>) noexcept
        {
            // row = (a,b,c,d)
            float32x2_t tmp1, tmp2, tmp3;
            // tmp1 = (a0 + a2, a1 + a3)
            tmp1 = vpadd_f32(vget_low_f32(row[0]), vget_high_f32(row[0]));
            // tmp2 = (b0 + b2, b1 + b3)
            tmp2 = vpadd_f32(vget_low_f32(row[1]), vget_high_f32(row[1]));
            // tmp1 = (a0..3, b0..3)
            tmp1 = vpadd_f32(tmp1, tmp2);
            // tmp2 = (c0 + c2, c1 + c3)
            tmp2 = vpadd_f32(vget_low_f32(row[2]), vget_high_f32(row[2]));
            // tmp3 = (d0 + d2, d1 + d3)
            tmp3 = vpadd_f32(vget_low_f32(row[3]), vget_high_f32(row[3]));
            // tmp1 = (c0..3, d0..3)
            tmp2 = vpadd_f32(tmp2, tmp3);
            // return = (a0..3, b0..3, c0..3, d0..3)
            return vcombine_f32(tmp1, tmp2);
        }

        /**************
         * reciprocal *
         **************/


        template <class A>
        XSIMD_INLINE batch<float, A>
        reciprocal(const batch<float, A>& x,
                   kernel::requires_arch<neon>) noexcept
        {
            return vrecpeq_f32(x);
        }

        /**********
         * insert *
         **********/


        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 1> = 0>
        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
        {
            return vsetq_lane_u8(val, self, I);
        }

        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 1> = 0>
        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
        {
            return vsetq_lane_s8(val, self, I);
        }

        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 2> = 0>
        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
        {
            return vsetq_lane_u16(val, self, I);
        }

        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 2> = 0>
        XSIMD_INLINE batch<int16_t, A> insert(batch<int16_t, A> const& self, int16_t val, index<I>, requires_arch<neon>) noexcept
        {
            return vsetq_lane_s16(val, self, I);
        }

        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
        {
            return vsetq_lane_u32(val, self, I);
        }

        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 4> = 0>
        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
        {
            return vsetq_lane_s32(val, self, I);
        }

        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 8> = 0>
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5
C=96 H=98 G=96

¤ Dauer der Verarbeitung: 0.66 Sekunden  (vorverarbeitet)  ¤

*© Formatika GbR, Deutschland






Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.