/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2018-2020 Evan Nemerson <evan@nemerson.com>
* 2020 Michael R. Crusoe <crusoe@debian.org>
*/
#include "sse.h"
#if !defined (SIMDE_X86_AVX_H)
#define SIMDE_X86_AVX_H
#include "sse4.2.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
typedef union {
#if defined (SIMDE_VECTOR_SUBSCRIPT)
SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#if defined (SIMDE_HAVE_INT128_)
SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#endif
SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#else
SIMDE_ALIGN_TO_32 int8_t i8[32];
SIMDE_ALIGN_TO_32 int16_t i16[16];
SIMDE_ALIGN_TO_32 int32_t i32[8];
SIMDE_ALIGN_TO_32 int64_t i64[4];
SIMDE_ALIGN_TO_32 uint8_t u8[32];
SIMDE_ALIGN_TO_32 uint16_t u16[16];
SIMDE_ALIGN_TO_32 uint32_t u32[8];
SIMDE_ALIGN_TO_32 uint64_t u64[4];
SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof (int_fast32_t)];
SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof (uint_fast32_t)];
#if defined (SIMDE_HAVE_INT128_)
SIMDE_ALIGN_TO_32 simde_int128 i128[2];
SIMDE_ALIGN_TO_32 simde_uint128 u128[2];
#endif
SIMDE_ALIGN_TO_32 simde_float32 f32[8];
SIMDE_ALIGN_TO_32 simde_float64 f64[4];
#endif
SIMDE_ALIGN_TO_32 simde__m128_private m128_private[2];
SIMDE_ALIGN_TO_32 simde__m128 m128[2];
#if defined (SIMDE_X86_AVX_NATIVE)
SIMDE_ALIGN_TO_32 __m256 n;
#elif defined (SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char ) altivec_u8[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short ) altivec_u16[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int ) altivec_u32[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char ) altivec_i8[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short ) altivec_i16[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(int ) altivec_i32[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float ) altivec_f32[2];
#if defined (SIMDE_POWER_ALTIVEC_P7_NATIVE)
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long ) altivec_u64[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(long long ) altivec_i64[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double ) altivec_f64[2];
#endif
#endif
} simde__m256_private;
typedef union {
#if defined (SIMDE_VECTOR_SUBSCRIPT)
SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#if defined (SIMDE_HAVE_INT128_)
SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#endif
SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#else
SIMDE_ALIGN_TO_32 int8_t i8[32];
SIMDE_ALIGN_TO_32 int16_t i16[16];
SIMDE_ALIGN_TO_32 int32_t i32[8];
SIMDE_ALIGN_TO_32 int64_t i64[4];
SIMDE_ALIGN_TO_32 uint8_t u8[32];
SIMDE_ALIGN_TO_32 uint16_t u16[16];
SIMDE_ALIGN_TO_32 uint32_t u32[8];
SIMDE_ALIGN_TO_32 uint64_t u64[4];
#if defined (SIMDE_HAVE_INT128_)
SIMDE_ALIGN_TO_32 simde_int128 i128[2];
SIMDE_ALIGN_TO_32 simde_uint128 u128[2];
#endif
SIMDE_ALIGN_TO_32 simde_float32 f32[8];
SIMDE_ALIGN_TO_32 simde_float64 f64[4];
SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof (int_fast32_t)];
SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof (uint_fast32_t)];
#endif
SIMDE_ALIGN_TO_32 simde__m128d_private m128d_private[2];
SIMDE_ALIGN_TO_32 simde__m128d m128d[2];
#if defined (SIMDE_X86_AVX_NATIVE)
SIMDE_ALIGN_TO_32 __m256d n;
#elif defined (SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char ) altivec_u8[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short ) altivec_u16[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int ) altivec_u32[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char ) altivec_i8[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short ) altivec_i16[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int ) altivec_i32[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float ) altivec_f32[2];
#if defined (SIMDE_POWER_ALTIVEC_P7_NATIVE)
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long ) altivec_u64[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long ) altivec_i64[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double ) altivec_f64[2];
#endif
#endif
} simde__m256d_private;
typedef union {
#if defined (SIMDE_VECTOR_SUBSCRIPT)
SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#if defined (SIMDE_HAVE_INT128_)
SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#endif
SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#else
SIMDE_ALIGN_TO_32 int8_t i8[32];
SIMDE_ALIGN_TO_32 int16_t i16[16];
SIMDE_ALIGN_TO_32 int32_t i32[8];
SIMDE_ALIGN_TO_32 int64_t i64[4];
SIMDE_ALIGN_TO_32 uint8_t u8[32];
SIMDE_ALIGN_TO_32 uint16_t u16[16];
SIMDE_ALIGN_TO_32 uint32_t u32[8];
SIMDE_ALIGN_TO_32 uint64_t u64[4];
SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof (int_fast32_t)];
SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof (uint_fast32_t)];
#if defined (SIMDE_HAVE_INT128_)
SIMDE_ALIGN_TO_32 simde_int128 i128[2];
SIMDE_ALIGN_TO_32 simde_uint128 u128[2];
#endif
SIMDE_ALIGN_TO_32 simde_float32 f32[8];
SIMDE_ALIGN_TO_32 simde_float64 f64[4];
#endif
SIMDE_ALIGN_TO_32 simde__m128i_private m128i_private[2];
SIMDE_ALIGN_TO_32 simde__m128i m128i[2];
#if defined (SIMDE_X86_AVX_NATIVE)
SIMDE_ALIGN_TO_32 __m256i n;
#elif defined (SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char ) altivec_u8[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short ) altivec_u16[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int ) altivec_u32[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char ) altivec_i8[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short ) altivec_i16[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int ) altivec_i32[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float ) altivec_f32[2];
#if defined (SIMDE_POWER_ALTIVEC_P7_NATIVE)
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long ) altivec_u64[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long ) altivec_i64[2];
SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double ) altivec_f64[2];
#endif
#endif
} simde__m256i_private;
#if defined (SIMDE_X86_AVX_NATIVE)
typedef __m256 simde__m256;
typedef __m256i simde__m256i;
typedef __m256d simde__m256d;
#elif defined (SIMDE_VECTOR_SUBSCRIPT)
typedef simde_float32 simde__m256 SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
typedef int_fast32_t simde__m256i SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
typedef simde_float64 simde__m256d SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS;
#else
typedef simde__m256_private simde__m256;
typedef simde__m256i_private simde__m256i;
typedef simde__m256d_private simde__m256d;
#endif
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#if !defined (HEDLEY_INTEL_VERSION) && !defined (_AVXINTRIN_H_INCLUDED) && !defined (__AVXINTRIN_H) && !defined (_CMP_EQ_OQ)
typedef simde__m256 __m256;
typedef simde__m256i __m256i;
typedef simde__m256d __m256d;
#else
#undef __m256
#define __m256 simde__m256
#undef __m256i
#define __m256i simde__m256i
#undef __m256d
#define __m256d simde__m256d
#endif
#endif
HEDLEY_STATIC_ASSERT(32 == sizeof (simde__m256), "simde__m256 size incorrect" );
HEDLEY_STATIC_ASSERT(32 == sizeof (simde__m256_private), "simde__m256_private size incorrect" );
HEDLEY_STATIC_ASSERT(32 == sizeof (simde__m256i), "simde__m256i size incorrect" );
HEDLEY_STATIC_ASSERT(32 == sizeof (simde__m256i_private), "simde__m256i_private size incorrect" );
HEDLEY_STATIC_ASSERT(32 == sizeof (simde__m256d), "simde__m256d size incorrect" );
HEDLEY_STATIC_ASSERT(32 == sizeof (simde__m256d_private), "simde__m256d_private size incorrect" );
#if defined (SIMDE_CHECK_ALIGNMENT) && defined (SIMDE_ALIGN_OF)
HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256) == 32, "simde__m256 is not 32-byte aligned" );
HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256_private) == 32, "simde__m256_private is not 32-byte aligned" );
HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256i) == 32, "simde__m256i is not 32-byte aligned" );
HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256i_private) == 32, "simde__m256i_private is not 32-byte aligned" );
HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256d) == 32, "simde__m256d is not 32-byte aligned" );
HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256d_private) == 32, "simde__m256d_private is not 32-byte aligned" );
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde__m256_from_private(simde__m256_private v) {
simde__m256 r;
simde_memcpy(&r, &v, sizeof (r));
return r;
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256_private
simde__m256_to_private(simde__m256 v) {
simde__m256_private r;
simde_memcpy(&r, &v, sizeof (r));
return r;
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde__m256i_from_private(simde__m256i_private v) {
simde__m256i r;
simde_memcpy(&r, &v, sizeof (r));
return r;
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i_private
simde__m256i_to_private(simde__m256i v) {
simde__m256i_private r;
simde_memcpy(&r, &v, sizeof (r));
return r;
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde__m256d_from_private(simde__m256d_private v) {
simde__m256d r;
simde_memcpy(&r, &v, sizeof (r));
return r;
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d_private
simde__m256d_to_private(simde__m256d v) {
simde__m256d_private r;
simde_memcpy(&r, &v, sizeof (r));
return r;
}
#define SIMDE_CMP_EQ_OQ 0
#define SIMDE_CMP_LT_OS 1
#define SIMDE_CMP_LE_OS 2
#define SIMDE_CMP_UNORD_Q 3
#define SIMDE_CMP_NEQ_UQ 4
#define SIMDE_CMP_NLT_US 5
#define SIMDE_CMP_NLE_US 6
#define SIMDE_CMP_ORD_Q 7
#define SIMDE_CMP_EQ_UQ 8
#define SIMDE_CMP_NGE_US 9
#define SIMDE_CMP_NGT_US 10
#define SIMDE_CMP_FALSE_OQ 11
#define SIMDE_CMP_NEQ_OQ 12
#define SIMDE_CMP_GE_OS 13
#define SIMDE_CMP_GT_OS 14
#define SIMDE_CMP_TRUE_UQ 15
#define SIMDE_CMP_EQ_OS 16
#define SIMDE_CMP_LT_OQ 17
#define SIMDE_CMP_LE_OQ 18
#define SIMDE_CMP_UNORD_S 19
#define SIMDE_CMP_NEQ_US 20
#define SIMDE_CMP_NLT_UQ 21
#define SIMDE_CMP_NLE_UQ 22
#define SIMDE_CMP_ORD_S 23
#define SIMDE_CMP_EQ_US 24
#define SIMDE_CMP_NGE_UQ 25
#define SIMDE_CMP_NGT_UQ 26
#define SIMDE_CMP_FALSE_OS 27
#define SIMDE_CMP_NEQ_OS 28
#define SIMDE_CMP_GE_OQ 29
#define SIMDE_CMP_GT_OQ 30
#define SIMDE_CMP_TRUE_US 31
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) && !defined (_CMP_EQ_OQ)
#define _CMP_EQ_OQ SIMDE_CMP_EQ_OQ
#define _CMP_LT_OS SIMDE_CMP_LT_OS
#define _CMP_LE_OS SIMDE_CMP_LE_OS
#define _CMP_UNORD_Q SIMDE_CMP_UNORD_Q
#define _CMP_NEQ_UQ SIMDE_CMP_NEQ_UQ
#define _CMP_NLT_US SIMDE_CMP_NLT_US
#define _CMP_NLE_US SIMDE_CMP_NLE_US
#define _CMP_ORD_Q SIMDE_CMP_ORD_Q
#define _CMP_EQ_UQ SIMDE_CMP_EQ_UQ
#define _CMP_NGE_US SIMDE_CMP_NGE_US
#define _CMP_NGT_US SIMDE_CMP_NGT_US
#define _CMP_FALSE_OQ SIMDE_CMP_FALSE_OQ
#define _CMP_NEQ_OQ SIMDE_CMP_NEQ_OQ
#define _CMP_GE_OS SIMDE_CMP_GE_OS
#define _CMP_GT_OS SIMDE_CMP_GT_OS
#define _CMP_TRUE_UQ SIMDE_CMP_TRUE_UQ
#define _CMP_EQ_OS SIMDE_CMP_EQ_OS
#define _CMP_LT_OQ SIMDE_CMP_LT_OQ
#define _CMP_LE_OQ SIMDE_CMP_LE_OQ
#define _CMP_UNORD_S SIMDE_CMP_UNORD_S
#define _CMP_NEQ_US SIMDE_CMP_NEQ_US
#define _CMP_NLT_UQ SIMDE_CMP_NLT_UQ
#define _CMP_NLE_UQ SIMDE_CMP_NLE_UQ
#define _CMP_ORD_S SIMDE_CMP_ORD_S
#define _CMP_EQ_US SIMDE_CMP_EQ_US
#define _CMP_NGE_UQ SIMDE_CMP_NGE_UQ
#define _CMP_NGT_UQ SIMDE_CMP_NGT_UQ
#define _CMP_FALSE_OS SIMDE_CMP_FALSE_OS
#define _CMP_NEQ_OS SIMDE_CMP_NEQ_OS
#define _CMP_GE_OQ SIMDE_CMP_GE_OQ
#define _CMP_GT_OQ SIMDE_CMP_GT_OQ
#define _CMP_TRUE_US SIMDE_CMP_TRUE_US
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_castps_pd (simde__m256 a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_castps_pd(a);
#else
return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castps_pd
#define _mm256_castps_pd(a) simde_mm256_castps_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_castps_si256 (simde__m256 a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_castps_si256(a);
#else
return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castps_si256
#define _mm256_castps_si256(a) simde_mm256_castps_si256(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_castsi256_pd (simde__m256i a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_castsi256_pd(a);
#else
return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castsi256_pd
#define _mm256_castsi256_pd(a) simde_mm256_castsi256_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_castsi256_ps (simde__m256i a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_castsi256_ps(a);
#else
return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castsi256_ps
#define _mm256_castsi256_ps(a) simde_mm256_castsi256_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_castpd_ps (simde__m256d a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_castpd_ps(a);
#else
return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castpd_ps
#define _mm256_castpd_ps(a) simde_mm256_castpd_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_castpd_si256 (simde__m256d a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_castpd_si256(a);
#else
return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_castpd_si256
#define _mm256_castpd_si256(a) simde_mm256_castpd_si256(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_setzero_si256 (void ) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_setzero_si256();
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_setzero_si128();
r_.m128i[1] = simde_mm_setzero_si128();
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32f) / sizeof (r_.i32f[0])) ; i++) {
r_.i32f[i] = 0;
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setzero_si256
#define _mm256_setzero_si256() simde_mm256_setzero_si256()
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_setzero_ps (void ) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_setzero_ps();
#else
return simde_mm256_castsi256_ps(simde_mm256_setzero_si256());
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setzero_ps
#define _mm256_setzero_ps() simde_mm256_setzero_ps()
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_setzero_pd (void ) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_setzero_pd();
#else
return simde_mm256_castsi256_pd(simde_mm256_setzero_si256());
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_setzero_pd
#define _mm256_setzero_pd() simde_mm256_setzero_pd()
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_not_ps(simde__m256 a) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a);
#if defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = ~a_.i32;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
r_.m128[0] = simde_x_mm_not_ps(a_.m128[0]);
r_.m128[1] = simde_x_mm_not_ps(a_.m128[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = ~(a_.i32[i]);
}
#endif
return simde__m256_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_select_ps(simde__m256 a, simde__m256 b, simde__m256 mask) {
/* This function is for when you want to blend two elements together
* according to a mask. It is similar to _mm256_blendv_ps, except that
* it is undefined whether the blend is based on the highest bit in
* each lane (like blendv) or just bitwise operations. This allows
* us to implement the function efficiently everywhere.
*
* Basically, you promise that all the lanes in mask are either 0 or
* ~0. */
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_blendv_ps(a, b, mask);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b),
mask_ = simde__m256_to_private(mask);
#if defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
r_.m128[0] = simde_x_mm_select_ps(a_.m128[0], b_.m128[0], mask_.m128[0]);
r_.m128[1] = simde_x_mm_select_ps(a_.m128[1], b_.m128[1], mask_.m128[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = a_.i32[i] ^ ((a_.i32[i] ^ b_.i32[i]) & mask_.i32[i]);
}
#endif
return simde__m256_from_private(r_);
#endif
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_not_pd(simde__m256d a) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a);
#if defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = ~a_.i64;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
r_.m128d[0] = simde_x_mm_not_pd(a_.m128d[0]);
r_.m128d[1] = simde_x_mm_not_pd(a_.m128d[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = ~(a_.i64[i]);
}
#endif
return simde__m256d_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_select_pd(simde__m256d a, simde__m256d b, simde__m256d mask) {
/* This function is for when you want to blend two elements together
* according to a mask. It is similar to _mm256_blendv_pd, except that
* it is undefined whether the blend is based on the highest bit in
* each lane (like blendv) or just bitwise operations. This allows
* us to implement the function efficiently everywhere.
*
* Basically, you promise that all the lanes in mask are either 0 or
* ~0. */
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_blendv_pd(a, b, mask);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b),
mask_ = simde__m256d_to_private(mask);
#if defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
r_.m128d[0] = simde_x_mm_select_pd(a_.m128d[0], b_.m128d[0], mask_.m128d[0]);
r_.m128d[1] = simde_x_mm_select_pd(a_.m128d[1], b_.m128d[1], mask_.m128d[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = a_.i64[i] ^ ((a_.i64[i] ^ b_.i64[i]) & mask_.i64[i]);
}
#endif
return simde__m256d_from_private(r_);
#endif
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_setone_si256 (void ) {
simde__m256i_private r_;
#if defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
__typeof__(r_.i32f) rv = { 0, };
r_.i32f = ~rv;
#elif defined (SIMDE_X86_AVX2_NATIVE)
__m256i t = _mm256_setzero_si256();
r_.n = _mm256_cmpeq_epi32(t, t);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32f) / sizeof (r_.i32f[0])) ; i++) {
r_.i32f[i] = ~HEDLEY_STATIC_CAST(int_fast32_t, 0);
}
#endif
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_setone_ps (void ) {
return simde_mm256_castsi256_ps(simde_x_mm256_setone_si256());
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_setone_pd (void ) {
return simde_mm256_castsi256_pd(simde_x_mm256_setone_si256());
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set_epi8 (int8_t e31, int8_t e30, int8_t e29, int8_t e28,
int8_t e27, int8_t e26, int8_t e25, int8_t e24,
int8_t e23, int8_t e22, int8_t e21, int8_t e20,
int8_t e19, int8_t e18, int8_t e17, int8_t e16,
int8_t e15, int8_t e14, int8_t e13, int8_t e12,
int8_t e11, int8_t e10, int8_t e9, int8_t e8,
int8_t e7, int8_t e6, int8_t e5, int8_t e4,
int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24,
e23, e22, e21, e20, e19, e18, e17, e16,
e15, e14, e13, e12, e11, e10, e9, e8,
e7, e6, e5, e4, e3, e2, e1, e0);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set_epi8(
e15, e14, e13, e12, e11, e10, e9, e8,
e7, e6, e5, e4, e3, e2, e1, e0);
r_.m128i[1] = simde_mm_set_epi8(
e31, e30, e29, e28, e27, e26, e25, e24,
e23, e22, e21, e20, e19, e18, e17, e16);
#else
r_.i8[ 0] = e0;
r_.i8[ 1] = e1;
r_.i8[ 2] = e2;
r_.i8[ 3] = e3;
r_.i8[ 4] = e4;
r_.i8[ 5] = e5;
r_.i8[ 6] = e6;
r_.i8[ 7] = e7;
r_.i8[ 8] = e8;
r_.i8[ 9] = e9;
r_.i8[10] = e10;
r_.i8[11] = e11;
r_.i8[12] = e12;
r_.i8[13] = e13;
r_.i8[14] = e14;
r_.i8[15] = e15;
r_.i8[16] = e16;
r_.i8[17] = e17;
r_.i8[18] = e18;
r_.i8[19] = e19;
r_.i8[20] = e20;
r_.i8[21] = e21;
r_.i8[22] = e22;
r_.i8[23] = e23;
r_.i8[24] = e24;
r_.i8[25] = e25;
r_.i8[26] = e26;
r_.i8[27] = e27;
r_.i8[28] = e28;
r_.i8[29] = e29;
r_.i8[30] = e30;
r_.i8[31] = e31;
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_epi8
#define _mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \
simde_mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set_epi16 (int16_t e15, int16_t e14, int16_t e13, int16_t e12,
int16_t e11, int16_t e10, int16_t e9, int16_t e8,
int16_t e7, int16_t e6, int16_t e5, int16_t e4,
int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8,
e7, e6, e5, e4, e3, e2, e1, e0);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set_epi16( e7, e6, e5, e4, e3, e2, e1, e0);
r_.m128i[1] = simde_mm_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8);
#else
r_.i16[ 0] = e0;
r_.i16[ 1] = e1;
r_.i16[ 2] = e2;
r_.i16[ 3] = e3;
r_.i16[ 4] = e4;
r_.i16[ 5] = e5;
r_.i16[ 6] = e6;
r_.i16[ 7] = e7;
r_.i16[ 8] = e8;
r_.i16[ 9] = e9;
r_.i16[10] = e10;
r_.i16[11] = e11;
r_.i16[12] = e12;
r_.i16[13] = e13;
r_.i16[14] = e14;
r_.i16[15] = e15;
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_epi16
#define _mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \
simde_mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set_epi32 (int32_t e7, int32_t e6, int32_t e5, int32_t e4,
int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set_epi32(e3, e2, e1, e0);
r_.m128i[1] = simde_mm_set_epi32(e7, e6, e5, e4);
#else
r_.i32[ 0] = e0;
r_.i32[ 1] = e1;
r_.i32[ 2] = e2;
r_.i32[ 3] = e3;
r_.i32[ 4] = e4;
r_.i32[ 5] = e5;
r_.i32[ 6] = e6;
r_.i32[ 7] = e7;
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_epi32
#define _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0) \
simde_mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_set_epi64x(e3, e2, e1, e0);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set_epi64x(e1, e0);
r_.m128i[1] = simde_mm_set_epi64x(e3, e2);
#else
r_.i64[0] = e0;
r_.i64[1] = e1;
r_.i64[2] = e2;
r_.i64[3] = e3;
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_epi64x
#define _mm256_set_epi64x(e3, e2, e1, e0) simde_mm256_set_epi64x(e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_set_epu8 (uint8_t e31, uint8_t e30, uint8_t e29, uint8_t e28,
uint8_t e27, uint8_t e26, uint8_t e25, uint8_t e24,
uint8_t e23, uint8_t e22, uint8_t e21, uint8_t e20,
uint8_t e19, uint8_t e18, uint8_t e17, uint8_t e16,
uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12,
uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8,
uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) {
simde__m256i_private r_;
r_.u8[ 0] = e0;
r_.u8[ 1] = e1;
r_.u8[ 2] = e2;
r_.u8[ 3] = e3;
r_.u8[ 4] = e4;
r_.u8[ 5] = e5;
r_.u8[ 6] = e6;
r_.u8[ 7] = e7;
r_.u8[ 8] = e8;
r_.u8[ 9] = e9;
r_.u8[10] = e10;
r_.u8[11] = e11;
r_.u8[12] = e12;
r_.u8[13] = e13;
r_.u8[14] = e14;
r_.u8[15] = e15;
r_.u8[16] = e16;
r_.u8[17] = e17;
r_.u8[18] = e18;
r_.u8[19] = e19;
r_.u8[20] = e20;
r_.u8[20] = e20;
r_.u8[21] = e21;
r_.u8[22] = e22;
r_.u8[23] = e23;
r_.u8[24] = e24;
r_.u8[25] = e25;
r_.u8[26] = e26;
r_.u8[27] = e27;
r_.u8[28] = e28;
r_.u8[29] = e29;
r_.u8[30] = e30;
r_.u8[31] = e31;
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_set_epu16 (uint16_t e15, uint16_t e14, uint16_t e13, uint16_t e12,
uint16_t e11, uint16_t e10, uint16_t e9, uint16_t e8,
uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4,
uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) {
simde__m256i_private r_;
r_.u16[ 0] = e0;
r_.u16[ 1] = e1;
r_.u16[ 2] = e2;
r_.u16[ 3] = e3;
r_.u16[ 4] = e4;
r_.u16[ 5] = e5;
r_.u16[ 6] = e6;
r_.u16[ 7] = e7;
r_.u16[ 8] = e8;
r_.u16[ 9] = e9;
r_.u16[10] = e10;
r_.u16[11] = e11;
r_.u16[12] = e12;
r_.u16[13] = e13;
r_.u16[14] = e14;
r_.u16[15] = e15;
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_set_epu32 (uint32_t e7, uint32_t e6, uint32_t e5, uint32_t e4,
uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_set_epi32(HEDLEY_STATIC_CAST(int32_t, e7), HEDLEY_STATIC_CAST(int32_t, e6), HEDLEY_STATIC_CAST(int32_t, e5), HEDLEY_STATIC_CAST(int32_t, e4),
HEDLEY_STATIC_CAST(int32_t, e3), HEDLEY_STATIC_CAST(int32_t, e2), HEDLEY_STATIC_CAST(int32_t, e1), HEDLEY_STATIC_CAST(int32_t, e0));
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set_epi32(HEDLEY_STATIC_CAST(int32_t, e3), HEDLEY_STATIC_CAST(int32_t, e2), HEDLEY_STATIC_CAST(int32_t, e1), HEDLEY_STATIC_CAST(int32_t, e0));
r_.m128i[1] = simde_mm_set_epi32(HEDLEY_STATIC_CAST(int32_t, e7), HEDLEY_STATIC_CAST(int32_t, e6), HEDLEY_STATIC_CAST(int32_t, e5), HEDLEY_STATIC_CAST(int32_t, e4));
#else
r_.u32[ 0] = e0;
r_.u32[ 1] = e1;
r_.u32[ 2] = e2;
r_.u32[ 3] = e3;
r_.u32[ 4] = e4;
r_.u32[ 5] = e5;
r_.u32[ 6] = e6;
r_.u32[ 7] = e7;
#endif
return simde__m256i_from_private(r_);
#endif
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_set_epu64x (uint64_t e3, uint64_t e2, uint64_t e1, uint64_t e0) {
simde__m256i_private r_;
r_.u64[0] = e0;
r_.u64[1] = e1;
r_.u64[2] = e2;
r_.u64[3] = e3;
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_set_ps (simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_float32 e4,
simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
#else
simde__m256_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_set_ps(e3, e2, e1, e0);
r_.m128[1] = simde_mm_set_ps(e7, e6, e5, e4);
#else
r_.f32[0] = e0;
r_.f32[1] = e1;
r_.f32[2] = e2;
r_.f32[3] = e3;
r_.f32[4] = e4;
r_.f32[5] = e5;
r_.f32[6] = e6;
r_.f32[7] = e7;
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_ps
#define _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0) \
simde_mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_set_pd (simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_set_pd(e3, e2, e1, e0);
#else
simde__m256d_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_set_pd(e1, e0);
r_.m128d[1] = simde_mm_set_pd(e3, e2);
#else
r_.f64[0] = e0;
r_.f64[1] = e1;
r_.f64[2] = e2;
r_.f64[3] = e3;
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_pd
#define _mm256_set_pd(e3, e2, e1, e0) \
simde_mm256_set_pd(e3, e2, e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_set_m128 (simde__m128 e1, simde__m128 e0) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_insertf128_ps(_mm256_castps128_ps256(e0), e1, 1);
#else
simde__m256_private r_;
simde__m128_private
e1_ = simde__m128_to_private(e1),
e0_ = simde__m128_to_private(e0);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128_private[0] = e0_;
r_.m128_private[1] = e1_;
#elif defined (SIMDE_HAVE_INT128_)
r_.i128[0] = e0_.i128[0];
r_.i128[1] = e1_.i128[0];
#else
r_.i64[0] = e0_.i64[0];
r_.i64[1] = e0_.i64[1];
r_.i64[2] = e1_.i64[0];
r_.i64[3] = e1_.i64[1];
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_m128
#define _mm256_set_m128(e1, e0) simde_mm256_set_m128(e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_set_m128d (simde__m128d e1, simde__m128d e0) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_insertf128_pd(_mm256_castpd128_pd256(e0), e1, 1);
#else
simde__m256d_private r_;
simde__m128d_private
e1_ = simde__m128d_to_private(e1),
e0_ = simde__m128d_to_private(e0);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d_private[0] = e0_;
r_.m128d_private[1] = e1_;
#else
r_.i64[0] = e0_.i64[0];
r_.i64[1] = e0_.i64[1];
r_.i64[2] = e1_.i64[0];
r_.i64[3] = e1_.i64[1];
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_m128d
#define _mm256_set_m128d(e1, e0) simde_mm256_set_m128d(e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set_m128i (simde__m128i e1, simde__m128i e0) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_insertf128_si256(_mm256_castsi128_si256(e0), e1, 1);
#else
simde__m256i_private r_;
simde__m128i_private
e1_ = simde__m128i_to_private(e1),
e0_ = simde__m128i_to_private(e0);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i_private[0] = e0_;
r_.m128i_private[1] = e1_;
#else
r_.i64[0] = e0_.i64[0];
r_.i64[1] = e0_.i64[1];
r_.i64[2] = e1_.i64[0];
r_.i64[3] = e1_.i64[1];
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set_m128i
#define _mm256_set_m128i(e1, e0) simde_mm256_set_m128i(e1, e0)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set1_epi8 (int8_t a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_set1_epi8(a);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set1_epi8(a);
r_.m128i[1] = simde_mm_set1_epi8(a);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i8) / sizeof (r_.i8[0])) ; i++) {
r_.i8[i] = a;
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set1_epi8
#define _mm256_set1_epi8(a) simde_mm256_set1_epi8(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set1_epi16 (int16_t a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_set1_epi16(a);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set1_epi16(a);
r_.m128i[1] = simde_mm_set1_epi16(a);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i16) / sizeof (r_.i16[0])) ; i++) {
r_.i16[i] = a;
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set1_epi16
#define _mm256_set1_epi16(a) simde_mm256_set1_epi16(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set1_epi32 (int32_t a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_set1_epi32(a);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set1_epi32(a);
r_.m128i[1] = simde_mm_set1_epi32(a);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = a;
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set1_epi32
#define _mm256_set1_epi32(a) simde_mm256_set1_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_set1_epi64x (int64_t a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_set1_epi64x(a);
#else
simde__m256i_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_set1_epi64x(a);
r_.m128i[1] = simde_mm_set1_epi64x(a);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = a;
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set1_epi64x
#define _mm256_set1_epi64x(a) simde_mm256_set1_epi64x(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_set1_ps (simde_float32 a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_set1_ps(a);
#else
simde__m256_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_set1_ps(a);
r_.m128[1] = simde_mm_set1_ps(a);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.f32) / sizeof (r_.f32[0])) ; i++) {
r_.f32[i] = a;
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set1_ps
#define _mm256_set1_ps(a) simde_mm256_set1_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_set1_pd (simde_float64 a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_set1_pd(a);
#else
simde__m256d_private r_;
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_set1_pd(a);
r_.m128d[1] = simde_mm_set1_pd(a);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.f64) / sizeof (r_.f64[0])) ; i++) {
r_.f64[i] = a;
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_set1_pd
#define _mm256_set1_pd(a) simde_mm256_set1_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_deinterleaveeven_epi16 (simde__m256i a, simde__m256i b) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[1], b_.m128i[1]);
#elif defined (SIMDE_SHUFFLE_VECTOR_)
r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30);
#else
const size_t halfway_point = (sizeof (r_.i16) / sizeof (r_.i16[0])) / 2;
const size_t quarter_point = (sizeof (r_.i16) / sizeof (r_.i16[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.i16[i] = a_.i16[2 * i];
r_.i16[i + quarter_point] = b_.i16[2 * i];
r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i];
r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i];
}
#endif
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_deinterleaveodd_epi16 (simde__m256i a, simde__m256i b) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[1], b_.m128i[1]);
#elif defined (SIMDE_SHUFFLE_VECTOR_)
r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31);
#else
const size_t halfway_point = (sizeof (r_.i16) / sizeof (r_.i16[0])) / 2;
const size_t quarter_point = (sizeof (r_.i16) / sizeof (r_.i16[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.i16[i] = a_.i16[2 * i + 1];
r_.i16[i + quarter_point] = b_.i16[2 * i + 1];
r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i + 1];
r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i + 1];
}
#endif
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_deinterleaveeven_epi32 (simde__m256i a, simde__m256i b) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[1], b_.m128i[1]);
#elif defined (SIMDE_SHUFFLE_VECTOR_)
r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 2, 8, 10, 4, 6, 12, 14);
#else
const size_t halfway_point = (sizeof (r_.i32) / sizeof (r_.i32[0])) / 2;
const size_t quarter_point = (sizeof (r_.i32) / sizeof (r_.i32[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.i32[i] = a_.i32[2 * i];
r_.i32[i + quarter_point] = b_.i32[2 * i];
r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i];
r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i];
}
#endif
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_deinterleaveodd_epi32 (simde__m256i a, simde__m256i b) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[1], b_.m128i[1]);
#elif defined (SIMDE_SHUFFLE_VECTOR_)
r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 1, 3, 9, 11, 5, 7, 13, 15);
#else
const size_t halfway_point = (sizeof (r_.i32) / sizeof (r_.i32[0])) / 2;
const size_t quarter_point = (sizeof (r_.i32) / sizeof (r_.i32[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.i32[i] = a_.i32[2 * i + 1];
r_.i32[i + quarter_point] = b_.i32[2 * i + 1];
r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i + 1];
r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i + 1];
}
#endif
return simde__m256i_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_deinterleaveeven_ps (simde__m256 a, simde__m256 b) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_x_mm_deinterleaveeven_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_x_mm_deinterleaveeven_ps(a_.m128[1], b_.m128[1]);
#elif defined (SIMDE_SHUFFLE_VECTOR_)
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 0, 2, 8, 10, 4, 6, 12, 14);
#else
const size_t halfway_point = (sizeof (r_.f32) / sizeof (r_.f32[0])) / 2;
const size_t quarter_point = (sizeof (r_.f32) / sizeof (r_.f32[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.f32[i] = a_.f32[2 * i];
r_.f32[i + quarter_point] = b_.f32[2 * i];
r_.f32[halfway_point + i] = a_.f32[halfway_point + 2 * i];
r_.f32[halfway_point + i + quarter_point] = b_.f32[halfway_point + 2 * i];
}
#endif
return simde__m256_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_deinterleaveodd_ps (simde__m256 a, simde__m256 b) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_x_mm_deinterleaveodd_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_x_mm_deinterleaveodd_ps(a_.m128[1], b_.m128[1]);
#elif defined (SIMDE_SHUFFLE_VECTOR_)
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 1, 3, 9, 11, 5, 7, 13, 15);
#else
const size_t halfway_point = (sizeof (r_.f32) / sizeof (r_.f32[0])) / 2;
const size_t quarter_point = (sizeof (r_.f32) / sizeof (r_.f32[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.f32[i] = a_.f32[2 * i + 1];
r_.f32[i + quarter_point] = b_.f32[2 * i + 1];
r_.f32[halfway_point + i] = a_.f32[halfway_point + 2 * i + 1];
r_.f32[halfway_point + i + quarter_point] = b_.f32[halfway_point + 2 * i + 1];
}
#endif
return simde__m256_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_deinterleaveeven_pd (simde__m256d a, simde__m256d b) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_x_mm_deinterleaveeven_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_x_mm_deinterleaveeven_pd(a_.m128d[1], b_.m128d[1]);
#elif defined (SIMDE_SHUFFLE_VECTOR_)
r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 0, 4, 2, 6);
#else
const size_t halfway_point = (sizeof (r_.f64) / sizeof (r_.f64[0])) / 2;
const size_t quarter_point = (sizeof (r_.f64) / sizeof (r_.f64[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.f64[i] = a_.f64[2 * i];
r_.f64[i + quarter_point] = b_.f64[2 * i];
r_.f64[halfway_point + i] = a_.f64[halfway_point + 2 * i];
r_.f64[halfway_point + i + quarter_point] = b_.f64[halfway_point + 2 * i];
}
#endif
return simde__m256d_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_deinterleaveodd_pd (simde__m256d a, simde__m256d b) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_x_mm_deinterleaveodd_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_x_mm_deinterleaveodd_pd(a_.m128d[1], b_.m128d[1]);
#elif defined (SIMDE_SHUFFLE_VECTOR_)
r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 1, 5, 3, 7);
#else
const size_t halfway_point = (sizeof (r_.f64) / sizeof (r_.f64[0])) / 2;
const size_t quarter_point = (sizeof (r_.f64) / sizeof (r_.f64[0])) / 4;
for (size_t i = 0 ; i < quarter_point ; i++) {
r_.f64[i] = a_.f64[2 * i + 1];
r_.f64[i + quarter_point] = b_.f64[2 * i + 1];
r_.f64[halfway_point + i] = a_.f64[halfway_point + 2 * i + 1];
r_.f64[halfway_point + i + quarter_point] = b_.f64[halfway_point + 2 * i + 1];
}
#endif
return simde__m256d_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_x_mm256_abs_ps(simde__m256 a) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.f32) / sizeof (r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_fabsf(a_.f32[i]);
}
return simde__m256_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_x_mm256_abs_pd(simde__m256d a) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.f64) / sizeof (r_.f64[0])) ; i++) {
r_.f64[i] = simde_math_fabs(a_.f64[i]);
}
return simde__m256d_from_private(r_);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_add_ps (simde__m256 a, simde__m256 b) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_add_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_add_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_add_ps(a_.m128[1], b_.m128[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.f32 = a_.f32 + b_.f32;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.f32) / sizeof (r_.f32[0])) ; i++) {
r_.f32[i] = a_.f32[i] + b_.f32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_add_ps
#define _mm256_add_ps(a, b) simde_mm256_add_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_hadd_ps (simde__m256 a, simde__m256 b) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_hadd_ps(a, b);
#else
return simde_mm256_add_ps(simde_x_mm256_deinterleaveeven_ps(a, b), simde_x_mm256_deinterleaveodd_ps(a, b));
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_hadd_ps
#define _mm256_hadd_ps(a, b) simde_mm256_hadd_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_add_pd (simde__m256d a, simde__m256d b) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_add_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_add_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_add_pd(a_.m128d[1], b_.m128d[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.f64 = a_.f64 + b_.f64;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.f64) / sizeof (r_.f64[0])) ; i++) {
r_.f64[i] = a_.f64[i] + b_.f64[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_add_pd
#define _mm256_add_pd(a, b) simde_mm256_add_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_hadd_pd (simde__m256d a, simde__m256d b) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_hadd_pd(a, b);
#else
return simde_mm256_add_pd(simde_x_mm256_deinterleaveeven_pd(a, b), simde_x_mm256_deinterleaveodd_pd(a, b));
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_hadd_pd
#define _mm256_hadd_pd(a, b) simde_mm256_hadd_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_addsub_ps (simde__m256 a, simde__m256 b) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_addsub_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_addsub_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_addsub_ps(a_.m128[1], b_.m128[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.f32) / sizeof (r_.f32[0])) ; i += 2) {
r_.f32[ i ] = a_.f32[ i ] - b_.f32[ i ];
r_.f32[i + 1] = a_.f32[i + 1] + b_.f32[i + 1];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_addsub_ps
#define _mm256_addsub_ps(a, b) simde_mm256_addsub_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_addsub_pd (simde__m256d a, simde__m256d b) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_addsub_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_addsub_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_addsub_pd(a_.m128d[1], b_.m128d[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.f64) / sizeof (r_.f64[0])) ; i += 2) {
r_.f64[ i ] = a_.f64[ i ] - b_.f64[ i ];
r_.f64[i + 1] = a_.f64[i + 1] + b_.f64[i + 1];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_addsub_pd
#define _mm256_addsub_pd(a, b) simde_mm256_addsub_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_and_ps (simde__m256 a, simde__m256 b) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_and_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_and_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_and_ps(a_.m128[1], b_.m128[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32f = a_.i32f & b_.i32f;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32f) / sizeof (r_.i32f[0])) ; i++) {
r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_and_ps
#define _mm256_and_ps(a, b) simde_mm256_and_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_and_pd (simde__m256d a, simde__m256d b) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_and_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_and_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_and_pd(a_.m128d[1], b_.m128d[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32f = a_.i32f & b_.i32f;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32f) / sizeof (r_.i32f[0])) ; i++) {
r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_and_pd
#define _mm256_and_pd(a, b) simde_mm256_and_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_andnot_ps (simde__m256 a, simde__m256 b) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_andnot_ps(a, b);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_andnot_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_andnot_ps(a_.m128[1], b_.m128[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32f = ~a_.i32f & b_.i32f;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32f) / sizeof (r_.i32f[0])) ; i++) {
r_.i32f[i] = ~a_.i32f[i] & b_.i32f[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_andnot_ps
#define _mm256_andnot_ps(a, b) simde_mm256_andnot_ps(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_andnot_pd (simde__m256d a, simde__m256d b) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_andnot_pd(a, b);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_andnot_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_andnot_pd(a_.m128d[1], b_.m128d[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32f = ~a_.i32f & b_.i32f;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32f) / sizeof (r_.i32f[0])) ; i++) {
r_.i32f[i] = ~a_.i32f[i] & b_.i32f[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_andnot_pd
#define _mm256_andnot_pd(a, b) simde_mm256_andnot_pd(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_blend_ps (simde__m256 a, simde__m256 b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.f32) / sizeof (r_.f32[0])) ; i++) {
r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i];
}
return simde__m256_from_private(r_);
}
#if defined (SIMDE_X86_AVX_NATIVE)
# define simde_mm256_blend_ps(a, b, imm8) _mm256_blend_ps(a, b, imm8)
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
# define simde_mm256_blend_ps(a, b, imm8) \
simde_mm256_set_m128( \
simde_mm_blend_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), (imm8) >> 4), \
simde_mm_blend_ps(simde_mm256_extractf128_ps(a, 0), simde_mm256_extractf128_ps(b, 0), (imm8) & 0x0F))
#endif
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_blend_ps
#define _mm256_blend_ps(a, b, imm8) simde_mm256_blend_ps(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_blend_pd (simde__m256d a, simde__m256d b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.f64) / sizeof (r_.f64[0])) ; i++) {
r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i];
}
return simde__m256d_from_private(r_);
}
#if defined (SIMDE_X86_AVX_NATIVE)
# define simde_mm256_blend_pd(a, b, imm8) _mm256_blend_pd(a, b, imm8)
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
# define simde_mm256_blend_pd(a, b, imm8) \
simde_mm256_set_m128d( \
simde_mm_blend_pd(simde_mm256_extractf128_pd(a, 1), simde_mm256_extractf128_pd(b, 1), (imm8) >> 2), \
simde_mm_blend_pd(simde_mm256_extractf128_pd(a, 0), simde_mm256_extractf128_pd(b, 0), (imm8) & 3))
#endif
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_blend_pd
#define _mm256_blend_pd(a, b, imm8) simde_mm256_blend_pd(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_blendv_ps (simde__m256 a, simde__m256 b, simde__m256 mask) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_blendv_ps(a, b, mask);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b),
mask_ = simde__m256_to_private(mask);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_blendv_ps(a_.m128[0], b_.m128[0], mask_.m128[0]);
r_.m128[1] = simde_mm_blendv_ps(a_.m128[1], b_.m128[1], mask_.m128[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.u32) / sizeof (r_.u32[0])) ; i++) {
r_.f32[i] = (mask_.u32[i] & (UINT32_C(1) << 31)) ? b_.f32[i] : a_.f32[i];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_blendv_ps
#define _mm256_blendv_ps(a, b, imm8) simde_mm256_blendv_ps(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_blendv_pd (simde__m256d a, simde__m256d b, simde__m256d mask) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_blendv_pd(a, b, mask);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b),
mask_ = simde__m256d_to_private(mask);
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_blendv_pd(a_.m128d[0], b_.m128d[0], mask_.m128d[0]);
r_.m128d[1] = simde_mm_blendv_pd(a_.m128d[1], b_.m128d[1], mask_.m128d[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.u64) / sizeof (r_.u64[0])) ; i++) {
r_.f64[i] = (mask_.u64[i] & (UINT64_C(1) << 63)) ? b_.f64[i] : a_.f64[i];
}
#endif
return simde__m256d_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_blendv_pd
#define _mm256_blendv_pd(a, b, imm8) simde_mm256_blendv_pd(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_broadcast_pd (simde__m128d const * mem_addr) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_broadcast_pd(mem_addr);
#else
simde__m256d_private r_;
simde__m128d tmp = simde_mm_loadu_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const *, mem_addr));
r_.m128d[0] = tmp;
r_.m128d[1] = tmp;
return simde__m256d_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcast_pd
#define _mm256_broadcast_pd(mem_addr) simde_mm256_broadcast_pd(mem_addr)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_broadcast_ps (simde__m128 const * mem_addr) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_broadcast_ps(mem_addr);
#else
simde__m256_private r_;
simde__m128 tmp = simde_mm_loadu_ps(HEDLEY_REINTERPRET_CAST(simde_float32 const *, mem_addr));
r_.m128[0] = tmp;
r_.m128[1] = tmp;
return simde__m256_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcast_ps
#define _mm256_broadcast_ps(mem_addr) simde_mm256_broadcast_ps(HEDLEY_REINTERPRET_CAST(simde__m128 const *, mem_addr))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_broadcast_sd (simde_float64 const * a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_broadcast_sd(a);
#else
return simde_mm256_set1_pd(*a);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcast_sd
#define _mm256_broadcast_sd(mem_addr) simde_mm256_broadcast_sd(HEDLEY_REINTERPRET_CAST(double const *, mem_addr))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm_broadcast_ss (simde_float32 const * a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm_broadcast_ss(a);
#elif defined (SIMDE_WASM_SIMD128_NATIVE)
return simde__m128_from_wasm_v128(wasm_v128_load32_splat(a));
#else
return simde_mm_set1_ps(*a);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm_broadcast_ss
#define _mm_broadcast_ss(mem_addr) simde_mm_broadcast_ss(mem_addr)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_broadcast_ss (simde_float32 const * a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_broadcast_ss(a);
#else
return simde_mm256_set1_ps(*a);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcast_ss
#define _mm256_broadcast_ss(mem_addr) simde_mm256_broadcast_ss(mem_addr)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_castpd128_pd256 (simde__m128d a) {
#if defined (SIMDE_X86_AVX_NATIVE)
return _mm256_castpd128_pd256(a);
#else
simde__m256d_private r_;
simde__m128d_private a_ = simde__m128d_to_private(a);
r_.m128d_private[0] = a_;
return simde__m256d_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=94 H=94 G=93
¤ Dauer der Verarbeitung: 0.28 Sekunden
(vorverarbeitet)
¤
*© Formatika GbR, Deutschland