/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2018-2020 Evan Nemerson <evan@nemerson.com>
* 2019-2020 Michael R. Crusoe <crusoe@debian.org>
* 2020 Himanshi Mathur <himanshi18037@iiitd.ac.in>
* 2020 Hidayat Khan <huk2209@gmail.com>
*/
#if !defined (SIMDE_X86_AVX2_H)
#define SIMDE_X86_AVX2_H
#include "avx.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_abs_epi8 (simde__m256i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_abs_epi8(a);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_abs_epi8(a_.m128i[0]);
r_.m128i[1] = simde_mm_abs_epi8(a_.m128i[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i8) / sizeof (r_.i8[0])) ; i++) {
r_.i8[i] = (a_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_abs_epi8
#define _mm256_abs_epi8(a) simde_mm256_abs_epi8(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_abs_epi16 (simde__m256i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_abs_epi16(a);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_abs_epi16(a_.m128i[0]);
r_.m128i[1] = simde_mm_abs_epi16(a_.m128i[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i16) / sizeof (r_.i16[0])) ; i++) {
r_.i16[i] = (a_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_abs_epi16
#define _mm256_abs_epi16(a) simde_mm256_abs_epi16(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_abs_epi32(simde__m256i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_abs_epi32(a);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_abs_epi32(a_.m128i[0]);
r_.m128i[1] = simde_mm_abs_epi32(a_.m128i[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof (r_.i32) / sizeof (r_.i32[0])); i++) {
r_.i32[i] = (a_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_abs_epi32
#define _mm256_abs_epi32(a) simde_mm256_abs_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_add_epi8 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_add_epi8(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_add_epi8(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_add_epi8(a_.m128i[1], b_.m128i[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i8 = a_.i8 + b_.i8;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i8) / sizeof (r_.i8[0])) ; i++) {
r_.i8[i] = a_.i8[i] + b_.i8[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_add_epi8
#define _mm256_add_epi8(a, b) simde_mm256_add_epi8(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_add_epi16 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_add_epi16(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_add_epi16(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_add_epi16(a_.m128i[1], b_.m128i[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i16 = a_.i16 + b_.i16;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i16) / sizeof (r_.i16[0])) ; i++) {
r_.i16[i] = a_.i16[i] + b_.i16[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_add_epi16
#define _mm256_add_epi16(a, b) simde_mm256_add_epi16(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_hadd_epi16 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_hadd_epi16(a, b);
#else
return simde_mm256_add_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_hadd_epi16
#define _mm256_hadd_epi16(a, b) simde_mm256_hadd_epi16(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_add_epi32 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_add_epi32(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_add_epi32(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_add_epi32(a_.m128i[1], b_.m128i[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = a_.i32 + b_.i32;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = a_.i32[i] + b_.i32[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_add_epi32
#define _mm256_add_epi32(a, b) simde_mm256_add_epi32(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_hadd_epi32 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_hadd_epi32(a, b);
#else
return simde_mm256_add_epi32(simde_x_mm256_deinterleaveeven_epi32(a, b), simde_x_mm256_deinterleaveodd_epi32(a, b));
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_hadd_epi32
#define _mm256_hadd_epi32(a, b) simde_mm256_hadd_epi32(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_add_epi64 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_add_epi64(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_add_epi64(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_add_epi64(a_.m128i[1], b_.m128i[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined (SIMDE_BUG_CLANG_BAD_VI64_OPS)
r_.i64 = a_.i64 + b_.i64;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = a_.i64[i] + b_.i64[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_add_epi64
#define _mm256_add_epi64(a, b) simde_mm256_add_epi64(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_alignr_epi8 (simde__m256i a, simde__m256i b, int count)
SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
if (HEDLEY_UNLIKELY(count > 31))
return simde_mm256_setzero_si256();
for (size_t h = 0 ; h < (sizeof (r_.m128i) / sizeof (r_.m128i[0])) ; h++) {
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.m128i_private[h].i8) / sizeof (r_.m128i_private[h].i8[0])) ; i++) {
const int srcpos = count + HEDLEY_STATIC_CAST(int , i);
if (srcpos > 31) {
r_.m128i_private[h].i8[i] = 0;
} else if (srcpos > 15) {
r_.m128i_private[h].i8[i] = a_.m128i_private[h].i8[(srcpos) & 15];
} else {
r_.m128i_private[h].i8[i] = b_.m128i_private[h].i8[srcpos];
}
}
}
return simde__m256i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE) && !defined (SIMDE_BUG_PGI_30106)
# define simde_mm256_alignr_epi8(a, b, count) _mm256_alignr_epi8(a, b, count)
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_alignr_epi8(a, b, count) \
simde_mm256_set_m128i( \
simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (count)), \
simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (count)))
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_alignr_epi8
#define _mm256_alignr_epi8(a, b, count) simde_mm256_alignr_epi8(a, b, (count))
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_and_si256 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_and_si256(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_and_si128(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_and_si128(a_.m128i[1], b_.m128i[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32f = a_.i32f & b_.i32f;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = a_.i64[i] & b_.i64[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_and_si256
#define _mm256_and_si256(a, b) simde_mm256_and_si256(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_andnot_si256 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_andnot_si256(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_andnot_si128(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_andnot_si128(a_.m128i[1], b_.m128i[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32f) / sizeof (r_.i32f[0])) ; i++) {
r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_andnot_si256
#define _mm256_andnot_si256(a, b) simde_mm256_andnot_si256(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_adds_epi8 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_adds_epi8(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_adds_epi8(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_adds_epi8(a_.m128i[1], b_.m128i[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i8) / sizeof (r_.i8[0])) ; i++) {
r_.i8[i] = simde_math_adds_i8(a_.i8[i], b_.i8[i]);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_adds_epi8
#define _mm256_adds_epi8(a, b) simde_mm256_adds_epi8(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_adds_epi16(simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_adds_epi16(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_adds_epi16(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_adds_epi16(a_.m128i[1], b_.m128i[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i16) / sizeof (r_.i16[0])) ; i++) {
r_.i16[i] = simde_math_adds_i16(a_.i16[i], b_.i16[i]);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_adds_epi16
#define _mm256_adds_epi16(a, b) simde_mm256_adds_epi16(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_hadds_epi16 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_hadds_epi16(a, b);
#else
return simde_mm256_adds_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_hadds_epi16
#define _mm256_hadds_epi16(a, b) simde_mm256_hadds_epi16(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_adds_epu8 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_adds_epu8(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_adds_epu8(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_adds_epu8(a_.m128i[1], b_.m128i[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.u8) / sizeof (r_.u8[0])) ; i++) {
r_.u8[i] = simde_math_adds_u8(a_.u8[i], b_.u8[i]);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_adds_epu8
#define _mm256_adds_epu8(a, b) simde_mm256_adds_epu8(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_adds_epu16(simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_adds_epu16(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_adds_epu16(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_adds_epu16(a_.m128i[1], b_.m128i[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.u16) / sizeof (r_.u16[0])) ; i++) {
r_.u16[i] = simde_math_adds_u16(a_.u16[i], b_.u16[i]);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_adds_epu16
#define _mm256_adds_epu16(a, b) simde_mm256_adds_epu16(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_avg_epu8 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_avg_epu8(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.u8) / sizeof (r_.u8[0])) ; i++) {
r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
}
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_avg_epu8
#define _mm256_avg_epu8(a, b) simde_mm256_avg_epu8(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_avg_epu16 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_avg_epu16(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.u16) / sizeof (r_.u16[0])) ; i++) {
r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
}
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_avg_epu16
#define _mm256_avg_epu16(a, b) simde_mm256_avg_epu16(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_blend_epi32(simde__m128i a, simde__m128i b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
simde__m128i_private
r_,
a_ = simde__m128i_to_private(a),
b_ = simde__m128i_to_private(b);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i];
}
return simde__m128i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE)
# define simde_mm_blend_epi32(a, b, imm8) _mm_blend_epi32(a, b, imm8)
#elif SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(128)
# define simde_mm_blend_epi32(a, b, imm8) \
simde_mm_castps_si128(simde_mm_blend_ps(simde_mm_castsi128_ps(a), simde_mm_castsi128_ps(b), (imm8)))
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm_blend_epi32
#define _mm_blend_epi32(a, b, imm8) simde_mm_blend_epi32(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_blend_epi16(simde__m256i a, simde__m256i b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i16) / sizeof (r_.i16[0])) ; i++) {
r_.i16[i] = ((imm8 >> i%8) & 1) ? b_.i16[i] : a_.i16[i];
}
return simde__m256i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE) && defined (SIMDE_BUG_CLANG_REV_234560)
# define simde_mm256_blend_epi16(a, b, imm8) _mm256_castpd_si256(_mm256_blend_epi16(a, b, imm8))
#elif defined (SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_blend_epi16(a, b, imm8) _mm256_blend_epi16(a, b, imm8)
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_blend_epi16(a, b, imm8) \
simde_mm256_set_m128i( \
simde_mm_blend_epi16(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8)), \
simde_mm_blend_epi16(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8)))
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_blend_epi16
#define _mm256_blend_epi16(a, b, imm8) simde_mm256_blend_epi16(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_blend_epi32(simde__m256i a, simde__m256i b, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i];
}
return simde__m256i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_blend_epi32(a, b, imm8) _mm256_blend_epi32(a, b, imm8)
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_blend_epi32(a, b, imm8) \
simde_mm256_set_m128i( \
simde_mm_blend_epi32(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8) >> 4), \
simde_mm_blend_epi32(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8) & 0x0F))
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_blend_epi32
#define _mm256_blend_epi32(a, b, imm8) simde_mm256_blend_epi32(a, b, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_blendv_epi8(simde__m256i a, simde__m256i b, simde__m256i mask) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_blendv_epi8(a, b, mask);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b),
mask_ = simde__m256i_to_private(mask);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_blendv_epi8(a_.m128i[0], b_.m128i[0], mask_.m128i[0]);
r_.m128i[1] = simde_mm_blendv_epi8(a_.m128i[1], b_.m128i[1], mask_.m128i[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(mask_.i8) tmp = mask_.i8 >> 7;
r_.i8 = (tmp & b_.i8) | (~tmp & a_.i8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.u8) / sizeof (r_.u8[0])) ; i++) {
int8_t tmp = mask_.i8[i] >> 7;
r_.i8[i] = (tmp & b_.i8[i]) | (~tmp & a_.i8[i]);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_blendv_epi8(a, b, imm8) _mm256_blendv_epi8(a, b, imm8)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_blendv_epi8
#define _mm256_blendv_epi8(a, b, mask) simde_mm256_blendv_epi8(a, b, mask)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_broadcastb_epi8 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm_broadcastb_epi8(a);
#else
simde__m128i_private r_;
simde__m128i_private a_= simde__m128i_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i8) / sizeof (r_.i8[0])) ; i++) {
r_.i8[i] = a_.i8[0];
}
return simde__m128i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm_broadcastb_epi8
#define _mm_broadcastb_epi8(a) simde_mm_broadcastb_epi8(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_broadcastb_epi8 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_broadcastb_epi8(a);
#else
simde__m256i_private r_;
simde__m128i_private a_= simde__m128i_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i8) / sizeof (r_.i8[0])) ; i++) {
r_.i8[i] = a_.i8[0];
}
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcastb_epi8
#define _mm256_broadcastb_epi8(a) simde_mm256_broadcastb_epi8(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_broadcastw_epi16 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm_broadcastw_epi16(a);
#else
simde__m128i_private r_;
simde__m128i_private a_= simde__m128i_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i16) / sizeof (r_.i16[0])) ; i++) {
r_.i16[i] = a_.i16[0];
}
return simde__m128i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm_broadcastw_epi16
#define _mm_broadcastw_epi16(a) simde_mm_broadcastw_epi16(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_broadcastw_epi16 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_broadcastw_epi16(a);
#else
simde__m256i_private r_;
simde__m128i_private a_= simde__m128i_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i16) / sizeof (r_.i16[0])) ; i++) {
r_.i16[i] = a_.i16[0];
}
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcastw_epi16
#define _mm256_broadcastw_epi16(a) simde_mm256_broadcastw_epi16(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_broadcastd_epi32 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm_broadcastd_epi32(a);
#else
simde__m128i_private r_;
simde__m128i_private a_= simde__m128i_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = a_.i32[0];
}
return simde__m128i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm_broadcastd_epi32
#define _mm_broadcastd_epi32(a) simde_mm_broadcastd_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_broadcastd_epi32 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_broadcastd_epi32(a);
#else
simde__m256i_private r_;
simde__m128i_private a_= simde__m128i_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = a_.i32[0];
}
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcastd_epi32
#define _mm256_broadcastd_epi32(a) simde_mm256_broadcastd_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_broadcastq_epi64 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm_broadcastq_epi64(a);
#else
simde__m128i_private r_;
simde__m128i_private a_= simde__m128i_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = a_.i64[0];
}
return simde__m128i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm_broadcastq_epi64
#define _mm_broadcastq_epi64(a) simde_mm_broadcastq_epi64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_broadcastq_epi64 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_broadcastq_epi64(a);
#else
simde__m256i_private r_;
simde__m128i_private a_= simde__m128i_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = a_.i64[0];
}
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcastq_epi64
#define _mm256_broadcastq_epi64(a) simde_mm256_broadcastq_epi64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm_broadcastss_ps (simde__m128 a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm_broadcastss_ps(a);
#elif defined (SIMDE_X86_SSE_NATIVE)
return simde_mm_shuffle_ps(a, a, 0);
#else
simde__m128_private r_;
simde__m128_private a_= simde__m128_to_private(a);
#if defined (SIMDE_SHUFFLE_VECTOR_)
r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.f32) / sizeof (r_.f32[0])) ; i++) {
r_.f32[i] = a_.f32[0];
}
#endif
return simde__m128_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm_broadcastss_ps
#define _mm_broadcastss_ps(a) simde_mm_broadcastss_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256
simde_mm256_broadcastss_ps (simde__m128 a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_broadcastss_ps(a);
#else
simde__m256_private r_;
simde__m128_private a_= simde__m128_to_private(a);
#if defined (SIMDE_X86_AVX_NATIVE)
__m128 tmp = _mm_permute_ps(a_.n, 0);
r_.n = _mm256_insertf128_ps(_mm256_castps128_ps256(tmp), tmp, 1);
#elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector)
r_.f32 = __builtin_shufflevector(a_.f32, a_.f32, 0, 0, 0, 0, 0, 0, 0, 0);
#elif SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(128)
r_.m128[0] = r_.m128[1] = simde_mm_broadcastss_ps(simde__m128_from_private(a_));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.f32) / sizeof (r_.f32[0])) ; i++) {
r_.f32[i] = a_.f32[0];
}
#endif
return simde__m256_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcastss_ps
#define _mm256_broadcastss_ps(a) simde_mm256_broadcastss_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128d
simde_mm_broadcastsd_pd (simde__m128d a) {
return simde_mm_movedup_pd(a);
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm_broadcastsd_pd
#define _mm_broadcastsd_pd(a) simde_mm_broadcastsd_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256d
simde_mm256_broadcastsd_pd (simde__m128d a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_broadcastsd_pd(a);
#else
simde__m256d_private r_;
simde__m128d_private a_= simde__m128d_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.f64) / sizeof (r_.f64[0])) ; i++) {
r_.f64[i] = a_.f64[0];
}
return simde__m256d_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcastsd_pd
#define _mm256_broadcastsd_pd(a) simde_mm256_broadcastsd_pd(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_broadcastsi128_si256 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE) && \
(!defined (HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0))
return _mm256_broadcastsi128_si256(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i_private[0] = a_;
r_.m128i_private[1] = a_;
#else
r_.i64[0] = a_.i64[0];
r_.i64[1] = a_.i64[1];
r_.i64[2] = a_.i64[0];
r_.i64[3] = a_.i64[1];
#endif
return simde__m256i_from_private(r_);
#endif
}
#define simde_mm_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_broadcastsi128_si256
#define _mm256_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
#undef _mm_broadcastsi128_si256
#define _mm_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_bslli_epi128 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
const int ssize = HEDLEY_STATIC_CAST(int , (sizeof (r_.i8) / sizeof (r_.i8[0])));
SIMDE_VECTORIZE
for (int i = 0 ; i < ssize ; i++) {
const int e = i - imm8;
if (i >= (ssize/2)) {
if (e >= (ssize/2) && e < ssize)
r_.i8[i] = a_.i8[e];
else
r_.i8[i] = 0;
}
else {
if (e >= 0 && e < (ssize/2))
r_.i8[i] = a_.i8[e];
else
r_.i8[i] = 0;
}
}
return simde__m256i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE) && \
(!defined (HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \
SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0)
#define simde_mm256_bslli_epi128(a, imm8) _mm256_bslli_epi128(a, imm8)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_bslli_epi128
#define _mm256_bslli_epi128(a, imm8) simde_mm256_bslli_epi128(a, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_bsrli_epi128 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
const int ssize = HEDLEY_STATIC_CAST(int , (sizeof (r_.i8) / sizeof (r_.i8[0])));
SIMDE_VECTORIZE
for (int i = 0 ; i < ssize ; i++) {
const int e = i + imm8;
if (i < (ssize/2)) {
if (e >= 0 && e < (ssize/2))
r_.i8[i] = a_.i8[e];
else
r_.i8[i] = 0;
}
else {
if (e >= (ssize/2) && e < ssize)
r_.i8[i] = a_.i8[e];
else
r_.i8[i] = 0;
}
}
return simde__m256i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE) && \
(!defined (HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \
SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0)
#define simde_mm256_bsrli_epi128(a, imm8) _mm256_bsrli_epi128(a, imm8)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_bsrli_epi128
#define _mm256_bsrli_epi128(a, imm8) simde_mm256_bsrli_epi128(a, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cmpeq_epi8 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cmpeq_epi8(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_cmpeq_epi8(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_cmpeq_epi8(a_.m128i[1], b_.m128i[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i8) / sizeof (r_.i8[0])) ; i++) {
r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cmpeq_epi8
#define _mm256_cmpeq_epi8(a, b) simde_mm256_cmpeq_epi8(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cmpeq_epi16 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cmpeq_epi16(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_cmpeq_epi16(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_cmpeq_epi16(a_.m128i[1], b_.m128i[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i16) / sizeof (r_.i16[0])) ; i++) {
r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cmpeq_epi16
#define _mm256_cmpeq_epi16(a, b) simde_mm256_cmpeq_epi16(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cmpeq_epi32 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cmpeq_epi32(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_cmpeq_epi32(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_cmpeq_epi32(a_.m128i[1], b_.m128i[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cmpeq_epi32
#define _mm256_cmpeq_epi32(a, b) simde_mm256_cmpeq_epi32(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cmpeq_epi64 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cmpeq_epi64(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_cmpeq_epi64(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_cmpeq_epi64(a_.m128i[1], b_.m128i[1]);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = (a_.i64[i] == b_.i64[i]) ? ~INT64_C(0) : INT64_C(0);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cmpeq_epi64
#define _mm256_cmpeq_epi64(a, b) simde_mm256_cmpeq_epi64(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cmpgt_epi8 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cmpgt_epi8(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_cmpgt_epi8(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_cmpgt_epi8(a_.m128i[1], b_.m128i[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 > b_.i8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i8) / sizeof (r_.i8[0])) ; i++) {
r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cmpgt_epi8
#define _mm256_cmpgt_epi8(a, b) simde_mm256_cmpgt_epi8(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cmpgt_epi16 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cmpgt_epi16(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_cmpgt_epi16(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_cmpgt_epi16(a_.m128i[1], b_.m128i[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i16 = a_.i16 > b_.i16;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i16) / sizeof (r_.i16[0])) ; i++) {
r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cmpgt_epi16
#define _mm256_cmpgt_epi16(a, b) simde_mm256_cmpgt_epi16(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cmpgt_epi32 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cmpgt_epi32(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_cmpgt_epi32(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_cmpgt_epi32(a_.m128i[1], b_.m128i[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 > b_.i32);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cmpgt_epi32
#define _mm256_cmpgt_epi32(a, b) simde_mm256_cmpgt_epi32(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cmpgt_epi64 (simde__m256i a, simde__m256i b) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cmpgt_epi64(a, b);
#else
simde__m256i_private
r_,
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
#if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
r_.m128i[0] = simde_mm_cmpgt_epi64(a_.m128i[0], b_.m128i[0]);
r_.m128i[1] = simde_mm_cmpgt_epi64(a_.m128i[1], b_.m128i[1]);
#elif defined (SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 > b_.i64);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = (a_.i64[i] > b_.i64[i]) ? ~INT64_C(0) : INT64_C(0);
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cmpgt_epi64
#define _mm256_cmpgt_epi64(a, b) simde_mm256_cmpgt_epi64(a, b)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvtepi8_epi16 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cvtepi8_epi16(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
#if defined (SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.i16, a_.i8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i16) / sizeof (r_.i16[0])) ; i++) {
r_.i16[i] = a_.i8[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepi8_epi16
#define _mm256_cvtepi8_epi16(a) simde_mm256_cvtepi8_epi16(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvtepi8_epi32 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cvtepi8_epi32(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
#if defined (SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].i8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = a_.i8[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepi8_epi32
#define _mm256_cvtepi8_epi32(a) simde_mm256_cvtepi8_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvtepi8_epi64 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cvtepi8_epi64(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = a_.i8[i];
}
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepi8_epi64
#define _mm256_cvtepi8_epi64(a) simde_mm256_cvtepi8_epi64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvtepi16_epi32 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cvtepi16_epi32(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
#if defined (SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.i32, a_.i16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = a_.i16[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepi16_epi32
#define _mm256_cvtepi16_epi32(a) simde_mm256_cvtepi16_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvtepi16_epi64 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cvtepi16_epi64(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
#if defined (SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = a_.i16[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepi16_epi64
#define _mm256_cvtepi16_epi64(a) simde_mm256_cvtepi16_epi64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvtepi32_epi64 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cvtepi32_epi64(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
#if defined (SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.i64, a_.i32);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = a_.i32[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepi32_epi64
#define _mm256_cvtepi32_epi64(a) simde_mm256_cvtepi32_epi64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvtepu8_epi16 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cvtepu8_epi16(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
#if defined (SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.i16, a_.u8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i16) / sizeof (r_.i16[0])) ; i++) {
r_.i16[i] = a_.u8[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepu8_epi16
#define _mm256_cvtepu8_epi16(a) simde_mm256_cvtepu8_epi16(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvtepu8_epi32 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cvtepu8_epi32(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
#if defined (SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = a_.u8[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepu8_epi32
#define _mm256_cvtepu8_epi32(a) simde_mm256_cvtepu8_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvtepu8_epi64 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cvtepu8_epi64(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = a_.u8[i];
}
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepu8_epi64
#define _mm256_cvtepu8_epi64(a) simde_mm256_cvtepu8_epi64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvtepu16_epi32 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cvtepu16_epi32(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
#if defined (SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.i32, a_.u16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i32) / sizeof (r_.i32[0])) ; i++) {
r_.i32[i] = a_.u16[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepu16_epi32
#define _mm256_cvtepu16_epi32(a) simde_mm256_cvtepu16_epi32(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvtepu16_epi64 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cvtepu16_epi64(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
#if defined (SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = a_.u16[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepu16_epi64
#define _mm256_cvtepu16_epi64(a) simde_mm256_cvtepu16_epi64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_cvtepu32_epi64 (simde__m128i a) {
#if defined (SIMDE_X86_AVX2_NATIVE)
return _mm256_cvtepu32_epi64(a);
#else
simde__m256i_private r_;
simde__m128i_private a_ = simde__m128i_to_private(a);
#if defined (SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.i64, a_.u32);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
r_.i64[i] = a_.u32[i];
}
#endif
return simde__m256i_from_private(r_);
#endif
}
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_cvtepu32_epi64
#define _mm256_cvtepu32_epi64(a) simde_mm256_cvtepu32_epi64(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm256_extract_epi8 (simde__m256i a, const int index)
SIMDE_REQUIRE_RANGE(index, 0, 31){
simde__m256i_private a_ = simde__m256i_to_private(a);
return a_.i8[index];
}
#if defined (SIMDE_X86_AVX2_NATIVE) && \
(!defined (HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0))
#define simde_mm256_extract_epi8(a, index) _mm256_extract_epi8(a, index)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_extract_epi8
#define _mm256_extract_epi8(a, index) simde_mm256_extract_epi8(a, index)
#endif
SIMDE_FUNCTION_ATTRIBUTES
int
simde_mm256_extract_epi16 (simde__m256i a, const int index)
SIMDE_REQUIRE_RANGE(index, 0, 15) {
simde__m256i_private a_ = simde__m256i_to_private(a);
return a_.i16[index];
}
#if defined (SIMDE_X86_AVX2_NATIVE) && \
(!defined (HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0))
#define simde_mm256_extract_epi16(a, index) _mm256_extract_epi16(a, index)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_extract_epi16
#define _mm256_extract_epi16(a, index) simde_mm256_extract_epi16(a, index)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm256_extracti128_si256 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
simde__m256i_private a_ = simde__m256i_to_private(a);
return a_.m128i[imm8];
}
#if defined (SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_extracti128_si256(a, imm8) _mm256_extracti128_si256(a, imm8)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_extracti128_si256
#define _mm256_extracti128_si256(a, imm8) simde_mm256_extracti128_si256(a, imm8)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_i32gather_epi32(const int32_t* base_addr, simde__m128i vindex, const int32_t scale)
SIMDE_REQUIRE_CONSTANT(scale)
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8" ) {
simde__m128i_private
vindex_ = simde__m128i_to_private(vindex),
r_;
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (vindex_.i32) / sizeof (vindex_.i32[0])) ; i++) {
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
int32_t dst;
simde_memcpy(&dst, src, sizeof (dst));
r_.i32[i] = dst;
}
return simde__m128i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE)
#define simde_mm_i32gather_epi32(base_addr, vindex, scale) _mm_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const *, int32_t const *, base_addr), vindex, scale)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm_i32gather_epi32
#define _mm_i32gather_epi32(base_addr, vindex, scale) simde_mm_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const *, int const *, base_addr), vindex, scale)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_mask_i32gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
SIMDE_REQUIRE_CONSTANT(scale)
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8" ) {
simde__m128i_private
vindex_ = simde__m128i_to_private(vindex),
src_ = simde__m128i_to_private(src),
mask_ = simde__m128i_to_private(mask),
r_;
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (vindex_.i32) / sizeof (vindex_.i32[0])) ; i++) {
if ((mask_.i32[i] >> 31) & 1) {
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
int32_t dst;
simde_memcpy(&dst, src1, sizeof (dst));
r_.i32[i] = dst;
}
else {
r_.i32[i] = src_.i32[i];
}
}
return simde__m128i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE)
#define simde_mm_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const *, int32_t const *, base_addr), vindex, mask, scale)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm_mask_i32gather_epi32
#define _mm_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const *, int const *, base_addr), vindex, mask, scale)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_i32gather_epi32(const int32_t* base_addr, simde__m256i vindex, const int32_t scale)
SIMDE_REQUIRE_CONSTANT(scale)
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8" ) {
simde__m256i_private
vindex_ = simde__m256i_to_private(vindex),
r_;
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (vindex_.i32) / sizeof (vindex_.i32[0])) ; i++) {
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
int32_t dst;
simde_memcpy(&dst, src, sizeof (dst));
r_.i32[i] = dst;
}
return simde__m256i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE)
#define simde_mm256_i32gather_epi32(base_addr, vindex, scale) _mm256_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const *, int32_t const *, base_addr), vindex, scale)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_i32gather_epi32
#define _mm256_i32gather_epi32(base_addr, vindex, scale) simde_mm256_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const *, int const *, base_addr), vindex, scale)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_mask_i32gather_epi32(simde__m256i src, const int32_t* base_addr, simde__m256i vindex, simde__m256i mask, const int32_t scale)
SIMDE_REQUIRE_CONSTANT(scale)
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8" ) {
simde__m256i_private
vindex_ = simde__m256i_to_private(vindex),
src_ = simde__m256i_to_private(src),
mask_ = simde__m256i_to_private(mask),
r_;
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (vindex_.i32) / sizeof (vindex_.i32[0])) ; i++) {
if ((mask_.i32[i] >> 31) & 1) {
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
int32_t dst;
simde_memcpy(&dst, src1, sizeof (dst));
r_.i32[i] = dst;
}
else {
r_.i32[i] = src_.i32[i];
}
}
return simde__m256i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE)
#define simde_mm256_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const *, int32_t const *, base_addr), vindex, mask, scale)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_mask_i32gather_epi32
#define _mm256_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const *, int const *, base_addr), vindex, mask, scale)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_i64gather_epi32(const int32_t* base_addr, simde__m128i vindex, const int32_t scale)
SIMDE_REQUIRE_CONSTANT(scale)
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8" ) {
simde__m128i_private
vindex_ = simde__m128i_to_private(vindex),
r_ = simde__m128i_to_private(simde_mm_setzero_si128());
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (vindex_.i64) / sizeof (vindex_.i64[0])) ; i++) {
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
int32_t dst;
simde_memcpy(&dst, src, sizeof (dst));
r_.i32[i] = dst;
}
return simde__m128i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE)
#define simde_mm_i64gather_epi32(base_addr, vindex, scale) _mm_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const *, int32_t const *, base_addr), vindex, scale)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm_i64gather_epi32
#define _mm_i64gather_epi32(base_addr, vindex, scale) simde_mm_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const *, int const *, base_addr), vindex, scale)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_mask_i64gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
SIMDE_REQUIRE_CONSTANT(scale)
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8" ) {
simde__m128i_private
vindex_ = simde__m128i_to_private(vindex),
src_ = simde__m128i_to_private(src),
mask_ = simde__m128i_to_private(mask),
r_ = simde__m128i_to_private(simde_mm_setzero_si128());
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (vindex_.i64) / sizeof (vindex_.i64[0])) ; i++) {
if ((mask_.i32[i] >> 31) & 1) {
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
int32_t dst;
simde_memcpy(&dst, src1, sizeof (dst));
r_.i32[i] = dst;
}
else {
r_.i32[i] = src_.i32[i];
}
}
return simde__m128i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE)
#define simde_mm_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const *, int32_t const *, base_addr), vindex, mask, scale)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm_mask_i64gather_epi32
#define _mm_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const *, int const *, base_addr), vindex, mask, scale)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm256_i64gather_epi32(const int32_t* base_addr, simde__m256i vindex, const int32_t scale)
SIMDE_REQUIRE_CONSTANT(scale)
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8" ) {
simde__m256i_private
vindex_ = simde__m256i_to_private(vindex);
simde__m128i_private
r_ = simde__m128i_to_private(simde_mm_setzero_si128());
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (vindex_.i64) / sizeof (vindex_.i64[0])) ; i++) {
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
int32_t dst;
simde_memcpy(&dst, src, sizeof (dst));
r_.i32[i] = dst;
}
return simde__m128i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE)
#define simde_mm256_i64gather_epi32(base_addr, vindex, scale) _mm256_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const *, int32_t const *, base_addr), vindex, scale)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_i64gather_epi32
#define _mm256_i64gather_epi32(base_addr, vindex, scale) simde_mm256_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const *, int const *, base_addr), vindex, scale)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm256_mask_i64gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m256i vindex, simde__m128i mask, const int32_t scale)
SIMDE_REQUIRE_CONSTANT(scale)
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8" ) {
simde__m256i_private
vindex_ = simde__m256i_to_private(vindex);
simde__m128i_private
src_ = simde__m128i_to_private(src),
mask_ = simde__m128i_to_private(mask),
r_ = simde__m128i_to_private(simde_mm_setzero_si128());
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (vindex_.i64) / sizeof (vindex_.i64[0])) ; i++) {
if ((mask_.i32[i] >> 31) & 1) {
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
int32_t dst;
simde_memcpy(&dst, src1, sizeof (dst));
r_.i32[i] = dst;
}
else {
r_.i32[i] = src_.i32[i];
}
}
return simde__m128i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE)
#define simde_mm256_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const *, int32_t const *, base_addr), vindex, mask, scale)
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_mask_i64gather_epi32
#define _mm256_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const *, int const *, base_addr), vindex, mask, scale)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_i32gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale)
SIMDE_REQUIRE_CONSTANT(scale)
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8" ) {
simde__m128i_private
vindex_ = simde__m128i_to_private(vindex),
r_;
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
int64_t dst;
simde_memcpy(&dst, src, sizeof (dst));
r_.i64[i] = dst;
}
return simde__m128i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE)
#if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
#define simde_mm_i32gather_epi64(base_addr, vindex, scale) _mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const *, base_addr), vindex, scale)
#else
#define simde_mm_i32gather_epi64(base_addr, vindex, scale) _mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(long long const *, base_addr), vindex, scale)
#endif
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm_i32gather_epi64
#define _mm_i32gather_epi64(base_addr, vindex, scale) simde_mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const *, base_addr), vindex, scale)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_mask_i32gather_epi64(simde__m128i src, const int64_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
SIMDE_REQUIRE_CONSTANT(scale)
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8" ) {
simde__m128i_private
vindex_ = simde__m128i_to_private(vindex),
src_ = simde__m128i_to_private(src),
mask_ = simde__m128i_to_private(mask),
r_;
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (r_.i64) / sizeof (r_.i64[0])) ; i++) {
if ((mask_.i64[i] >> 63) & 1) {
const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
int64_t dst;
simde_memcpy(&dst, src1, sizeof (dst));
r_.i64[i] = dst;
}
else {
r_.i64[i] = src_.i64[i];
}
}
return simde__m128i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE)
#if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
#define simde_mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const *, base_addr), vindex, mask, scale)
#else
#define simde_mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const *, base_addr), vindex, mask, scale)
#endif
#endif
#if defined (SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm_mask_i32gather_epi64
#define _mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const *, base_addr), vindex, mask, scale)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_i32gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale)
SIMDE_REQUIRE_CONSTANT(scale)
HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8" ) {
simde__m128i_private
vindex_ = simde__m128i_to_private(vindex);
simde__m256i_private
r_;
const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof (vindex_.i32) / sizeof (vindex_.i32[0])) ; i++) {
const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
int64_t dst;
simde_memcpy(&dst, src, sizeof (dst));
r_.i64[i] = dst;
}
return simde__m256i_from_private(r_);
}
#if defined (SIMDE_X86_AVX2_NATIVE)
#if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
#define simde_mm256_i32gather_epi64(base_addr, vindex, scale) _mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const *, base_addr), vindex, scale)
#else
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=99 H=88 G=93
¤ Dauer der Verarbeitung: 0.23 Sekunden
(vorverarbeitet)
¤
*© Formatika GbR, Deutschland