/* Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, * modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * Copyright: * 2020-2021 Christopher Moore <moore@free.fr> * 2020 Evan Nemerson <evan@nemerson.com>
*/
/* In all the *gf2p8affine* intrinsics the argument b must be a compile-time constant so we must use macros and simde_x_mm* helper functions */
/* N.B. The _mm*gf2p8affineinv_epi64_epi8 and _mm*gf2p8mul_epi8 intrinsics are for a Field Generator Polynomial (FGP) (aka reduction polynomial) of 0x11B */ /* Only the _mm*gf2p8affine_epi64_epi8 intrinsics do not assume this specific FGP */
/* The field generator polynomial is 0x11B but we make the 0x100 bit implicit to fit inside 8 bits */ #define SIMDE_X86_GFNI_FGP 0x1B
/* Computing the inverse of a GF element is expensive so use this LUT for an FGP of 0x11B */
a = _mm_shuffle_epi8(A, _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8));
X = x;
r = zero;
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 8 ; i++) {
p = _mm_insert_epi16(zero, _mm_movemask_epi8(a), 0);
p = _mm_shuffle_epi8(p, byte_select);
p = _mm_and_si128(p, _mm_cmpgt_epi8(zero, X));
r = _mm_xor_si128(r, p);
a = _mm_add_epi8(a, a);
X = _mm_add_epi8(X, X);
}
return r; #elifdefined(SIMDE_X86_SSE2_NATIVE) const __m128i zero = _mm_setzero_si128();
__m128i r, a, p, X;
a = _mm_shufflehi_epi16(A, (0 << 6) + (1 << 4) + (2 << 2) + (3 << 0));
a = _mm_shufflelo_epi16(a, (0 << 6) + (1 << 4) + (2 << 2) + (3 << 0));
a = _mm_or_si128(_mm_slli_epi16(a, 8), _mm_srli_epi16(a, 8));
X = _mm_unpacklo_epi8(x, _mm_unpackhi_epi64(x, x));
r = zero;
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 8 ; i++) {
p = _mm_set1_epi16(HEDLEY_STATIC_CAST(short, _mm_movemask_epi8(a)));
p = _mm_and_si128(p, _mm_cmpgt_epi8(zero, X));
r = _mm_xor_si128(r, p);
a = _mm_add_epi8(a, a);
X = _mm_add_epi8(X, X);
}
t = simde__m128i_to_neon_i8(A);
a = vqtbl1q_s8(t, vld1q_u8(byte_interleave));
t = simde__m128i_to_neon_i8(x);
X = vqtbl1q_s8(t, vld1q_u8(byte_interleave));
r = vdupq_n_s8(0);
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 8 ; i++) {
t = vshrq_n_s8(a, 7);
t = vandq_s8(t, mask);
t = vreinterpretq_s8_u16(vdupq_n_u16(vaddvq_u16(vreinterpretq_u16_s8(t))));
t = vandq_s8(t, vshrq_n_s8(X, 7));
r = veorq_s8(r, t);
a = vshlq_n_s8(a, 1);
X = vshlq_n_s8(X, 1);
}
X = simde__m128i_to_altivec_i8(x);
a = simde__m128i_to_altivec_u8(A);
X = vec_perm(X, X, byte_interleave);
r = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar), zero);
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 8 ; i++) { #ifdefined(SIMDE_BUG_CLANG_50932)
p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar),
vec_bperm(HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a), bit_select)); #else
p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar), vec_bperm_u128(a, bit_select)); #endif
p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar),
vec_splat(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedshort), p), 3));
p &= X < zero;
r ^= p;
a += a;
X += X;
}
X = simde__m128i_to_altivec_i8(x);
a = simde__m128i_to_altivec_i8(A);
r = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar), zero);
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 8 ; i++) {
p = a < zero;
p &= mask;
p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar),
vec_sum2(vec_sum4(p, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar), zero)),
HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedint), zero)));
p = vec_perm(p, p, byte_select);
p &= X < zero;
r ^= p;
a += a;
X += X;
}
X = simde__m128i_to_altivec_i8(x);
a = simde__m128i_to_altivec_u8(A);
X = vec_perm(X, X, byte_interleave);
r = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar), zero);
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 8 ; i++) { #ifdefined(SIMDE_BUG_CLANG_50932)
p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar),
vec_bperm(HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a), bit_select)); #else
p = vec_bperm(a, bit_select); #endif
p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar),
vec_splat(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedshort), p), 4));
p = vec_and(p, vec_cmplt(X, zero));
r = vec_xor(r, p);
a = vec_add(a, a);
X = vec_add(X, X);
}
X = simde__m128i_to_altivec_i8(x);
a = simde__m128i_to_altivec_u8(A);
r = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar), zero);
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 8 ; i++) {
p = vec_sr(a, sevens);
p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar),
vec_msum(p,
mask,
HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedint), zero)));
p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar),
vec_sum2s(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signedint), p),
HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signedint), zero)));
p = vec_perm(p, p, byte_select);
p = vec_and(p, vec_cmplt(X, zero));
r = vec_xor(r, p);
a = vec_add(a, a);
X = vec_add(X, X);
}
return simde__m128i_from_altivec_u8(r); #elifdefined(SIMDE_WASM_SIMD128_NATIVE) const v128_t zero = wasm_i8x16_splat(0);
v128_t a, p, r, X;
X = simde__m128i_to_wasm_v128(x);
a = simde__m128i_to_wasm_v128(A);
a = wasm_i8x16_shuffle(a, a, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
X = wasm_i8x16_shuffle(X, X, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15);
r = zero;
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 8 ; i++) {
p = wasm_i16x8_splat(HEDLEY_STATIC_CAST(int16_t, wasm_i8x16_bitmask(a)));
p = wasm_v128_and(p, wasm_i8x16_lt(X, zero));
r = wasm_v128_xor(r, p);
a = wasm_i8x16_add(a, a);
X = wasm_i8x16_add(X, X);
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_gf2p8matrix_multiply_epi64_epi8 (simde__m256i x, simde__m256i A) { #ifdefined(SIMDE_X86_AVX2_NATIVE)
simde__m256i r, a, p; const simde__m256i byte_select = simde_x_mm256_set_epu64x(UINT64_C(0x0303030303030303), UINT64_C(0x0202020202020202),
UINT64_C(0x0101010101010101), UINT64_C(0x0000000000000000));
a = simde_mm256_shuffle_epi8(A, simde_mm256_broadcastsi128_si256(simde_x_mm_set_epu64x(UINT64_C(0x08090A0B0C0D0E0F), UINT64_C(0x0001020304050607))));
r = simde_mm256_setzero_si256();
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 8 ; i++) {
p = simde_mm256_set1_epi32(simde_mm256_movemask_epi8(a));
p = simde_mm256_shuffle_epi8(p, byte_select);
p = simde_mm256_xor_si256(r, p);
r = simde_mm256_blendv_epi8(r, p, x);
a = simde_mm256_add_epi8(a, a);
x = simde_mm256_add_epi8(x, x);
}
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
r_.m128i[i] = simde_x_mm_gf2p8matrix_multiply_epi64_epi8(x_.m128i[i], A_.m128i[i]);
}
return simde__m256i_from_private(r_); #endif
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_x_mm512_gf2p8matrix_multiply_epi64_epi8 (simde__m512i x, simde__m512i A) { #ifdefined(SIMDE_X86_AVX512BW_NATIVE)
simde__m512i r, a, p; const simde__m512i byte_select = simde_x_mm512_set_epu64(UINT64_C(0x0707070707070707), UINT64_C(0x0606060606060606), UINT64_C(0x0505050505050505), UINT64_C(0x0404040404040404),
UINT64_C(0x0303030303030303), UINT64_C(0x0202020202020202), UINT64_C(0x0101010101010101), UINT64_C(0X0000000000000000));
a = simde_mm512_shuffle_epi8(A, simde_mm512_broadcast_i32x4(simde_x_mm_set_epu64x(UINT64_C(0x08090A0B0C0D0E0F), UINT64_C(0x0001020304050607))));
r = simde_mm512_setzero_si512();
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 8 ; i++) {
p = simde_mm512_set1_epi64(HEDLEY_STATIC_CAST(int64_t, simde_mm512_movepi8_mask(a)));
p = simde_mm512_maskz_shuffle_epi8(simde_mm512_movepi8_mask(x), p, byte_select);
r = simde_mm512_xor_si512(r, p);
a = simde_mm512_add_epi8(a, a);
x = simde_mm512_add_epi8(x, x);
}
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
r_.m256i[i] = simde_x_mm256_gf2p8matrix_multiply_epi64_epi8(x_.m256i[i], A_.m256i[i]);
}
return simde__m512i_from_private(r_); #endif
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_x_mm_gf2p8inverse_epi8 (simde__m128i x) { #ifdefined(SIMDE_X86_SSE4_1_NATIVE) /* N.B. CM: this fallback may not be faster */
simde__m128i r, u, t, test; const simde__m128i sixteens = simde_mm_set1_epi8(16); const simde__m128i masked_x = simde_mm_and_si128(x, simde_mm_set1_epi8(0x0F));
test = simde_mm_set1_epi8(INT8_MIN /* 0x80 */);
x = simde_mm_xor_si128(x, test);
r = simde_mm_shuffle_epi8(simde_x_gf2p8inverse_lut.m128i[0], masked_x);
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 1 ; i < 16 ; i++) {
t = simde_mm_shuffle_epi8(simde_x_gf2p8inverse_lut.m128i[i], masked_x);
test = simde_mm_add_epi8(test, sixteens);
u = simde_mm_cmplt_epi8(x, test);
r = simde_mm_blendv_epi8(t, r, u);
}
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
r_.u8[i] = simde_x_gf2p8inverse_lut.u8[x_.u8[i]];
}
return simde__m128i_from_private(r_); #endif
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_x_mm256_gf2p8inverse_epi8 (simde__m256i x) { #ifdefined(SIMDE_X86_AVX2_NATIVE) /* N.B. CM: this fallback may not be faster */
simde__m256i r, u, t, test; const simde__m256i sixteens = simde_mm256_set1_epi8(16); const simde__m256i masked_x = simde_mm256_and_si256(x, simde_mm256_set1_epi8(0x0F));
test = simde_mm256_set1_epi8(INT8_MIN /* 0x80 */);
x = simde_mm256_xor_si256(x, test);
r = simde_mm256_shuffle_epi8(simde_mm256_broadcastsi128_si256(simde_x_gf2p8inverse_lut.m128i[0]), masked_x);
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 1 ; i < 16 ; i++) {
t = simde_mm256_shuffle_epi8(simde_mm256_broadcastsi128_si256(simde_x_gf2p8inverse_lut.m128i[i]), masked_x);
test = simde_mm256_add_epi8(test, sixteens);
u = simde_mm256_cmpgt_epi8(test, x);
r = simde_mm256_blendv_epi8(t, r, u);
}
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
r_.m128i[i] = simde_x_mm_gf2p8inverse_epi8(x_.m128i[i]);
}
return simde__m256i_from_private(r_); #endif
}
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_x_mm512_gf2p8inverse_epi8 (simde__m512i x) { /* N.B. CM: TODO: later add VBMI version using just two _mm512_permutex2var_epi8 and friends */ /* But except for Cannon Lake all processors with VBMI also have GFNI */ #ifdefined(SIMDE_X86_AVX512BW_NATIVE) /* N.B. CM: this fallback may not be faster */
simde__m512i r, test; const simde__m512i sixteens = simde_mm512_set1_epi8(16); const simde__m512i masked_x = simde_mm512_and_si512(x, simde_mm512_set1_epi8(0x0F));
r = simde_mm512_shuffle_epi8(simde_mm512_broadcast_i32x4(simde_x_gf2p8inverse_lut.m128i[0]), masked_x);
test = sixteens;
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 1 ; i < 16 ; i++) {
r = simde_mm512_mask_shuffle_epi8(r, simde_mm512_cmpge_epu8_mask(x, test), simde_mm512_broadcast_i32x4(simde_x_gf2p8inverse_lut.m128i[i]), masked_x);
test = simde_mm512_add_epi8(test, sixteens);
}
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
r_.m256i[i] = simde_x_mm256_gf2p8inverse_epi8(x_.m256i[i]);
}
return simde__m512i_from_private(r_); #endif
}
#define simde_x_mm_gf2p8matrix_multiply_inverse_epi64_epi8(x, A) simde_x_mm_gf2p8matrix_multiply_epi64_epi8(simde_x_mm_gf2p8inverse_epi8(x), A) #define simde_x_mm256_gf2p8matrix_multiply_inverse_epi64_epi8(x, A) simde_x_mm256_gf2p8matrix_multiply_epi64_epi8(simde_x_mm256_gf2p8inverse_epi8(x), A) #define simde_x_mm512_gf2p8matrix_multiply_inverse_epi64_epi8(x, A) simde_x_mm512_gf2p8matrix_multiply_epi64_epi8(simde_x_mm512_gf2p8inverse_epi8(x), A)
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_gf2p8affine_epi64_epi8 (simde__m128i x, simde__m128i A, int b)
SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) { return simde_mm_xor_si128(simde_x_mm_gf2p8matrix_multiply_epi64_epi8(x, A), simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b)));
} #ifdefined(SIMDE_X86_GFNI_NATIVE) #define simde_mm_gf2p8affine_epi64_epi8(x, A, b) _mm_gf2p8affine_epi64_epi8(x, A, b) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm_gf2p8affine_epi64_epi8 #define _mm_gf2p8affine_epi64_epi8(x, A, b) simde_mm_gf2p8affine_epi64_epi8(x, A, b) #endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_gf2p8affine_epi64_epi8 (simde__m256i x, simde__m256i A, int b)
SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) { return simde_mm256_xor_si256(simde_x_mm256_gf2p8matrix_multiply_epi64_epi8(x, A), simde_mm256_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b)));
} #ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) #define simde_mm256_gf2p8affine_epi64_epi8(x, A, b) _mm256_gf2p8affine_epi64_epi8(x, A, b) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm256_gf2p8affine_epi64_epi8 #define _mm256_gf2p8affine_epi64_epi8(x, A, b) simde_mm256_gf2p8affine_epi64_epi8(x, A, b) #endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_gf2p8affine_epi64_epi8 (simde__m512i x, simde__m512i A, int b)
SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) { return simde_mm512_xor_si512(simde_x_mm512_gf2p8matrix_multiply_epi64_epi8(x, A), simde_mm512_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b)));
} #ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm512_gf2p8affine_epi64_epi8(x, A, b) _mm512_gf2p8affine_epi64_epi8(x, A, b) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm512_gf2p8affine_epi64_epi8 #define _mm512_gf2p8affine_epi64_epi8(x, A, b) simde_mm512_gf2p8affine_epi64_epi8(x, A, b) #endif
#ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) #define simde_mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) _mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) #else #define simde_mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm_mask_mov_epi8(src, k, simde_mm_gf2p8affine_epi64_epi8(x, A, b)) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm_mask_gf2p8affine_epi64_epi8 #define _mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) #endif
#ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) #define simde_mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) _mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) #else #define simde_mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm256_mask_mov_epi8(src, k, simde_mm256_gf2p8affine_epi64_epi8(x, A, b)) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_gf2p8affine_epi64_epi8 #define _mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) #endif
#ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) _mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) #else #define simde_mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm512_mask_mov_epi8(src, k, simde_mm512_gf2p8affine_epi64_epi8(x, A, b)) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_gf2p8affine_epi64_epi8 #define _mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) #endif
#ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) #define simde_mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b) _mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b) #else #define simde_mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm_maskz_mov_epi8(k, simde_mm_gf2p8affine_epi64_epi8(x, A, b)) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_gf2p8affine_epi64_epi8 #define _mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b) #endif
#ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) #define simde_mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b) _mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b) #else #define simde_mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm256_maskz_mov_epi8(k, simde_mm256_gf2p8affine_epi64_epi8(x, A, b)) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_gf2p8affine_epi64_epi8 #define _mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b) #endif
#ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b) _mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b) #else #define simde_mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm512_maskz_mov_epi8(k, simde_mm512_gf2p8affine_epi64_epi8(x, A, b)) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_gf2p8affine_epi64_epi8 #define _mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b) #endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_gf2p8affineinv_epi64_epi8 (simde__m128i x, simde__m128i A, int b)
SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) { return simde_mm_xor_si128(simde_x_mm_gf2p8matrix_multiply_inverse_epi64_epi8(x, A), simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b)));
} #ifdefined(SIMDE_X86_GFNI_NATIVE) #define simde_mm_gf2p8affineinv_epi64_epi8(x, A, b) _mm_gf2p8affineinv_epi64_epi8(x, A, b) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm_gf2p8affineinv_epi64_epi8 #define _mm_gf2p8affineinv_epi64_epi8(x, A, b) simde_mm_gf2p8affineinv_epi64_epi8(x, A, b) #endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_gf2p8affineinv_epi64_epi8 (simde__m256i x, simde__m256i A, int b)
SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) { return simde_mm256_xor_si256(simde_x_mm256_gf2p8matrix_multiply_inverse_epi64_epi8(x, A), simde_mm256_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b)));
} #ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) #define simde_mm256_gf2p8affineinv_epi64_epi8(x, A, b) _mm256_gf2p8affineinv_epi64_epi8(x, A, b) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm256_gf2p8affineinv_epi64_epi8 #define _mm256_gf2p8affineinv_epi64_epi8(x, A, b) simde_mm256_gf2p8affineinv_epi64_epi8(x, A, b) #endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_gf2p8affineinv_epi64_epi8 (simde__m512i x, simde__m512i A, int b)
SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) { return simde_mm512_xor_si512(simde_x_mm512_gf2p8matrix_multiply_inverse_epi64_epi8(x, A), simde_mm512_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b)));
} #ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm512_gf2p8affineinv_epi64_epi8(x, A, b) _mm512_gf2p8affineinv_epi64_epi8(x, A, b) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm512_gf2p8affineinv_epi64_epi8 #define _mm512_gf2p8affineinv_epi64_epi8(x, A, b) simde_mm512_gf2p8affineinv_epi64_epi8(x, A, b) #endif
#ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) #define simde_mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) _mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) #else #define simde_mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm_mask_mov_epi8(src, k, simde_mm_gf2p8affineinv_epi64_epi8(x, A, b)) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm_mask_gf2p8affineinv_epi64_epi8 #define _mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) #endif
#ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) #define simde_mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) _mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) #else #define simde_mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm256_mask_mov_epi8(src, k, simde_mm256_gf2p8affineinv_epi64_epi8(x, A, b)) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_gf2p8affineinv_epi64_epi8 #define _mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) #endif
#ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) _mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) #else #define simde_mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm512_mask_mov_epi8(src, k, simde_mm512_gf2p8affineinv_epi64_epi8(x, A, b)) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_gf2p8affineinv_epi64_epi8 #define _mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) #endif
#ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) #define simde_mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) _mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) #else #define simde_mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm_maskz_mov_epi8(k, simde_mm_gf2p8affineinv_epi64_epi8(x, A, b)) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_gf2p8affineinv_epi64_epi8 #define _mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) #endif
#ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) #define simde_mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) _mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) #else #define simde_mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm256_maskz_mov_epi8(k, simde_mm256_gf2p8affineinv_epi64_epi8(x, A, b)) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_gf2p8affineinv_epi64_epi8 #define _mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) #endif
#ifdefined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) _mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) #else #define simde_mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm512_maskz_mov_epi8(k, simde_mm512_gf2p8affineinv_epi64_epi8(x, A, b)) #endif #ifdefined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_gf2p8affineinv_epi64_epi8 #define _mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) #endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128i simde_mm_gf2p8mul_epi8 (simde__m128i a, simde__m128i b) { #ifdefined(SIMDE_X86_GFNI_NATIVE) && (defined(SIMDE_X86_AVX512VL_NATIVE) || !defined(SIMDE_X86_AVX512F_NATIVE)) return _mm_gf2p8mul_epi8(a, b); #elifdefined(SIMDE_ARM_NEON_A32V7_NATIVE) const poly8x16_t pa = vreinterpretq_p8_u8(simde__m128i_to_neon_u8(a)); const poly8x16_t pb = vreinterpretq_p8_u8(simde__m128i_to_neon_u8(b)); const uint8x16_t lo = vreinterpretq_u8_p16(vmull_p8(vget_low_p8(pa), vget_low_p8(pb))); #ifdefined (SIMDE_ARM_NEON_A64V8_NATIVE)
uint8x16_t hi = vreinterpretq_u8_p16(vmull_high_p8(pa, pb)); #else
uint8x16_t hi = vreinterpretq_u8_p16(vmull_p8(vget_high_p8(pa), vget_high_p8(pb))); #endif
uint8x16x2_t hilo = vuzpq_u8(lo, hi);
uint8x16_t r = hilo.val[0];
hi = hilo.val[1]; const uint8x16_t idxHi = vshrq_n_u8(hi, 4); const uint8x16_t idxLo = vandq_u8(hi, vdupq_n_u8(0xF));
const SIMDE_POWER_ALTIVEC_VECTOR(signedchar) zero = vec_splat_s8(0);
m = vec_splat_u8(0x01);
const SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar) fgp = vec_splats(HEDLEY_STATIC_CAST(unsignedchar, SIMDE_X86_GFNI_FGP));
t = vec_and(y, m);
t = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar), vec_cmpeq(t, m));
r = vec_and(x, t);
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 7 ; i++) {
t = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar), vec_cmplt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signedchar), x), zero));
x = vec_add(x, x);
t = vec_and(fgp, t);
x = vec_xor(x, t);
m = vec_add(m, m);
t = vec_and(y, m);
t = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsignedchar), vec_cmpeq(t, m));
t = vec_and(x, t);
r = vec_xor(r, t);
}
return simde__m128i_from_altivec_u8(r); #elifdefined(SIMDE_WASM_SIMD128_NATIVE)
v128_t x, y, r, t, m;
x = simde__m128i_to_wasm_v128(a);
y = simde__m128i_to_wasm_v128(b);
t = wasm_v128_and(y, m);
t = wasm_i8x16_eq(t, m);
r = wasm_v128_and(x, t);
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 7 ; i++) {
t = wasm_i8x16_shr(x, 7);
x = wasm_i8x16_add(x, x);
t = wasm_v128_and(fgp, t);
x = wasm_v128_xor(x, t);
m = wasm_i8x16_add(m, m);
t = wasm_v128_and(y, m);
t = wasm_i8x16_eq(t, m);
t = wasm_v128_and(x, t);
r = wasm_v128_xor(r, t);
}
t = simde_mm_and_si128(b, ones);
t = simde_mm_cmpeq_epi8(t, ones);
r = simde_mm_and_si128(a, t);
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 7 ; i++) {
t = simde_mm_cmpgt_epi8(zero, a);
t = simde_mm_and_si128(fgp, t);
a = simde_mm_add_epi8(a, a);
a = simde_mm_xor_si128(a, t);
b = simde_mm_srli_epi64(b, 1);
t = simde_mm_and_si128(b, ones);
t = simde_mm_cmpeq_epi8(t, ones);
t = simde_mm_and_si128(a, t);
r = simde_mm_xor_si128(r, t);
}
t = simde_mm256_and_si256(b, ones);
t = simde_mm256_cmpeq_epi8(t, ones);
r = simde_mm256_and_si256(a, t);
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 7 ; i++) {
t = simde_mm256_cmpgt_epi8(zero, a);
t = simde_mm256_and_si256(fgp, t);
a = simde_mm256_add_epi8(a, a);
a = simde_mm256_xor_si256(a, t);
b = simde_mm256_srli_epi64(b, 1);
t = simde_mm256_and_si256(b, ones);
t = simde_mm256_cmpeq_epi8(t, ones);
t = simde_mm256_and_si256(a, t);
r = simde_mm256_xor_si256(r, t);
}
mb = simde_mm512_test_epi8_mask(b, s);
r = simde_mm512_maskz_mov_epi8(mb, a);
#if !defined(__INTEL_COMPILER)
SIMDE_VECTORIZE #endif for (int i = 0 ; i < 7 ; i++) {
ma = simde_mm512_cmplt_epi8_mask(a, zero);
s = simde_mm512_add_epi8(s, s);
mb = simde_mm512_test_epi8_mask(b, s);
a = simde_mm512_add_epi8(a, a);
t = simde_mm512_maskz_mov_epi8(ma, fgp);
a = simde_mm512_xor_si512(a, t);
t = simde_mm512_maskz_mov_epi8(mb, a);
r = simde_mm512_xor_si512(r, t);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.