// Copyright 2023 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// 128-bit vectors for VSX/Z14
// External include guard in highway.h - see comment there.
#if HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
#define HWY_S390X_HAVE_Z14 1
#else
#define HWY_S390X_HAVE_Z14 0
#endif
#pragma push_macro(
"vector")
#pragma push_macro(
"pixel")
#pragma push_macro(
"bool")
#undef vector
#undef pixel
#undef bool
#if HWY_S390X_HAVE_Z14
#include <vecintrin.h>
#else
#include <altivec.h>
#endif
#pragma pop_macro(
"vector")
#pragma pop_macro(
"pixel")
#pragma pop_macro(
"bool")
#include "hwy/ops/shared-inl.h"
// clang's altivec.h gates some intrinsics behind #ifdef __POWER10_VECTOR__, and
// some GCC do the same for _ARCH_PWR10.
// This means we can only use POWER10-specific intrinsics in static dispatch
// mode (where the -mpower10-vector compiler flag is passed). Same for PPC9.
// On other compilers, the usual target check is sufficient.
#if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC9 && \
(
defined(_ARCH_PWR9) ||
defined(__POWER9_VECTOR__))
#define HWY_PPC_HAVE_9 1
#else
#define HWY_PPC_HAVE_9 0
#endif
#if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC10 && \
(
defined(_ARCH_PWR10) ||
defined(__POWER10_VECTOR__))
#define HWY_PPC_HAVE_10 1
#else
#define HWY_PPC_HAVE_10 0
#endif
#if HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_Z15 && __ARCH__ >= 13
#define HWY_S390X_HAVE_Z15 1
#else
#define HWY_S390X_HAVE_Z15 0
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace detail {
template <
typename T>
struct Raw128;
// Each Raw128 specialization defines the following typedefs:
// - type:
// the backing Altivec/VSX raw vector type of the Vec128<T, N> type
// - RawBoolVec:
// the backing Altivec/VSX raw __bool vector type of the Mask128<T, N> type
// - RawT:
// the lane type for intrinsics, in particular vec_splat
// - AlignedRawVec:
// the 128-bit GCC/Clang vector type for aligned loads/stores
// - UnalignedRawVec:
// the 128-bit GCC/Clang vector type for unaligned loads/stores
#define HWY_VSX_RAW128(LANE_TYPE, RAW_VECT_LANE_TYPE, RAW_BOOL_VECT_LANE_TYPE) \
template <> \
struct Raw128<LANE_TYPE> { \
using type = __vector RAW_VECT_LANE_TYPE; \
using RawBoolVec = __vector __
bool RAW_BOOL_VECT_LANE_TYPE; \
using RawT = RAW_VECT_LANE_TYPE; \
typedef LANE_TYPE AlignedRawVec \
__attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); \
typedef LANE_TYPE UnalignedRawVec __attribute__(( \
__vector_size__(16), __aligned__(alignof(LANE_TYPE)), __may_alias__)); \
};
HWY_VSX_RAW128(int8_t,
signed char,
char)
HWY_VSX_RAW128(uint8_t,
unsigned char,
char)
HWY_VSX_RAW128(int16_t,
signed short,
short)
// NOLINT(runtime/int)
HWY_VSX_RAW128(uint16_t,
unsigned short,
short)
// NOLINT(runtime/int)
HWY_VSX_RAW128(int32_t,
signed int,
int)
HWY_VSX_RAW128(uint32_t,
unsigned int,
int)
HWY_VSX_RAW128(int64_t,
signed long long,
long long)
// NOLINT(runtime/int)
HWY_VSX_RAW128(uint64_t,
unsigned long long,
long long)
// NOLINT(runtime/int)
HWY_VSX_RAW128(
float,
float,
int)
HWY_VSX_RAW128(
double,
double,
long long)
// NOLINT(runtime/int)
template <>
struct Raw128<bfloat16_t> :
public Raw128<uint16_t> {};
template <>
struct Raw128<float16_t> :
public Raw128<uint16_t> {};
#undef HWY_VSX_RAW128
}
// namespace detail
template <
typename T, size_t N = 16 /
sizeof(T)>
class Vec128 {
using Raw =
typename detail::Raw128<T>::type;
public:
using PrivateT = T;
// only for DFromV
static constexpr size_t kPrivateN = N;
// only for DFromV
// Compound assignment. Only usable if there is a corresponding non-member
// binary operator overload. For example, only f32 and f64 support division.
HWY_INLINE Vec128&
operator*=(
const Vec128 other) {
return *
this = (*
this * other);
}
HWY_INLINE Vec128&
operator/=(
const Vec128 other) {
return *
this = (*
this / other);
}
HWY_INLINE Vec128&
operator+=(
const Vec128 other) {
return *
this = (*
this + other);
}
HWY_INLINE Vec128& operator-=(
const Vec128 other) {
return *
this = (*
this - other);
}
HWY_INLINE Vec128&
operator%=(
const Vec128 other) {
return *
this = (*
this % other);
}
HWY_INLINE Vec128&
operator&=(
const Vec128 other) {
return *
this = (*
this & other);
}
HWY_INLINE Vec128&
operator|=(
const Vec128 other) {
return *
this = (*
this | other);
}
HWY_INLINE Vec128&
operator^=(
const Vec128 other) {
return *
this = (*
this ^ other);
}
Raw raw;
};
template <
typename T>
using Vec64 = Vec128<T, 8 /
sizeof(T)>;
template <
typename T>
using Vec32 = Vec128<T, 4 /
sizeof(T)>;
template <
typename T>
using Vec16 = Vec128<T, 2 /
sizeof(T)>;
// FF..FF or 0.
template <
typename T, size_t N = 16 /
sizeof(T)>
struct Mask128 {
typename detail::Raw128<T>::RawBoolVec raw;
using PrivateT = T;
// only for DFromM
static constexpr size_t kPrivateN = N;
// only for DFromM
};
template <
class V>
using DFromV = Simd<
typename V::PrivateT, V::kPrivateN, 0>;
template <
class M>
using DFromM = Simd<
typename M::PrivateT, M::kPrivateN, 0>;
template <
class V>
using TFromV =
typename V::PrivateT;
// ------------------------------ Zero
// Returns an all-zero vector/part.
template <
class D,
typename T = TFromD<D>>
HWY_API Vec128<T, HWY_MAX_LANES_D(D)> Zero(D
/* tag */) {
// There is no vec_splats for 64-bit, so we cannot rely on casting the 0
// argument in order to select the correct overload. We instead cast the
// return vector type; see also the comment in BitCast.
return Vec128<T, HWY_MAX_LANES_D(D)>{
reinterpret_cast<
typename detail::Raw128<T>::type>(vec_splats(0))};
}
template <
class D>
using VFromD = decltype(Zero(D()));
// ------------------------------ Tuple (VFromD)
#include "hwy/ops/tuple-inl.h"
// ------------------------------ BitCast
template <
class D,
typename FromT>
HWY_API VFromD<D> BitCast(D
/*d*/,
Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
// C-style casts are not sufficient when compiling with
// -fno-lax-vector-conversions, which will be the future default in Clang,
// but reinterpret_cast is.
return VFromD<D>{
reinterpret_cast<
typename detail::Raw128<TFromD<D>>::type>(v.raw)};
}
// ------------------------------ ResizeBitCast
template <
class D,
typename FromV>
HWY_API VFromD<D> ResizeBitCast(D
/*d*/, FromV v) {
// C-style casts are not sufficient when compiling with
// -fno-lax-vector-conversions, which will be the future default in Clang,
// but reinterpret_cast is.
return VFromD<D>{
reinterpret_cast<
typename detail::Raw128<TFromD<D>>::type>(v.raw)};
}
// ------------------------------ Set
// Returns a vector/part with all lanes set to "t".
template <
class D, HWY_IF_NOT_SPECIAL_FLOAT(TFromD<D>)>
HWY_API VFromD<D> Set(D
/* tag */, TFromD<D> t) {
using RawLane =
typename detail::Raw128<TFromD<D>>::RawT;
return VFromD<D>{vec_splats(
static_cast<RawLane>(t))};
}
template <
class D, HWY_IF_SPECIAL_FLOAT(TFromD<D>)>
HWY_API VFromD<D> Set(D d, TFromD<D> t) {
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, Set(du, BitCastScalar<TFromD<decltype(du)>>(t)));
}
// Returns a vector with uninitialized elements.
template <
class D>
HWY_API VFromD<D> Undefined(D d) {
#if HWY_COMPILER_GCC_ACTUAL
// Suppressing maybe-uninitialized both here and at the caller does not work,
// so initialize.
return Zero(d);
#else
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored
"-Wuninitialized")
typename detail::Raw128<TFromD<D>>::type raw;
return VFromD<decltype(d)>{raw};
HWY_DIAGNOSTICS(pop)
#endif
}
// ------------------------------ GetLane
// Gets the single value stored in a vector/part.
template <
typename T, size_t N>
HWY_API T GetLane(Vec128<T, N> v) {
return static_cast<T>(v.raw[0]);
}
// ------------------------------ Dup128VecFromValues
template <
class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
TFromD<D> t11, TFromD<D> t12,
TFromD<D> t13, TFromD<D> t14,
TFromD<D> t15) {
const typename detail::Raw128<TFromD<D>>::type raw = {
t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15};
return VFromD<D>{raw};
}
template <
class D, HWY_IF_UI16_D(D)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6,
TFromD<D> t7) {
const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3,
t4, t5, t6, t7};
return VFromD<D>{raw};
}
template <
class D, HWY_IF_SPECIAL_FLOAT_D(D)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6,
TFromD<D> t7) {
const RebindToUnsigned<decltype(d)> du;
return BitCast(
d, Dup128VecFromValues(
du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
}
template <
class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3) {
const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3};
return VFromD<D>{raw};
}
template <
class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1) {
const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1};
return VFromD<D>{raw};
}
// ================================================== LOGICAL
// ------------------------------ And
template <
typename T, size_t N>
HWY_API Vec128<T, N>
And(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
#if HWY_S390X_HAVE_Z14
return BitCast(d, VU{BitCast(du, a).raw & BitCast(du, b).raw});
#else
return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)});
#endif
}
// ------------------------------ AndNot
// Returns ~not_mask & mask.
template <
typename T, size_t N>
HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
const DFromV<decltype(mask)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
return BitCast(
d, VU{vec_andc(BitCast(du, mask).raw, BitCast(du, not_mask).raw)});
}
// ------------------------------ Or
template <
typename T, size_t N>
HWY_API Vec128<T, N>
Or(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
#if HWY_S390X_HAVE_Z14
return BitCast(d, VU{BitCast(du, a).raw | BitCast(du, b).raw});
#else
return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)});
#endif
}
// ------------------------------ Xor
template <
typename T, size_t N>
HWY_API Vec128<T, N>
Xor(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
#if HWY_S390X_HAVE_Z14
return BitCast(d, VU{BitCast(du, a).raw ^ BitCast(du, b).raw});
#else
return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)});
#endif
}
// ------------------------------ Not
template <
typename T, size_t N>
HWY_API Vec128<T, N>
Not(Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
return BitCast(d, VU{vec_nor(BitCast(du, v).raw, BitCast(du, v).raw)});
}
// ------------------------------ IsConstantRawAltivecVect
namespace detail {
template <
class RawV>
static HWY_INLINE
bool IsConstantRawAltivecVect(
hwy::SizeTag<1>
/* lane_size_tag */, RawV v) {
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
__builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
__builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
__builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) &&
__builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) &&
__builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) &&
__builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) &&
__builtin_constant_p(v[14]) && __builtin_constant_p(v[15]);
}
template <
class RawV>
static HWY_INLINE
bool IsConstantRawAltivecVect(
hwy::SizeTag<2>
/* lane_size_tag */, RawV v) {
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
__builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
__builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
__builtin_constant_p(v[6]) && __builtin_constant_p(v[7]);
}
template <
class RawV>
static HWY_INLINE
bool IsConstantRawAltivecVect(
hwy::SizeTag<4>
/* lane_size_tag */, RawV v) {
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
__builtin_constant_p(v[2]) && __builtin_constant_p(v[3]);
}
template <
class RawV>
static HWY_INLINE
bool IsConstantRawAltivecVect(
hwy::SizeTag<8>
/* lane_size_tag */, RawV v) {
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]);
}
template <
class RawV>
static HWY_INLINE
bool IsConstantRawAltivecVect(RawV v) {
return IsConstantRawAltivecVect(hwy::SizeTag<
sizeof(decltype(v[0]))>(), v);
}
}
// namespace detail
// ------------------------------ TernaryLogic
#if HWY_PPC_HAVE_10
namespace detail {
// NOTE: the kTernLogOp bits of the PPC10 TernaryLogic operation are in reverse
// order of the kTernLogOp bits of AVX3
// _mm_ternarylogic_epi64(a, b, c, kTernLogOp)
template <uint8_t kTernLogOp,
class V>
HWY_INLINE V TernaryLogic(V a, V b, V c) {
const DFromV<decltype(a)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
const auto a_raw = BitCast(du, a).raw;
const auto b_raw = BitCast(du, b).raw;
const auto c_raw = BitCast(du, c).raw;
#if HWY_COMPILER_GCC_ACTUAL
// Use inline assembly on GCC to work around GCC compiler bug
typename detail::Raw128<TFromV<VU>>::type raw_ternlog_result;
__asm__(
"xxeval %x0,%x1,%x2,%x3,%4"
:
"=wa"(raw_ternlog_result)
:
"wa"(a_raw),
"wa"(b_raw),
"wa"(c_raw),
"n"(
static_cast<
unsigned>(kTernLogOp))
:);
#else
const auto raw_ternlog_result =
vec_ternarylogic(a_raw, b_raw, c_raw, kTernLogOp);
#endif
return BitCast(d, VU{raw_ternlog_result});
}
}
// namespace detail
#endif // HWY_PPC_HAVE_10
// ------------------------------ Xor3
template <
typename T, size_t N>
HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
#if HWY_PPC_HAVE_10
#if defined(__OPTIMIZE__)
if (
static_cast<
int>(detail::IsConstantRawAltivecVect(x1.raw)) +
static_cast<
int>(detail::IsConstantRawAltivecVect(x2.raw)) +
static_cast<
int>(detail::IsConstantRawAltivecVect(x3.raw)) >=
2) {
return Xor(x1,
Xor(x2, x3));
}
else // NOLINT
#endif
{
return detail::TernaryLogic<0x69>(x1, x2, x3);
}
#else
return Xor(x1,
Xor(x2, x3));
#endif
}
// ------------------------------ Or3
template <
typename T, size_t N>
HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
#if HWY_PPC_HAVE_10
#if defined(__OPTIMIZE__)
if (
static_cast<
int>(detail::IsConstantRawAltivecVect(o1.raw)) +
static_cast<
int>(detail::IsConstantRawAltivecVect(o2.raw)) +
static_cast<
int>(detail::IsConstantRawAltivecVect(o3.raw)) >=
2) {
return Or(o1,
Or(o2, o3));
}
else // NOLINT
#endif
{
return detail::TernaryLogic<0x7F>(o1, o2, o3);
}
#else
return Or(o1,
Or(o2, o3));
#endif
}
// ------------------------------ OrAnd
template <
typename T, size_t N>
HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
#if HWY_PPC_HAVE_10
#if defined(__OPTIMIZE__)
if (detail::IsConstantRawAltivecVect(a1.raw) &&
detail::IsConstantRawAltivecVect(a2.raw)) {
return Or(o,
And(a1, a2));
}
else // NOLINT
#endif
{
return detail::TernaryLogic<0x1F>(o, a1, a2);
}
#else
return Or(o,
And(a1, a2));
#endif
}
// ------------------------------ IfVecThenElse
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
const DFromV<decltype(yes)> d;
const RebindToUnsigned<decltype(d)> du;
return BitCast(
d, VFromD<decltype(du)>{vec_sel(BitCast(du, no).raw, BitCast(du, yes).raw,
BitCast(du, mask).raw)});
}
// ------------------------------ BitwiseIfThenElse
#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#else
#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
#endif
template <
class V>
HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
return IfVecThenElse(mask, yes, no);
}
// ------------------------------ Operator overloads (internal-only if float)
template <
typename T, size_t N>
HWY_API Vec128<T, N>
operator&(Vec128<T, N> a, Vec128<T, N> b) {
return And(a, b);
}
template <
typename T, size_t N>
HWY_API Vec128<T, N>
operator|(Vec128<T, N> a, Vec128<T, N> b) {
return Or(a, b);
}
template <
typename T, size_t N>
HWY_API Vec128<T, N>
operator^(Vec128<T, N> a, Vec128<T, N> b) {
return Xor(a, b);
}
// ================================================== SIGN
// ------------------------------ Neg
template <
typename T, size_t N, HWY_IF_SIGNED(T)>
HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
// If T is an signed integer type, use Zero(d) - v instead of vec_neg to
// avoid undefined behavior in the case where v[i] == LimitsMin<T>()
const DFromV<decltype(v)> d;
return Zero(d) - v;
}
template <
typename T, size_t N, HWY_IF_FLOAT3264(T)>
HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
#if HWY_S390X_HAVE_Z14
return Xor(v, SignBit(DFromV<decltype(v)>()));
#else
return Vec128<T, N>{vec_neg(v.raw)};
#endif
}
template <
typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
HWY_API Vec128<T, N> Neg(
const Vec128<T, N> v) {
return Xor(v, SignBit(DFromV<decltype(v)>()));
}
// ------------------------------ Abs
// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
template <
class T, size_t N, HWY_IF_SIGNED(T)>
HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
// If T is a signed integer type, use Max(v, Neg(v)) instead of vec_abs to
// avoid undefined behavior in the case where v[i] == LimitsMin<T>().
return Max(v, Neg(v));
}
template <
class T, size_t N, HWY_IF_FLOAT3264(T)>
HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
return Vec128<T, N>{vec_abs(v.raw)};
}
// ------------------------------ CopySign
#if HWY_S390X_HAVE_Z14
template <
class V>
HWY_API V CopySign(
const V magn,
const V sign) {
static_assert(IsFloat<TFromV<V>>(),
"Only makes sense for floating-point");
const DFromV<decltype(magn)> d;
const auto msb = SignBit(d);
// Truth table for msb, magn, sign | bitwise msb ? sign : mag
// 0 0 0 | 0
// 0 0 1 | 0
// 0 1 0 | 1
// 0 1 1 | 1
// 1 0 0 | 0
// 1 0 1 | 1
// 1 1 0 | 0
// 1 1 1 | 1
return BitwiseIfThenElse(msb, sign, magn);
}
#else // VSX
template <size_t N>
HWY_API Vec128<
float, N> CopySign(Vec128<
float, N> magn,
Vec128<
float, N> sign) {
// Work around compiler bugs that are there with vec_cpsgn on older versions
// of GCC/Clang
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
return Vec128<
float, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgnsp)
return Vec128<
float, N>{__builtin_vsx_xvcpsgnsp(magn.raw, sign.raw)};
#else
return Vec128<
float, N>{vec_cpsgn(sign.raw, magn.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<
double, N> CopySign(Vec128<
double, N> magn,
Vec128<
double, N> sign) {
// Work around compiler bugs that are there with vec_cpsgn on older versions
// of GCC/Clang
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
return Vec128<
double, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgndp)
return Vec128<
double, N>{__builtin_vsx_xvcpsgndp(magn.raw, sign.raw)};
#else
return Vec128<
double, N>{vec_cpsgn(sign.raw, magn.raw)};
#endif
}
#endif // HWY_S390X_HAVE_Z14
template <
typename T, size_t N>
HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
// PPC8 can also handle abs < 0, so no extra action needed.
static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
return CopySign(abs, sign);
}
// ================================================== MEMORY (1)
// Note: type punning is safe because the types are tagged with may_alias.
// (https://godbolt.org/z/fqrWjfjsP)
// ------------------------------ Load
template <
class D, HWY_IF_V_SIZE_D(D, 16),
typename T = TFromD<D>>
HWY_API Vec128<T> Load(D
/* tag */, const T* HWY_RESTRICT aligned) {
using LoadRaw =
typename detail::Raw128<T>::AlignedRawVec;
const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(
const LoadRaw*, aligned);
using ResultRaw =
typename detail::Raw128<T>::type;
return Vec128<T>{
reinterpret_cast<ResultRaw>(*p)};
}
// Any <= 64 bit
template <
class D, HWY_IF_V_SIZE_LE_D(D, 8),
typename T = TFromD<D>>
HWY_API VFromD<D> Load(D d,
const T* HWY_RESTRICT p) {
using BitsT = UnsignedFromSize<d.MaxBytes()>;
BitsT bits;
const Repartition<BitsT, decltype(d)> d_bits;
CopyBytes<d.MaxBytes()>(p, &bits);
return BitCast(d, Set(d_bits, bits));
}
// ================================================== MASK
// ------------------------------ Mask
// Mask and Vec are both backed by vector types (true = FF..FF).
template <
typename T, size_t N>
HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
using Raw =
typename detail::Raw128<T>::RawBoolVec;
return Mask128<T, N>{
reinterpret_cast<Raw>(v.raw)};
}
template <
class D>
using MFromD = decltype(MaskFromVec(VFromD<D>()));
template <
typename T, size_t N>
HWY_API Vec128<T, N> VecFromMask(Mask128<T, N> v) {
return Vec128<T, N>{
reinterpret_cast<
typename detail::Raw128<T>::type>(v.raw)};
}
template <
class D>
HWY_API VFromD<D> VecFromMask(D
/* tag */, MFromD<D> v) {
return VFromD<D>{
reinterpret_cast<
typename detail::Raw128<TFromD<D>>::type>(v.raw)};
}
// mask ? yes : no
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
const DFromV<decltype(yes)> d;
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, VFromD<decltype(du)>{vec_sel(
BitCast(du, no).raw, BitCast(du, yes).raw, mask.raw)});
}
// mask ? yes : 0
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
}
// mask ? 0 : no
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
}
// ------------------------------ Mask logical
template <
typename T, size_t N>
HWY_API Mask128<T, N>
Not(Mask128<T, N> m) {
return Mask128<T, N>{vec_nor(m.raw, m.raw)};
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
And(Mask128<T, N> a, Mask128<T, N> b) {
#if HWY_S390X_HAVE_Z14
return Mask128<T, N>{a.raw & b.raw};
#else
return Mask128<T, N>{vec_and(a.raw, b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
return Mask128<T, N>{vec_andc(b.raw, a.raw)};
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
Or(Mask128<T, N> a, Mask128<T, N> b) {
#if HWY_S390X_HAVE_Z14
return Mask128<T, N>{a.raw | b.raw};
#else
return Mask128<T, N>{vec_or(a.raw, b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
Xor(Mask128<T, N> a, Mask128<T, N> b) {
#if HWY_S390X_HAVE_Z14
return Mask128<T, N>{a.raw ^ b.raw};
#else
return Mask128<T, N>{vec_xor(a.raw, b.raw)};
#endif
}
template <
typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
return Mask128<T, N>{vec_nor(a.raw, b.raw)};
}
// ------------------------------ ShiftLeftSame
template <
typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v,
const int bits) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
using TU = TFromD<decltype(du)>;
#if HWY_S390X_HAVE_Z14
return BitCast(d,
VFromD<decltype(du)>{BitCast(du, v).raw
<< Set(du,
static_cast<TU>(bits)).raw});
#else
// Do an unsigned vec_sl operation to avoid undefined behavior
return BitCast(
d, VFromD<decltype(du)>{
vec_sl(BitCast(du, v).raw, Set(du,
static_cast<TU>(bits)).raw)});
#endif
}
// ------------------------------ ShiftRightSame
template <
typename T, size_t N, HWY_IF_UNSIGNED(T)>
HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v,
const int bits) {
using TU =
typename detail::Raw128<MakeUnsigned<T>>::RawT;
#if HWY_S390X_HAVE_Z14
return Vec128<T, N>{v.raw >> vec_splats(
static_cast<TU>(bits))};
#else
return Vec128<T, N>{vec_sr(v.raw, vec_splats(
static_cast<TU>(bits)))};
#endif
}
template <
typename T, size_t N, HWY_IF_SIGNED(T)>
HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v,
const int bits) {
#if HWY_S390X_HAVE_Z14
using TI =
typename detail::Raw128<T>::RawT;
return Vec128<T, N>{v.raw >> vec_splats(
static_cast<TI>(bits))};
#else
using TU =
typename detail::Raw128<MakeUnsigned<T>>::RawT;
return Vec128<T, N>{vec_sra(v.raw, vec_splats(
static_cast<TU>(bits)))};
#endif
}
// ------------------------------ ShiftLeft
template <
int kBits,
typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
return ShiftLeftSame(v, kBits);
}
// ------------------------------ ShiftRight
template <
int kBits,
typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
return ShiftRightSame(v, kBits);
}
// ------------------------------ BroadcastSignBit
template <
typename T, size_t N, HWY_IF_SIGNED(T)>
HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
return ShiftRightSame(v,
static_cast<
int>(
sizeof(T) * 8 - 1));
}
// ================================================== SWIZZLE (1)
// ------------------------------ TableLookupBytes
template <
typename T, size_t N,
typename TI, size_t NI>
HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> bytes,
Vec128<TI, NI> from) {
const Repartition<uint8_t, DFromV<decltype(from)>> du8_from;
return Vec128<TI, NI>{
reinterpret_cast<
typename detail::Raw128<TI>::type>(
vec_perm(bytes.raw, bytes.raw, BitCast(du8_from, from).raw))};
}
// ------------------------------ TableLookupBytesOr0
// For all vector widths; Altivec/VSX needs zero out
template <
class V,
class VI>
HWY_API VI TableLookupBytesOr0(
const V bytes,
const VI from) {
const DFromV<VI> di;
Repartition<int8_t, decltype(di)> di8;
const VI zeroOutMask = BitCast(di, BroadcastSignBit(BitCast(di8, from)));
return AndNot(zeroOutMask, TableLookupBytes(bytes, from));
}
// ------------------------------ Reverse
template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1)>
HWY_API Vec128<T> Reverse(D
/* tag */, Vec128<T> v) {
return Vec128<T>{vec_reve(v.raw)};
}
// ------------------------------ Shuffles (Reverse)
// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
// Shuffle0321 rotates one lane to the right (the previous least-significant
// lane is now most-significant). These could also be implemented via
// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
// Swap 32-bit halves in 64-bit halves.
template <
typename T, size_t N>
HWY_API Vec128<T, N> Shuffle2301(Vec128<T, N> v) {
static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
const __vector
unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
12, 13, 14, 15, 8, 9, 10, 11};
return Vec128<T, N>{vec_perm(v.raw, v.raw, kShuffle)};
}
// These are used by generic_ops-inl to implement LoadInterleaved3. As with
// Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output
// comes from the first argument.
namespace detail {
template <
typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> ShuffleTwo2301(Vec32<T> a, Vec32<T> b) {
const __vector
unsigned char kShuffle16 = {1, 0, 19, 18};
return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle16)};
}
template <
typename T, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec64<T> ShuffleTwo2301(Vec64<T> a, Vec64<T> b) {
const __vector
unsigned char kShuffle = {2, 3, 0, 1, 22, 23, 20, 21};
return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <
typename T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> ShuffleTwo2301(Vec128<T> a, Vec128<T> b) {
const __vector
unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
28, 29, 30, 31, 24, 25, 26, 27};
return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <
typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> ShuffleTwo1230(Vec32<T> a, Vec32<T> b) {
const __vector
unsigned char kShuffle = {0, 3, 18, 17};
return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <
typename T, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec64<T> ShuffleTwo1230(Vec64<T> a, Vec64<T> b) {
const __vector
unsigned char kShuffle = {0, 1, 6, 7, 20, 21, 18, 19};
return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <
typename T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> ShuffleTwo1230(Vec128<T> a, Vec128<T> b) {
const __vector
unsigned char kShuffle = {0, 1, 2, 3, 12, 13, 14, 15,
24, 25, 26, 27, 20, 21, 22, 23};
return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <
typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> ShuffleTwo3012(Vec32<T> a, Vec32<T> b) {
const __vector
unsigned char kShuffle = {2, 1, 16, 19};
return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <
typename T, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec64<T> ShuffleTwo3012(Vec64<T> a, Vec64<T> b) {
const __vector
unsigned char kShuffle = {4, 5, 2, 3, 16, 17, 22, 23};
return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <
typename T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> ShuffleTwo3012(Vec128<T> a, Vec128<T> b) {
const __vector
unsigned char kShuffle = {8, 9, 10, 11, 4, 5, 6, 7,
16, 17, 18, 19, 28, 29, 30, 31};
return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
}
// namespace detail
// Swap 64-bit halves
template <
class T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> Shuffle1032(Vec128<T> v) {
const Full128<T> d;
const Full128<uint64_t> du64;
return BitCast(d, Reverse(du64, BitCast(du64, v)));
}
template <
class T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> Shuffle01(Vec128<T> v) {
return Reverse(Full128<T>(), v);
}
// Rotate right 32 bits
template <
class T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> Shuffle0321(Vec128<T> v) {
#if HWY_IS_LITTLE_ENDIAN
return Vec128<T>{vec_sld(v.raw, v.raw, 12)};
#else
return Vec128<T>{vec_sld(v.raw, v.raw, 4)};
#endif
}
// Rotate left 32 bits
template <
class T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> Shuffle2103(Vec128<T> v) {
#if HWY_IS_LITTLE_ENDIAN
return Vec128<T>{vec_sld(v.raw, v.raw, 4)};
#else
return Vec128<T>{vec_sld(v.raw, v.raw, 12)};
#endif
}
template <
class T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> Shuffle0123(Vec128<T> v) {
return Reverse(Full128<T>(), v);
}
// ================================================== COMPARE
// Comparisons fill a lane with 1-bits if the condition is true, else 0.
template <
class DTo,
typename TFrom, size_t NFrom>
HWY_API MFromD<DTo> RebindMask(DTo
/*dto*/, Mask128<TFrom, NFrom> m) {
static_assert(
sizeof(TFrom) ==
sizeof(TFromD<DTo>),
"Must have same size");
return MFromD<DTo>{m.raw};
}
template <
typename T, size_t N>
HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
return (v & bit) == bit;
}
// ------------------------------ Equality
template <
typename T, size_t N>
HWY_API Mask128<T, N>
operator==(Vec128<T, N> a, Vec128<T, N> b) {
return Mask128<T, N>{vec_cmpeq(a.raw, b.raw)};
}
// ------------------------------ Inequality
// This cannot have T as a template argument, otherwise it is not more
// specialized than rewritten operator== in C++20, leading to compile
// errors: https://gcc.godbolt.org/z/xsrPhPvPT.
template <size_t N>
HWY_API Mask128<uint8_t, N>
operator!=(Vec128<uint8_t, N> a,
Vec128<uint8_t, N> b) {
#if HWY_PPC_HAVE_9
return Mask128<uint8_t, N>{vec_cmpne(a.raw, b.raw)};
#else
return Not(a == b);
#endif
}
template <size_t N>
HWY_API Mask128<uint16_t, N>
operator!=(Vec128<uint16_t, N> a,
Vec128<uint16_t, N> b) {
#if HWY_PPC_HAVE_9
return Mask128<uint16_t, N>{vec_cmpne(a.raw, b.raw)};
#else
return Not(a == b);
#endif
}
template <size_t N>
HWY_API Mask128<uint32_t, N>
operator!=(Vec128<uint32_t, N> a,
Vec128<uint32_t, N> b) {
#if HWY_PPC_HAVE_9
return Mask128<uint32_t, N>{vec_cmpne(a.raw, b.raw)};
#else
return Not(a == b);
#endif
}
template <size_t N>
HWY_API Mask128<uint64_t, N>
operator!=(Vec128<uint64_t, N> a,
Vec128<uint64_t, N> b) {
return Not(a == b);
}
template <size_t N>
HWY_API Mask128<int8_t, N>
operator!=(Vec128<int8_t, N> a,
Vec128<int8_t, N> b) {
#if HWY_PPC_HAVE_9
return Mask128<int8_t, N>{vec_cmpne(a.raw, b.raw)};
#else
return Not(a == b);
#endif
}
template <size_t N>
HWY_API Mask128<int16_t, N>
operator!=(Vec128<int16_t, N> a,
Vec128<int16_t, N> b) {
#if HWY_PPC_HAVE_9
return Mask128<int16_t, N>{vec_cmpne(a.raw, b.raw)};
#else
return Not(a == b);
#endif
}
template <size_t N>
HWY_API Mask128<int32_t, N>
operator!=(Vec128<int32_t, N> a,
Vec128<int32_t, N> b) {
#if HWY_PPC_HAVE_9
return Mask128<int32_t, N>{vec_cmpne(a.raw, b.raw)};
#else
return Not(a == b);
#endif
}
template <size_t N>
HWY_API Mask128<int64_t, N>
operator!=(Vec128<int64_t, N> a,
Vec128<int64_t, N> b) {
return Not(a == b);
}
template <size_t N>
HWY_API Mask128<
float, N>
operator!=(Vec128<
float, N> a, Vec128<
float, N> b) {
return Not(a == b);
}
template <size_t N>
HWY_API Mask128<
double, N>
operator!=(Vec128<
double, N> a,
Vec128<
double, N> b) {
return Not(a == b);
}
// ------------------------------ Strict inequality
template <
typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_INLINE Mask128<T, N>
operator>(Vec128<T, N> a, Vec128<T, N> b) {
return Mask128<T, N>{vec_cmpgt(a.raw, b.raw)};
}
// ------------------------------ Weak inequality
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Mask128<T, N>
operator>=(Vec128<T, N> a, Vec128<T, N> b) {
return Mask128<T, N>{vec_cmpge(a.raw, b.raw)};
}
template <
typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Mask128<T, N>
operator>=(Vec128<T, N> a, Vec128<T, N> b) {
return Not(b > a);
}
// ------------------------------ Reversed comparisons
template <
typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Mask128<T, N>
operator<(Vec128<T, N> a, Vec128<T, N> b) {
return b > a;
}
template <
typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Mask128<T, N>
operator<=(Vec128<T, N> a, Vec128<T, N> b) {
return b >= a;
}
// ================================================== MEMORY (2)
// ------------------------------ Load
template <
class D, HWY_IF_V_SIZE_D(D, 16),
typename T = TFromD<D>>
HWY_API Vec128<T> LoadU(D
/* tag */, const T* HWY_RESTRICT p) {
using LoadRaw =
typename detail::Raw128<T>::UnalignedRawVec;
const LoadRaw* HWY_RESTRICT praw =
reinterpret_cast<
const LoadRaw*>(p);
using ResultRaw =
typename detail::Raw128<T>::type;
return Vec128<T>{
reinterpret_cast<ResultRaw>(*praw)};
}
// For < 128 bit, LoadU == Load.
template <
class D, HWY_IF_V_SIZE_LE_D(D, 8),
typename T = TFromD<D>>
HWY_API VFromD<D> LoadU(D d,
const T* HWY_RESTRICT p) {
return Load(d, p);
}
// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
template <
class D,
typename T = TFromD<D>>
HWY_API VFromD<D> LoadDup128(D d,
const T* HWY_RESTRICT p) {
return LoadU(d, p);
}
#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
#ifdef HWY_NATIVE_LOAD_N
#undef HWY_NATIVE_LOAD_N
#else
#define HWY_NATIVE_LOAD_N
#endif
template <
class D,
typename T = TFromD<D>>
HWY_API VFromD<D> LoadN(D d,
const T* HWY_RESTRICT p,
size_t max_lanes_to_load) {
#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) {
return Zero(d);
}
if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) &&
max_lanes_to_load >= HWY_MAX_LANES_D(D)) {
return LoadU(d, p);
}
#endif
const size_t num_of_bytes_to_load =
HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) *
sizeof(TFromD<D>);
const Repartition<uint8_t, decltype(d)> du8;
#if HWY_S390X_HAVE_Z14
return (num_of_bytes_to_load > 0)
? BitCast(d, VFromD<decltype(du8)>{vec_load_len(
const_cast<
unsigned char*>(
reinterpret_cast<
const unsigned char*>(p)),
static_cast<
unsigned>(num_of_bytes_to_load - 1))})
: Zero(d);
#else
return BitCast(
d,
VFromD<decltype(du8)>{vec_xl_len(
const_cast<
unsigned char*>(
reinterpret_cast<
const unsigned char*>(p)),
num_of_bytes_to_load)});
#endif
}
template <
class D,
typename T = TFromD<D>>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d,
const T* HWY_RESTRICT p,
size_t max_lanes_to_load) {
#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) {
return no;
}
if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) &&
max_lanes_to_load >= HWY_MAX_LANES_D(D)) {
return LoadU(d, p);
}
#endif
return IfThenElse(FirstN(d, max_lanes_to_load),
LoadN(d, p, max_lanes_to_load), no);
}
#endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
// Returns a vector with lane i=[0, N) set to "first" + i.
namespace detail {
template <
class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<D> Iota0(D d) {
constexpr __vector
unsigned char kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15};
return BitCast(d, VFromD<RebindToUnsigned<D>>{kU8Iota0});
}
template <
class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
constexpr __vector
unsigned short kU16Iota0 = {0, 1, 2, 3, 4, 5, 6, 7};
return BitCast(d, VFromD<RebindToUnsigned<D>>{kU16Iota0});
}
template <
class D, HWY_IF_UI32_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
constexpr __vector
unsigned int kU32Iota0 = {0, 1, 2, 3};
return BitCast(d, VFromD<RebindToUnsigned<D>>{kU32Iota0});
}
template <
class D, HWY_IF_UI64_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
constexpr __vector
unsigned long long kU64Iota0 = {0, 1};
return BitCast(d, VFromD<RebindToUnsigned<D>>{kU64Iota0});
}
template <
class D, HWY_IF_F32_D(D)>
HWY_INLINE VFromD<D> Iota0(D
/*d*/) {
constexpr __vector
float kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f};
return VFromD<D>{kF32Iota0};
}
template <
class D, HWY_IF_F64_D(D)>
HWY_INLINE VFromD<D> Iota0(D
/*d*/) {
constexpr __vector
double kF64Iota0 = {0.0, 1.0};
return VFromD<D>{kF64Iota0};
}
}
// namespace detail
template <
class D,
typename T2>
HWY_API VFromD<D> Iota(D d,
const T2 first) {
return detail::Iota0(d) + Set(d,
static_cast<TFromD<D>>(first));
}
// ------------------------------ FirstN (Iota, Lt)
template <
class D>
HWY_API MFromD<D> FirstN(D d, size_t num) {
const RebindToUnsigned<decltype(d)> du;
using TU = TFromD<decltype(du)>;
return RebindMask(d, Iota(du, 0) < Set(du,
static_cast<TU>(num)));
}
// ------------------------------ MaskedLoad
template <
class D,
typename T = TFromD<D>>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
const T* HWY_RESTRICT p) {
return IfThenElseZero(m, LoadU(d, p));
}
// ------------------------------ MaskedLoadOr
template <
class D,
typename T = TFromD<D>>
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
const T* HWY_RESTRICT p) {
return IfThenElse(m, LoadU(d, p), v);
}
// ------------------------------ Store
template <
class D, HWY_IF_V_SIZE_D(D, 16),
typename T = TFromD<D>>
HWY_API
void Store(Vec128<T> v, D
/* tag */, T* HWY_RESTRICT aligned) {
using StoreRaw =
typename detail::Raw128<T>::AlignedRawVec;
*HWY_RCAST_ALIGNED(StoreRaw*, aligned) =
reinterpret_cast<StoreRaw>(v.raw);
}
template <
class D, HWY_IF_V_SIZE_D(D, 16),
typename T = TFromD<D>>
HWY_API
void StoreU(Vec128<T> v, D
/* tag */, T* HWY_RESTRICT p) {
using StoreRaw =
typename detail::Raw128<T>::UnalignedRawVec;
*
reinterpret_cast<StoreRaw*>(p) =
reinterpret_cast<StoreRaw>(v.raw);
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 8),
typename T = TFromD<D>>
HWY_API
void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {
using BitsT = UnsignedFromSize<d.MaxBytes()>;
const Repartition<BitsT, decltype(d)> d_bits;
const BitsT bits = GetLane(BitCast(d_bits, v));
CopyBytes<d.MaxBytes()>(&bits, p);
}
// For < 128 bit, StoreU == Store.
template <
class D, HWY_IF_V_SIZE_LE_D(D, 8),
typename T = TFromD<D>>
HWY_API
void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
Store(v, d, p);
}
#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
#ifdef HWY_NATIVE_STORE_N
#undef HWY_NATIVE_STORE_N
#else
#define HWY_NATIVE_STORE_N
#endif
template <
class D,
typename T = TFromD<D>>
HWY_API
void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
size_t max_lanes_to_store) {
#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
if (__builtin_constant_p(max_lanes_to_store) && max_lanes_to_store == 0) {
return;
}
if (__builtin_constant_p(max_lanes_to_store >= HWY_MAX_LANES_D(D)) &&
max_lanes_to_store >= HWY_MAX_LANES_D(D)) {
StoreU(v, d, p);
return;
}
#endif
const size_t num_of_bytes_to_store =
HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)) *
sizeof(TFromD<D>);
const Repartition<uint8_t, decltype(d)> du8;
#if HWY_S390X_HAVE_Z14
if (num_of_bytes_to_store > 0) {
vec_store_len(BitCast(du8, v).raw,
reinterpret_cast<
unsigned char*>(p),
static_cast<
unsigned>(num_of_bytes_to_store - 1));
}
#else
vec_xst_len(BitCast(du8, v).raw,
reinterpret_cast<
unsigned char*>(p),
num_of_bytes_to_store);
#endif
}
#endif
// ------------------------------ BlendedStore
template <
class D>
HWY_API
void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
TFromD<D>* HWY_RESTRICT p) {
const RebindToSigned<decltype(d)> di;
// for testing mask if T=bfloat16_t.
using TI = TFromD<decltype(di)>;
alignas(16) TI buf[MaxLanes(d)];
alignas(16) TI mask[MaxLanes(d)];
Store(BitCast(di, v), di, buf);
Store(BitCast(di, VecFromMask(d, m)), di, mask);
for (size_t i = 0; i < MaxLanes(d); ++i) {
if (mask[i]) {
CopySameSize(buf + i, p + i);
}
}
}
// ================================================== ARITHMETIC
namespace detail {
// If TFromD<D> is an integer type, detail::RebindToUnsignedIfNotFloat<D>
// rebinds D to MakeUnsigned<TFromD<D>>.
// Otherwise, if TFromD<D> is a floating-point type (including F16 and BF16),
// detail::RebindToUnsignedIfNotFloat<D> is the same as D.
template <
class D>
using RebindToUnsignedIfNotFloat =
hwy::
If<(!hwy::IsFloat<TFromD<D>>() && !hwy::IsSpecialFloat<TFromD<D>>()),
RebindToUnsigned<D>, D>;
}
// namespace detail
// ------------------------------ Addition
template <
typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Vec128<T, N>
operator+(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
// If T is an integer type, do an unsigned vec_add to avoid undefined behavior
#if HWY_S390X_HAVE_Z14
return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw +
BitCast(d_arith, b).raw});
#else
return BitCast(d, VFromD<decltype(d_arith)>{vec_add(
BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
#endif
}
// ------------------------------ Subtraction
template <
typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
// If T is an integer type, do an unsigned vec_sub to avoid undefined behavior
#if HWY_S390X_HAVE_Z14
return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw -
BitCast(d_arith, b).raw});
#else
return BitCast(d, VFromD<decltype(d_arith)>{vec_sub(
BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
#endif
}
// ------------------------------ SumsOf8
template <
class V, HWY_IF_U8(TFromV<V>)>
HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
return SumsOf2(SumsOf4(v));
}
template <
class V, HWY_IF_I8(TFromV<V>)>
HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
#if HWY_S390X_HAVE_Z14
const DFromV<decltype(v)> di8;
const RebindToUnsigned<decltype(di8)> du8;
const RepartitionToWideX3<decltype(di8)> di64;
return BitCast(di64, SumsOf8(BitCast(du8,
Xor(v, SignBit(di8))))) +
Set(di64, int64_t{-1024});
#else
return SumsOf2(SumsOf4(v));
#endif
}
// ------------------------------ SaturatedAdd
// Returns a + b clamped to the destination range.
#if HWY_S390X_HAVE_Z14
// Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedAdd instructions unlike most
// other integer SIMD instruction sets
template <
typename T, size_t N, HWY_IF_UNSIGNED(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
return Add(a, Min(b,
Not(a)));
}
template <
typename T, size_t N, HWY_IF_SIGNED(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const auto sum = Add(a, b);
const auto overflow_mask = AndNot(
Xor(a, b),
Xor(a, sum));
const auto overflow_result =
Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
return IfNegativeThenElse(overflow_mask, overflow_result, sum);
}
#else // VSX
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
#undef HWY_NATIVE_I32_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I32_SATURATED_ADDSUB
#endif
#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
#undef HWY_NATIVE_U32_SATURATED_ADDSUB
#else
#define HWY_NATIVE_U32_SATURATED_ADDSUB
#endif
template <
typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{vec_adds(a.raw, b.raw)};
}
#endif // HWY_S390X_HAVE_Z14
#if HWY_PPC_HAVE_10
#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
#undef HWY_NATIVE_I64_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I64_SATURATED_ADDSUB
#endif
template <
class V, HWY_IF_I64_D(DFromV<V>)>
HWY_API V SaturatedAdd(V a, V b) {
const DFromV<decltype(a)> d;
const auto sum = Add(a, b);
const auto overflow_mask =
BroadcastSignBit(detail::TernaryLogic<0x42>(a, b, sum));
const auto overflow_result =
Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
return IfNegativeThenElse(overflow_mask, overflow_result, sum);
}
#endif // HWY_PPC_HAVE_10
// ------------------------------ SaturatedSub
// Returns a - b clamped to the destination range.
#if HWY_S390X_HAVE_Z14
// Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedSub instructions unlike most
// other integer SIMD instruction sets
template <
typename T, size_t N, HWY_IF_UNSIGNED(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
return Sub(a, Min(a, b));
}
template <
typename T, size_t N, HWY_IF_SIGNED(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const auto diff = Sub(a, b);
const auto overflow_mask =
And(
Xor(a, b),
Xor(a, diff));
const auto overflow_result =
Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
return IfNegativeThenElse(overflow_mask, overflow_result, diff);
}
#else // VSX
template <
typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{vec_subs(a.raw, b.raw)};
}
#endif // HWY_S390X_HAVE_Z14
#if HWY_PPC_HAVE_10
template <
class V, HWY_IF_I64_D(DFromV<V>)>
HWY_API V SaturatedSub(V a, V b) {
const DFromV<decltype(a)> d;
const auto diff = Sub(a, b);
const auto overflow_mask =
BroadcastSignBit(detail::TernaryLogic<0x18>(a, b, diff));
const auto overflow_result =
Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
return IfNegativeThenElse(overflow_mask, overflow_result, diff);
}
#endif // HWY_PPC_HAVE_10
// ------------------------------ AverageRound
// Returns (a + b + 1) / 2
template <
typename T, size_t N, HWY_IF_UNSIGNED(T),
HWY_IF_T_SIZE_ONE_OF(T, 0x6)>
HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{vec_avg(a.raw, b.raw)};
}
// ------------------------------ Multiplication
// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
#ifdef HWY_NATIVE_MUL_8
#undef HWY_NATIVE_MUL_8
#else
#define HWY_NATIVE_MUL_8
#endif
#ifdef HWY_NATIVE_MUL_64
#undef HWY_NATIVE_MUL_64
#else
#define HWY_NATIVE_MUL_64
#endif
template <
typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Vec128<T, N>
operator*(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
// If T is an integer type, do an unsigned vec_mul to avoid undefined behavior
#if HWY_S390X_HAVE_Z14
return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw *
BitCast(d_arith, b).raw});
#else
return BitCast(d, VFromD<decltype(d_arith)>{vec_mul(
BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
#endif
}
// Returns the upper 16 bits of a * b in each lane.
template <
typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_NOT_FLOAT(T)>
HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
#if HWY_S390X_HAVE_Z14
return Vec128<T, N>{vec_mulh(a.raw, b.raw)};
#else
const DFromV<decltype(a)> d;
const RepartitionToWide<decltype(d)> dw;
const VFromD<decltype(dw)> p1{vec_mule(a.raw, b.raw)};
const VFromD<decltype(dw)> p2{vec_mulo(a.raw, b.raw)};
#if HWY_IS_LITTLE_ENDIAN
const __vector
unsigned char kShuffle = {2, 3, 18, 19, 6, 7, 22, 23,
10, 11, 26, 27, 14, 15, 30, 31};
#else
const __vector
unsigned char kShuffle = {0, 1, 16, 17, 4, 5, 20, 21,
8, 9, 24, 25, 12, 13, 28, 29};
#endif
return BitCast(d, VFromD<decltype(dw)>{vec_perm(p1.raw, p2.raw, kShuffle)});
#endif
}
// Multiplies even lanes (0, 2, ..) and places the double-wide result into
// even and the upper half into its odd neighbor lane.
template <
typename T, size_t N,
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(Vec128<T, N> a,
Vec128<T, N> b) {
return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mule(a.raw, b.raw)};
}
// Multiplies odd lanes (1, 3, ..) and places the double-wide result into
// even and the upper half into its odd neighbor lane.
template <
typename T, size_t N,
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
Vec128<T, N> b) {
return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mulo(a.raw, b.raw)};
}
// ------------------------------ RotateRight
template <
int kBits,
typename T, size_t N>
HWY_API Vec128<T, N> RotateRight(
const Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
constexpr size_t kSizeInBits =
sizeof(T) * 8;
static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
if (kBits == 0)
return v;
// Do an unsigned vec_rl operation to avoid undefined behavior
return BitCast(d, VFromD<decltype(du)>{vec_rl(
BitCast(du, v).raw, Set(du, kSizeInBits - kBits).raw)});
}
// ------------------------------ ZeroIfNegative (BroadcastSignBit)
template <
typename T, size_t N>
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
static_assert(IsFloat<T>(),
"Only works for float");
const DFromV<decltype(v)> d;
const RebindToSigned<decltype(d)> di;
const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
return IfThenElse(mask, Zero(d), v);
}
// ------------------------------ IfNegativeThenElse
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
Vec128<T, N> no) {
static_assert(IsSigned<T>(),
"Only works for signed/float");
const DFromV<decltype(v)> d;
#if HWY_PPC_HAVE_10
const RebindToUnsigned<decltype(d)> du;
return BitCast(
d, VFromD<decltype(du)>{vec_blendv(
BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, v).raw)});
#else
const RebindToSigned<decltype(d)> di;
return IfVecThenElse(BitCast(d, BroadcastSignBit(BitCast(di, v))), yes, no);
#endif
}
// generic_ops takes care of integer T.
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
return Abs(a - b);
}
// ------------------------------ Floating-point multiply-add variants
// Returns mul * x + add
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
Vec128<T, N> add) {
return Vec128<T, N>{vec_madd(mul.raw, x.raw, add.raw)};
}
// Returns add - mul * x
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
Vec128<T, N> add) {
// NOTE: the vec_nmsub operation below computes -(mul * x - add),
// which is equivalent to add - mul * x in the round-to-nearest
// and round-towards-zero rounding modes
return Vec128<T, N>{vec_nmsub(mul.raw, x.raw, add.raw)};
}
// Returns mul * x - sub
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
Vec128<T, N> sub) {
return Vec128<T, N>{vec_msub(mul.raw, x.raw, sub.raw)};
}
// Returns -mul * x - sub
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
Vec128<T, N> sub) {
// NOTE: The vec_nmadd operation below computes -(mul * x + sub),
// which is equivalent to -mul * x - sub in the round-to-nearest
// and round-towards-zero rounding modes
return Vec128<T, N>{vec_nmadd(mul.raw, x.raw, sub.raw)};
}
// ------------------------------ Floating-point div
// Approximate reciprocal
#ifdef HWY_NATIVE_F64_APPROX_RECIP
#undef HWY_NATIVE_F64_APPROX_RECIP
#else
#define HWY_NATIVE_F64_APPROX_RECIP
#endif
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N>
operator/(Vec128<T, N> a, Vec128<T, N> b) {
#if HWY_S390X_HAVE_Z14
return Vec128<T, N>{a.raw / b.raw};
#else
return Vec128<T, N>{vec_div(a.raw, b.raw)};
#endif
}
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> ApproximateReciprocal(Vec128<T, N> v) {
#if HWY_S390X_HAVE_Z14
const DFromV<decltype(v)> d;
return Set(d, T(1.0)) / v;
#else
return Vec128<T, N>{vec_re(v.raw)};
#endif
}
// ------------------------------ Floating-point square root
#if HWY_S390X_HAVE_Z14
// Approximate reciprocal square root
template <size_t N>
HWY_API Vec128<
float, N> ApproximateReciprocalSqrt(Vec128<
float, N> v) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const auto half = v * Set(d, 0.5f);
// Initial guess based on log2(f)
const auto guess = BitCast(
d, Set(du, uint32_t{0x5F3759DFu}) - ShiftRight<1>(BitCast(du, v)));
// One Newton-Raphson iteration
return guess * NegMulAdd(half * guess, guess, Set(d, 1.5f));
}
#else // VSX
#ifdef HWY_NATIVE_F64_APPROX_RSQRT
#undef HWY_NATIVE_F64_APPROX_RSQRT
#else
#define HWY_NATIVE_F64_APPROX_RSQRT
#endif
// Approximate reciprocal square root
template <
class T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> ApproximateReciprocalSqrt(Vec128<T, N> v) {
return Vec128<T, N>{vec_rsqrte(v.raw)};
}
#endif // HWY_S390X_HAVE_Z14
// Full precision square root
template <
class T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
return Vec128<T, N>{vec_sqrt(v.raw)};
}
// ------------------------------ Min (Gt, IfThenElse)
template <
typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Vec128<T, N> Min(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{vec_min(a.raw, b.raw)};
}
// ------------------------------ Max (Gt, IfThenElse)
template <
typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Vec128<T, N> Max(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{vec_max(a.raw, b.raw)};
}
// ------------------------------- Integer AbsDiff for PPC9/PPC10
#if HWY_PPC_HAVE_9
#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
#undef HWY_NATIVE_INTEGER_ABS_DIFF
#else
#define HWY_NATIVE_INTEGER_ABS_DIFF
#endif
template <
class V, HWY_IF_UNSIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
HWY_API V AbsDiff(
const V a,
const V b) {
return V{vec_absd(a.raw, b.raw)};
}
template <
class V, HWY_IF_U64_D(DFromV<V>)>
HWY_API V AbsDiff(
const V a,
const V b) {
return Sub(Max(a, b), Min(a, b));
}
template <
class V, HWY_IF_SIGNED_V(V)>
HWY_API V AbsDiff(
const V a,
const V b) {
return Sub(Max(a, b), Min(a, b));
}
#endif // HWY_PPC_HAVE_9
// ------------------------------ Integer Div for PPC10
#if HWY_PPC_HAVE_10
#ifdef HWY_NATIVE_INT_DIV
#undef HWY_NATIVE_INT_DIV
#else
#define HWY_NATIVE_INT_DIV
#endif
template <size_t N>
HWY_API Vec128<int32_t, N>
operator/(Vec128<int32_t, N> a,
Vec128<int32_t, N> b) {
// Inline assembly is used instead of vec_div for I32 Div on PPC10 to avoid
// undefined behavior if b[i] == 0 or
// (a[i] == LimitsMin<int32_t>() && b[i] == -1)
// Clang will also optimize out I32 vec_div on PPC10 if optimizations are
// enabled and any of the lanes of b are known to be zero (even in the unused
// lanes of a partial vector)
__vector
signed int raw_result;
__asm__(
"vdivsw %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
return Vec128<int32_t, N>{raw_result};
}
template <size_t N>
HWY_API Vec128<uint32_t, N>
operator/(Vec128<uint32_t, N> a,
Vec128<uint32_t, N> b) {
// Inline assembly is used instead of vec_div for U32 Div on PPC10 to avoid
// undefined behavior if b[i] == 0
// Clang will also optimize out U32 vec_div on PPC10 if optimizations are
// enabled and any of the lanes of b are known to be zero (even in the unused
// lanes of a partial vector)
__vector
unsigned int raw_result;
__asm__(
"vdivuw %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
return Vec128<uint32_t, N>{raw_result};
}
template <size_t N>
HWY_API Vec128<int64_t, N>
operator/(Vec128<int64_t, N> a,
Vec128<int64_t, N> b) {
// Inline assembly is used instead of vec_div for I64 Div on PPC10 to avoid
// undefined behavior if b[i] == 0 or
// (a[i] == LimitsMin<int64_t>() && b[i] == -1)
// Clang will also optimize out I64 vec_div on PPC10 if optimizations are
// enabled and any of the lanes of b are known to be zero (even in the unused
// lanes of a partial vector)
__vector
signed long long raw_result;
__asm__(
"vdivsd %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
return Vec128<int64_t, N>{raw_result};
}
template <size_t N>
HWY_API Vec128<uint64_t, N>
operator/(Vec128<uint64_t, N> a,
Vec128<uint64_t, N> b) {
// Inline assembly is used instead of vec_div for U64 Div on PPC10 to avoid
// undefined behavior if b[i] == 0
// Clang will also optimize out U64 vec_div on PPC10 if optimizations are
// enabled and any of the lanes of b are known to be zero (even in the unused
// lanes of a partial vector)
__vector
unsigned long long raw_result;
__asm__(
"vdivud %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
return Vec128<uint64_t, N>{raw_result};
}
template <
class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
HWY_API Vec128<T>
operator/(Vec128<T> a, Vec128<T> b) {
const DFromV<decltype(a)> d;
const RepartitionToWide<decltype(d)> dw;
return OrderedDemote2To(d, PromoteLowerTo(dw, a) / PromoteLowerTo(dw, b),
PromoteUpperTo(dw, a) / PromoteUpperTo(dw, b));
}
template <
class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_API Vec128<T, N>
operator/(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const Rebind<MakeWide<T>, decltype(d)> dw;
return DemoteTo(d, PromoteTo(dw, a) / PromoteTo(dw, b));
}
template <size_t N>
HWY_API Vec128<int32_t, N>
operator%(Vec128<int32_t, N> a,
Vec128<int32_t, N> b) {
// Inline assembly is used instead of vec_mod for I32 Mod on PPC10 to avoid
// undefined behavior if b[i] == 0 or
// (a[i] == LimitsMin<int32_t>() && b[i] == -1)
// Clang will also optimize out I32 vec_mod on PPC10 if optimizations are
// enabled and any of the lanes of b are known to be zero (even in the unused
// lanes of a partial vector)
__vector
signed int raw_result;
__asm__(
"vmodsw %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
return Vec128<int32_t, N>{raw_result};
}
template <size_t N>
HWY_API Vec128<uint32_t, N>
operator%(Vec128<uint32_t, N> a,
Vec128<uint32_t, N> b) {
// Inline assembly is used instead of vec_mod for U32 Mod on PPC10 to avoid
// undefined behavior if b[i] == 0
// Clang will also optimize out U32 vec_mod on PPC10 if optimizations are
// enabled and any of the lanes of b are known to be zero (even in the unused
// lanes of a partial vector)
__vector
unsigned int raw_result;
__asm__(
"vmoduw %0,%1,%2" :
"=v"(raw_result) :
"v"(a.raw),
"v"(b.raw));
--> --------------------
--> maximum size reached
--> --------------------