// Copyright 2021 Google LLC
// Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
// SPDX-License-Identifier: Apache-2.0
// SPDX-License-Identifier: BSD-3-Clause
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Target-independent types/functions defined after target-specific ops.
// The "include guards" in this file that check HWY_TARGET_TOGGLE serve to skip
// the generic implementation here if native ops are already defined.
#include "hwy/base.h"
// Define detail::Shuffle1230 etc, but only when viewing the current header;
// normally this is included via highway.h, which includes ops/*.h.
#if HWY_IDE && !
defined(HWY_HIGHWAY_INCLUDED)
#include "hwy/detect_targets.h"
#include "hwy/ops/emu128-inl.h"
#endif // HWY_IDE
// Relies on the external include guard in highway.h.
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
template <
class V>
using LaneType = decltype(GetLane(V()));
// Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
// type of functions that do not take a vector argument, or as an argument type
// if the function only has a template argument for D, or for explicit type
// names instead of auto. This may be a built-in type.
template <
class D>
using Vec = decltype(Zero(D()));
// Mask type. Useful as the return type of functions that do not take a mask
// argument, or as an argument type if the function only has a template argument
// for D, or for explicit type names instead of auto.
template <
class D>
using Mask = decltype(MaskFromVec(Zero(D())));
// Returns the closest value to v within [lo, hi].
template <
class V>
HWY_API V Clamp(
const V v,
const V lo,
const V hi) {
return Min(Max(lo, v), hi);
}
// CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
// and RVV has its own implementation of -Lanes.
#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
template <size_t kLanes,
class D>
HWY_API VFromD<D> CombineShiftRightLanes(D d, VFromD<D> hi, VFromD<D> lo) {
constexpr size_t kBytes = kLanes *
sizeof(TFromD<D>);
static_assert(kBytes < 16,
"Shift count is per-block");
return CombineShiftRightBytes<kBytes>(d, hi, lo);
}
#endif
// Returns lanes with the most significant bit set and all other bits zero.
template <
class D>
HWY_API Vec<D> SignBit(D d) {
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, Set(du, SignMask<TFromD<D>>()));
}
// Returns quiet NaN.
template <
class D>
HWY_API Vec<D> NaN(D d) {
const RebindToSigned<D> di;
// LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
// mantissa MSB (to indicate quiet) would be sufficient.
return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
}
// Returns positive infinity.
template <
class D>
HWY_API Vec<D> Inf(D d) {
const RebindToUnsigned<D> du;
using T = TFromD<D>;
using TU = TFromD<decltype(du)>;
const TU max_x2 =
static_cast<TU>(MaxExponentTimes2<T>());
return BitCast(d, Set(du, max_x2 >> 1));
}
// ------------------------------ ZeroExtendResizeBitCast
// The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128
// target is in emu128-inl.h, and the implementation of
// detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h
#if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR
namespace detail {
#if HWY_HAVE_SCALABLE
template <size_t kFromVectSize, size_t kToVectSize,
class DTo,
class DFrom>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
hwy::SizeTag<kFromVectSize>
/* from_size_tag */,
hwy::SizeTag<kToVectSize>
/* to_size_tag */, DTo d_to, DFrom d_from,
VFromD<DFrom> v) {
const Repartition<uint8_t, DTo> d_to_u8;
const auto resized = ResizeBitCast(d_to_u8, v);
// Zero the upper bytes which were not present/valid in d_from.
const size_t num_bytes = Lanes(Repartition<uint8_t, decltype(d_from)>());
return BitCast(d_to, IfThenElseZero(FirstN(d_to_u8, num_bytes), resized));
}
#else // target that uses fixed-size vectors
// Truncating or same-size resizing cast: same as ResizeBitCast
template <size_t kFromVectSize, size_t kToVectSize,
class DTo,
class DFrom,
HWY_IF_LANES_LE(kToVectSize, kFromVectSize)>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
hwy::SizeTag<kFromVectSize>
/* from_size_tag */,
hwy::SizeTag<kToVectSize>
/* to_size_tag */, DTo d_to, DFrom /*d_from*/,
VFromD<DFrom> v) {
return ResizeBitCast(d_to, v);
}
// Resizing cast to vector that has twice the number of lanes of the source
// vector
template <size_t kFromVectSize, size_t kToVectSize,
class DTo,
class DFrom,
HWY_IF_LANES(kToVectSize, kFromVectSize * 2)>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
hwy::SizeTag<kFromVectSize>
/* from_size_tag */,
hwy::SizeTag<kToVectSize>
/* to_size_tag */, DTo d_to, DFrom d_from,
VFromD<DFrom> v) {
const Twice<decltype(d_from)> dt_from;
return BitCast(d_to, ZeroExtendVector(dt_from, v));
}
// Resizing cast to vector that has more than twice the number of lanes of the
// source vector
template <size_t kFromVectSize, size_t kToVectSize,
class DTo,
class DFrom,
HWY_IF_LANES_GT(kToVectSize, kFromVectSize * 2)>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
hwy::SizeTag<kFromVectSize>
/* from_size_tag */,
hwy::SizeTag<kToVectSize>
/* to_size_tag */, DTo d_to, DFrom /*d_from*/,
VFromD<DFrom> v) {
using TFrom = TFromD<DFrom>;
constexpr size_t kNumOfFromLanes = kFromVectSize /
sizeof(TFrom);
const Repartition<TFrom, decltype(d_to)> d_resize_to;
return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes),
ResizeBitCast(d_resize_to, v)));
}
#endif // HWY_HAVE_SCALABLE
}
// namespace detail
#endif // HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR
template <
class DTo,
class DFrom>
HWY_API VFromD<DTo> ZeroExtendResizeBitCast(DTo d_to, DFrom d_from,
VFromD<DFrom> v) {
return detail::ZeroExtendResizeBitCast(hwy::SizeTag<d_from.MaxBytes()>(),
hwy::SizeTag<d_to.MaxBytes()>(), d_to,
d_from, v);
}
// ------------------------------ SafeFillN
template <
class D,
typename T = TFromD<D>>
HWY_API
void SafeFillN(
const size_t num,
const T value, D d,
T* HWY_RESTRICT to) {
#if HWY_MEM_OPS_MIGHT_FAULT
(
void)d;
for (size_t i = 0; i < num; ++i) {
to[i] = value;
}
#else
BlendedStore(Set(d, value), FirstN(d, num), d, to);
#endif
}
// ------------------------------ SafeCopyN
template <
class D,
typename T = TFromD<D>>
HWY_API
void SafeCopyN(
const size_t num, D d,
const T* HWY_RESTRICT from,
T* HWY_RESTRICT to) {
#if HWY_MEM_OPS_MIGHT_FAULT
(
void)d;
for (size_t i = 0; i < num; ++i) {
to[i] = from[i];
}
#else
const Mask<D> mask = FirstN(d, num);
BlendedStore(MaskedLoad(mask, d, from), mask, d, to);
#endif
}
// ------------------------------ MaskFalse
#if (
defined(HWY_NATIVE_MASK_FALSE) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_MASK_FALSE
#undef HWY_NATIVE_MASK_FALSE
#else
#define HWY_NATIVE_MASK_FALSE
#endif
template <
class D>
HWY_API Mask<D> MaskFalse(D d) {
return MaskFromVec(Zero(d));
}
#endif // HWY_NATIVE_MASK_FALSE
// ------------------------------ BitwiseIfThenElse
#if (
defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#else
#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
#endif
template <
class V>
HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
return Or(
And(mask, yes), AndNot(mask, no));
}
#endif // HWY_NATIVE_BITWISE_IF_THEN_ELSE
// ------------------------------ PromoteMaskTo
#if (
defined(HWY_NATIVE_PROMOTE_MASK_TO) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_PROMOTE_MASK_TO
#undef HWY_NATIVE_PROMOTE_MASK_TO
#else
#define HWY_NATIVE_PROMOTE_MASK_TO
#endif
template <
class DTo,
class DFrom>
HWY_API Mask<DTo> PromoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
static_assert(
sizeof(TFromD<DTo>) >
sizeof(TFromD<DFrom>),
"sizeof(TFromD) must be greater than sizeof(TFromD)");
static_assert(
IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
"Mask must be the same type as Mask, DTo>>");
const RebindToSigned<decltype(d_to)> di_to;
const RebindToSigned<decltype(d_from)> di_from;
return MaskFromVec(BitCast(
d_to, PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
}
#endif // HWY_NATIVE_PROMOTE_MASK_TO
// ------------------------------ DemoteMaskTo
#if (
defined(HWY_NATIVE_DEMOTE_MASK_TO) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_DEMOTE_MASK_TO
#undef HWY_NATIVE_DEMOTE_MASK_TO
#else
#define HWY_NATIVE_DEMOTE_MASK_TO
#endif
template <
class DTo,
class DFrom>
HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
static_assert(
sizeof(TFromD<DTo>) <
sizeof(TFromD<DFrom>),
"sizeof(TFromD) must be less than sizeof(TFromD)");
static_assert(
IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
"Mask must be the same type as Mask, DTo>>");
const RebindToSigned<decltype(d_to)> di_to;
const RebindToSigned<decltype(d_from)> di_from;
return MaskFromVec(
BitCast(d_to, DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
}
#endif // HWY_NATIVE_DEMOTE_MASK_TO
// ------------------------------ CombineMasks
#if (
defined(HWY_NATIVE_COMBINE_MASKS) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_COMBINE_MASKS
#undef HWY_NATIVE_COMBINE_MASKS
#else
#define HWY_NATIVE_COMBINE_MASKS
#endif
#if HWY_TARGET != HWY_SCALAR
template <
class D>
HWY_API Mask<D> CombineMasks(D d, Mask<Half<D>> hi, Mask<Half<D>> lo) {
const Half<decltype(d)> dh;
return MaskFromVec(Combine(d, VecFromMask(dh, hi), VecFromMask(dh, lo)));
}
#endif
#endif // HWY_NATIVE_COMBINE_MASKS
// ------------------------------ LowerHalfOfMask
#if (
defined(HWY_NATIVE_LOWER_HALF_OF_MASK) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
#undef HWY_NATIVE_LOWER_HALF_OF_MASK
#else
#define HWY_NATIVE_LOWER_HALF_OF_MASK
#endif
template <
class D>
HWY_API Mask<D> LowerHalfOfMask(D d, Mask<Twice<D>> m) {
const Twice<decltype(d)> dt;
return MaskFromVec(LowerHalf(d, VecFromMask(dt, m)));
}
#endif // HWY_NATIVE_LOWER_HALF_OF_MASK
// ------------------------------ UpperHalfOfMask
#if (
defined(HWY_NATIVE_UPPER_HALF_OF_MASK) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
#undef HWY_NATIVE_UPPER_HALF_OF_MASK
#else
#define HWY_NATIVE_UPPER_HALF_OF_MASK
#endif
#if HWY_TARGET != HWY_SCALAR
template <
class D>
HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) {
const Twice<decltype(d)> dt;
return MaskFromVec(UpperHalf(d, VecFromMask(dt, m)));
}
#endif
#endif // HWY_NATIVE_UPPER_HALF_OF_MASK
// ------------------------------ OrderedDemote2MasksTo
#if (
defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#else
#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#endif
#if HWY_TARGET != HWY_SCALAR
template <
class DTo,
class DFrom>
HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
Mask<DFrom> b) {
static_assert(
sizeof(TFromD<DTo>) ==
sizeof(TFromD<DFrom>) / 2,
"sizeof(TFromD) must be equal to sizeof(TFromD) / 2");
static_assert(IsSame<Mask<DTo>, Mask<Repartition<TFromD<DTo>, DFrom>>>(),
"Mask must be the same type as "
"Mask, DFrom>>>()");
const RebindToSigned<decltype(d_from)> di_from;
const RebindToSigned<decltype(d_to)> di_to;
const auto va = BitCast(di_from, VecFromMask(d_from, a));
const auto vb = BitCast(di_from, VecFromMask(d_from, b));
return MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb)));
}
#endif
#endif // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
// ------------------------------ InterleaveWholeLower/InterleaveWholeUpper
#if (
defined(HWY_NATIVE_INTERLEAVE_WHOLE) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
#undef HWY_NATIVE_INTERLEAVE_WHOLE
#else
#define HWY_NATIVE_INTERLEAVE_WHOLE
#endif
#if HWY_TARGET != HWY_SCALAR
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
// InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if
// D().MaxBytes() <= 16 is true
return InterleaveLower(d, a, b);
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
// InterleaveWholeUpper(d, a, b) is equivalent to InterleaveUpper(a, b) if
// D().MaxBytes() <= 16 is true
return InterleaveUpper(d, a, b);
}
// InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on AVX2/AVX3
// is implemented in x86_256-inl.h.
// InterleaveWholeLower/InterleaveWholeUpper for 64-byte vectors on AVX3 is
// implemented in x86_512-inl.h.
// InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on WASM_EMU256
// is implemented in wasm_256-inl.h.
#endif // HWY_TARGET != HWY_SCALAR
#endif // HWY_NATIVE_INTERLEAVE_WHOLE
#if HWY_TARGET != HWY_SCALAR
// The InterleaveWholeLower without the optional D parameter is generic for all
// vector lengths.
template <
class V>
HWY_API V InterleaveWholeLower(V a, V b) {
return InterleaveWholeLower(DFromV<V>(), a, b);
}
#endif // HWY_TARGET != HWY_SCALAR
// ------------------------------ AddSub
template <
class V, HWY_IF_LANES_D(DFromV<V>, 1)>
HWY_API V AddSub(V a, V b) {
// AddSub(a, b) for a one-lane vector is equivalent to Sub(a, b)
return Sub(a, b);
}
// AddSub for F32x2, F32x4, and F64x2 vectors is implemented in x86_128-inl.h on
// SSSE3/SSE4/AVX2/AVX3
// AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on
// AVX2/AVX3
template <
class V, HWY_IF_V_SIZE_GT_V(V, ((HWY_TARGET <= HWY_SSSE3 &&
hwy::IsFloat3264<TFromV<V>>())
? 32
:
sizeof(TFromV<V>)))>
HWY_API V AddSub(V a, V b) {
using D = DFromV<decltype(a)>;
using T = TFromD<D>;
using TNegate =
If<!hwy::IsSigned<T>(), MakeSigned<T>, T>;
const D d;
const Rebind<TNegate, D> d_negate;
// Negate the even lanes of b
const auto negated_even_b = OddEven(b, BitCast(d, Neg(BitCast(d_negate, b))));
return Add(a, negated_even_b);
}
// ------------------------------ MaskedAddOr etc.
#if (
defined(HWY_NATIVE_MASKED_ARITH) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_MASKED_ARITH
#undef HWY_NATIVE_MASKED_ARITH
#else
#define HWY_NATIVE_MASKED_ARITH
#endif
template <
class V,
class M>
HWY_API V MaskedMinOr(V no, M m, V a, V b) {
return IfThenElse(m, Min(a, b), no);
}
template <
class V,
class M>
HWY_API V MaskedMaxOr(V no, M m, V a, V b) {
return IfThenElse(m, Max(a, b), no);
}
template <
class V,
class M>
HWY_API V MaskedAddOr(V no, M m, V a, V b) {
return IfThenElse(m, Add(a, b), no);
}
template <
class V,
class M>
HWY_API V MaskedSubOr(V no, M m, V a, V b) {
return IfThenElse(m, Sub(a, b), no);
}
template <
class V,
class M>
HWY_API V MaskedMulOr(V no, M m, V a, V b) {
return IfThenElse(m, Mul(a, b), no);
}
template <
class V,
class M>
HWY_API V MaskedDivOr(V no, M m, V a, V b) {
return IfThenElse(m, Div(a, b), no);
}
template <
class V,
class M>
HWY_API V MaskedModOr(V no, M m, V a, V b) {
return IfThenElse(m, Mod(a, b), no);
}
template <
class V,
class M>
HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
return IfThenElse(m, SaturatedAdd(a, b), no);
}
template <
class V,
class M>
HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
return IfThenElse(m, SaturatedSub(a, b), no);
}
#endif // HWY_NATIVE_MASKED_ARITH
// ------------------------------ IfNegativeThenNegOrUndefIfZero
#if (
defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
#else
#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
#endif
template <
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
// MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE
const auto zero = Zero(DFromV<V>());
return MaskedSubOr(v, Lt(mask, zero), zero, v);
#else
return IfNegativeThenElse(mask, Neg(v), v);
#endif
}
#endif // HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
template <
class V, HWY_IF_FLOAT_V(V)>
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
return CopySign(v,
Xor(mask, v));
}
// ------------------------------ SaturatedNeg
#if (
defined(HWY_NATIVE_SATURATED_NEG_8_16_32) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
#undef HWY_NATIVE_SATURATED_NEG_8_16_32
#else
#define HWY_NATIVE_SATURATED_NEG_8_16_32
#endif
template <
class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
HWY_IF_SIGNED_V(V)>
HWY_API V SaturatedNeg(V v) {
const DFromV<decltype(v)> d;
return SaturatedSub(Zero(d), v);
}
template <
class V, HWY_IF_I32(TFromV<V>)>
HWY_API V SaturatedNeg(V v) {
const DFromV<decltype(v)> d;
#if HWY_TARGET == HWY_RVV || \
(HWY_TARGET >= HWY_PPC10 && HWY_TARGET <= HWY_PPC8) || \
(HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
// RVV/NEON/SVE/PPC have native I32 SaturatedSub instructions
return SaturatedSub(Zero(d), v);
#else
// ~v[i] - ((v[i] > LimitsMin<int32_t>()) ? -1 : 0) is equivalent to
// (v[i] > LimitsMin<int32_t>) ? (-v[i]) : LimitsMax<int32_t>() since
// -v[i] == ~v[i] + 1 == ~v[i] - (-1) and
// ~LimitsMin<int32_t>() == LimitsMax<int32_t>().
return Sub(
Not(v), VecFromMask(d, Gt(v, Set(d, LimitsMin<int32_t>()))));
#endif
}
#endif // HWY_NATIVE_SATURATED_NEG_8_16_32
#if (
defined(HWY_NATIVE_SATURATED_NEG_64) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SATURATED_NEG_64
#undef HWY_NATIVE_SATURATED_NEG_64
#else
#define HWY_NATIVE_SATURATED_NEG_64
#endif
template <
class V, HWY_IF_I64(TFromV<V>)>
HWY_API V SaturatedNeg(V v) {
#if HWY_TARGET == HWY_RVV || \
(HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
// RVV/NEON/SVE have native I64 SaturatedSub instructions
const DFromV<decltype(v)> d;
return SaturatedSub(Zero(d), v);
#else
const auto neg_v = Neg(v);
return Add(neg_v, BroadcastSignBit(
And(v, neg_v)));
#endif
}
#endif // HWY_NATIVE_SATURATED_NEG_64
// ------------------------------ SaturatedAbs
#if (
defined(HWY_NATIVE_SATURATED_ABS) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_SATURATED_ABS
#undef HWY_NATIVE_SATURATED_ABS
#else
#define HWY_NATIVE_SATURATED_ABS
#endif
template <
class V, HWY_IF_SIGNED_V(V)>
HWY_API V SaturatedAbs(V v) {
return Max(v, SaturatedNeg(v));
}
#endif
// ------------------------------ Reductions
// Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled,
// they (RVV/SVE/Armv8/Emu128) implement ReduceSum and SumOfLanes via Set.
// Otherwise, they (Armv7/PPC/scalar/WASM/x86) define zero to most of the
// SumOfLanes overloads. For the latter group, we here define the remaining
// overloads, plus ReduceSum which uses them plus GetLane.
#if (
defined(HWY_NATIVE_REDUCE_SCALAR) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REDUCE_SCALAR
#undef HWY_NATIVE_REDUCE_SCALAR
#else
#define HWY_NATIVE_REDUCE_SCALAR
#endif
namespace detail {
// Allows reusing the same shuffle code for SumOfLanes/MinOfLanes/MaxOfLanes.
struct AddFunc {
template <
class V>
V
operator()(V a, V b)
const {
return Add(a, b);
}
};
struct MinFunc {
template <
class V>
V
operator()(V a, V b)
const {
return Min(a, b);
}
};
struct MaxFunc {
template <
class V>
V
operator()(V a, V b)
const {
return Max(a, b);
}
};
// No-op for vectors of at most one block.
template <
class D,
class Func, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE VFromD<D> ReduceAcrossBlocks(D, Func, VFromD<D> v) {
return v;
}
// Reduces a lane with its counterpart in other block(s). Shared by AVX2 and
// WASM_EMU256. AVX3 has its own overload.
template <
class D,
class Func, HWY_IF_V_SIZE_D(D, 32)>
HWY_INLINE VFromD<D> ReduceAcrossBlocks(D
/*d*/, Func f, VFromD<D> v) {
return f(v, SwapAdjacentBlocks(v));
}
// These return the reduction result broadcasted across all lanes. They assume
// the caller has already reduced across blocks.
template <
class D,
class Func, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v10) {
return f(v10, Reverse2(d, v10));
}
template <
class D,
class Func, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v3210) {
const VFromD<D> v0123 = Reverse4(d, v3210);
const VFromD<D> v03_12_12_03 = f(v3210, v0123);
const VFromD<D> v12_03_03_12 = Reverse2(d, v03_12_12_03);
return f(v03_12_12_03, v12_03_03_12);
}
template <
class D,
class Func, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v76543210) {
// The upper half is reversed from the lower half; omit for brevity.
const VFromD<D> v34_25_16_07 = f(v76543210, Reverse8(d, v76543210));
const VFromD<D> v0347_1625_1625_0347 =
f(v34_25_16_07, Reverse4(d, v34_25_16_07));
return f(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
}
template <
class D,
class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_U8_D(D)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
const RepartitionToWide<decltype(d)> dw;
using VW = VFromD<decltype(dw)>;
const VW vw = BitCast(dw, v);
// f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
const VW even =
And(vw, Set(dw, 0xFF));
const VW odd = ShiftRight<8>(vw);
const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
#if HWY_IS_LITTLE_ENDIAN
return DupEven(BitCast(d, reduced));
#else
return DupOdd(BitCast(d, reduced));
#endif
}
template <
class D,
class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_I8_D(D)>
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
const RepartitionToWide<decltype(d)> dw;
using VW = VFromD<decltype(dw)>;
const VW vw = BitCast(dw, v);
// Sign-extend
// f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
const VW even = ShiftRight<8>(ShiftLeft<8>(vw));
const VW odd = ShiftRight<8>(vw);
const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
#if HWY_IS_LITTLE_ENDIAN
return DupEven(BitCast(d, reduced));
#else
return DupOdd(BitCast(d, reduced));
#endif
}
}
// namespace detail
template <
class D, HWY_IF_SUM_OF_LANES_D(D)>
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
const detail::AddFunc f;
v = detail::ReduceAcrossBlocks(d, f, v);
return detail::ReduceWithinBlocks(d, f, v);
}
template <
class D, HWY_IF_MINMAX_OF_LANES_D(D)>
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
const detail::MinFunc f;
v = detail::ReduceAcrossBlocks(d, f, v);
return detail::ReduceWithinBlocks(d, f, v);
}
template <
class D, HWY_IF_MINMAX_OF_LANES_D(D)>
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
const detail::MaxFunc f;
v = detail::ReduceAcrossBlocks(d, f, v);
return detail::ReduceWithinBlocks(d, f, v);
}
template <
class D, HWY_IF_REDUCE_D(D)>
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
return GetLane(SumOfLanes(d, v));
}
template <
class D, HWY_IF_REDUCE_D(D)>
HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
return GetLane(MinOfLanes(d, v));
}
template <
class D, HWY_IF_REDUCE_D(D)>
HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
return GetLane(MaxOfLanes(d, v));
}
#endif // HWY_NATIVE_REDUCE_SCALAR
// Corner cases for both generic and native implementations:
// N=1 (native covers N=2 e.g. for u64x2 and even u32x2 on Arm)
template <
class D, HWY_IF_LANES_D(D, 1)>
HWY_API TFromD<D> ReduceSum(D
/*d*/, VFromD<D> v) {
return GetLane(v);
}
template <
class D, HWY_IF_LANES_D(D, 1)>
HWY_API TFromD<D> ReduceMin(D
/*d*/, VFromD<D> v) {
return GetLane(v);
}
template <
class D, HWY_IF_LANES_D(D, 1)>
HWY_API TFromD<D> ReduceMax(D
/*d*/, VFromD<D> v) {
return GetLane(v);
}
template <
class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> SumOfLanes(D
/* tag */, VFromD<D> v) {
return v;
}
template <
class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> MinOfLanes(D
/* tag */, VFromD<D> v) {
return v;
}
template <
class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> MaxOfLanes(D
/* tag */, VFromD<D> v) {
return v;
}
// N=4 for 8-bit is still less than the minimum native size.
// ARMv7 NEON/PPC/RVV/SVE have target-specific implementations of the N=4 I8/U8
// ReduceSum operations
#if (
defined(HWY_NATIVE_REDUCE_SUM_4_UI8) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
#else
#define HWY_NATIVE_REDUCE_SUM_4_UI8
#endif
template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
const Twice<RepartitionToWide<decltype(d)>> dw;
return static_cast<TFromD<D>>(ReduceSum(dw, PromoteTo(dw, v)));
}
#endif // HWY_NATIVE_REDUCE_SUM_4_UI8
// RVV/SVE have target-specific implementations of the N=4 I8/U8
// ReduceMin/ReduceMax operations
#if (
defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
#else
#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
#endif
template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
const Twice<RepartitionToWide<decltype(d)>> dw;
return static_cast<TFromD<D>>(ReduceMin(dw, PromoteTo(dw, v)));
}
template <
class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
const Twice<RepartitionToWide<decltype(d)>> dw;
return static_cast<TFromD<D>>(ReduceMax(dw, PromoteTo(dw, v)));
}
#endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8
// ------------------------------ IsInf, IsFinite
// AVX3 has target-specific implementations of these.
#if (
defined(HWY_NATIVE_ISINF) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_ISINF
#undef HWY_NATIVE_ISINF
#else
#define HWY_NATIVE_ISINF
#endif
template <
class V,
class D = DFromV<V>>
HWY_API MFromD<D> IsInf(
const V v) {
using T = TFromD<D>;
const D d;
const RebindToUnsigned<decltype(d)> du;
const VFromD<decltype(du)> vu = BitCast(du, v);
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
return RebindMask(
d,
Eq(Add(vu, vu),
Set(du,
static_cast<MakeUnsigned<T>>(hwy::MaxExponentTimes2<T>()))));
}
// Returns whether normal/subnormal/zero.
template <
class V,
class D = DFromV<V>>
HWY_API MFromD<D> IsFinite(
const V v) {
using T = TFromD<D>;
const D d;
const RebindToUnsigned<decltype(d)> du;
const RebindToSigned<decltype(d)> di;
// cheaper than unsigned comparison
const VFromD<decltype(du)> vu = BitCast(du, v);
// 'Shift left' to clear the sign bit. MSVC seems to generate incorrect code
// for AVX2 if we instead add vu + vu.
#if HWY_COMPILER_MSVC
const VFromD<decltype(du)> shl = ShiftLeft<1>(vu);
#else
const VFromD<decltype(du)> shl = Add(vu, vu);
#endif
// Then shift right so we can compare with the max exponent (cannot compare
// with MaxExponentTimes2 directly because it is negative and non-negative
// floats would be greater).
const VFromD<decltype(di)> exp =
BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(shl));
return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
}
#endif // HWY_NATIVE_ISINF
// ------------------------------ LoadInterleaved2
#if HWY_IDE || \
(
defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) ==
defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
#else
#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
#endif
template <
class D, HWY_IF_LANES_GT_D(D, 1)>
HWY_API
void LoadInterleaved2(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1) {
const VFromD<D> A = LoadU(d, unaligned);
// v1[1] v0[1] v1[0] v0[0]
const VFromD<D> B = LoadU(d, unaligned + Lanes(d));
v0 = ConcatEven(d, B, A);
v1 = ConcatOdd(d, B, A);
}
template <
class D, HWY_IF_LANES_D(D, 1)>
HWY_API
void LoadInterleaved2(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1) {
v0 = LoadU(d, unaligned + 0);
v1 = LoadU(d, unaligned + 1);
}
// ------------------------------ LoadInterleaved3 (CombineShiftRightBytes)
namespace detail {
#if HWY_IDE
template <
class V>
HWY_INLINE V ShuffleTwo1230(V a, V
/* b */) {
return a;
}
template <
class V>
HWY_INLINE V ShuffleTwo2301(V a, V
/* b */) {
return a;
}
template <
class V>
HWY_INLINE V ShuffleTwo3012(V a, V
/* b */) {
return a;
}
#endif // HWY_IDE
// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE
void LoadTransposedBlocks3(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& A, VFromD<D>& B,
VFromD<D>& C) {
constexpr size_t kN = MaxLanes(d);
A = LoadU(d, unaligned + 0 * kN);
B = LoadU(d, unaligned + 1 * kN);
C = LoadU(d, unaligned + 2 * kN);
}
}
// namespace detail
template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
HWY_API
void LoadInterleaved3(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
const RebindToUnsigned<decltype(d)> du;
using V = VFromD<D>;
using VU = VFromD<decltype(du)>;
// Compact notation so these fit on one line: 12 := v1[2].
V A;
// 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
V B;
// 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
V C;
// 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a
detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
// Compress all lanes belonging to v0 into consecutive lanes.
constexpr uint8_t Z = 0x80;
const VU idx_v0A =
Dup128VecFromValues(du, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
const VU idx_v0B =
Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z);
const VU idx_v0C =
Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13);
const VU idx_v1A =
Dup128VecFromValues(du, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
const VU idx_v1B =
Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z);
const VU idx_v1C =
Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14);
const VU idx_v2A =
Dup128VecFromValues(du, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
const VU idx_v2B =
Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z);
const VU idx_v2C =
Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15);
const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
v0 = Xor3(v0L, v0M, v0U);
v1 = Xor3(v1L, v1M, v1U);
v2 = Xor3(v2L, v2M, v2U);
}
// 8-bit lanes x8
template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
HWY_API
void LoadInterleaved3(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
const RebindToUnsigned<decltype(d)> du;
using V = VFromD<D>;
using VU = VFromD<decltype(du)>;
V A;
// v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
V B;
// v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
V C;
// v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
// Compress all lanes belonging to v0 into consecutive lanes.
constexpr uint8_t Z = 0x80;
const VU idx_v0A =
Dup128VecFromValues(du, 0, 3, 6, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
const VU idx_v0B =
Dup128VecFromValues(du, Z, Z, Z, 1, 4, 7, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
const VU idx_v0C =
Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0);
const VU idx_v1A =
Dup128VecFromValues(du, 1, 4, 7, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
const VU idx_v1B =
Dup128VecFromValues(du, Z, Z, Z, 2, 5, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
const VU idx_v1C =
Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0);
const VU idx_v2A =
Dup128VecFromValues(du, 2, 5, Z, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
const VU idx_v2B =
Dup128VecFromValues(du, Z, Z, 0, 3, 6, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
const VU idx_v2C =
Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0);
const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
v0 = Xor3(v0L, v0M, v0U);
v1 = Xor3(v1L, v1M, v1U);
v2 = Xor3(v2L, v2M, v2U);
}
// 16-bit lanes x8
template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
HWY_API
void LoadInterleaved3(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
const RebindToUnsigned<decltype(d)> du;
const Repartition<uint8_t, decltype(du)> du8;
using V = VFromD<D>;
using VU8 = VFromD<decltype(du8)>;
V A;
// v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
V B;
// v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
V C;
// v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
// Compress all lanes belonging to v0 into consecutive lanes. Same as above,
// but each element of the array contains a byte index for a byte of a lane.
constexpr uint8_t Z = 0x80;
const VU8 idx_v0A = Dup128VecFromValues(du8, 0x00, 0x01, 0x06, 0x07, 0x0C,
0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
const VU8 idx_v0B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x02, 0x03,
0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z);
const VU8 idx_v0C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
Z, 0x04, 0x05, 0x0A, 0x0B);
const VU8 idx_v1A = Dup128VecFromValues(du8, 0x02, 0x03, 0x08, 0x09, 0x0E,
0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
const VU8 idx_v1B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x04, 0x05,
0x0A, 0x0B, Z, Z, Z, Z, Z, Z);
const VU8 idx_v1C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D);
const VU8 idx_v2A = Dup128VecFromValues(du8, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z,
Z, Z, Z, Z, Z, Z, Z, Z, Z);
const VU8 idx_v2B = Dup128VecFromValues(du8, Z, Z, Z, Z, 0x00, 0x01, 0x06,
0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z);
const VU8 idx_v2C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F);
const V v0L = TableLookupBytesOr0(A, BitCast(d, idx_v0A));
const V v0M = TableLookupBytesOr0(B, BitCast(d, idx_v0B));
const V v0U = TableLookupBytesOr0(C, BitCast(d, idx_v0C));
const V v1L = TableLookupBytesOr0(A, BitCast(d, idx_v1A));
const V v1M = TableLookupBytesOr0(B, BitCast(d, idx_v1B));
const V v1U = TableLookupBytesOr0(C, BitCast(d, idx_v1C));
const V v2L = TableLookupBytesOr0(A, BitCast(d, idx_v2A));
const V v2M = TableLookupBytesOr0(B, BitCast(d, idx_v2B));
const V v2U = TableLookupBytesOr0(C, BitCast(d, idx_v2C));
v0 = Xor3(v0L, v0M, v0U);
v1 = Xor3(v1L, v1M, v1U);
v2 = Xor3(v2L, v2M, v2U);
}
template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
HWY_API
void LoadInterleaved3(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
using V = VFromD<D>;
V A;
// v0[1] v2[0] v1[0] v0[0]
V B;
// v1[2] v0[2] v2[1] v1[1]
V C;
// v2[3] v1[3] v0[3] v2[2]
detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
const V vxx_02_03_xx = OddEven(C, B);
v0 = detail::ShuffleTwo1230(A, vxx_02_03_xx);
// Shuffle2301 takes the upper/lower halves of the output from one input, so
// we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use
// OddEven because it may have higher throughput than Shuffle.
const V vxx_xx_10_11 = OddEven(A, B);
const V v12_13_xx_xx = OddEven(B, C);
v1 = detail::ShuffleTwo2301(vxx_xx_10_11, v12_13_xx_xx);
const V vxx_20_21_xx = OddEven(B, A);
v2 = detail::ShuffleTwo3012(vxx_20_21_xx, C);
}
template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
HWY_API
void LoadInterleaved3(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
VFromD<D> A;
// v1[0] v0[0]
VFromD<D> B;
// v0[1] v2[0]
VFromD<D> C;
// v2[1] v1[1]
detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
v0 = OddEven(B, A);
v1 = CombineShiftRightBytes<
sizeof(TFromD<D>)>(d, C, A);
v2 = OddEven(C, B);
}
template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
HWY_API
void LoadInterleaved3(D d,
const T* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
v0 = LoadU(d, unaligned + 0);
v1 = LoadU(d, unaligned + 1);
v2 = LoadU(d, unaligned + 2);
}
// ------------------------------ LoadInterleaved4
namespace detail {
// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE
void LoadTransposedBlocks4(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& vA, VFromD<D>& vB,
VFromD<D>& vC, VFromD<D>& vD) {
constexpr size_t kN = MaxLanes(d);
vA = LoadU(d, unaligned + 0 * kN);
vB = LoadU(d, unaligned + 1 * kN);
vC = LoadU(d, unaligned + 2 * kN);
vD = LoadU(d, unaligned + 3 * kN);
}
}
// namespace detail
template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
HWY_API
void LoadInterleaved4(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
VFromD<D>& v3) {
const Repartition<uint64_t, decltype(d)> d64;
using V64 = VFromD<decltype(d64)>;
using V = VFromD<D>;
// 16 lanes per block; the lowest four blocks are at the bottom of vA..vD.
// Here int[i] means the four interleaved values of the i-th 4-tuple and
// int[3..0] indicates four consecutive 4-tuples (0 = least-significant).
V vA;
// int[13..10] int[3..0]
V vB;
// int[17..14] int[7..4]
V vC;
// int[1b..18] int[b..8]
V vD;
// int[1f..1c] int[f..c]
detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
// For brevity, the comments only list the lower block (upper = lower + 0x10)
const V v5140 = InterleaveLower(d, vA, vB);
// int[5,1,4,0]
const V vd9c8 = InterleaveLower(d, vC, vD);
// int[d,9,c,8]
const V v7362 = InterleaveUpper(d, vA, vB);
// int[7,3,6,2]
const V vfbea = InterleaveUpper(d, vC, vD);
// int[f,b,e,a]
const V v6420 = InterleaveLower(d, v5140, v7362);
// int[6,4,2,0]
const V veca8 = InterleaveLower(d, vd9c8, vfbea);
// int[e,c,a,8]
const V v7531 = InterleaveUpper(d, v5140, v7362);
// int[7,5,3,1]
const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea);
// int[f,d,b,9]
const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531));
// v10[7..0]
const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9));
// v10[f..8]
const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531));
// v32[7..0]
const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9));
// v32[f..8]
v0 = BitCast(d, InterleaveLower(d64, v10L, v10U));
v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U));
v2 = BitCast(d, InterleaveLower(d64, v32L, v32U));
v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U));
}
template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
HWY_API
void LoadInterleaved4(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
VFromD<D>& v3) {
// In the last step, we interleave by half of the block size, which is usually
// 8 bytes but half that for 8-bit x8 vectors.
using TW = hwy::UnsignedFromSize<d.MaxBytes() == 8 ? 4 : 8>;
const Repartition<TW, decltype(d)> dw;
using VW = VFromD<decltype(dw)>;
// (Comments are for 256-bit vectors.)
// 8 lanes per block; the lowest four blocks are at the bottom of vA..vD.
VFromD<D> vA;
// v3210[9]v3210[8] v3210[1]v3210[0]
VFromD<D> vB;
// v3210[b]v3210[a] v3210[3]v3210[2]
VFromD<D> vC;
// v3210[d]v3210[c] v3210[5]v3210[4]
VFromD<D> vD;
// v3210[f]v3210[e] v3210[7]v3210[6]
detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
const VFromD<D> va820 = InterleaveLower(d, vA, vB);
// v3210[a,8] v3210[2,0]
const VFromD<D> vec64 = InterleaveLower(d, vC, vD);
// v3210[e,c] v3210[6,4]
const VFromD<D> vb931 = InterleaveUpper(d, vA, vB);
// v3210[b,9] v3210[3,1]
const VFromD<D> vfd75 = InterleaveUpper(d, vC, vD);
// v3210[f,d] v3210[7,5]
const VW v10_b830 =
// v10[b..8] v10[3..0]
BitCast(dw, InterleaveLower(d, va820, vb931));
const VW v10_fc74 =
// v10[f..c] v10[7..4]
BitCast(dw, InterleaveLower(d, vec64, vfd75));
const VW v32_b830 =
// v32[b..8] v32[3..0]
BitCast(dw, InterleaveUpper(d, va820, vb931));
const VW v32_fc74 =
// v32[f..c] v32[7..4]
BitCast(dw, InterleaveUpper(d, vec64, vfd75));
v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74));
v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74));
v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74));
v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74));
}
template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
HWY_API
void LoadInterleaved4(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
VFromD<D>& v3) {
using V = VFromD<D>;
V vA;
// v3210[4] v3210[0]
V vB;
// v3210[5] v3210[1]
V vC;
// v3210[6] v3210[2]
V vD;
// v3210[7] v3210[3]
detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
const V v10e = InterleaveLower(d, vA, vC);
// v1[6,4] v0[6,4] v1[2,0] v0[2,0]
const V v10o = InterleaveLower(d, vB, vD);
// v1[7,5] v0[7,5] v1[3,1] v0[3,1]
const V v32e = InterleaveUpper(d, vA, vC);
// v3[6,4] v2[6,4] v3[2,0] v2[2,0]
const V v32o = InterleaveUpper(d, vB, vD);
// v3[7,5] v2[7,5] v3[3,1] v2[3,1]
v0 = InterleaveLower(d, v10e, v10o);
v1 = InterleaveUpper(d, v10e, v10o);
v2 = InterleaveLower(d, v32e, v32o);
v3 = InterleaveUpper(d, v32e, v32o);
}
template <
class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
HWY_API
void LoadInterleaved4(D d,
const TFromD<D>* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
VFromD<D>& v3) {
VFromD<D> vA, vB, vC, vD;
detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
v0 = InterleaveLower(d, vA, vC);
v1 = InterleaveUpper(d, vA, vC);
v2 = InterleaveLower(d, vB, vD);
v3 = InterleaveUpper(d, vB, vD);
}
// Any T x1
template <
class D,
typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
HWY_API
void LoadInterleaved4(D d,
const T* HWY_RESTRICT unaligned,
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
VFromD<D>& v3) {
v0 = LoadU(d, unaligned + 0);
v1 = LoadU(d, unaligned + 1);
v2 = LoadU(d, unaligned + 2);
v3 = LoadU(d, unaligned + 3);
}
// ------------------------------ StoreInterleaved2
namespace detail {
// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE
void StoreTransposedBlocks2(VFromD<D> A, VFromD<D> B, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
constexpr size_t kN = MaxLanes(d);
StoreU(A, d, unaligned + 0 * kN);
StoreU(B, d, unaligned + 1 * kN);
}
}
// namespace detail
// >= 128 bit vector
template <
class D, HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API
void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
const auto v10L = InterleaveLower(d, v0, v1);
// .. v1[0] v0[0]
const auto v10U = InterleaveUpper(d, v0, v1);
// .. v1[kN/2] v0[kN/2]
detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
}
// <= 64 bits
template <
class V,
class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API
void StoreInterleaved2(V part0, V part1, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
const Twice<decltype(d)> d2;
const auto v0 = ZeroExtendVector(d2, part0);
const auto v1 = ZeroExtendVector(d2, part1);
const auto v10 = InterleaveLower(d2, v0, v1);
StoreU(v10, d2, unaligned);
}
// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
// TableLookupBytes)
namespace detail {
// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE
void StoreTransposedBlocks3(VFromD<D> A, VFromD<D> B, VFromD<D> C,
D d, TFromD<D>* HWY_RESTRICT unaligned) {
constexpr size_t kN = MaxLanes(d);
StoreU(A, d, unaligned + 0 * kN);
StoreU(B, d, unaligned + 1 * kN);
StoreU(C, d, unaligned + 2 * kN);
}
}
// namespace detail
// >= 128-bit vector, 8-bit lanes
template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API
void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
const RebindToUnsigned<decltype(d)> du;
using TU = TFromD<decltype(du)>;
const auto k5 = Set(du, TU{5});
const auto k6 = Set(du, TU{6});
// Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
// v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
// to their place, with 0x80 so lanes to be filled from other vectors are 0
// to enable blending by ORing together.
const VFromD<decltype(du)> shuf_A0 =
Dup128VecFromValues(du, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3,
0x80, 0x80, 4, 0x80, 0x80, 5);
// Cannot reuse shuf_A0 because it contains 5.
const VFromD<decltype(du)> shuf_A1 =
Dup128VecFromValues(du, 0x80, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
3, 0x80, 0x80, 4, 0x80, 0x80);
// The interleaved vectors will be named A, B, C; temporaries with suffix
// 0..2 indicate which input vector's lanes they hold.
// cannot reuse shuf_A0 (has 5)
const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
// 5..4..3..2..1..0
const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
// ..4..3..2..1..0.
const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
// .4..3..2..1..0..
const VFromD<D> A = BitCast(d, A0 | A1 | A2);
// B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
const auto shuf_B0 = shuf_A2 + k6;
// .A..9..8..7..6..
const auto shuf_B1 = shuf_A0 + k5;
// A..9..8..7..6..5
const auto shuf_B2 = shuf_A1 + k5;
// ..9..8..7..6..5.
const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
const VFromD<D> B = BitCast(d, B0 | B1 | B2);
// C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
const auto shuf_C0 = shuf_B2 + k6;
// ..F..E..D..C..B.
const auto shuf_C1 = shuf_B0 + k5;
// .F..E..D..C..B..
const auto shuf_C2 = shuf_B1 + k5;
// F..E..D..C..B..A
const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
const VFromD<D> C = BitCast(d, C0 | C1 | C2);
detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
}
// >= 128-bit vector, 16-bit lanes
template <
class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API
void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
const Repartition<uint8_t, decltype(d)> du8;
const auto k2 = Set(du8, uint8_t{2 *
sizeof(TFromD<D>)});
const auto k3 = Set(du8, uint8_t{3 *
sizeof(TFromD<D>)});
// Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
// v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
// filled from other vectors are 0 for blending. Note that these are byte
// indices for 16-bit lanes.
const VFromD<decltype(du8)> shuf_A1 =
Dup128VecFromValues(du8, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3,
0x80, 0x80, 0x80, 0x80, 4, 5);
const VFromD<decltype(du8)> shuf_A2 =
Dup128VecFromValues(du8, 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80,
0x80, 2, 3, 0x80, 0x80, 0x80, 0x80);
// The interleaved vectors will be named A, B, C; temporaries with suffix
// 0..2 indicate which input vector's lanes they hold.
const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
const VFromD<D> A = BitCast(d, A0 | A1 | A2);
// B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
const auto shuf_B0 = shuf_A1 + k3;
// 5..4..3.
const auto shuf_B1 = shuf_A2 + k3;
// ..4..3..
const auto shuf_B2 = shuf_A0 + k2;
// .4..3..2
const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
const VFromD<D> B = BitCast(d, B0 | B1 | B2);
// C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
const auto shuf_C0 = shuf_B1 + k3;
// ..7..6..
const auto shuf_C1 = shuf_B2 + k3;
// .7..6..5
const auto shuf_C2 = shuf_B0 + k2;
// 7..6..5.
const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
const VFromD<D> C = BitCast(d, C0 | C1 | C2);
detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
}
// >= 128-bit vector, 32-bit lanes
template <
class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API
void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
const RepartitionToWide<decltype(d)> dw;
const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1);
const VFromD<D> v01_v20 = OddEven(v0, v2);
// A: v0[1], v2[0],v1[0],v0[0] (<- lane 0)
const VFromD<D> A = BitCast(
d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20)));
const VFromD<D> v1_321 = ShiftRightLanes<1>(d, v1);
const VFromD<D> v0_32 = ShiftRightLanes<2>(d, v0);
const VFromD<D> v21_v11 = OddEven(v2, v1_321);
const VFromD<D> v12_v02 = OddEven(v1_321, v0_32);
// B: v1[2],v0[2], v2[1],v1[1]
const VFromD<D> B = BitCast(
d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02)));
// Notation refers to the upper 2 lanes of the vector for InterleaveUpper.
const VFromD<D> v23_v13 = OddEven(v2, v1_321);
const VFromD<D> v03_v22 = OddEven(v0, v2);
// C: v2[3],v1[3],v0[3], v2[2]
const VFromD<D> C = BitCast(
d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13)));
detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
}
// >= 128-bit vector, 64-bit lanes
template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API
void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
const VFromD<D> A = InterleaveLower(d, v0, v1);
const VFromD<D> B = OddEven(v0, v2);
const VFromD<D> C = InterleaveUpper(d, v1, v2);
detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
}
// 64-bit vector, 8-bit lanes
template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)>
HWY_API
void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
VFromD<D> part2, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
// Use full vectors for the shuffles and first result.
constexpr size_t kFullN = 16 /
sizeof(TFromD<D>);
const Full128<uint8_t> du;
const Full128<TFromD<D>> d_full;
const auto k5 = Set(du, uint8_t{5});
const auto k6 = Set(du, uint8_t{6});
const VFromD<decltype(d_full)> v0{part0.raw};
const VFromD<decltype(d_full)> v1{part1.raw};
const VFromD<decltype(d_full)> v2{part2.raw};
// Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
// v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
// filled from other vectors are 0 for blending.
alignas(16)
static constexpr uint8_t tbl_v0[16] = {
0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
//
3, 0x80, 0x80, 4, 0x80, 0x80, 5};
alignas(16)
static constexpr uint8_t tbl_v1[16] = {
0x80, 0, 0x80, 0x80, 1, 0x80,
//
0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
// The interleaved vectors will be named A, B, C; temporaries with suffix
// 0..2 indicate which input vector's lanes they hold.
const auto shuf_A0 = Load(du, tbl_v0);
const auto shuf_A1 = Load(du, tbl_v1);
// cannot reuse shuf_A0 (5 in MSB)
const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
// 5..4..3..2..1..0
const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
// ..4..3..2..1..0.
const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
// .4..3..2..1..0..
const auto A = BitCast(d_full, A0 | A1 | A2);
StoreU(A, d_full, unaligned + 0 * kFullN);
// Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
const auto shuf_B0 = shuf_A2 + k6;
// ..7..6..
const auto shuf_B1 = shuf_A0 + k5;
// .7..6..5
const auto shuf_B2 = shuf_A1 + k5;
// 7..6..5.
const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
const VFromD<D> B{BitCast(d_full, B0 | B1 | B2).raw};
StoreU(B, d, unaligned + 1 * kFullN);
}
// 64-bit vector, 16-bit lanes
template <
class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 4)>
HWY_API
void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
VFromD<D> part2, D dh,
TFromD<D>* HWY_RESTRICT unaligned) {
const Twice<D> d_full;
const Full128<uint8_t> du8;
const auto k2 = Set(du8, uint8_t{2 *
sizeof(TFromD<D>)});
const auto k3 = Set(du8, uint8_t{3 *
sizeof(TFromD<D>)});
const VFromD<decltype(d_full)> v0{part0.raw};
const VFromD<decltype(d_full)> v1{part1.raw};
const VFromD<decltype(d_full)> v2{part2.raw};
// Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right):
// v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes
// to their place, with 0x80 so lanes to be filled from other vectors are 0
// to enable blending by ORing together.
alignas(16)
static constexpr uint8_t tbl_v1[16] = {
0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
alignas(16)
static constexpr uint8_t tbl_v2[16] = {
0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
// The interleaved vectors will be named A, B; temporaries with suffix
// 0..2 indicate which input vector's lanes they hold.
const auto shuf_A1 = Load(du8, tbl_v1);
// 2..1..0.
// .2..1..0
const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
const auto shuf_A2 = Load(du8, tbl_v2);
// ..1..0..
const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
StoreU(A, d_full, unaligned);
// Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
const auto shuf_B0 = shuf_A1 + k3;
// ..3.
const auto shuf_B1 = shuf_A2 + k3;
// .3..
const auto shuf_B2 = shuf_A0 + k2;
// 3..2
const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
const VFromD<decltype(d_full)> B = BitCast(d_full, B0 | B1 | B2);
StoreU(VFromD<D>{B.raw}, dh, unaligned + MaxLanes(d_full));
}
// 64-bit vector, 32-bit lanes
template <
class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_D(D, 2)>
HWY_API
void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
// (same code as 128-bit vector, 64-bit lanes)
const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1);
const VFromD<D> v01_v20 = OddEven(v0, v2);
const VFromD<D> v21_v11 = InterleaveUpper(d, v1, v2);
constexpr size_t kN = MaxLanes(d);
StoreU(v10_v00, d, unaligned + 0 * kN);
StoreU(v01_v20, d, unaligned + 1 * kN);
StoreU(v21_v11, d, unaligned + 2 * kN);
}
// 64-bit lanes are handled by the N=1 case below.
// <= 32-bit vector, 8-bit lanes
template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 4),
HWY_IF_LANES_GT_D(D, 1)>
HWY_API
void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
VFromD<D> part2, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
// Use full vectors for the shuffles and result.
const Full128<uint8_t> du;
const Full128<TFromD<D>> d_full;
const VFromD<decltype(d_full)> v0{part0.raw};
const VFromD<decltype(d_full)> v1{part1.raw};
const VFromD<decltype(d_full)> v2{part2.raw};
// Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
// so lanes to be filled from other vectors are 0 to enable blending by ORing
// together.
alignas(16)
static constexpr uint8_t tbl_v0[16] = {
0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80,
0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
// The interleaved vector will be named A; temporaries with suffix
// 0..2 indicate which input vector's lanes they hold.
const auto shuf_A0 = Load(du, tbl_v0);
const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
// ......3..2..1..0
const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
// .....3..2..1..0.
const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
// ....3..2..1..0..
const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
alignas(16) TFromD<D> buf[MaxLanes(d_full)];
StoreU(A, d_full, buf);
CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
}
// 32-bit vector, 16-bit lanes
template <
class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 2)>
HWY_API
void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
VFromD<D> part2, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
// Use full vectors for the shuffles and result.
const Full128<uint8_t> du8;
const Full128<TFromD<D>> d_full;
const VFromD<decltype(d_full)> v0{part0.raw};
const VFromD<decltype(d_full)> v1{part1.raw};
const VFromD<decltype(d_full)> v2{part2.raw};
// Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
// so lanes to be filled from other vectors are 0 to enable blending by ORing
// together.
alignas(16)
static constexpr uint8_t tbl_v2[16] = {
0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
// The interleaved vector will be named A; temporaries with suffix
// 0..2 indicate which input vector's lanes they hold.
const auto shuf_A2 =
// ..1..0..
Load(du8, tbl_v2);
const auto shuf_A1 =
// ...1..0.
CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);
const auto shuf_A0 =
// ....1..0
CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);
const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
// ..1..0
const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
// .1..0.
const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
// 1..0..
const auto A = BitCast(d_full, A0 | A1 | A2);
alignas(16) TFromD<D> buf[MaxLanes(d_full)];
StoreU(A, d_full, buf);
CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
}
// Single-element vector, any lane size: just store directly
template <
class D, HWY_IF_LANES_D(D, 1)>
HWY_API
void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
StoreU(v0, d, unaligned + 0);
StoreU(v1, d, unaligned + 1);
StoreU(v2, d, unaligned + 2);
}
// ------------------------------ StoreInterleaved4
namespace detail {
// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE
void StoreTransposedBlocks4(VFromD<D> vA, VFromD<D> vB, VFromD<D> vC,
VFromD<D> vD, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
constexpr size_t kN = MaxLanes(d);
StoreU(vA, d, unaligned + 0 * kN);
StoreU(vB, d, unaligned + 1 * kN);
StoreU(vC, d, unaligned + 2 * kN);
StoreU(vD, d, unaligned + 3 * kN);
}
}
// namespace detail
// >= 128-bit vector, 8..32-bit lanes
template <
class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API
void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
VFromD<D> v3, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
const RepartitionToWide<decltype(d)> dw;
const auto v10L = ZipLower(dw, v0, v1);
// .. v1[0] v0[0]
const auto v32L = ZipLower(dw, v2, v3);
const auto v10U = ZipUpper(dw, v0, v1);
const auto v32U = ZipUpper(dw, v2, v3);
// The interleaved vectors are vA, vB, vC, vD.
const VFromD<D> vA = BitCast(d, InterleaveLower(dw, v10L, v32L));
// 3210
const VFromD<D> vB = BitCast(d, InterleaveUpper(dw, v10L, v32L));
const VFromD<D> vC = BitCast(d, InterleaveLower(dw, v10U, v32U));
const VFromD<D> vD = BitCast(d, InterleaveUpper(dw, v10U, v32U));
detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned);
}
// >= 128-bit vector, 64-bit lanes
template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
HWY_API
void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
VFromD<D> v3, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
// The interleaved vectors are vA, vB, vC, vD.
const VFromD<D> vA = InterleaveLower(d, v0, v1);
// v1[0] v0[0]
const VFromD<D> vB = InterleaveLower(d, v2, v3);
const VFromD<D> vC = InterleaveUpper(d, v0, v1);
const VFromD<D> vD = InterleaveUpper(d, v2, v3);
detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned);
}
// 64-bit vector, 8..32-bit lanes
template <
class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
HWY_API
void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
VFromD<D> part2, VFromD<D> part3, D
/* tag */,
TFromD<D>* HWY_RESTRICT unaligned) {
// Use full vectors to reduce the number of stores.
const Full128<TFromD<D>> d_full;
const RepartitionToWide<decltype(d_full)> dw;
const VFromD<decltype(d_full)> v0{part0.raw};
const VFromD<decltype(d_full)> v1{part1.raw};
const VFromD<decltype(d_full)> v2{part2.raw};
--> --------------------
--> maximum size reached
--> --------------------