// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// 128-bit WASM vectors and operations.
// External include guard in highway.h - see comment there.
#include <wasm_simd128.h>
#include "hwy/base.h"
#include "hwy/ops/shared-inl.h"
#ifdef HWY_WASM_OLD_NAMES
#define wasm_i8x16_shuffle wasm_v8x16_shuffle
#define wasm_i16x8_shuffle wasm_v16x8_shuffle
#define wasm_i32x4_shuffle wasm_v32x4_shuffle
#define wasm_i64x2_shuffle wasm_v64x2_shuffle
#define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
#define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
#define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
#define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
#define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
#define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
#define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
#define wasm_i62x2_trunc_sat_f64x2 wasm_i64x2_trunc_saturate_f64x2
#define wasm_u8x16_add_sat wasm_u8x16_add_saturate
#define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
#define wasm_u16x8_add_sat wasm_u16x8_add_saturate
#define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
#define wasm_i8x16_add_sat wasm_i8x16_add_saturate
#define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
#define wasm_i16x8_add_sat wasm_i16x8_add_saturate
#define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
#if HWY_TARGET == HWY_WASM_EMU256
template <
typename T>
using Full256 = Simd<T, 32 /
sizeof(T), 0>;
#endif
namespace detail {
template <
typename T>
struct Raw128 {
using type = __v128_u;
};
template <>
struct Raw128<
float> {
using type = __f32x4;
};
template <>
struct Raw128<
double> {
using type = __f64x2;
};
}
// namespace detail
template <
typename T, size_t N = 16 /
sizeof(T)>
class Vec128 {
using Raw =
typename detail::Raw128<T>::type;
public:
using PrivateT = T;
// only for DFromV
static constexpr size_t kPrivateN = N;
// only for DFromV
// Compound assignment. Only usable if there is a corresponding non-member
// binary operator overload. For example, only f32 and f64 support division.
HWY_INLINE Vec128&
operator*=(
const Vec128 other) {
return *
this = (*
this * other);
}
HWY_INLINE Vec128&
operator/=(
const Vec128 other) {
return *
this = (*
this / other);
}
HWY_INLINE Vec128&
operator+=(
const Vec128 other) {
return *
this = (*
this + other);
}
HWY_INLINE Vec128& operator-=(
const Vec128 other) {
return *
this = (*
this - other);
}
HWY_INLINE Vec128&
operator%=(
const Vec128 other) {
return *
this = (*
this % other);
}
HWY_INLINE Vec128&
operator&=(
const Vec128 other) {
return *
this = (*
this & other);
}
HWY_INLINE Vec128&
operator|=(
const Vec128 other) {
return *
this = (*
this | other);
}
HWY_INLINE Vec128&
operator^=(
const Vec128 other) {
return *
this = (*
this ^ other);
}
Raw raw;
};
template <
typename T>
using Vec64 = Vec128<T, 8 /
sizeof(T)>;
template <
typename T>
using Vec32 = Vec128<T, 4 /
sizeof(T)>;
template <
typename T>
using Vec16 = Vec128<T, 2 /
sizeof(T)>;
// FF..FF or 0.
template <
typename T, size_t N = 16 /
sizeof(T)>
struct Mask128 {
using PrivateT = T;
// only for DFromM
static constexpr size_t kPrivateN = N;
// only for DFromM
typename detail::Raw128<T>::type raw;
};
template <
class V>
using DFromV = Simd<
typename V::PrivateT, V::kPrivateN, 0>;
template <
class M>
using DFromM = Simd<
typename M::PrivateT, M::kPrivateN, 0>;
template <
class V>
using TFromV =
typename V::PrivateT;
// ------------------------------ Zero
// Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero.
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D
/* tag */) {
return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_i32x4_splat(0)};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D
/* tag */) {
return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_f32x4_splat(0.0f)};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D
/* tag */) {
return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_f64x2_splat(0.0)};
}
template <
class D>
using VFromD = decltype(Zero(D()));
// ------------------------------ Tuple (VFromD)
#include "hwy/ops/tuple-inl.h"
// ------------------------------ BitCast
namespace detail {
HWY_INLINE __v128_u BitCastToInteger(__v128_u v) {
return v; }
HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) {
return static_cast<__v128_u>(v);
}
HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) {
return static_cast<__v128_u>(v);
}
template <
typename T, size_t N>
HWY_INLINE Vec128<uint8_t, N *
sizeof(T)> BitCastToByte(Vec128<T, N> v) {
return Vec128<uint8_t, N *
sizeof(T)>{BitCastToInteger(v.raw)};
}
// Cannot rely on function overloading because return types differ.
template <
typename T>
struct BitCastFromInteger128 {
HWY_INLINE __v128_u
operator()(__v128_u v) {
return v; }
};
template <>
struct BitCastFromInteger128<
float> {
HWY_INLINE __f32x4
operator()(__v128_u v) {
return static_cast<__f32x4>(v); }
};
template <>
struct BitCastFromInteger128<
double> {
HWY_INLINE __f64x2
operator()(__v128_u v) {
return static_cast<__f64x2>(v); }
};
template <
class D>
HWY_INLINE VFromD<D> BitCastFromByte(D d, Vec128<uint8_t, d.MaxBytes()> v) {
return VFromD<D>{BitCastFromInteger128<TFromD<D>>()(v.raw)};
}
}
// namespace detail
template <
class D,
typename FromT>
HWY_API VFromD<D> BitCast(D d,
Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
return detail::BitCastFromByte(d, detail::BitCastToByte(v));
}
// ------------------------------ ResizeBitCast
template <
class D,
typename FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
const Repartition<uint8_t, decltype(d)> du8_to;
return BitCast(d, VFromD<decltype(du8_to)>{detail::BitCastToInteger(v.raw)});
}
// ------------------------------ Set
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Set(D
/* tag */, TFromD<D> t) {
return VFromD<D>{wasm_i8x16_splat(
static_cast<int8_t>(t))};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
HWY_API VFromD<D> Set(D
/* tag */, TFromD<D> t) {
return VFromD<D>{wasm_i16x8_splat(
static_cast<int16_t>(t))};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
HWY_API VFromD<D> Set(D
/* tag */, TFromD<D> t) {
return VFromD<D>{wasm_i32x4_splat(
static_cast<int32_t>(t))};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
HWY_API VFromD<D> Set(D
/* tag */, TFromD<D> t) {
return VFromD<D>{wasm_i64x2_splat(
static_cast<int64_t>(t))};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_SPECIAL_FLOAT_D(D)>
HWY_API VFromD<D> Set(D
/* tag */, TFromD<D> t) {
return VFromD<D>{wasm_i16x8_splat(BitCastScalar<int16_t>(t))};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> Set(D
/* tag */, TFromD<D> t) {
return VFromD<D>{wasm_f32x4_splat(t)};
}
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> Set(D
/* tag */, TFromD<D> t) {
return VFromD<D>{wasm_f64x2_splat(t)};
}
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored
"-Wuninitialized")
// For all vector sizes.
template <
class D>
HWY_API VFromD<D> Undefined(D d) {
return Zero(d);
}
HWY_DIAGNOSTICS(pop)
// For all vector sizes.
template <
class D,
typename T = TFromD<D>,
typename T2>
HWY_API VFromD<D> Iota(D d,
const T2 first) {
HWY_ALIGN T lanes[MaxLanes(d)];
for (size_t i = 0; i < MaxLanes(d); ++i) {
lanes[i] = AddWithWraparound(
static_cast<T>(first), i);
}
return Load(d, lanes);
}
// ------------------------------ Dup128VecFromValues
template <
class D, HWY_IF_I8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
TFromD<D> t11, TFromD<D> t12,
TFromD<D> t13, TFromD<D> t14,
TFromD<D> t15) {
return VFromD<D>{wasm_i8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
t11, t12, t13, t14, t15)};
}
template <
class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
TFromD<D> t11, TFromD<D> t12,
TFromD<D> t13, TFromD<D> t14,
TFromD<D> t15) {
return VFromD<D>{wasm_u8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
t11, t12, t13, t14, t15)};
}
template <
class D, HWY_IF_I16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6,
TFromD<D> t7) {
return VFromD<D>{wasm_i16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)};
}
template <
class D, HWY_IF_U16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6,
TFromD<D> t7) {
return VFromD<D>{wasm_u16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)};
}
template <
class D, HWY_IF_SPECIAL_FLOAT_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6,
TFromD<D> t7) {
const RebindToSigned<decltype(d)> di;
return BitCast(d,
Dup128VecFromValues(
di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
}
template <
class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3) {
return VFromD<D>{wasm_i32x4_make(t0, t1, t2, t3)};
}
template <
class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3) {
return VFromD<D>{wasm_u32x4_make(t0, t1, t2, t3)};
}
template <
class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3) {
return VFromD<D>{wasm_f32x4_make(t0, t1, t2, t3)};
}
template <
class D, HWY_IF_I64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1) {
return VFromD<D>{wasm_i64x2_make(t0, t1)};
}
template <
class D, HWY_IF_U64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1) {
return VFromD<D>{wasm_u64x2_make(t0, t1)};
}
template <
class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D
/*d*/, TFromD<D> t0, TFromD<D> t1) {
return VFromD<D>{wasm_f64x2_make(t0, t1)};
}
// ================================================== ARITHMETIC
// ------------------------------ Addition
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N>
operator+(
const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N>
operator+(
const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N>
operator+(
const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N>
operator+(
const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
return Vec128<uint64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N>
operator+(
const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N>
operator+(
const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N>
operator+(
const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N>
operator+(
const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
return Vec128<int64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
}
// Float
template <size_t N>
HWY_API Vec128<
float, N>
operator+(
const Vec128<
float, N> a,
const Vec128<
float, N> b) {
return Vec128<
float, N>{wasm_f32x4_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N>
operator+(
const Vec128<
double, N> a,
const Vec128<
double, N> b) {
return Vec128<
double, N>{wasm_f64x2_add(a.raw, b.raw)};
}
// ------------------------------ Subtraction
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> operator-(
const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> operator-(
const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> operator-(
const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
return Vec128<uint64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> operator-(
const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> operator-(
const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> operator-(
const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> operator-(
const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
return Vec128<int64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
}
// Float
template <size_t N>
HWY_API Vec128<
float, N> operator-(
const Vec128<
float, N> a,
const Vec128<
float, N> b) {
return Vec128<
float, N>{wasm_f32x4_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> operator-(
const Vec128<
double, N> a,
const Vec128<
double, N> b) {
return Vec128<
double, N>{wasm_f64x2_sub(a.raw, b.raw)};
}
// ------------------------------ SaturatedAdd
// Returns a + b clamped to the destination range.
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> SaturatedAdd(
const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> SaturatedAdd(
const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> SaturatedAdd(
const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> SaturatedAdd(
const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
}
// ------------------------------ SaturatedSub
// Returns a - b clamped to the destination range.
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> SaturatedSub(
const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> SaturatedSub(
const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> SaturatedSub(
const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> SaturatedSub(
const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
}
// ------------------------------ Average
// Returns (a + b + 1) / 2
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> AverageRound(
const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> AverageRound(
const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
}
// ------------------------------ Absolute value
// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
template <size_t N>
HWY_API Vec128<int8_t, N> Abs(
const Vec128<int8_t, N> v) {
return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> Abs(
const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Abs(
const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> Abs(
const Vec128<int64_t, N> v) {
return Vec128<int64_t, N>{wasm_i64x2_abs(v.raw)};
}
template <size_t N>
HWY_API Vec128<
float, N> Abs(
const Vec128<
float, N> v) {
return Vec128<
float, N>{wasm_f32x4_abs(v.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> Abs(
const Vec128<
double, N> v) {
return Vec128<
double, N>{wasm_f64x2_abs(v.raw)};
}
// ------------------------------ Shift lanes by constant #bits
// Unsigned
template <
int kBits, size_t N>
HWY_API Vec128<uint16_t, N> ShiftLeft(
const Vec128<uint16_t, N> v) {
return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<uint16_t, N> ShiftRight(
const Vec128<uint16_t, N> v) {
return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<uint32_t, N> ShiftLeft(
const Vec128<uint32_t, N> v) {
return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<uint64_t, N> ShiftLeft(
const Vec128<uint64_t, N> v) {
return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<uint32_t, N> ShiftRight(
const Vec128<uint32_t, N> v) {
return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<uint64_t, N> ShiftRight(
const Vec128<uint64_t, N> v) {
return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, kBits)};
}
// Signed
template <
int kBits, size_t N>
HWY_API Vec128<int16_t, N> ShiftLeft(
const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<int16_t, N> ShiftRight(
const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<int32_t, N> ShiftLeft(
const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<int64_t, N> ShiftLeft(
const Vec128<int64_t, N> v) {
return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<int32_t, N> ShiftRight(
const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
}
template <
int kBits, size_t N>
HWY_API Vec128<int64_t, N> ShiftRight(
const Vec128<int64_t, N> v) {
return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, kBits)};
}
// 8-bit
template <
int kBits,
typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> ShiftLeft(
const Vec128<T, N> v) {
const DFromV<decltype(v)> d8;
// Use raw instead of BitCast to support N=1.
const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
return kBits == 1
? (v + v)
: (shifted & Set(d8,
static_cast<T>((0xFF << kBits) & 0xFF)));
}
template <
int kBits, size_t N>
HWY_API Vec128<uint8_t, N> ShiftRight(
const Vec128<uint8_t, N> v) {
const DFromV<decltype(v)> d8;
// Use raw instead of BitCast to support N=1.
const Vec128<uint8_t, N> shifted{
ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
return shifted & Set(d8, 0xFF >> kBits);
}
template <
int kBits, size_t N>
HWY_API Vec128<int8_t, N> ShiftRight(
const Vec128<int8_t, N> v) {
const DFromV<decltype(v)> di;
const RebindToUnsigned<decltype(di)> du;
const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
return (shifted ^ shifted_sign) - shifted_sign;
}
// ------------------------------ RotateRight (ShiftRight, Or)
template <
int kBits,
typename T, size_t N>
HWY_API Vec128<T, N> RotateRight(
const Vec128<T, N> v) {
constexpr size_t kSizeInBits =
sizeof(T) * 8;
static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
if (kBits == 0)
return v;
return Or(ShiftRight<kBits>(v),
ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
}
// ------------------------------ Shift lanes by same variable #bits
// After https://reviews.llvm.org/D108415 shift argument became unsigned.
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored
"-Wsign-conversion")
// Unsigned
template <size_t N>
HWY_API Vec128<uint16_t, N> ShiftLeftSame(
const Vec128<uint16_t, N> v,
const int bits) {
return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> ShiftRightSame(
const Vec128<uint16_t, N> v,
const int bits) {
return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> ShiftLeftSame(
const Vec128<uint32_t, N> v,
const int bits) {
return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> ShiftRightSame(
const Vec128<uint32_t, N> v,
const int bits) {
return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> ShiftLeftSame(
const Vec128<uint64_t, N> v,
const int bits) {
return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> ShiftRightSame(
const Vec128<uint64_t, N> v,
const int bits) {
return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, bits)};
}
// Signed
template <size_t N>
HWY_API Vec128<int16_t, N> ShiftLeftSame(
const Vec128<int16_t, N> v,
const int bits) {
return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> ShiftRightSame(
const Vec128<int16_t, N> v,
const int bits) {
return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> ShiftLeftSame(
const Vec128<int32_t, N> v,
const int bits) {
return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> ShiftRightSame(
const Vec128<int32_t, N> v,
const int bits) {
return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> ShiftLeftSame(
const Vec128<int64_t, N> v,
const int bits) {
return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> ShiftRightSame(
const Vec128<int64_t, N> v,
const int bits) {
return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, bits)};
}
// 8-bit
template <
typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> ShiftLeftSame(
const Vec128<T, N> v,
const int bits) {
const DFromV<decltype(v)> d8;
// Use raw instead of BitCast to support N=1.
const Vec128<T, N> shifted{
ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
return shifted & Set(d8,
static_cast<T>((0xFF << bits) & 0xFF));
}
template <size_t N>
HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
const int bits) {
const DFromV<decltype(v)> d8;
// Use raw instead of BitCast to support N=1.
const Vec128<uint8_t, N> shifted{
ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
return shifted & Set(d8, 0xFF >> bits);
}
template <size_t N>
HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v,
const int bits) {
const DFromV<decltype(v)> di;
const RebindToUnsigned<decltype(di)> du;
const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
return (shifted ^ shifted_sign) - shifted_sign;
}
// ignore Wsign-conversion
HWY_DIAGNOSTICS(pop)
// ------------------------------ Minimum
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_u32x4_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
// Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
const uint64_t a0 =
static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
const uint64_t b0 =
static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
const uint64_t a1 =
static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
const uint64_t b1 =
static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)};
return Vec128<uint64_t, N>{wasm_v128_load(min)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
alignas(16) int64_t min[4];
min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
wasm_i64x2_extract_lane(b.raw, 0));
min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
wasm_i64x2_extract_lane(b.raw, 1));
return Vec128<int64_t, N>{wasm_v128_load(min)};
}
// Float
template <size_t N>
HWY_API Vec128<
float, N> Min(Vec128<
float, N> a, Vec128<
float, N> b) {
// Equivalent to a < b ? a : b (taking into account our swapped arg order,
// so that Min(NaN, x) is x to match x86).
return Vec128<
float, N>{wasm_f32x4_pmin(b.raw, a.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> Min(Vec128<
double, N> a, Vec128<
double, N> b) {
// Equivalent to a < b ? a : b (taking into account our swapped arg order,
// so that Min(NaN, x) is x to match x86).
return Vec128<
double, N>{wasm_f64x2_pmin(b.raw, a.raw)};
}
// ------------------------------ Maximum
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_u32x4_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
// Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
const uint64_t a0 =
static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
const uint64_t b0 =
static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
const uint64_t a1 =
static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
const uint64_t b1 =
static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)};
return Vec128<uint64_t, N>{wasm_v128_load(max)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
alignas(16) int64_t max[2];
max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
wasm_i64x2_extract_lane(b.raw, 0));
max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
wasm_i64x2_extract_lane(b.raw, 1));
return Vec128<int64_t, N>{wasm_v128_load(max)};
}
// Float
template <size_t N>
HWY_API Vec128<
float, N> Max(Vec128<
float, N> a, Vec128<
float, N> b) {
// Equivalent to b < a ? a : b (taking into account our swapped arg order,
// so that Max(NaN, x) is x to match x86).
return Vec128<
float, N>{wasm_f32x4_pmax(b.raw, a.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> Max(Vec128<
double, N> a, Vec128<
double, N> b) {
// Equivalent to b < a ? a : b (taking into account our swapped arg order,
// so that Max(NaN, x) is x to match x86).
return Vec128<
double, N>{wasm_f64x2_pmax(b.raw, a.raw)};
}
// ------------------------------ Integer multiplication
// Unsigned
template <size_t N>
HWY_API Vec128<uint16_t, N>
operator*(
const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N>
operator*(
const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int16_t, N>
operator*(
const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N>
operator*(
const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
}
// Returns the upper 16 bits of a * b in each lane.
template <size_t N>
HWY_API Vec128<uint16_t, N> MulHigh(
const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
const auto l = wasm_u32x4_extmul_low_u16x8(a.raw, b.raw);
const auto h = wasm_u32x4_extmul_high_u16x8(a.raw, b.raw);
// TODO(eustas): shift-right + narrow?
return Vec128<uint16_t, N>{
wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> MulHigh(
const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
const auto l = wasm_i32x4_extmul_low_i16x8(a.raw, b.raw);
const auto h = wasm_i32x4_extmul_high_i16x8(a.raw, b.raw);
// TODO(eustas): shift-right + narrow?
return Vec128<int16_t, N>{
wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_q15mulr_sat(a.raw, b.raw)};
}
// Multiplies even lanes (0, 2 ..) and returns the double-width result.
template <
class T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
HWY_IF_SIGNED(T)>
HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(
const Vec128<T, N> a,
const Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const RepartitionToWide<decltype(d)> dw;
constexpr
int kSrcBits =
sizeof(T) * 8;
const auto ae =
ShiftRight<kSrcBits>(ShiftLeft<kSrcBits>(ResizeBitCast(dw, a)));
const auto be =
ShiftRight<kSrcBits>(ShiftLeft<kSrcBits>(ResizeBitCast(dw, b)));
return ae * be;
}
template <
class T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
HWY_IF_UNSIGNED(T)>
HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(
const Vec128<T, N> a,
const Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const RepartitionToWide<decltype(d)> dw;
const auto kEvenMask = Set(dw, LimitsMax<T>());
const auto ae =
And(ResizeBitCast(dw, a), kEvenMask);
const auto be =
And(ResizeBitCast(dw, b), kEvenMask);
return ae * be;
}
template <size_t N>
HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(
const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
const DFromV<decltype(a)> d;
const RepartitionToWide<decltype(d)> dw;
const auto ae = ShiftRight<32>(ShiftLeft<32>(ResizeBitCast(dw, a))).raw;
const auto be = ShiftRight<32>(ShiftLeft<32>(ResizeBitCast(dw, b))).raw;
return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
}
template <size_t N>
HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(
const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
const auto ae = wasm_v128_and(a.raw, kEvenMask);
const auto be = wasm_v128_and(b.raw, kEvenMask);
return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
}
// Multiplies odd lanes (1, 3 ..) and returns the double-width result.
template <
class T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(
const Vec128<T, N> a,
const Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const RepartitionToWide<decltype(d)> dw;
constexpr
int kSrcBits =
sizeof(T) * 8;
const auto ao = ShiftRight<kSrcBits>(BitCast(dw, a));
const auto bo = ShiftRight<kSrcBits>(BitCast(dw, b));
return ao * bo;
}
template <
class T, size_t N, HWY_IF_UI32(T)>
HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(
const Vec128<T, N> a,
const Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const RepartitionToWide<decltype(d)> dw;
const auto ao = ShiftRight<32>(BitCast(dw, a));
const auto bo = ShiftRight<32>(BitCast(dw, b));
return Vec128<MakeWide<T>, (N + 1) / 2>{wasm_i64x2_mul(ao.raw, bo.raw)};
}
// ------------------------------ Negate
template <
typename T, size_t N, HWY_IF_FLOAT_OR_SPECIAL(T)>
HWY_API Vec128<T, N> Neg(
const Vec128<T, N> v) {
return Xor(v, SignBit(DFromV<decltype(v)>()));
}
template <size_t N>
HWY_API Vec128<int8_t, N> Neg(
const Vec128<int8_t, N> v) {
return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> Neg(
const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Neg(
const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> Neg(
const Vec128<int64_t, N> v) {
return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
}
// ------------------------------ Floating-point mul / div
template <size_t N>
HWY_API Vec128<
float, N>
operator*(Vec128<
float, N> a, Vec128<
float, N> b) {
return Vec128<
float, N>{wasm_f32x4_mul(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N>
operator*(Vec128<
double, N> a, Vec128<
double, N> b) {
return Vec128<
double, N>{wasm_f64x2_mul(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<
float, N>
operator/(
const Vec128<
float, N> a,
const Vec128<
float, N> b) {
return Vec128<
float, N>{wasm_f32x4_div(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N>
operator/(
const Vec128<
double, N> a,
const Vec128<
double, N> b) {
return Vec128<
double, N>{wasm_f64x2_div(a.raw, b.raw)};
}
template <
typename T, size_t N>
HWY_API Vec128<T, N> ApproximateReciprocal(
const Vec128<T, N> v) {
return Set(DFromV<decltype(v)>(), T{1.0}) / v;
}
// Integer overload defined in generic_ops-inl.h.
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> AbsDiff(
const Vec128<T, N> a,
const Vec128<T, N> b) {
return Abs(a - b);
}
// ------------------------------ Floating-point multiply-add variants
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
Vec128<T, N> add) {
return mul * x + add;
}
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
Vec128<T, N> add) {
return add - mul * x;
}
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
Vec128<T, N> sub) {
return mul * x - sub;
}
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
Vec128<T, N> sub) {
return Neg(mul) * x - sub;
}
// ------------------------------ Floating-point square root
// Full precision square root
template <size_t N>
HWY_API Vec128<
float, N> Sqrt(
const Vec128<
float, N> v) {
return Vec128<
float, N>{wasm_f32x4_sqrt(v.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> Sqrt(
const Vec128<
double, N> v) {
return Vec128<
double, N>{wasm_f64x2_sqrt(v.raw)};
}
// Approximate reciprocal square root
template <
typename T, size_t N>
HWY_API Vec128<T, N> ApproximateReciprocalSqrt(
const Vec128<T, N> v) {
// TODO(eustas): find cheaper a way to calculate this.
return Set(DFromV<decltype(v)>(), T{1.0}) / Sqrt(v);
}
// ------------------------------ Floating-point rounding
// Toward nearest integer, ties to even
template <size_t N>
HWY_API Vec128<
float, N> Round(
const Vec128<
float, N> v) {
return Vec128<
float, N>{wasm_f32x4_nearest(v.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> Round(
const Vec128<
double, N> v) {
return Vec128<
double, N>{wasm_f64x2_nearest(v.raw)};
}
// Toward zero, aka truncate
template <size_t N>
HWY_API Vec128<
float, N> Trunc(
const Vec128<
float, N> v) {
return Vec128<
float, N>{wasm_f32x4_trunc(v.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> Trunc(
const Vec128<
double, N> v) {
return Vec128<
double, N>{wasm_f64x2_trunc(v.raw)};
}
// Toward +infinity, aka ceiling
template <size_t N>
HWY_API Vec128<
float, N> Ceil(
const Vec128<
float, N> v) {
return Vec128<
float, N>{wasm_f32x4_ceil(v.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> Ceil(
const Vec128<
double, N> v) {
return Vec128<
double, N>{wasm_f64x2_ceil(v.raw)};
}
// Toward -infinity, aka floor
template <size_t N>
HWY_API Vec128<
float, N> Floor(
const Vec128<
float, N> v) {
return Vec128<
float, N>{wasm_f32x4_floor(v.raw)};
}
template <size_t N>
HWY_API Vec128<
double, N> Floor(
const Vec128<
double, N> v) {
return Vec128<
double, N>{wasm_f64x2_floor(v.raw)};
}
// ------------------------------ Floating-point classification
template <
typename T, size_t N>
HWY_API Mask128<T, N> IsNaN(
const Vec128<T, N> v) {
return v != v;
}
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Mask128<T, N> IsInf(
const Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const VFromD<decltype(du)> vu = BitCast(du, v);
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
return RebindMask(d, Eq(Add(vu, vu), Set(du, hwy::MaxExponentTimes2<T>())));
}
// Returns whether normal/subnormal/zero.
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Mask128<T, N> IsFinite(
const Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const RebindToSigned<decltype(d)> di;
// cheaper than unsigned comparison
const VFromD<decltype(du)> vu = BitCast(du, v);
// 'Shift left' to clear the sign bit, then right so we can compare with the
// max exponent (cannot compare with MaxExponentTimes2 directly because it is
// negative and non-negative floats would be greater).
const VFromD<decltype(di)> exp =
BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
}
// ================================================== COMPARE
// Comparisons fill a lane with 1-bits if the condition is true, else 0.
// Mask and Vec are the same (true = FF..FF).
template <
typename T, size_t N>
HWY_API Mask128<T, N> MaskFromVec(
const Vec128<T, N> v) {
return Mask128<T, N>{v.raw};
}
template <
class D>
using MFromD = decltype(MaskFromVec(VFromD<D>()));
template <
typename TFrom, size_t NFrom,
class DTo>
HWY_API MFromD<DTo> RebindMask(DTo
/* tag */, Mask128<TFrom, NFrom> m) {
static_assert(
sizeof(TFrom) ==
sizeof(TFromD<DTo>),
"Must have same size");
return MFromD<DTo>{m.raw};
}
template <
typename T, size_t N>
HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
return (v & bit) == bit;
}
// ------------------------------ Equality
// Unsigned
template <size_t N>
HWY_API Mask128<uint8_t, N>
operator==(
const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint16_t, N>
operator==(
const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint32_t, N>
operator==(
const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint64_t, N>
operator==(
const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
return Mask128<uint64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Mask128<int8_t, N>
operator==(
const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int16_t, N>
operator==(Vec128<int16_t, N> a,
Vec128<int16_t, N> b) {
return Mask128<int16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int32_t, N>
operator==(
const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int64_t, N>
operator==(
const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
return Mask128<int64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
}
// Float
template <size_t N>
HWY_API Mask128<
float, N>
operator==(
const Vec128<
float, N> a,
const Vec128<
float, N> b) {
return Mask128<
float, N>{wasm_f32x4_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<
double, N>
operator==(
const Vec128<
double, N> a,
const Vec128<
double, N> b) {
return Mask128<
double, N>{wasm_f64x2_eq(a.raw, b.raw)};
}
// ------------------------------ Inequality
// Unsigned
template <size_t N>
HWY_API Mask128<uint8_t, N>
operator!=(
const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint16_t, N>
operator!=(
const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint32_t, N>
operator!=(
const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint64_t, N>
operator!=(
const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Mask128<int8_t, N>
operator!=(
const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int16_t, N>
operator!=(
const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int32_t, N>
operator!=(
const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int64_t, N>
operator!=(
const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
}
// Float
template <size_t N>
HWY_API Mask128<
float, N>
operator!=(
const Vec128<
float, N> a,
const Vec128<
float, N> b) {
return Mask128<
float, N>{wasm_f32x4_ne(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<
double, N>
operator!=(
const Vec128<
double, N> a,
const Vec128<
double, N> b) {
return Mask128<
double, N>{wasm_f64x2_ne(a.raw, b.raw)};
}
// ------------------------------ Strict inequality
template <size_t N>
HWY_API Mask128<int8_t, N>
operator>(
const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int16_t, N>
operator>(
const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int32_t, N>
operator>(
const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int64_t, N>
operator>(
const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
return Mask128<int64_t, N>{wasm_i64x2_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint8_t, N>
operator>(
const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Mask128<uint8_t, N>{wasm_u8x16_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint16_t, N>
operator>(
const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Mask128<uint16_t, N>{wasm_u16x8_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint32_t, N>
operator>(
const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Mask128<uint32_t, N>{wasm_u32x4_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint64_t, N>
operator>(
const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
const DFromV<decltype(a)> d;
const Repartition<uint32_t, decltype(d)> d32;
const auto a32 = BitCast(d32, a);
const auto b32 = BitCast(d32, b);
// If the upper halves are not equal, this is the answer.
const auto m_gt = a32 > b32;
// Otherwise, the lower half decides.
const auto m_eq = a32 == b32;
const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
const auto lo_gt =
And(m_eq, MaskFromVec(VFromD<decltype(d32)>{lo_in_hi}));
const auto gt =
Or(lo_gt, m_gt);
// Copy result in upper 32 bits to lower 32 bits.
return Mask128<uint64_t, N>{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)};
}
template <size_t N>
HWY_API Mask128<
float, N>
operator>(
const Vec128<
float, N> a,
const Vec128<
float, N> b) {
return Mask128<
float, N>{wasm_f32x4_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<
double, N>
operator>(
const Vec128<
double, N> a,
const Vec128<
double, N> b) {
return Mask128<
double, N>{wasm_f64x2_gt(a.raw, b.raw)};
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
operator<(
const Vec128<T, N> a,
const Vec128<T, N> b) {
return operator>(b, a);
}
// ------------------------------ Weak inequality
// Float >=
template <size_t N>
HWY_API Mask128<
float, N>
operator>=(
const Vec128<
float, N> a,
const Vec128<
float, N> b) {
return Mask128<
float, N>{wasm_f32x4_ge(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<
double, N>
operator>=(
const Vec128<
double, N> a,
const Vec128<
double, N> b) {
return Mask128<
double, N>{wasm_f64x2_ge(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int8_t, N>
operator>=(
const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Mask128<int8_t, N>{wasm_i8x16_ge(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int16_t, N>
operator>=(
const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Mask128<int16_t, N>{wasm_i16x8_ge(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int32_t, N>
operator>=(
const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Mask128<int32_t, N>{wasm_i32x4_ge(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int64_t, N>
operator>=(
const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
return Mask128<int64_t, N>{wasm_i64x2_ge(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint8_t, N>
operator>=(
const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Mask128<uint8_t, N>{wasm_u8x16_ge(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint16_t, N>
operator>=(
const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Mask128<uint16_t, N>{wasm_u16x8_ge(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint32_t, N>
operator>=(
const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Mask128<uint32_t, N>{wasm_u32x4_ge(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint64_t, N>
operator>=(
const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
return Not(b > a);
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
operator<=(
const Vec128<T, N> a,
const Vec128<T, N> b) {
return operator>=(b, a);
}
// ------------------------------ FirstN (Iota, Lt)
template <
class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API MFromD<D> FirstN(D d, size_t num) {
const RebindToSigned<decltype(d)> di;
// Signed comparisons may be cheaper.
using TI = TFromD<decltype(di)>;
return RebindMask(d, Iota(di, 0) < Set(di,
static_cast<TI>(num)));
}
// ================================================== LOGICAL
// ------------------------------ Not
template <
typename T, size_t N>
HWY_API Vec128<T, N>
Not(Vec128<T, N> v) {
return Vec128<T, N>{wasm_v128_not(v.raw)};
}
// ------------------------------ And
template <
typename T, size_t N>
HWY_API Vec128<T, N>
And(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
}
// ------------------------------ AndNot
// Returns ~not_mask & mask.
template <
typename T, size_t N>
HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
}
// ------------------------------ Or
template <
typename T, size_t N>
HWY_API Vec128<T, N>
Or(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
}
// ------------------------------ Xor
template <
typename T, size_t N>
HWY_API Vec128<T, N>
Xor(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
}
// ------------------------------ Xor3
template <
typename T, size_t N>
HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
return Xor(x1,
Xor(x2, x3));
}
// ------------------------------ Or3
template <
typename T, size_t N>
HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
return Or(o1,
Or(o2, o3));
}
// ------------------------------ OrAnd
template <
typename T, size_t N>
HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
return Or(o,
And(a1, a2));
}
// ------------------------------ IfVecThenElse
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
return IfThenElse(MaskFromVec(mask), yes, no);
}
// ------------------------------ Operator overloads (internal-only if float)
template <
typename T, size_t N>
HWY_API Vec128<T, N>
operator&(
const Vec128<T, N> a,
const Vec128<T, N> b) {
return And(a, b);
}
template <
typename T, size_t N>
HWY_API Vec128<T, N>
operator|(
const Vec128<T, N> a,
const Vec128<T, N> b) {
return Or(a, b);
}
template <
typename T, size_t N>
HWY_API Vec128<T, N>
operator^(
const Vec128<T, N> a,
const Vec128<T, N> b) {
return Xor(a, b);
}
// ------------------------------ CopySign
template <
typename T, size_t N>
HWY_API Vec128<T, N> CopySign(
const Vec128<T, N> magn,
const Vec128<T, N> sign) {
static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
const DFromV<decltype(magn)> d;
return BitwiseIfThenElse(SignBit(d), sign, magn);
}
// ------------------------------ CopySignToAbs
template <
typename T, size_t N>
HWY_API Vec128<T, N> CopySignToAbs(
const Vec128<T, N> abs,
const Vec128<T, N> sign) {
static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
const DFromV<decltype(abs)> d;
return OrAnd(abs, SignBit(d), sign);
}
// ------------------------------ BroadcastSignBit (compare)
template <
typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
HWY_API Vec128<T, N> BroadcastSignBit(
const Vec128<T, N> v) {
return ShiftRight<
sizeof(T) * 8 - 1>(v);
}
template <size_t N>
HWY_API Vec128<int8_t, N> BroadcastSignBit(
const Vec128<int8_t, N> v) {
const DFromV<decltype(v)> d;
return VecFromMask(d, v < Zero(d));
}
// ------------------------------ Mask
template <
class D>
HWY_API VFromD<D> VecFromMask(D
/* tag */, MFromD<D> v) {
return VFromD<D>{v.raw};
}
// mask ? yes : no
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
}
// mask ? yes : 0
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
}
// mask ? 0 : no
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
}
template <
typename T, size_t N>
HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
Vec128<T, N> no) {
static_assert(IsSigned<T>(),
"Only works for signed/float");
const DFromV<decltype(v)> d;
const RebindToSigned<decltype(d)> di;
v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
return IfThenElse(MaskFromVec(v), yes, no);
}
template <
typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const auto zero = Zero(d);
return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
}
// ------------------------------ Mask logical
template <
typename T, size_t N>
HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
const DFromM<decltype(m)> d;
return MaskFromVec(
Not(VecFromMask(d, m)));
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
const DFromM<decltype(a)> d;
return MaskFromVec(
And(VecFromMask(d, a), VecFromMask(d, b)));
}
template <
typename T, size_t N>
HWY_API Mask128<T, N> AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
const DFromM<decltype(a)> d;
return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
const DFromM<decltype(a)> d;
return MaskFromVec(
Or(VecFromMask(d, a), VecFromMask(d, b)));
}
template <
typename T, size_t N>
HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
const DFromM<decltype(a)> d;
return MaskFromVec(
Xor(VecFromMask(d, a), VecFromMask(d, b)));
}
template <
typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(
const Mask128<T, N> a, Mask128<T, N> b) {
const DFromM<decltype(a)> d;
return MaskFromVec(AndNot(VecFromMask(d, a),
Not(VecFromMask(d, b))));
}
// ------------------------------ Shl (BroadcastSignBit, IfThenElse)
// The x86 multiply-by-Pow2() trick will not work because WASM saturates
// float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
// scalar count operand, per-lane shift instructions would require extract_lane
// for each lane, and hoping that shuffle is correctly mapped to a native
// instruction. Using non-vector shifts would incur a store-load forwarding
// stall when loading the result vector. We instead test bits of the shift
// count to "predicate" a shift of the entire vector by a constant.
template <
typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N>
operator<<(Vec128<T, N> v,
const Vec128<T, N> bits) {
const DFromV<decltype(v)> d;
Mask128<T, N> mask;
// Need a signed type for BroadcastSignBit.
auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
// Move the highest valid bit of the shift count into the sign bit.
test = ShiftLeft<5>(test);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test);
// next bit (descending order)
v = IfThenElse(mask, ShiftLeft<4>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test);
// next bit (descending order)
v = IfThenElse(mask, ShiftLeft<2>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
return IfThenElse(mask, ShiftLeft<1>(v), v);
}
template <
typename T, size_t N, HWY_IF_T_SIZE(T, 2),
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N>
operator<<(Vec128<T, N> v,
const Vec128<T, N> bits) {
const DFromV<decltype(v)> d;
Mask128<T, N> mask;
// Need a signed type for BroadcastSignBit.
auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
// Move the highest valid bit of the shift count into the sign bit.
test = ShiftLeft<12>(test);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test);
// next bit (descending order)
v = IfThenElse(mask, ShiftLeft<8>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test);
// next bit (descending order)
v = IfThenElse(mask, ShiftLeft<4>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test);
// next bit (descending order)
v = IfThenElse(mask, ShiftLeft<2>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
return IfThenElse(mask, ShiftLeft<1>(v), v);
}
template <
typename T, size_t N, HWY_IF_UI32(T)>
HWY_API Vec128<T, N>
operator<<(Vec128<T, N> v,
const Vec128<T, N> bits) {
const DFromV<decltype(v)> d;
Mask128<T, N> mask;
// Need a signed type for BroadcastSignBit.
auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
// Move the highest valid bit of the shift count into the sign bit.
test = ShiftLeft<27>(test);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test);
// next bit (descending order)
v = IfThenElse(mask, ShiftLeft<16>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test);
// next bit (descending order)
v = IfThenElse(mask, ShiftLeft<8>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test);
// next bit (descending order)
v = IfThenElse(mask, ShiftLeft<4>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test);
// next bit (descending order)
v = IfThenElse(mask, ShiftLeft<2>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
return IfThenElse(mask, ShiftLeft<1>(v), v);
}
template <
typename T, size_t N, HWY_IF_UI64(T)>
HWY_API Vec128<T, N>
operator<<(Vec128<T, N> v,
const Vec128<T, N> bits) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
using TU = MakeUnsigned<T>;
alignas(16) TU lanes[2] = {};
alignas(16) TU bits_lanes[2] = {};
Store(BitCast(du, v), du, lanes);
Store(BitCast(du, bits), du, bits_lanes);
lanes[0] <<= (bits_lanes[0] & 63);
lanes[1] <<= (bits_lanes[1] & 63);
return BitCast(d, Load(du, lanes));
}
// ------------------------------ Shr (BroadcastSignBit, IfThenElse)
--> --------------------
--> maximum size reached
--> --------------------