Quelle emu128-inl.h Sprache: C

// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Single-element vectors and operations.
// External include guard in highway.h - see comment there.

#include "hwy/base.h"
#ifndef HWY_NO_LIBCXX
#include <math.h>  // sqrtf
#endif

#include "hwy/ops/shared-inl.h"

HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {

template <typename T>
using Full128 = Simd<T, 16 / sizeof(T), 0>;

// (Wrapper class required for overloading comparison operators.)
template <typename T, size_t N = 16 / sizeof(T)>
struct Vec128 {
  using PrivateT = T;                     // only for DFromV
  static constexpr size_t kPrivateN = N;  // only for DFromV

  HWY_INLINE Vec128() = default;
  Vec128(const Vec128&) = default;
  Vec128& operator=(const Vec128&) = default;

  HWY_INLINE Vec128& operator*=(const Vec128 other) {
    return *this = (*this * other);
  }
  HWY_INLINE Vec128& operator/=(const Vec128 other) {
    return *this = (*this / other);
  }
  HWY_INLINE Vec128& operator+=(const Vec128 other) {
    return *this = (*this + other);
  }
  HWY_INLINE Vec128& operator-=(const Vec128 other) {
    return *this = (*this - other);
  }
  HWY_INLINE Vec128& operator%=(const Vec128 other) {
    return *this = (*this % other);
  }
  HWY_INLINE Vec128& operator&=(const Vec128 other) {
    return *this = (*this & other);
  }
  HWY_INLINE Vec128& operator|=(const Vec128 other) {
    return *this = (*this | other);
  }
  HWY_INLINE Vec128& operator^=(const Vec128 other) {
    return *this = (*this ^ other);
  }

  // Behave like wasm128 (vectors can always hold 128 bits). generic_ops-inl.h
  // relies on this for LoadInterleaved*. CAVEAT: this method of padding
  // prevents using range for, especially in SumOfLanes, where it would be
  // incorrect. Moving padding to another field would require handling the case
  // where N = 16 / sizeof(T) (i.e. there is no padding), which is also awkward.
  T raw[16 / sizeof(T)] = {};
};

// 0 or FF..FF, same size as Vec128.
template <typename T, size_t N = 16 / sizeof(T)>
struct Mask128 {
  using Raw = hwy::MakeUnsigned<T>;
  static HWY_INLINE Raw FromBool(bool b) {
    return b ? static_cast<Raw>(~Raw{0}) : 0;
  }

  // Must match the size of Vec128.
  Raw bits[16 / sizeof(T)] = {};
};

template <class V>
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;

template <class V>
using TFromV = typename V::PrivateT;

// ------------------------------ Zero

// Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
  Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> v;  // zero-initialized
  return v;
}

template <class D>
using VFromD = decltype(Zero(D()));

// ------------------------------ Tuple (VFromD)
#include "hwy/ops/tuple-inl.h"

// ------------------------------ BitCast

template <class D, class VFrom>
HWY_API VFromD<D> BitCast(D /* tag */, VFrom v) {
  VFromD<D> to;
  CopySameSize(&v.raw, &to.raw);
  return to;
}

// ------------------------------ ResizeBitCast

template <class D, class VFrom>
HWY_API VFromD<D> ResizeBitCast(D d, VFrom v) {
  using DFrom = DFromV<VFrom>;
  using TFrom = TFromD<DFrom>;
  using TTo = TFromD<D>;

  constexpr size_t kFromByteLen = sizeof(TFrom) * HWY_MAX_LANES_D(DFrom);
  constexpr size_t kToByteLen = sizeof(TTo) * HWY_MAX_LANES_D(D);
  constexpr size_t kCopyByteLen = HWY_MIN(kFromByteLen, kToByteLen);

  VFromD<D> to = Zero(d);
  CopyBytes<kCopyByteLen>(&v.raw, &to.raw);
  return to;
}

namespace detail {

// ResizeBitCast on the HWY_EMU128 target has zero-extending semantics if
// VFromD<DTo> is a larger vector than FromV
template <class FromSizeTag, class ToSizeTag, class DTo, class DFrom>
HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */,
                                               ToSizeTag /* to_size_tag */,
                                               DTo d_to, DFrom /* d_from */,
                                               VFromD<DFrom> v) {
  return ResizeBitCast(d_to, v);
}

}  // namespace detail

// ------------------------------ Set
template <class D, typename T2>
HWY_API VFromD<D> Set(D d, const T2 t) {
  VFromD<D> v;
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    v.raw[i] = ConvertScalarTo<TFromD<D>>(t);
  }
  return v;
}

// ------------------------------ Undefined
template <class D>
HWY_API VFromD<D> Undefined(D d) {
  return Zero(d);
}

// ------------------------------ Dup128VecFromValues

template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                      TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
                                      TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
                                      TFromD<D> t11, TFromD<D> t12,
                                      TFromD<D> t13, TFromD<D> t14,
                                      TFromD<D> t15) {
  VFromD<D> result;
  result.raw[0] = t0;
  result.raw[1] = t1;
  result.raw[2] = t2;
  result.raw[3] = t3;
  result.raw[4] = t4;
  result.raw[5] = t5;
  result.raw[6] = t6;
  result.raw[7] = t7;
  result.raw[8] = t8;
  result.raw[9] = t9;
  result.raw[10] = t10;
  result.raw[11] = t11;
  result.raw[12] = t12;
  result.raw[13] = t13;
  result.raw[14] = t14;
  result.raw[15] = t15;
  return result;
}

template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                      TFromD<D> t5, TFromD<D> t6,
                                      TFromD<D> t7) {
  VFromD<D> result;
  result.raw[0] = t0;
  result.raw[1] = t1;
  result.raw[2] = t2;
  result.raw[3] = t3;
  result.raw[4] = t4;
  result.raw[5] = t5;
  result.raw[6] = t6;
  result.raw[7] = t7;
  return result;
}

template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3) {
  VFromD<D> result;
  result.raw[0] = t0;
  result.raw[1] = t1;
  result.raw[2] = t2;
  result.raw[3] = t3;
  return result;
}

template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
  VFromD<D> result;
  result.raw[0] = t0;
  result.raw[1] = t1;
  return result;
}

// ------------------------------ Iota

template <class D, typename T = TFromD<D>, typename T2>
HWY_API VFromD<D> Iota(D d, T2 first) {
  VFromD<D> v;
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    v.raw[i] = AddWithWraparound(static_cast<T>(first), i);
  }
  return v;
}

// ================================================== LOGICAL

// ------------------------------ Not
template <typename T, size_t N>
HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  using TU = TFromD<decltype(du)>;
  VFromD<decltype(du)> vu = BitCast(du, v);
  for (size_t i = 0; i < N; ++i) {
    vu.raw[i] = static_cast<TU>(~vu.raw[i]);
  }
  return BitCast(d, vu);
}

// ------------------------------ And
template <typename T, size_t N>
HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  auto au = BitCast(du, a);
  auto bu = BitCast(du, b);
  for (size_t i = 0; i < N; ++i) {
    au.raw[i] &= bu.raw[i];
  }
  return BitCast(d, au);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> operator&(Vec128<T, N> a, Vec128<T, N> b) {
  return And(a, b);
}

// ------------------------------ AndNot
template <typename T, size_t N>
HWY_API Vec128<T, N> AndNot(Vec128<T, N> a, Vec128<T, N> b) {
  return And(Not(a), b);
}

// ------------------------------ Or
template <typename T, size_t N>
HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  auto au = BitCast(du, a);
  auto bu = BitCast(du, b);
  for (size_t i = 0; i < N; ++i) {
    au.raw[i] |= bu.raw[i];
  }
  return BitCast(d, au);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> operator|(Vec128<T, N> a, Vec128<T, N> b) {
  return Or(a, b);
}

// ------------------------------ Xor
template <typename T, size_t N>
HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  auto au = BitCast(du, a);
  auto bu = BitCast(du, b);
  for (size_t i = 0; i < N; ++i) {
    au.raw[i] ^= bu.raw[i];
  }
  return BitCast(d, au);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
  return Xor(a, b);
}

// ------------------------------ Xor3
template <typename T, size_t N>
HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
  return Xor(x1, Xor(x2, x3));
}

// ------------------------------ Or3
template <typename T, size_t N>
HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
  return Or(o1, Or(o2, o3));
}

// ------------------------------ OrAnd
template <typename T, size_t N>
HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
  return Or(o, And(a1, a2));
}

// ------------------------------ IfVecThenElse
template <typename T, size_t N>
HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
                                   Vec128<T, N> no) {
  return Or(And(mask, yes), AndNot(mask, no));
}

// ------------------------------ CopySign
template <typename T, size_t N>
HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
  const DFromV<decltype(magn)> d;
  return BitwiseIfThenElse(SignBit(d), sign, magn);
}

// ------------------------------ CopySignToAbs
template <typename T, size_t N>
HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
  const DFromV<decltype(abs)> d;
  return OrAnd(abs, SignBit(d), sign);
}

// ------------------------------ BroadcastSignBit
template <typename T, size_t N>
HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
  // This is used inside ShiftRight, so we cannot implement in terms of it.
  for (size_t i = 0; i < N; ++i) {
    v.raw[i] = static_cast<T>(v.raw[i] < 0 ? -1 : 0);
  }
  return v;
}

// ------------------------------ Mask

// v must be 0 or FF..FF.
template <typename T, size_t N>
HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
  Mask128<T, N> mask;
  CopySameSize(&v.raw, &mask.bits);
  return mask;
}

template <class D>
using MFromD = decltype(MaskFromVec(VFromD<D>()));

template <class DTo, class MFrom>
HWY_API MFromD<DTo> RebindMask(DTo /* tag */, MFrom mask) {
  MFromD<DTo> to;
  CopySameSize(&mask.bits, &to.bits);
  return to;
}

template <class D>
VFromD<D> VecFromMask(D /* tag */, MFromD<D> mask) {
  VFromD<D> v;
  CopySameSize(&mask.bits, &v.raw);
  return v;
}

template <class D>
HWY_API MFromD<D> FirstN(D d, size_t n) {
  MFromD<D> m;
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    m.bits[i] = MFromD<D>::FromBool(i < n);
  }
  return m;
}

// Returns mask ? yes : no.
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
                                Vec128<T, N> no) {
  const DFromV<decltype(yes)> d;
  return IfVecThenElse(VecFromMask(d, mask), yes, no);
}

template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
  const DFromV<decltype(yes)> d;
  return IfVecThenElse(VecFromMask(d, mask), yes, Zero(d));
}

template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
  const DFromV<decltype(no)> d;
  return IfVecThenElse(VecFromMask(d, mask), Zero(d), no);
}

template <typename T, size_t N>
HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
                                        Vec128<T, N> no) {
  const DFromV<decltype(v)> d;
  const RebindToSigned<decltype(d)> di;
  const auto vi = BitCast(di, v);

  for (size_t i = 0; i < N; ++i) {
    v.raw[i] = vi.raw[i] < 0 ? yes.raw[i] : no.raw[i];
  }
  return v;
}

template <typename T, size_t N>
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  return IfNegativeThenElse(v, Zero(d), v);
}

// ------------------------------ Mask logical

template <typename T, size_t N>
HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
  const Simd<T, N, 0> d;
  return MaskFromVec(Not(VecFromMask(d, m)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) {
  const Simd<T, N, 0> d;
  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
  const Simd<T, N, 0> d;
  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) {
  const Simd<T, N, 0> d;
  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) {
  const Simd<T, N, 0> d;
  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
  const Simd<T, N, 0> d;
  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
}

// ================================================== SHIFTS

// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)

template <int kBits, typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
  using TU = hwy::MakeUnsigned<T>;
  for (size_t i = 0; i < N; ++i) {
    const TU raw_u = static_cast<TU>(v.raw[i]);
    const auto shifted = raw_u << kBits;  // separate line to avoid MSVC warning
    v.raw[i] = static_cast<T>(shifted);
  }
  return v;
}

template <int kBits, typename T, size_t N>
HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
#if __cplusplus >= 202002L
  // Signed right shift is now guaranteed to be arithmetic (rounding toward
  // negative infinity, i.e. shifting in the sign bit).
  for (size_t i = 0; i < N; ++i) {
    v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
  }
#else
  if (IsSigned<T>()) {
    // Emulate arithmetic shift using only logical (unsigned) shifts, because
    // signed shifts are still implementation-defined.
    using TU = hwy::MakeUnsigned<T>;
    for (size_t i = 0; i < N; ++i) {
      const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> kBits);
      const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
      const size_t sign_shift =
          static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
      const TU upper = static_cast<TU>(sign << sign_shift);
      v.raw[i] = static_cast<T>(shifted | upper);
    }
  } else {  // T is unsigned
    for (size_t i = 0; i < N; ++i) {
      v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
    }
  }
#endif
  return v;
}

// ------------------------------ RotateRight (ShiftRight)
template <int kBits, typename T, size_t N>
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
  constexpr size_t kSizeInBits = sizeof(T) * 8;
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
  if (kBits == 0) return v;
  return Or(ShiftRight<kBits>(v),
            ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
}

// ------------------------------ ShiftLeftSame

template <typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
  for (size_t i = 0; i < N; ++i) {
    const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << bits;
    v.raw[i] = static_cast<T>(shifted);
  }
  return v;
}

template <typename T, size_t N>
HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
#if __cplusplus >= 202002L
  // Signed right shift is now guaranteed to be arithmetic (rounding toward
  // negative infinity, i.e. shifting in the sign bit).
  for (size_t i = 0; i < N; ++i) {
    v.raw[i] = static_cast<T>(v.raw[i] >> bits);
  }
#else
  if (IsSigned<T>()) {
    // Emulate arithmetic shift using only logical (unsigned) shifts, because
    // signed shifts are still implementation-defined.
    using TU = hwy::MakeUnsigned<T>;
    for (size_t i = 0; i < N; ++i) {
      const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits);
      const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
      const size_t sign_shift =
          static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
      const TU upper = static_cast<TU>(sign << sign_shift);
      v.raw[i] = static_cast<T>(shifted | upper);
    }
  } else {
    for (size_t i = 0; i < N; ++i) {
      v.raw[i] = static_cast<T>(v.raw[i] >> bits);  // unsigned, logical shift
    }
  }
#endif
  return v;
}

// ------------------------------ Shl

template <typename T, size_t N>
HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
  for (size_t i = 0; i < N; ++i) {
    const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i])
                         << bits.raw[i];
    v.raw[i] = static_cast<T>(shifted);
  }
  return v;
}

template <typename T, size_t N>
HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
#if __cplusplus >= 202002L
  // Signed right shift is now guaranteed to be arithmetic (rounding toward
  // negative infinity, i.e. shifting in the sign bit).
  for (size_t i = 0; i < N; ++i) {
    v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
  }
#else
  if (IsSigned<T>()) {
    // Emulate arithmetic shift using only logical (unsigned) shifts, because
    // signed shifts are still implementation-defined.
    using TU = hwy::MakeUnsigned<T>;
    for (size_t i = 0; i < N; ++i) {
      const TU shifted =
          static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits.raw[i]);
      const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
      const size_t sign_shift = static_cast<size_t>(
          static_cast<int>(sizeof(TU)) * 8 - 1 - bits.raw[i]);
      const TU upper = static_cast<TU>(sign << sign_shift);
      v.raw[i] = static_cast<T>(shifted | upper);
    }
  } else {  // T is unsigned
    for (size_t i = 0; i < N; ++i) {
      v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
    }
  }
#endif
  return v;
}

// ================================================== ARITHMETIC

// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
namespace detail {

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Add(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
                            Vec128<T, N> b) {
  for (size_t i = 0; i < N; ++i) {
    const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
    const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
    a.raw[i] = static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0)));
  }
  return a;
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Sub(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
                            Vec128<T, N> b) {
  for (size_t i = 0; i < N; ++i) {
    const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
    const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
    a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)));
  }
  return a;
}

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Add(hwy::FloatTag /*tag*/, Vec128<T, N> a,
                            Vec128<T, N> b) {
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] += b.raw[i];
  }
  return a;
}

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Sub(hwy::FloatTag /*tag*/, Vec128<T, N> a,
                            Vec128<T, N> b) {
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] -= b.raw[i];
  }
  return a;
}

}  // namespace detail

template <typename T, size_t N>
HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
  return detail::Sub(hwy::IsFloatTag<T>(), a, b);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
  return detail::Add(hwy::IsFloatTag<T>(), a, b);
}

// ------------------------------ SumsOf8

template <size_t N>
HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(Vec128<uint8_t, N> v) {
  Vec128<uint64_t, (N + 7) / 8> sums;
  for (size_t i = 0; i < N; ++i) {
    sums.raw[i / 8] += v.raw[i];
  }
  return sums;
}

template <size_t N>
HWY_API Vec128<int64_t, (N + 7) / 8> SumsOf8(Vec128<int8_t, N> v) {
  Vec128<int64_t, (N + 7) / 8> sums;
  for (size_t i = 0; i < N; ++i) {
    sums.raw[i / 8] += v.raw[i];
  }
  return sums;
}

// ------------------------------ SaturatedAdd
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
  using TW = MakeSigned<MakeWide<T>>;
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] = static_cast<T>(HWY_MIN(
        HWY_MAX(hwy::LowestValue<T>(), static_cast<TW>(a.raw[i]) + b.raw[i]),
        hwy::HighestValue<T>()));
  }
  return a;
}

// ------------------------------ SaturatedSub
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
  using TW = MakeSigned<MakeWide<T>>;
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] = static_cast<T>(HWY_MIN(
        HWY_MAX(hwy::LowestValue<T>(), static_cast<TW>(a.raw[i]) - b.raw[i]),
        hwy::HighestValue<T>()));
  }
  return a;
}

// ------------------------------ AverageRound
template <typename T, size_t N>
HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
  static_assert(!IsSigned<T>(), "Only for unsigned");
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] = static_cast<T>((a.raw[i] + b.raw[i] + 1) / 2);
  }
  return a;
}

// ------------------------------ Abs

template <typename T, size_t N>
HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] = ScalarAbs(a.raw[i]);
  }
  return a;
}

// ------------------------------ Min/Max

// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
namespace detail {

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Min(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
                            Vec128<T, N> b) {
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
  }
  return a;
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Max(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
                            Vec128<T, N> b) {
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
  }
  return a;
}

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Min(hwy::FloatTag /*tag*/, Vec128<T, N> a,
                            Vec128<T, N> b) {
  for (size_t i = 0; i < N; ++i) {
    if (ScalarIsNaN(a.raw[i])) {
      a.raw[i] = b.raw[i];
    } else if (ScalarIsNaN(b.raw[i])) {
      // no change
    } else {
      a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
    }
  }
  return a;
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Max(hwy::FloatTag /*tag*/, Vec128<T, N> a,
                            Vec128<T, N> b) {
  for (size_t i = 0; i < N; ++i) {
    if (ScalarIsNaN(a.raw[i])) {
      a.raw[i] = b.raw[i];
    } else if (ScalarIsNaN(b.raw[i])) {
      // no change
    } else {
      a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
    }
  }
  return a;
}

}  // namespace detail

template <typename T, size_t N>
HWY_API Vec128<T, N> Min(Vec128<T, N> a, Vec128<T, N> b) {
  return detail::Min(hwy::IsFloatTag<T>(), a, b);
}

template <typename T, size_t N>
HWY_API Vec128<T, N> Max(Vec128<T, N> a, Vec128<T, N> b) {
  return detail::Max(hwy::IsFloatTag<T>(), a, b);
}

// ------------------------------ Neg

// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
namespace detail {

template <typename T, size_t N>
HWY_API Vec128<T, N> Neg(hwy::NonFloatTag /*tag*/, Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  return Zero(d) - v;
}

template <typename T, size_t N>
HWY_API Vec128<T, N> Neg(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  return Xor(v, SignBit(d));
}

template <typename T, size_t N>
HWY_API Vec128<T, N> Neg(hwy::SpecialTag /*tag*/, Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  return Xor(v, SignBit(d));
}

}  // namespace detail

template <typename T, size_t N>
HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
  return detail::Neg(hwy::IsFloatTag<T>(), v);
}

// ------------------------------ Mul/Div

// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
namespace detail {

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Mul(hwy::FloatTag /*tag*/, Vec128<T, N> a,
                            Vec128<T, N> b) {
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] *= b.raw[i];
  }
  return a;
}

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Mul(SignedTag /*tag*/, Vec128<T, N> a, Vec128<T, N> b) {
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) *
                              static_cast<uint64_t>(b.raw[i]));
  }
  return a;
}

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Mul(UnsignedTag /*tag*/, Vec128<T, N> a,
                            Vec128<T, N> b) {
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) *
                              static_cast<uint64_t>(b.raw[i]));
  }
  return a;
}

}  // namespace detail

// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
#ifdef HWY_NATIVE_MUL_8
#undef HWY_NATIVE_MUL_8
#else
#define HWY_NATIVE_MUL_8
#endif
#ifdef HWY_NATIVE_MUL_64
#undef HWY_NATIVE_MUL_64
#else
#define HWY_NATIVE_MUL_64
#endif

template <typename T, size_t N>
HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
  return detail::Mul(hwy::TypeTag<T>(), a, b);
}

template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] = (b.raw[i] == T{0}) ? 0 : a.raw[i] / b.raw[i];
  }
  return a;
}

// Returns the upper 16 bits of a * b in each lane.
template <size_t N>
HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] = static_cast<int16_t>((int32_t{a.raw[i]} * b.raw[i]) >> 16);
  }
  return a;
}
template <size_t N>
HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a,
                                    Vec128<uint16_t, N> b) {
  for (size_t i = 0; i < N; ++i) {
    // Cast to uint32_t first to prevent overflow. Otherwise the result of
    // uint16_t * uint16_t is in "int" which may overflow. In practice the
    // result is the same but this way it is also defined.
    a.raw[i] = static_cast<uint16_t>(
        (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
        16);
  }
  return a;
}

template <size_t N>
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
                                           Vec128<int16_t, N> b) {
  for (size_t i = 0; i < N; ++i) {
    a.raw[i] = static_cast<int16_t>((a.raw[i] * b.raw[i] + 16384) >> 15);
  }
  return a;
}

// Multiplies even lanes (0, 2, ..) and returns the double-wide result.
template <class T, size_t N,
          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(Vec128<T, N> a,
                                                 Vec128<T, N> b) {
  using TW = MakeWide<T>;
  Vec128<TW, (N + 1) / 2> mul;
  for (size_t i = 0; i < N; i += 2) {
    const TW a_wide = a.raw[i];
    mul.raw[i / 2] = static_cast<TW>(a_wide * b.raw[i]);
  }
  return mul;
}

// Multiplies odd lanes (1, 3, ..) and returns the double-wide result.
template <class T, size_t N,
          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
                                                Vec128<T, N> b) {
  using TW = MakeWide<T>;
  Vec128<TW, (N + 1) / 2> mul;
  for (size_t i = 0; i < N; i += 2) {
    const TW a_wide = a.raw[i + 1];
    mul.raw[i / 2] = static_cast<TW>(a_wide * b.raw[i + 1]);
  }
  return mul;
}

template <size_t N>
HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) {
  for (size_t i = 0; i < N; ++i) {
    // Zero inputs are allowed, but callers are responsible for replacing the
    // return value with something else (typically using IfThenElse). This check
    // avoids a ubsan error. The result is arbitrary.
    v.raw[i] = (ScalarAbs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
  }
  return v;
}

// generic_ops takes care of integer T.
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
  return Abs(a - b);
}

// ------------------------------ Floating-point multiply-add variants

template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
                            Vec128<T, N> add) {
  return mul * x + add;
}

template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
                               Vec128<T, N> add) {
  return add - mul * x;
}

template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
                            Vec128<T, N> sub) {
  return mul * x - sub;
}

template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
                               Vec128<T, N> sub) {
  return Neg(mul) * x - sub;
}

// ------------------------------ Floating-point square root

template <size_t N>
HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
  for (size_t i = 0; i < N; ++i) {
    const float half = v.raw[i] * 0.5f;
    // Initial guess based on log2(f)
    v.raw[i] = BitCastScalar<float>(static_cast<uint32_t>(
        0x5F3759DF - (BitCastScalar<uint32_t>(v.raw[i]) >> 1)));
    // One Newton-Raphson iteration
    v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
  }
  return v;
}

namespace detail {

static HWY_INLINE float ScalarSqrt(float v) {
#if defined(HWY_NO_LIBCXX)
#if HWY_COMPILER_GCC_ACTUAL
  return __builtin_sqrt(v);
#else
  uint32_t bits = BitCastScalar<uint32_t>(v);
  // Coarse approximation, letting the exponent LSB leak into the mantissa
  bits = (1 << 29) + (bits >> 1) - (1 << 22);
  return BitCastScalar<float>(bits);
#endif  // !HWY_COMPILER_GCC_ACTUAL
#else
  return sqrtf(v);
#endif  // !HWY_NO_LIBCXX
}
static HWY_INLINE double ScalarSqrt(double v) {
#if defined(HWY_NO_LIBCXX)
#if HWY_COMPILER_GCC_ACTUAL
  return __builtin_sqrt(v);
#else
  uint64_t bits = BitCastScalar<uint64_t>(v);
  // Coarse approximation, letting the exponent LSB leak into the mantissa
  bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51);
  return BitCastScalar<double>(bits);
#endif  // !HWY_COMPILER_GCC_ACTUAL
#else
  return sqrt(v);
#endif  // HWY_NO_LIBCXX
}

}  // namespace detail

template <typename T, size_t N>
HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
  for (size_t i = 0; i < N; ++i) {
    v.raw[i] = detail::ScalarSqrt(v.raw[i]);
  }
  return v;
}

// ------------------------------ Floating-point rounding

template <typename T, size_t N>
HWY_API Vec128<T, N> Round(Vec128<T, N> v) {
  using TI = MakeSigned<T>;
  const T k0 = ConvertScalarTo<T>(0);
  const Vec128<T, N> a = Abs(v);
  for (size_t i = 0; i < N; ++i) {
    if (!(a.raw[i] < MantissaEnd<T>())) {  // Huge or NaN
      continue;
    }
    const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
    const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
    if (rounded == 0) {
      v.raw[i] = v.raw[i] < 0 ? ConvertScalarTo<T>(-0) : k0;
      continue;
    }
    const T rounded_f = ConvertScalarTo<T>(rounded);
    // Round to even
    if ((rounded & 1) &&
        ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
      v.raw[i] = ConvertScalarTo<T>(rounded - (v.raw[i] < k0 ? -1 : 1));
      continue;
    }
    v.raw[i] = rounded_f;
  }
  return v;
}

// Round-to-nearest even.
template <size_t N>
HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
  using T = float;
  using TI = int32_t;
  const T k0 = ConvertScalarTo<T>(0);

  const Vec128<float, N> abs = Abs(v);
  Vec128<int32_t, N> ret;
  for (size_t i = 0; i < N; ++i) {
    const bool signbit = ScalarSignBit(v.raw[i]);

    if (!(abs.raw[i] < MantissaEnd<T>())) {  // Huge or NaN
      // Check if too large to cast or NaN
      if (!(abs.raw[i] <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
        ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
        continue;
      }
      ret.raw[i] = static_cast<TI>(v.raw[i]);
      continue;
    }
    const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
    const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
    if (rounded == 0) {
      ret.raw[i] = 0;
      continue;
    }
    const T rounded_f = ConvertScalarTo<T>(rounded);
    // Round to even
    if ((rounded & 1) &&
        ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
      ret.raw[i] = rounded - (signbit ? -1 : 1);
      continue;
    }
    ret.raw[i] = rounded;
  }
  return ret;
}

template <typename T, size_t N>
HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) {
  using TI = MakeSigned<T>;
  const Vec128<T, N> abs = Abs(v);
  for (size_t i = 0; i < N; ++i) {
    if (!(abs.raw[i] <= MantissaEnd<T>())) {  // Huge or NaN
      continue;
    }
    const TI truncated = static_cast<TI>(v.raw[i]);
    if (truncated == 0) {
      v.raw[i] = v.raw[i] < 0 ? -T{0} : T{0};
      continue;
    }
    v.raw[i] = static_cast<T>(truncated);
  }
  return v;
}

// Toward +infinity, aka ceiling
template <typename Float, size_t N>
Vec128<Float, N> Ceil(Vec128<Float, N> v) {
  constexpr int kMantissaBits = MantissaBits<Float>();
  using Bits = MakeUnsigned<Float>;
  const Bits kExponentMask = MaxExponentField<Float>();
  const Bits kMantissaMask = MantissaMask<Float>();
  const Bits kBias = kExponentMask / 2;

  for (size_t i = 0; i < N; ++i) {
    const bool positive = v.raw[i] > Float(0.0);

    Bits bits = BitCastScalar<Bits>(v.raw[i]);

    const int exponent =
        static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
    // Already an integer.
    if (exponent >= kMantissaBits) continue;
    // |v| <= 1 => 0 or 1.
    if (exponent < 0) {
      v.raw[i] = positive ? Float{1} : Float{-0.0};
      continue;
    }

    const Bits mantissa_mask = kMantissaMask >> exponent;
    // Already an integer
    if ((bits & mantissa_mask) == 0) continue;

    // Clear fractional bits and round up
    if (positive) bits += (kMantissaMask + 1) >> exponent;
    bits &= ~mantissa_mask;

    v.raw[i] = BitCastScalar<Float>(bits);
  }
  return v;
}

// Toward -infinity, aka floor
template <typename Float, size_t N>
Vec128<Float, N> Floor(Vec128<Float, N> v) {
  constexpr int kMantissaBits = MantissaBits<Float>();
  using Bits = MakeUnsigned<Float>;
  const Bits kExponentMask = MaxExponentField<Float>();
  const Bits kMantissaMask = MantissaMask<Float>();
  const Bits kBias = kExponentMask / 2;

  for (size_t i = 0; i < N; ++i) {
    const bool negative = v.raw[i] < Float(0.0);

    Bits bits = BitCastScalar<Bits>(v.raw[i]);

    const int exponent =
        static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
    // Already an integer.
    if (exponent >= kMantissaBits) continue;
    // |v| <= 1 => -1 or 0.
    if (exponent < 0) {
      v.raw[i] = negative ? Float(-1.0) : Float(0.0);
      continue;
    }

    const Bits mantissa_mask = kMantissaMask >> exponent;
    // Already an integer
    if ((bits & mantissa_mask) == 0) continue;

    // Clear fractional bits and round down
    if (negative) bits += (kMantissaMask + 1) >> exponent;
    bits &= ~mantissa_mask;

    v.raw[i] = BitCastScalar<Float>(bits);
  }
  return v;
}

// ------------------------------ Floating-point classification

template <typename T, size_t N>
HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
  Mask128<T, N> ret;
  for (size_t i = 0; i < N; ++i) {
    // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
    ret.bits[i] = Mask128<T, N>::FromBool(ScalarIsNaN(v.raw[i]));
  }
  return ret;
}

// ================================================== COMPARE

template <typename T, size_t N>
HWY_API Mask128<T, N> operator==(Vec128<T, N> a, Vec128<T, N> b) {
  Mask128<T, N> m;
  for (size_t i = 0; i < N; ++i) {
    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] == b.raw[i]);
  }
  return m;
}

template <typename T, size_t N>
HWY_API Mask128<T, N> operator!=(Vec128<T, N> a, Vec128<T, N> b) {
  Mask128<T, N> m;
  for (size_t i = 0; i < N; ++i) {
    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] != b.raw[i]);
  }
  return m;
}

template <typename T, size_t N>
HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
  return (v & bit) == bit;
}

template <typename T, size_t N>
HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
  Mask128<T, N> m;
  for (size_t i = 0; i < N; ++i) {
    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] < b.raw[i]);
  }
  return m;
}
template <typename T, size_t N>
HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
  Mask128<T, N> m;
  for (size_t i = 0; i < N; ++i) {
    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] > b.raw[i]);
  }
  return m;
}

template <typename T, size_t N>
HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
  Mask128<T, N> m;
  for (size_t i = 0; i < N; ++i) {
    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] <= b.raw[i]);
  }
  return m;
}
template <typename T, size_t N>
HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
  Mask128<T, N> m;
  for (size_t i = 0; i < N; ++i) {
    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] >= b.raw[i]);
  }
  return m;
}

// ------------------------------ Lt128

// Only makes sense for full vectors of u64.
template <class D>
HWY_API MFromD<D> Lt128(D /* tag */, Vec128<uint64_t> a, Vec128<uint64_t> b) {
  const bool lt =
      (a.raw[1] < b.raw[1]) || (a.raw[1] == b.raw[1] && a.raw[0] < b.raw[0]);
  Mask128<uint64_t> ret;
  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
  return ret;
}

template <class D>
HWY_API MFromD<D> Lt128Upper(D /* tag */, Vec128<uint64_t> a,
                             Vec128<uint64_t> b) {
  const bool lt = a.raw[1] < b.raw[1];
  Mask128<uint64_t> ret;
  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
  return ret;
}

// ------------------------------ Eq128

// Only makes sense for full vectors of u64.
template <class D>
HWY_API MFromD<D> Eq128(D /* tag */, Vec128<uint64_t> a, Vec128<uint64_t> b) {
  const bool eq = a.raw[1] == b.raw[1] && a.raw[0] == b.raw[0];
  Mask128<uint64_t> ret;
  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
  return ret;
}

template <class D>
HWY_API Mask128<uint64_t> Ne128(D /* tag */, Vec128<uint64_t> a,
                                Vec128<uint64_t> b) {
  const bool ne = a.raw[1] != b.raw[1] || a.raw[0] != b.raw[0];
  Mask128<uint64_t> ret;
  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
  return ret;
}

template <class D>
HWY_API MFromD<D> Eq128Upper(D /* tag */, Vec128<uint64_t> a,
                             Vec128<uint64_t> b) {
  const bool eq = a.raw[1] == b.raw[1];
  Mask128<uint64_t> ret;
  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
  return ret;
}

template <class D>
HWY_API MFromD<D> Ne128Upper(D /* tag */, Vec128<uint64_t> a,
                             Vec128<uint64_t> b) {
  const bool ne = a.raw[1] != b.raw[1];
  Mask128<uint64_t> ret;
  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
  return ret;
}

// ------------------------------ Min128, Max128 (Lt128)

template <class D>
HWY_API VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) {
  return IfThenElse(Lt128(d, a, b), a, b);
}

template <class D>
HWY_API VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) {
  return IfThenElse(Lt128(d, b, a), a, b);
}

template <class D>
HWY_API VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
  return IfThenElse(Lt128Upper(d, a, b), a, b);
}

template <class D>
HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
  return IfThenElse(Lt128Upper(d, b, a), a, b);
}

// ================================================== MEMORY

// ------------------------------ Load

template <class D>
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
  VFromD<D> v;
  CopyBytes<d.MaxBytes()>(aligned, v.raw);  // copy from array
  return v;
}

template <class D>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
                             const TFromD<D>* HWY_RESTRICT p) {
  return IfThenElseZero(m, LoadU(d, p));
}

template <class D>
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
                               const TFromD<D>* HWY_RESTRICT p) {
  return IfThenElse(m, LoadU(d, p), v);
}

template <class D>
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
  return Load(d, p);
}

// In some use cases, "load single lane" is sufficient; otherwise avoid this.
template <class D>
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT aligned) {
  return Load(d, aligned);
}

#ifdef HWY_NATIVE_LOAD_N
#undef HWY_NATIVE_LOAD_N
#else
#define HWY_NATIVE_LOAD_N
#endif

template <class D>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                        size_t max_lanes_to_load) {
  VFromD<D> v = Zero(d);
  const size_t N = Lanes(d);
  const size_t num_of_lanes_to_load = HWY_MIN(max_lanes_to_load, N);
  CopyBytes(p, v.raw, num_of_lanes_to_load * sizeof(TFromD<D>));
  return v;
}

template <class D>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                          size_t max_lanes_to_load) {
  VFromD<D> v = no;
  const size_t N = Lanes(d);
  const size_t num_of_lanes_to_load = HWY_MIN(max_lanes_to_load, N);
  CopyBytes(p, v.raw, num_of_lanes_to_load * sizeof(TFromD<D>));
  return v;
}

// ------------------------------ Store

template <class D>
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
  CopyBytes<d.MaxBytes()>(v.raw, aligned);  // copy to array
}

template <class D>
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
  Store(v, d, p);
}

template <class D>
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
                          TFromD<D>* HWY_RESTRICT p) {
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    if (m.bits[i]) p[i] = v.raw[i];
  }
}

#ifdef HWY_NATIVE_STORE_N
#undef HWY_NATIVE_STORE_N
#else
#define HWY_NATIVE_STORE_N
#endif

template <class D>
HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
                    size_t max_lanes_to_store) {
  const size_t N = Lanes(d);
  const size_t num_of_lanes_to_store = HWY_MIN(max_lanes_to_store, N);
  CopyBytes(v.raw, p, num_of_lanes_to_store * sizeof(TFromD<D>));
}

// ------------------------------ LoadInterleaved2/3/4

// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
// We implement those here because scalar code is likely faster than emulation
// via shuffles.
#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
#else
#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
#endif

template <class D, typename T = TFromD<D>>
HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1) {
  alignas(16) T buf0[MaxLanes(d)];
  alignas(16) T buf1[MaxLanes(d)];
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    buf0[i] = *unaligned++;
    buf1[i] = *unaligned++;
  }
  v0 = Load(d, buf0);
  v1 = Load(d, buf1);
}

template <class D, typename T = TFromD<D>>
HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  alignas(16) T buf0[MaxLanes(d)];
  alignas(16) T buf1[MaxLanes(d)];
  alignas(16) T buf2[MaxLanes(d)];
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    buf0[i] = *unaligned++;
    buf1[i] = *unaligned++;
    buf2[i] = *unaligned++;
  }
  v0 = Load(d, buf0);
  v1 = Load(d, buf1);
  v2 = Load(d, buf2);
}

template <class D, typename T = TFromD<D>>
HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
                              VFromD<D>& v3) {
  alignas(16) T buf0[MaxLanes(d)];
  alignas(16) T buf1[MaxLanes(d)];
  alignas(16) T buf2[MaxLanes(d)];
  alignas(16) T buf3[MaxLanes(d)];
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    buf0[i] = *unaligned++;
    buf1[i] = *unaligned++;
    buf2[i] = *unaligned++;
    buf3[i] = *unaligned++;
  }
  v0 = Load(d, buf0);
  v1 = Load(d, buf1);
  v2 = Load(d, buf2);
  v3 = Load(d, buf3);
}

// ------------------------------ StoreInterleaved2/3/4

template <class D>
HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    *unaligned++ = v0.raw[i];
    *unaligned++ = v1.raw[i];
  }
}

template <class D>
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    *unaligned++ = v0.raw[i];
    *unaligned++ = v1.raw[i];
    *unaligned++ = v2.raw[i];
  }
}

template <class D>
HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
                               VFromD<D> v3, D d,
                               TFromD<D>* HWY_RESTRICT unaligned) {
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    *unaligned++ = v0.raw[i];
    *unaligned++ = v1.raw[i];
    *unaligned++ = v2.raw[i];
    *unaligned++ = v3.raw[i];
  }
}

// ------------------------------ Stream
template <class D>
HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
  Store(v, d, aligned);
}

// ------------------------------ Scatter in generic_ops-inl.h
// ------------------------------ Gather in generic_ops-inl.h

// ================================================== CONVERT

// ConvertTo and DemoteTo with floating-point input and integer output truncate
// (rounding toward zero).

namespace detail {

template <class ToT, class FromT>
HWY_INLINE ToT CastValueForF2IConv(FromT val) {
  // Prevent ubsan errors when converting float to narrower integer

  using FromTU = MakeUnsigned<FromT>;
  using ToTU = MakeUnsigned<ToT>;

  constexpr unsigned kMaxExpField =
      static_cast<unsigned>(MaxExponentField<FromT>());
  constexpr unsigned kExpBias = kMaxExpField >> 1;
  constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
      kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
      kMaxExpField));

  // If ToT is signed, compare only the exponent bits of val against
  // kMinOutOfRangeExpField.
  //
  // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
  // val against kMinOutOfRangeExpField as a negative value is outside of the
  // range of an unsigned integer type.
  const FromT val_to_compare =
      static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);

  // val is within the range of ToT if
  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
  // than kMinOutOfRangeExpField
  //
  // Otherwise, val is either outside of the range of ToT or equal to
  // LimitsMin<ToT>() if
  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
  // than or equal to kMinOutOfRangeExpField.

  return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
                                MantissaBits<FromT>()) < kMinOutOfRangeExpField)
             ? static_cast<ToT>(val)
             : static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) +
                                static_cast<ToTU>(ScalarSignBit(val)));
}

template <class ToT, class ToTypeTag, class FromT>
HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
  return ConvertScalarTo<ToT>(val);
}

template <class ToT>
HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag /*to_type_tag*/,
                                     float val) {
  return CastValueForF2IConv<ToT>(val);
}

template <class ToT>
HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
                                     float val) {
  return CastValueForF2IConv<ToT>(val);
}

}  // namespace detail

template <class DTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TFrom)>
HWY_API VFromD<DTo> PromoteTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
  static_assert(sizeof(TFromD<DTo>) > sizeof(TFrom), "Not promoting");
  VFromD<DTo> ret;
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    // For bits Y > X, floatX->floatY and intX->intY are always representable.
    ret.raw[i] = detail::CastValueForPromoteTo<TFromD<DTo>>(
        hwy::TypeTag<TFromD<DTo>>(), from.raw[i]);
  }
  return ret;
}

// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
// so we overload for TFrom=double and ToT={float,int32_t}.
template <class D, HWY_IF_F32_D(D)>
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
  VFromD<D> ret;
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    // Prevent ubsan errors when converting float to narrower integer/float
    if (ScalarIsInf(from.raw[i]) ||
        ScalarAbs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
      ret.raw[i] = ScalarSignBit(from.raw[i]) ? LowestValue<float>()
                                              : HighestValue<float>();
      continue;
    }
    ret.raw[i] = static_cast<float>(from.raw[i]);
  }
  return ret;
}
template <class D, HWY_IF_UI32_D(D)>
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
  VFromD<D> ret;
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    // Prevent ubsan errors when converting double to narrower integer/int32_t
    ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(from.raw[i]);
  }
  return ret;
}

template <class DTo, typename TFrom, size_t N, HWY_IF_SIGNED(TFrom),
          HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DTo>)>
HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
  using TTo = TFromD<DTo>;
  static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");

  VFromD<DTo> ret;
  for (size_t i = 0; i < N; ++i) {
    // Int to int: choose closest value in ToT to `from` (avoids UB)
    from.raw[i] =
        HWY_MIN(HWY_MAX(LimitsMin<TTo>(), from.raw[i]), LimitsMax<TTo>());
    ret.raw[i] = static_cast<TTo>(from.raw[i]);
  }
  return ret;
}

template <class DTo, typename TFrom, size_t N, HWY_IF_UNSIGNED(TFrom),
          HWY_IF_UNSIGNED_D(DTo)>
HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
  using TTo = TFromD<DTo>;
  static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");

  VFromD<DTo> ret;
  for (size_t i = 0; i < N; ++i) {
    // Int to int: choose closest value in ToT to `from` (avoids UB)
    from.raw[i] = HWY_MIN(from.raw[i], LimitsMax<TTo>());
    ret.raw[i] = static_cast<TTo>(from.raw[i]);
  }
  return ret;
}

template <class DTo, typename TFrom, size_t N, HWY_IF_UI64(TFrom),
          HWY_IF_F32_D(DTo)>
HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
  using TTo = TFromD<DTo>;
  static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");

  VFromD<DTo> ret;
  for (size_t i = 0; i < N; ++i) {
    // int64_t/uint64_t to float: okay to cast to float as an int64_t/uint64_t
    // value is always within the range of a float
    ret.raw[i] = static_cast<TTo>(from.raw[i]);
  }
  return ret;
}

template <class DBF16, HWY_IF_BF16_D(DBF16), class VF32>
HWY_API VFromD<DBF16> ReorderDemote2To(DBF16 dbf16, VF32 a, VF32 b) {
  const Repartition<uint32_t, decltype(dbf16)> du32;
  const VFromD<decltype(du32)> b_in_lower = ShiftRight<16>(BitCast(du32, b));
  // Avoid OddEven - we want the upper half of `a` even on big-endian systems.
  const VFromD<decltype(du32)> a_mask = Set(du32, 0xFFFF0000);
  return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
}

template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V,
          HWY_IF_SIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
          HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
  const RepartitionToWide<decltype(dn)> dw;
  const size_t NW = Lanes(dw);
  using TN = TFromD<DN>;
  const TN min = LimitsMin<TN>();
  const TN max = LimitsMax<TN>();
  VFromD<DN> ret;
  for (size_t i = 0; i < NW; ++i) {
    ret.raw[i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, a.raw[i]), max));
  }
  for (size_t i = 0; i < NW; ++i) {
    ret.raw[NW + i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, b.raw[i]), max));
  }
  return ret;
}

template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
          HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
  const RepartitionToWide<decltype(dn)> dw;
  const size_t NW = Lanes(dw);
  using TN = TFromD<DN>;
  const TN max = LimitsMax<TN>();
  VFromD<DN> ret;
  for (size_t i = 0; i < NW; ++i) {
    ret.raw[i] = static_cast<TN>(HWY_MIN(a.raw[i], max));
  }
  for (size_t i = 0; i < NW; ++i) {
    ret.raw[NW + i] = static_cast<TN>(HWY_MIN(b.raw[i], max));
  }
  return ret;
}

template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V,
          HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
          HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
  return ReorderDemote2To(dn, a, b);
}

template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), class V,
          HWY_IF_F32_D(DFromV<V>),
          HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
  const size_t NW = Lanes(dn) / 2;
  using TN = TFromD<DN>;
  VFromD<DN> ret;
  for (size_t i = 0; i < NW; ++i) {
    ret.raw[i] = ConvertScalarTo<TN>(a.raw[i]);
  }
  for (size_t i = 0; i < NW; ++i) {
    ret.raw[NW + i] = ConvertScalarTo<TN>(b.raw[i]);
  }
  return ret;
}

namespace detail {

HWY_INLINE void StoreU16ToF16(const uint16_t val,
                              hwy::float16_t* HWY_RESTRICT to) {
  CopySameSize(&val, to);
}

HWY_INLINE uint16_t U16FromF16(const hwy::float16_t* HWY_RESTRICT from) {
  uint16_t bits16;
  CopySameSize(from, &bits16);
  return bits16;
}

}  // namespace detail

template <class D, HWY_IF_F32_D(D), size_t N>
HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<bfloat16_t, N> v) {
  VFromD<D> ret;
  for (size_t i = 0; i < N; ++i) {
    ret.raw[i] = F32FromBF16(v.raw[i]);
  }
  return ret;
}

template <class D, HWY_IF_BF16_D(D), size_t N>
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
  VFromD<D> ret;
  for (size_t i = 0; i < N; ++i) {
    ret.raw[i] = BF16FromF32(v.raw[i]);
  }
  return ret;
}

// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
namespace detail {

template <typename TFrom, typename DTo>
HWY_API VFromD<DTo> ConvertTo(hwy::FloatTag /*tag*/, DTo /*tag*/,
                              Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
  using ToT = TFromD<DTo>;
  static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size");
  VFromD<DTo> ret;
  constexpr size_t N = HWY_MAX_LANES_D(DTo);

  for (size_t i = 0; i < N; ++i) {
    // float## -> int##: return closest representable value
    ret.raw[i] = CastValueForF2IConv<ToT>(from.raw[i]);
  }
  return ret;
}

template <typename TFrom, typename DTo>
HWY_API VFromD<DTo> ConvertTo(hwy::NonFloatTag /*tag*/, DTo /* tag */,
                              Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
  using ToT = TFromD<DTo>;
  static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size");
  VFromD<DTo> ret;
  constexpr size_t N = HWY_MAX_LANES_D(DTo);
  for (size_t i = 0; i < N; ++i) {
    // int## -> float##: no check needed
    ret.raw[i] = static_cast<ToT>(from.raw[i]);
  }
  return ret;
}

}  // namespace detail

template <class DTo, typename TFrom>
HWY_API VFromD<DTo> ConvertTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
  return detail::ConvertTo(hwy::IsFloatTag<TFrom>(), d, from);
}

template <size_t N>
HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
  return DemoteTo(Simd<uint8_t, N, 0>(), v);
}

// ------------------------------ Truncations

template <class D, HWY_IF_U8_D(D), size_t N>
HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) {
  VFromD<D> ret;
  for (size_t i = 0; i < N; ++i) {
    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
  }
  return ret;
}

template <class D, HWY_IF_U16_D(D), size_t N>
HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) {
  VFromD<D> ret;
  for (size_t i = 0; i < N; ++i) {
    ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
  }
  return ret;
}

template <class D, HWY_IF_U32_D(D), size_t N>
HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) {
  VFromD<D> ret;
  for (size_t i = 0; i < N; ++i) {
    ret.raw[i] = static_cast<uint32_t>(v.raw[i] & 0xFFFFFFFFu);
  }
  return ret;
}

template <class D, HWY_IF_U8_D(D), size_t N>
HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint32_t, N> v) {
  VFromD<D> ret;
  for (size_t i = 0; i < N; ++i) {
    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
  }
  return ret;
}

template <class D, HWY_IF_U16_D(D), size_t N>
HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint32_t, N> v) {
  VFromD<D> ret;
  for (size_t i = 0; i < N; ++i) {
    ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
  }
  return ret;
}

template <class D, HWY_IF_U8_D(D), size_t N>
HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint16_t, N> v) {
  VFromD<D> ret;
  for (size_t i = 0; i < N; ++i) {
    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
  }
  return ret;
}

#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
#else
#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
#endif

template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
          HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
  const RepartitionToWide<decltype(dn)> dw;
  const size_t NW = Lanes(dw);
  using TW = TFromD<decltype(dw)>;
  using TN = TFromD<decltype(dn)>;
  VFromD<DN> ret;
  constexpr TW max_val{LimitsMax<TN>()};

  for (size_t i = 0; i < NW; ++i) {
    ret.raw[i] = static_cast<TN>(a.raw[i] & max_val);
  }
  for (size_t i = 0; i < NW; ++i) {
    ret.raw[NW + i] = static_cast<TN>(b.raw[i] & max_val);
  }
  return ret;
}

// ================================================== COMBINE

template <typename T, size_t N>
HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
  Vec128<T, N / 2> ret;
  CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
  return ret;
}

template <class D>
HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
  return LowerHalf(v);
}

template <class D>
HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
  VFromD<D> ret;
  CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
  return ret;
}

template <class D>
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
  const Half<decltype(d)> dh;
  VFromD<D> ret;  // zero-initialized
  CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
  return ret;
}

template <class D, class VH = VFromD<Half<D>>>
HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
  const Half<decltype(d)> dh;
  VFromD<D> ret;
  CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
  CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
  return ret;
}

template <class D>
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
  const Half<decltype(d)> dh;
  VFromD<D> ret;
  CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
  CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
  return ret;
}

template <class D>
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
  const Half<decltype(d)> dh;
  VFromD<D> ret;
  CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
  CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
  return ret;
}

template <class D>
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
  const Half<decltype(d)> dh;
  VFromD<D> ret;
  CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
  CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
  return ret;
}

template <class D>
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
  const Half<decltype(d)> dh;
  VFromD<D> ret;
  CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
  CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
  return ret;
}

template <class D>
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
  const Half<decltype(d)> dh;
  VFromD<D> ret;
  for (size_t i = 0; i < MaxLanes(dh); ++i) {
    ret.raw[i] = lo.raw[2 * i];
  }
  for (size_t i = 0; i < MaxLanes(dh); ++i) {
    ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
  }
  return ret;
}

// 2023-11-23: workaround for incorrect codegen (reduction_test fails for
// SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
#if HWY_ARCH_RVV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
#define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
#else
#define HWY_EMU128_CONCAT_INLINE HWY_API
#endif

template <class D>
HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
  const Half<decltype(d)> dh;
  VFromD<D> ret;
  for (size_t i = 0; i < MaxLanes(dh); ++i) {
    ret.raw[i] = lo.raw[2 * i + 1];
  }
  for (size_t i = 0; i < MaxLanes(dh); ++i) {
    ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
  }
  return ret;
}

// ------------------------------ CombineShiftRightBytes
template <int kBytes, class D>
HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
  VFromD<D> ret;
  const uint8_t* HWY_RESTRICT lo8 =
      reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
  uint8_t* HWY_RESTRICT ret8 =
      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
  CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
  CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
  return ret;
}

// ------------------------------ ShiftLeftBytes

template <int kBytes, class D>
HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
  VFromD<D> ret;
  uint8_t* HWY_RESTRICT ret8 =
      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
  ZeroBytes<kBytes>(ret8);
  CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
  return ret;
}

template <int kBytes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
}

// ------------------------------ ShiftLeftLanes

template <int kLanes, class D, typename T = TFromD<D>>
HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
}

template <int kLanes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
}

// ------------------------------ ShiftRightBytes
template <int kBytes, class D>
HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
  VFromD<D> ret;
  const uint8_t* HWY_RESTRICT v8 =
      reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
  uint8_t* HWY_RESTRICT ret8 =
      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
  CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
  ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
  return ret;
}

// ------------------------------ ShiftRightLanes
template <int kLanes, class D>
HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
  const Repartition<uint8_t, decltype(d)> d8;
  constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
  return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
}

// ================================================== SWIZZLE

template <typename T, size_t N>
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.21 Sekunden ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.