// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// RISC-V V vectors (length not known at compile time).
// External include guard in highway.h - see comment there.
#include <riscv_vector.h>
#include "hwy/ops/shared-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// Support for vfloat16m*_t and PromoteTo/DemoteTo.
#ifdef __riscv_zvfhmin
#define HWY_RVV_HAVE_F16C 1
#else
#define HWY_RVV_HAVE_F16C 0
#endif
template <
class V>
struct DFromV_t {};
// specialized in macros
template <
class V>
using DFromV =
typename DFromV_t<RemoveConst<V>>::type;
template <
class V>
using TFromV = TFromD<DFromV<V>>;
template <
typename T, size_t N,
int kPow2>
constexpr size_t MLenFromD(Simd<T, N, kPow2>
/* tag */) {
// Returns divisor = type bits / LMUL. Folding *8 into the ScaleByPower
// argument enables fractional LMUL < 1. Limit to 64 because that is the
// largest value for which vbool##_t are defined.
return HWY_MIN(64,
sizeof(T) * 8 * 8 / detail::ScaleByPower(8, kPow2));
}
namespace detail {
template <
class D>
class AdjustSimdTagToMinVecPow2_t {};
template <
typename T, size_t N,
int kPow2>
class AdjustSimdTagToMinVecPow2_t<Simd<T, N, kPow2>> {
private:
using D = Simd<T, N, kPow2>;
static constexpr
int kMinVecPow2 =
-3 +
static_cast<
int>(FloorLog2(
sizeof(T)));
static constexpr size_t kNumMaxLanes = HWY_MAX_LANES_D(D);
static constexpr
int kNewPow2 = HWY_MAX(kPow2, kMinVecPow2);
static constexpr size_t kNewN = D::
template NewN<kNewPow2, kNumMaxLanes>();
public:
using type = Simd<T, kNewN, kNewPow2>;
};
template <
class D>
using AdjustSimdTagToMinVecPow2 =
typename AdjustSimdTagToMinVecPow2_t<RemoveConst<D>>::type;
}
// namespace detail
// ================================================== MACROS
// Generate specializations and function definitions using X macros. Although
// harder to read and debug, writing everything manually is too bulky.
namespace detail {
// for code folding
// For all mask sizes MLEN: (1/Nth of a register, one bit per lane)
// The first three arguments are arbitrary SEW, LMUL, SHIFT such that
// SEW >> SHIFT = MLEN.
#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \
X_MACRO(64, 0, 64, NAME, OP) \
X_MACRO(32, 0, 32, NAME, OP) \
X_MACRO(16, 0, 16, NAME, OP) \
X_MACRO(8, 0, 8, NAME, OP) \
X_MACRO(8, 1, 4, NAME, OP) \
X_MACRO(8, 2, 2, NAME, OP) \
X_MACRO(8, 3, 1, NAME, OP)
// For given SEW, iterate over one of LMULS: _TRUNC, _EXT, _ALL. This allows
// reusing type lists such as HWY_RVV_FOREACH_U for _ALL (the usual case) or
// _EXT (for Combine). To achieve this, we HWY_CONCAT with the LMULS suffix.
//
// Precompute SEW/LMUL => MLEN to allow token-pasting the result. For the same
// reason, also pass the double-width and half SEW and LMUL (suffixed D and H,
// respectively). "__" means there is no corresponding LMUL (e.g. LMULD for m8).
// Args: BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP
// LMULS = _TRUNC: truncatable (not the smallest LMUL)
#define HWY_RVV_FOREACH_08_TRUNC(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, mf4, mf2, mf8, -2,
/*MLEN=*/32, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, mf2, m1, mf4, -1,
/*MLEN=*/16, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, m1, m2, mf2, 0,
/*MLEN=*/8, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, m2, m4, m1, 1,
/*MLEN=*/4, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, m4, m8, m2, 2,
/*MLEN=*/2, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, m8, __, m4, 3,
/*MLEN=*/1, NAME, OP)
#define HWY_RVV_FOREACH_16_TRUNC(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, mf2, m1, mf4, -1,
/*MLEN=*/32, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, m1, m2, mf2, 0,
/*MLEN=*/16, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, m2, m4, m1, 1,
/*MLEN=*/8, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, m4, m8, m2, 2,
/*MLEN=*/4, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, m8, __, m4, 3,
/*MLEN=*/2, NAME, OP)
#define HWY_RVV_FOREACH_32_TRUNC(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, m1, m2, mf2, 0,
/*MLEN=*/32, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, m2, m4, m1, 1,
/*MLEN=*/16, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, m4, m8, m2, 2,
/*MLEN=*/8, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, m8, __, m4, 3,
/*MLEN=*/4, NAME, OP)
#define HWY_RVV_FOREACH_64_TRUNC(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 64, __, 32, m2, m4, m1, 1,
/*MLEN=*/32, NAME, OP) \
X_MACRO(BASE,
CHAR, 64, __, 32, m4, m8, m2, 2,
/*MLEN=*/16, NAME, OP) \
X_MACRO(BASE,
CHAR, 64, __, 32, m8, __, m4, 3,
/*MLEN=*/8, NAME, OP)
// LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH.
#define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, mf4, mf2, mf8, -2,
/*MLEN=*/32, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, mf2, m1, mf4, -1,
/*MLEN=*/16, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, m1, m2, mf2, 0,
/*MLEN=*/8, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, m2, m4, m1, 1,
/*MLEN=*/4, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, m4, m8, m2, 2,
/*MLEN=*/2, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, m8, __, m4, 3,
/*MLEN=*/1, NAME, OP)
#define HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, mf4, mf2, mf8, -2,
/*MLEN=*/64, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, mf2, m1, mf4, -1,
/*MLEN=*/32, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, m1, m2, mf2, 0,
/*MLEN=*/16, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, m2, m4, m1, 1,
/*MLEN=*/8, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, m4, m8, m2, 2,
/*MLEN=*/4, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, m8, __, m4, 3,
/*MLEN=*/2, NAME, OP)
#define HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, mf2, m1, mf4, -1,
/*MLEN=*/64, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, m1, m2, mf2, 0,
/*MLEN=*/32, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, m2, m4, m1, 1,
/*MLEN=*/16, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, m4, m8, m2, 2,
/*MLEN=*/8, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, m8, __, m4, 3,
/*MLEN=*/4, NAME, OP)
#define HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 64, __, 32, m1, m2, mf2, 0,
/*MLEN=*/64, NAME, OP) \
X_MACRO(BASE,
CHAR, 64, __, 32, m2, m4, m1, 1,
/*MLEN=*/32, NAME, OP) \
X_MACRO(BASE,
CHAR, 64, __, 32, m4, m8, m2, 2,
/*MLEN=*/16, NAME, OP) \
X_MACRO(BASE,
CHAR, 64, __, 32, m8, __, m4, 3,
/*MLEN=*/8, NAME, OP)
// LMULS = _LE2: <= 2
#define HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, mf8, mf4, __, -3,
/*MLEN=*/64, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, mf4, mf2, mf8, -2,
/*MLEN=*/32, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, mf2, m1, mf4, -1,
/*MLEN=*/16, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, m1, m2, mf2, 0,
/*MLEN=*/8, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, m2, m4, m1, 1,
/*MLEN=*/4, NAME, OP)
#define HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, mf4, mf2, mf8, -2,
/*MLEN=*/64, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, mf2, m1, mf4, -1,
/*MLEN=*/32, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, m1, m2, mf2, 0,
/*MLEN=*/16, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, m2, m4, m1, 1,
/*MLEN=*/8, NAME, OP)
#define HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, mf2, m1, mf4, -1,
/*MLEN=*/64, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, m1, m2, mf2, 0,
/*MLEN=*/32, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, m2, m4, m1, 1,
/*MLEN=*/16, NAME, OP)
#define HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 64, __, 32, m1, m2, mf2, 0,
/*MLEN=*/64, NAME, OP) \
X_MACRO(BASE,
CHAR, 64, __, 32, m2, m4, m1, 1,
/*MLEN=*/32, NAME, OP)
// LMULS = _EXT: not the largest LMUL
#define HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, m4, m8, m2, 2,
/*MLEN=*/2, NAME, OP)
#define HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, m4, m8, m2, 2,
/*MLEN=*/4, NAME, OP)
#define HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, m4, m8, m2, 2,
/*MLEN=*/8, NAME, OP)
#define HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 64, __, 32, m4, m8, m2, 2,
/*MLEN=*/16, NAME, OP)
// LMULS = _ALL (2^MinPow2() <= LMUL <= 8)
#define HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 8, 16, __, m8, __, m4, 3,
/*MLEN=*/1, NAME, OP)
#define HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, m8, __, m4, 3,
/*MLEN=*/2, NAME, OP)
#define HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, m8, __, m4, 3,
/*MLEN=*/4, NAME, OP)
#define HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 64, __, 32, m8, __, m4, 3,
/*MLEN=*/8, NAME, OP)
// 'Virtual' LMUL. This upholds the Highway guarantee that vectors are at least
// 128 bit and LowerHalf is defined whenever there are at least 2 lanes, even
// though RISC-V LMUL must be at least SEW/64 (notice that this rules out
// LMUL=1/2 for SEW=64). To bridge the gap, we add overloads for kPow2 equal to
// one less than should be supported, with all other parameters (vector type
// etc.) unchanged. For D with the lowest kPow2 ('virtual LMUL'), Lanes()
// returns half of what it usually would.
//
// Notice that we can only add overloads whenever there is a D argument: those
// are unique with respect to non-virtual-LMUL overloads because their kPow2
// template argument differs. Otherwise, there is no actual vuint64mf2_t, and
// defining another overload with the same LMUL would be an error. Thus we have
// a separate _VIRT category for HWY_RVV_FOREACH*, and the common case is
// _ALL_VIRT (meaning the regular LMUL plus the VIRT overloads), used in most
// functions that take a D.
#define HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
#define HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 16, 32, 8, mf4, mf2, mf8, -3,
/*MLEN=*/64, NAME, OP)
#define HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 32, 64, 16, mf2, m1, mf4, -2,
/*MLEN=*/64, NAME, OP)
#define HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
X_MACRO(BASE,
CHAR, 64, __, 32, m1, m2, mf2, -1,
/*MLEN=*/64, NAME, OP)
// ALL + VIRT
#define HWY_RVV_FOREACH_08_ALL_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
#define HWY_RVV_FOREACH_16_ALL_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
#define HWY_RVV_FOREACH_32_ALL_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
#define HWY_RVV_FOREACH_64_ALL_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
// LE2 + VIRT
#define HWY_RVV_FOREACH_08_LE2_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
#define HWY_RVV_FOREACH_16_LE2_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
#define HWY_RVV_FOREACH_32_LE2_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
#define HWY_RVV_FOREACH_64_LE2_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
// EXT + VIRT
#define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
#define HWY_RVV_FOREACH_16_EXT_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
#define HWY_RVV_FOREACH_32_EXT_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
#define HWY_RVV_FOREACH_64_EXT_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
// DEMOTE + VIRT
#define HWY_RVV_FOREACH_08_DEMOTE_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
#define HWY_RVV_FOREACH_16_DEMOTE_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
#define HWY_RVV_FOREACH_32_DEMOTE_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
#define HWY_RVV_FOREACH_64_DEMOTE_VIRT(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE,
CHAR, NAME, OP) \
HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE,
CHAR, NAME, OP)
// SEW for unsigned:
#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, uint, u, NAME, OP)
#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, uint, u, NAME, OP)
#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, uint, u, NAME, OP)
#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, uint, u, NAME, OP)
// SEW for signed:
#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO,
int, i, NAME, OP)
#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO,
int, i, NAME, OP)
#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO,
int, i, NAME, OP)
#define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) \
HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO,
int, i, NAME, OP)
// SEW for float:
// Used for conversion instructions if HWY_RVV_HAVE_F16C.
#define HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS) \
HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO,
float, f, NAME, OP)
#if HWY_HAVE_FLOAT16
// Full support for f16 in all ops
#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS)
// Only BF16 is emulated.
#define HWY_RVV_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
#else
#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
#define HWY_RVV_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
#endif
#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO,
float, f, NAME, OP)
#define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) \
HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO,
float, f, NAME, OP)
// Commonly used type/SEW groups:
#define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
#define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
#define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
#define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
#define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS)
#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
#define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
#define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS)
// For all combinations of SEW:
#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS)
// Commonly used type categories:
#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
// Assemble types for use in x-macros
#define HWY_RVV_T(BASE, SEW) BASE
##SEW
##_t
#define HWY_RVV_D(BASE, SEW, N, SHIFT) Simd<HWY_RVV_T(BASE, SEW), N, SHIFT>
#define HWY_RVV_V(BASE, SEW, LMUL) v
##BASE
##SEW
##LMUL
##_t
#define HWY_RVV_TUP(BASE, SEW, LMUL, TUP) v
##BASE
##SEW
##LMUL
##x
##TUP
##_t
#define HWY_RVV_M(MLEN) vbool
##MLEN
##_t
}
// namespace detail
// Until we have full intrinsic support for fractional LMUL, mixed-precision
// code can use LMUL 1..8 (adequate unless they need many registers).
#define HWY_SPECIALIZE(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
MLEN, NAME, OP) \
template <> \
struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \
using Lane = HWY_RVV_T(BASE, SEW); \
using type = ScalableTag<Lane, SHIFT>; \
};
HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL)
#undef HWY_SPECIALIZE
// ------------------------------ Lanes
// WARNING: we want to query VLMAX/sizeof(T), but this may actually change VL!
#define HWY_RVV_LANES(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
MLEN, NAME, OP) \
template <size_t N> \
HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
constexpr size_t kFull = HWY_LANES(HWY_RVV_T(BASE, SEW)); \
constexpr size_t kCap = MaxLanes(d); \
/* If no cap, avoid generating a constant by using VLMAX. */ \
return N == kFull ? __riscv_vsetvlmax_e
##SEW
##LMUL() \
: __riscv_vsetvl_e
##SEW
##LMUL(kCap); \
} \
template <size_t N> \
HWY_API size_t Capped
##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) { \
/* If no cap, avoid the HWY_MIN. */ \
return detail::IsFull(d) \
? __riscv_vsetvl_e
##SEW
##LMUL(cap) \
: __riscv_vsetvl_e
##SEW
##LMUL(HWY_MIN(cap, MaxLanes(d))); \
}
#define HWY_RVV_LANES_VIRT(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
template <size_t N> \
HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
constexpr size_t kCap = MaxLanes(d); \
/* In case of virtual LMUL (intrinsics do not provide "uint16mf8_t") */ \
/* vsetvl may or may not be correct, so do it ourselves. */ \
const size_t actual = \
detail::ScaleByPower(__riscv_vlenb() / (SEW / 8), SHIFT); \
return HWY_MIN(actual, kCap); \
} \
template <size_t N> \
HWY_API size_t Capped
##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) { \
/* In case of virtual LMUL (intrinsics do not provide "uint16mf8_t") */ \
/* vsetvl may or may not be correct, so do it ourselves. */ \
const size_t actual = \
detail::ScaleByPower(__riscv_vlenb() / (SEW / 8), SHIFT); \
/* If no cap, avoid an extra HWY_MIN. */ \
return detail::IsFull(d) ? HWY_MIN(actual, cap) \
: HWY_MIN(HWY_MIN(actual, cap), MaxLanes(d)); \
}
HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL)
HWY_RVV_FOREACH(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT)
#undef HWY_RVV_LANES
#undef HWY_RVV_LANES_VIRT
template <
class D, HWY_RVV_IF_EMULATED_D(D)>
HWY_API size_t Lanes(D
/* tag*/) {
return Lanes(RebindToUnsigned<D>());
}
// ------------------------------ Common x-macros
// Last argument to most intrinsics. Use when the op has no d arg of its own,
// which means there is no user-specified cap.
#define HWY_RVV_AVL(SEW, SHIFT) \
Lanes(ScalableTag<HWY_RVV_T(uint, SEW), SHIFT>())
// vector = f(vector), e.g. Not
#define HWY_RVV_RETV_ARGV(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
return __riscv_v
##OP
##_v_
##CHAR##SEW
##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
}
// vector = f(vector, scalar), e.g. detail::AddS
#define HWY_RVV_RETV_ARGVS(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
return __riscv_v
##OP
##_
##CHAR##SEW
##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
}
// vector = f(vector, vector), e.g. Add
#define HWY_RVV_RETV_ARGVV(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
return __riscv_v
##OP
##_vv_
##CHAR##SEW
##LMUL(a, b, \
HWY_RVV_AVL(SEW, SHIFT)); \
}
// vector = f(vector, mask, vector, vector), e.g. MaskedAddOr
#define HWY_RVV_RETV_ARGMVV(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \
HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
return __riscv_v
##OP
##_vv_
##CHAR##SEW
##LMUL
##_mu(m, no, a, b, \
HWY_RVV_AVL(SEW, SHIFT)); \
}
// mask = f(mask)
#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
return __riscv_vm
##OP
##_m_b
##MLEN(m, HWY_RVV_AVL(SEW, SHIFT)); \
}
// ================================================== INIT
// ------------------------------ Set
#define HWY_RVV_SET(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
MLEN, NAME, OP) \
template <size_t N> \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_T(BASE, SEW) arg) { \
return __riscv_v
##OP
##_
##CHAR##SEW
##LMUL(arg, Lanes(d)); \
}
HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x, _ALL_VIRT)
HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT)
#undef HWY_RVV_SET
// Treat bfloat16_t as int16_t (using the previously defined Set overloads);
// required for Zero and VFromD.
template <size_t N,
int kPow2>
decltype(Set(Simd<int16_t, N, kPow2>(), 0)) Set(
Simd<hwy::bfloat16_t, N, kPow2> d, hwy::bfloat16_t arg) {
return Set(RebindToSigned<decltype(d)>(), BitCastScalar<int16_t>(arg));
}
#if !HWY_HAVE_FLOAT16
// Otherwise already defined above.
// WARNING: returns a different type than emulated bfloat16_t so that we can
// implement PromoteTo overloads for both bfloat16_t and float16_t, and also
// provide a Neg(hwy::float16_t) overload that coexists with Neg(int16_t).
template <size_t N,
int kPow2>
decltype(Set(Simd<uint16_t, N, kPow2>(), 0)) Set(
Simd<hwy::float16_t, N, kPow2> d, hwy::float16_t arg) {
return Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg));
}
#endif
template <
class D>
using VFromD = decltype(Set(D(), TFromD<D>()));
// ------------------------------ Zero
template <
class D>
HWY_API VFromD<D> Zero(D d) {
// Cast to support bfloat16_t.
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, Set(du, 0));
}
// ------------------------------ Undefined
// RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
// by it gives unpredictable results. It should only be used for maskoff, so
// keep it internal. For the Highway op, just use Zero (single instruction).
namespace detail {
#define HWY_RVV_UNDEFINED(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
template <size_t N> \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT)
/* tag */) { \
return __riscv_v
##OP
##_
##CHAR##SEW
##LMUL();
/* no AVL */ \
}
HWY_RVV_FOREACH(HWY_RVV_UNDEFINED, Undefined, undefined, _ALL)
#undef HWY_RVV_UNDEFINED
}
// namespace detail
template <
class D>
HWY_API VFromD<D> Undefined(D d) {
return Zero(d);
}
// ------------------------------ BitCast
namespace detail {
// Halves LMUL. (Use LMUL arg for the source so we can use _TRUNC.)
#define HWY_RVV_TRUNC(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
MLEN, NAME, OP) \
HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
return __riscv_v
##OP
##_v_
##CHAR##SEW
##LMUL
##_
##CHAR##SEW
##LMULH( \
v);
/* no AVL */ \
}
HWY_RVV_FOREACH(HWY_RVV_TRUNC, Trunc, lmul_trunc, _TRUNC)
#undef HWY_RVV_TRUNC
// Doubles LMUL to `d2` (the arg is only necessary for _VIRT).
#define HWY_RVV_EXT(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
MLEN, NAME, OP) \
template <size_t N> \
HWY_API HWY_RVV_V(BASE, SEW, LMULD) \
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1)
/* d2 */, \
HWY_RVV_V(BASE, SEW, LMUL) v) { \
return __riscv_v
##OP
##_v_
##CHAR##SEW
##LMUL
##_
##CHAR##SEW
##LMULD( \
v);
/* no AVL */ \
}
HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT)
#undef HWY_RVV_EXT
// For virtual LMUL e.g. 'uint32mf4_t', the return type should be mf2, which is
// the same as the actual input type.
#define HWY_RVV_EXT_VIRT(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
template <size_t N> \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1)
/* d2 */, \
HWY_RVV_V(BASE, SEW, LMUL) v) { \
return v; \
}
HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT)
#undef HWY_RVV_EXT_VIRT
template <
class D, HWY_RVV_IF_EMULATED_D(D)>
VFromD<D> Ext(D d, VFromD<Half<D>> v) {
const RebindToUnsigned<decltype(d)> du;
const Half<decltype(du)> duh;
return BitCast(d, Ext(du, BitCast(duh, v)));
}
// For BitCastToByte, the D arg is only to prevent duplicate definitions caused
// by _ALL_VIRT.
// There is no reinterpret from u8 <-> u8, so just return.
#define HWY_RVV_CAST_U8(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
template <
typename T, size_t N> \
HWY_API vuint8
##LMUL
##_t BitCastToByte(Simd<T, N, SHIFT>
/* d */, \
vuint8
##LMUL
##_t v) { \
return v; \
} \
template <size_t N> \
HWY_API vuint8
##LMUL
##_t BitCastFromByte( \
HWY_RVV_D(BASE, SEW, N, SHIFT)
/* d */, vuint8##LMUL##_t v) { \
return v; \
}
// For i8, need a single reinterpret (HWY_RVV_CAST_IF does two).
#define HWY_RVV_CAST_I8(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
template <
typename T, size_t N> \
HWY_API vuint8
##LMUL
##_t BitCastToByte(Simd<T, N, SHIFT>
/* d */, \
vint8
##LMUL
##_t v) { \
return __riscv_vreinterpret_v_i8
##LMUL
##_u8
##LMUL(v); \
} \
template <size_t N> \
HWY_API vint8
##LMUL
##_t BitCastFromByte( \
HWY_RVV_D(BASE, SEW, N, SHIFT)
/* d */, vuint8##LMUL##_t v) { \
return __riscv_vreinterpret_v_u8
##LMUL
##_i8
##LMUL(v); \
}
// Separate u/i because clang only provides signed <-> unsigned reinterpret for
// the same SEW.
#define HWY_RVV_CAST_U(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
MLEN, NAME, OP) \
template <
typename T, size_t N> \
HWY_API vuint8
##LMUL
##_t BitCastToByte(Simd<T, N, SHIFT>
/* d */, \
HWY_RVV_V(BASE, SEW, LMUL) v) { \
return __riscv_v
##OP
##_v_
##CHAR##SEW
##LMUL
##_u8
##LMUL(v); \
} \
template <size_t N> \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
HWY_RVV_D(BASE, SEW, N, SHIFT)
/* d */, vuint8##LMUL##_t v) { \
return __riscv_v
##OP
##_v_u8
##LMUL
##_
##CHAR##SEW
##LMUL(v); \
}
// Signed/Float: first cast to/from unsigned
#define HWY_RVV_CAST_IF(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
template <
typename T, size_t N> \
HWY_API vuint8
##LMUL
##_t BitCastToByte(Simd<T, N, SHIFT>
/* d */, \
HWY_RVV_V(BASE, SEW, LMUL) v) { \
return __riscv_v
##OP
##_v_u
##SEW
##LMUL
##_u8
##LMUL( \
__riscv_v
##OP
##_v_
##CHAR##SEW
##LMUL
##_u
##SEW
##LMUL(v)); \
} \
template <size_t N> \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
HWY_RVV_D(BASE, SEW, N, SHIFT)
/* d */, vuint8##LMUL##_t v) { \
return __riscv_v
##OP
##_v_u
##SEW
##LMUL
##_
##CHAR##SEW
##LMUL( \
__riscv_v
##OP
##_v_u8
##LMUL
##_u
##SEW
##LMUL(v)); \
}
// Additional versions for virtual LMUL using LMULH for byte vectors.
#define HWY_RVV_CAST_VIRT_U(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
template <
typename T, size_t N> \
HWY_API vuint8
##LMULH
##_t BitCastToByte(Simd<T, N, SHIFT>
/* d */, \
HWY_RVV_V(BASE, SEW, LMUL) v) { \
return detail::Trunc(__riscv_v
##OP
##_v_
##CHAR##SEW
##LMUL
##_u8
##LMUL(v)); \
} \
template <size_t N> \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
HWY_RVV_D(BASE, SEW, N, SHIFT)
/* d */, vuint8##LMULH##_t v) { \
HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
const vuint8
##LMUL
##_t v2 = detail::Ext(d2, v); \
return __riscv_v
##OP
##_v_u8
##LMUL
##_
##CHAR##SEW
##LMUL(v2); \
}
// Signed/Float: first cast to/from unsigned
#define HWY_RVV_CAST_VIRT_IF(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
template <
typename T, size_t N> \
HWY_API vuint8
##LMULH
##_t BitCastToByte(Simd<T, N, SHIFT>
/* d */, \
HWY_RVV_V(BASE, SEW, LMUL) v) { \
return detail::Trunc(__riscv_v
##OP
##_v_u
##SEW
##LMUL
##_u8
##LMUL( \
__riscv_v
##OP
##_v_
##CHAR##SEW
##LMUL
##_u
##SEW
##LMUL(v))); \
} \
template <size_t N> \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
HWY_RVV_D(BASE, SEW, N, SHIFT)
/* d */, vuint8##LMULH##_t v) { \
HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
const vuint8
##LMUL
##_t v2 = detail::Ext(d2, v); \
return __riscv_v
##OP
##_v_u
##SEW
##LMUL
##_
##CHAR##SEW
##LMUL( \
__riscv_v
##OP
##_v_u8
##LMUL
##_u
##SEW
##LMUL(v2)); \
}
HWY_RVV_FOREACH_U08(HWY_RVV_CAST_U8, _, reinterpret, _ALL)
HWY_RVV_FOREACH_I08(HWY_RVV_CAST_I8, _, reinterpret, _ALL)
HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_U, _, reinterpret, _ALL)
HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_VIRT_U, _, reinterpret, _VIRT)
HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
HWY_RVV_FOREACH_F(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
#if HWY_HAVE_FLOAT16
// HWY_RVV_FOREACH_F already covered float16_
#elif HWY_RVV_HAVE_F16C
// zvfhmin provides reinterpret* intrinsics:
HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
#else
template <size_t N,
int kPow2>
HWY_INLINE VFromD<Simd<uint16_t, N, kPow2>> BitCastFromByte(
Simd<hwy::float16_t, N, kPow2>
/* d */, VFromD<Simd<uint8_t, N, kPow2>> v) {
return BitCastFromByte(Simd<uint16_t, N, kPow2>(), v);
}
#endif
#undef HWY_RVV_CAST_U8
#undef HWY_RVV_CAST_I8
#undef HWY_RVV_CAST_U
#undef HWY_RVV_CAST_IF
#undef HWY_RVV_CAST_VIRT_U
#undef HWY_RVV_CAST_VIRT_IF
template <size_t N,
int kPow2>
HWY_INLINE VFromD<Simd<int16_t, N, kPow2>> BitCastFromByte(
Simd<hwy::bfloat16_t, N, kPow2>
/* d */,
VFromD<Simd<uint8_t, N, kPow2>> v) {
return BitCastFromByte(Simd<int16_t, N, kPow2>(), v);
}
}
// namespace detail
template <
class D,
class FromV>
HWY_API VFromD<D> BitCast(D d, FromV v) {
return detail::BitCastFromByte(d, detail::BitCastToByte(d, v));
}
// ------------------------------ Iota
namespace detail {
#define HWY_RVV_IOTA(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
MLEN, NAME, OP) \
template <size_t N> \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
return __riscv_v
##OP
##_
##CHAR##SEW
##LMUL(Lanes(d)); \
}
// For i8 lanes, this may well wrap around. Unsigned only is less error-prone.
HWY_RVV_FOREACH_U(HWY_RVV_IOTA, Iota0, id_v, _ALL_VIRT)
#undef HWY_RVV_IOTA
// Used by Expand.
#define HWY_RVV_MASKED_IOTA(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
template <size_t N> \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) mask) { \
return __riscv_v
##OP
##_
##CHAR##SEW
##LMUL(mask, Lanes(d)); \
}
HWY_RVV_FOREACH_U(HWY_RVV_MASKED_IOTA, MaskedIota, iota_m, _ALL_VIRT)
#undef HWY_RVV_MASKED_IOTA
}
// namespace detail
// ================================================== LOGICAL
// ------------------------------ Not
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGV,
Not,
not, _ALL)
template <
class V, HWY_IF_FLOAT_V(V)>
HWY_API V
Not(
const V v) {
using DF = DFromV<V>;
using DU = RebindToUnsigned<DF>;
return BitCast(DF(),
Not(BitCast(DU(), v)));
}
// ------------------------------ And
// Non-vector version (ideally immediate) for use with Iota0
namespace detail {
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AndS, and_vx, _ALL)
}
// namespace detail
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV,
And,
and, _ALL)
template <
class V, HWY_IF_FLOAT_V(V)>
HWY_API V
And(
const V a,
const V b) {
using DF = DFromV<V>;
using DU = RebindToUnsigned<DF>;
return BitCast(DF(),
And(BitCast(DU(), a), BitCast(DU(), b)));
}
// ------------------------------ Or
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV,
Or,
or, _ALL)
template <
class V, HWY_IF_FLOAT_V(V)>
HWY_API V
Or(
const V a,
const V b) {
using DF = DFromV<V>;
using DU = RebindToUnsigned<DF>;
return BitCast(DF(),
Or(BitCast(DU(), a), BitCast(DU(), b)));
}
// ------------------------------ Xor
// Non-vector version (ideally immediate) for use with Iota0
namespace detail {
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, XorS, xor_vx, _ALL)
}
// namespace detail
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV,
Xor,
xor, _ALL)
template <
class V, HWY_IF_FLOAT_V(V)>
HWY_API V
Xor(
const V a,
const V b) {
using DF = DFromV<V>;
using DU = RebindToUnsigned<DF>;
return BitCast(DF(),
Xor(BitCast(DU(), a), BitCast(DU(), b)));
}
// ------------------------------ AndNot
template <
class V>
HWY_API V AndNot(
const V not_a,
const V b) {
return And(
Not(not_a), b);
}
// ------------------------------ Xor3
template <
class V>
HWY_API V Xor3(V x1, V x2, V x3) {
return Xor(x1,
Xor(x2, x3));
}
// ------------------------------ Or3
template <
class V>
HWY_API V Or3(V o1, V o2, V o3) {
return Or(o1,
Or(o2, o3));
}
// ------------------------------ OrAnd
template <
class V>
HWY_API V OrAnd(
const V o,
const V a1,
const V a2) {
return Or(o,
And(a1, a2));
}
// ------------------------------ CopySign
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, CopySign, fsgnj, _ALL)
template <
class V>
HWY_API V CopySignToAbs(
const V abs,
const V sign) {
// RVV can also handle abs < 0, so no extra action needed.
return CopySign(abs, sign);
}
// ================================================== ARITHMETIC
// Per-target flags to prevent generic_ops-inl.h defining Add etc.
#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS
#undef HWY_NATIVE_OPERATOR_REPLACEMENTS
#else
#define HWY_NATIVE_OPERATOR_REPLACEMENTS
#endif
// ------------------------------ Add
namespace detail {
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf, _ALL)
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf, _ALL)
}
// namespace detail
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Add, add, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Add, fadd, _ALL)
// ------------------------------ Sub
namespace detail {
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, SubS, sub_vx, _ALL)
}
// namespace detail
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Sub, sub, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Sub, fsub, _ALL)
// ------------------------------ SaturatedAdd
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
#undef HWY_NATIVE_I32_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I32_SATURATED_ADDSUB
#endif
#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
#undef HWY_NATIVE_U32_SATURATED_ADDSUB
#else
#define HWY_NATIVE_U32_SATURATED_ADDSUB
#endif
#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
#undef HWY_NATIVE_I64_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I64_SATURATED_ADDSUB
#endif
#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
#undef HWY_NATIVE_U64_SATURATED_ADDSUB
#else
#define HWY_NATIVE_U64_SATURATED_ADDSUB
#endif
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, SaturatedAdd, saddu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedAdd, sadd, _ALL)
// ------------------------------ SaturatedSub
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, SaturatedSub, ssubu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL)
// ------------------------------ AverageRound
// Define this to opt-out of the default behavior, which is AVOID on certain
// compiler versions. You can define only this to use VXRM, or define both this
// and HWY_RVV_AVOID_VXRM to always avoid VXRM.
#ifndef HWY_RVV_CHOOSE_VXRM
// Assume that GCC-13 defaults to 'avoid VXRM'. Tested with GCC 13.1.0.
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400
#define HWY_RVV_AVOID_VXRM
// Clang 16 with __riscv_v_intrinsic == 11000 may either require VXRM or avoid.
// Assume earlier versions avoid.
#elif HWY_COMPILER_CLANG && \
(HWY_COMPILER_CLANG < 1600 || __riscv_v_intrinsic < 11000)
#define HWY_RVV_AVOID_VXRM
#endif
#endif // HWY_RVV_CHOOSE_VXRM
// Adding __RISCV_VXRM_* was a backwards-incompatible change and it is not clear
// how to detect whether it is supported or required. #ifdef __RISCV_VXRM_RDN
// does not work because it seems to be a compiler built-in, but neither does
// __has_builtin(__RISCV_VXRM_RDN). The intrinsics version was also not updated,
// so we require a macro to opt out of the new intrinsics.
#ifdef HWY_RVV_AVOID_VXRM
#define HWY_RVV_INSERT_VXRM(vxrm, avl) avl
#define __RISCV_VXRM_RNU
#define __RISCV_VXRM_RDN
#else // default: use new vxrm arguments
#define HWY_RVV_INSERT_VXRM(vxrm, avl) vxrm, avl
#endif
// Extra rounding mode = up argument.
#define HWY_RVV_RETV_AVERAGE(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
return __riscv_v
##OP
##_vv_
##CHAR##SEW
##LMUL( \
a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
}
HWY_RVV_FOREACH_U08(HWY_RVV_RETV_AVERAGE, AverageRound, aaddu, _ALL)
HWY_RVV_FOREACH_U16(HWY_RVV_RETV_AVERAGE, AverageRound, aaddu, _ALL)
#undef HWY_RVV_RETV_AVERAGE
// ------------------------------ ShiftLeft[Same]
// Intrinsics do not define .vi forms, so use .vx instead.
#define HWY_RVV_SHIFT(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
MLEN, NAME, OP) \
template <
int kBits> \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
return __riscv_v
##OP
##_vx_
##CHAR##SEW
##LMUL(v, kBits, \
HWY_RVV_AVL(SEW, SHIFT)); \
} \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME
##Same(HWY_RVV_V(BASE, SEW, LMUL) v,
int bits) { \
return __riscv_v
##OP
##_vx_
##CHAR##SEW
##LMUL(v,
static_cast<uint8_t>(bits), \
HWY_RVV_AVL(SEW, SHIFT)); \
}
HWY_RVV_FOREACH_UI(HWY_RVV_SHIFT, ShiftLeft, sll, _ALL)
// ------------------------------ ShiftRight[Same]
HWY_RVV_FOREACH_U(HWY_RVV_SHIFT, ShiftRight, srl, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRight, sra, _ALL)
#undef HWY_RVV_SHIFT
// ------------------------------ SumsOf8 (ShiftRight, Add)
template <
class VU8, HWY_IF_U8_D(DFromV<VU8>)>
HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(
const VU8 v) {
const DFromV<VU8> du8;
const RepartitionToWide<decltype(du8)> du16;
const RepartitionToWide<decltype(du16)> du32;
const RepartitionToWide<decltype(du32)> du64;
using VU16 = VFromD<decltype(du16)>;
const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
const VU16 vECA86420 = detail::AndS(BitCast(du16, v), 0xFF);
const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
const VU16 szz_FE_zz_BA_zz_76_zz_32 =
BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull);
}
template <
class VI8, HWY_IF_I8_D(DFromV<VI8>)>
HWY_API VFromD<Repartition<int64_t, DFromV<VI8>>> SumsOf8(
const VI8 v) {
const DFromV<VI8> di8;
const RepartitionToWide<decltype(di8)> di16;
const RepartitionToWide<decltype(di16)> di32;
const RepartitionToWide<decltype(di32)> di64;
const RebindToUnsigned<decltype(di32)> du32;
const RebindToUnsigned<decltype(di64)> du64;
using VI16 = VFromD<decltype(di16)>;
const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v));
const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v)));
const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
const VI16 sDC_zz_98_zz_54_zz_10_zz =
BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
const VI16 sFC_xx_B8_xx_74_xx_30_xx =
Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz);
const VI16 sB8_xx_zz_zz_30_xx_zz_zz =
BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx)));
const VI16 sF8_xx_xx_xx_70_xx_xx_xx =
Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz);
return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx));
}
// ------------------------------ RotateRight
template <
int kBits,
class V>
HWY_API V RotateRight(
const V v) {
constexpr size_t kSizeInBits =
sizeof(TFromV<V>) * 8;
static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
if (kBits == 0)
return v;
return Or(ShiftRight<kBits>(v),
ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
}
// ------------------------------ Shl
#define HWY_RVV_SHIFT_VV(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
return __riscv_v
##OP
##_vv_
##CHAR##SEW
##LMUL(v, bits, \
HWY_RVV_AVL(SEW, SHIFT)); \
}
HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shl, sll, _ALL)
#define HWY_RVV_SHIFT_II(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
const HWY_RVV_D(uint, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT) du; \
return __riscv_v
##OP
##_vv_
##CHAR##SEW
##LMUL(v, BitCast(du, bits), \
HWY_RVV_AVL(SEW, SHIFT)); \
}
HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shl, sll, _ALL)
// ------------------------------ Shr
HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shr, srl, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shr, sra, _ALL)
#undef HWY_RVV_SHIFT_II
#undef HWY_RVV_SHIFT_VV
// ------------------------------ Min
namespace detail {
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MinS, minu_vx, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MinS, min_vx, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MinS, fmin_vf, _ALL)
}
// namespace detail
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Min, minu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Min, min, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Min, fmin, _ALL)
// ------------------------------ Max
namespace detail {
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MaxS, maxu_vx, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MaxS, max_vx, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MaxS, fmax_vf, _ALL)
}
// namespace detail
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Max, maxu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Max, max, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Max, fmax, _ALL)
// ------------------------------ Mul
// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
#ifdef HWY_NATIVE_MUL_8
#undef HWY_NATIVE_MUL_8
#else
#define HWY_NATIVE_MUL_8
#endif
#ifdef HWY_NATIVE_MUL_64
#undef HWY_NATIVE_MUL_64
#else
#define HWY_NATIVE_MUL_64
#endif
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Mul, fmul, _ALL)
// ------------------------------ MulHigh
// Only for internal use (Highway only promises MulHigh for 16-bit inputs).
// Used by MulEven; vwmul does not work for m8.
namespace detail {
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL)
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL)
}
// namespace detail
HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL)
HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL)
// ------------------------------ MulFixedPoint15
// Extra rounding mode = up argument.
#define HWY_RVV_MUL15(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
MLEN, NAME, OP) \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
return __riscv_v
##OP
##_vv_
##CHAR##SEW
##LMUL( \
a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
}
HWY_RVV_FOREACH_I16(HWY_RVV_MUL15, MulFixedPoint15, smul, _ALL)
#undef HWY_RVV_MUL15
// ------------------------------ Div
#ifdef HWY_NATIVE_INT_DIV
#undef HWY_NATIVE_INT_DIV
#else
#define HWY_NATIVE_INT_DIV
#endif
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Div, divu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Div, div, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Div, fdiv, _ALL)
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Mod, remu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Mod, rem, _ALL)
// ------------------------------ MaskedAddOr etc.
#ifdef HWY_NATIVE_MASKED_ARITH
#undef HWY_NATIVE_MASKED_ARITH
#else
#define HWY_NATIVE_MASKED_ARITH
#endif
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMinOr, minu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMinOr, min, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMinOr, fmin, _ALL)
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, maxu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, max, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, fmax, _ALL)
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedAddOr, add, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedAddOr, fadd, _ALL)
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedSubOr, sub, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedSubOr, fsub, _ALL)
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedMulOr, mul, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMulOr, fmul, _ALL)
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedDivOr, divu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedDivOr, div, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedDivOr, fdiv, _ALL)
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedModOr, remu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedModOr, rem, _ALL)
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, saddu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, sadd, _ALL)
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssubu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssub, _ALL)
// ------------------------------ ApproximateReciprocal
#ifdef HWY_NATIVE_F64_APPROX_RECIP
#undef HWY_NATIVE_F64_APPROX_RECIP
#else
#define HWY_NATIVE_F64_APPROX_RECIP
#endif
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, ApproximateReciprocal, frec7, _ALL)
// ------------------------------ Sqrt
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, Sqrt, fsqrt, _ALL)
// ------------------------------ ApproximateReciprocalSqrt
#ifdef HWY_NATIVE_F64_APPROX_RSQRT
#undef HWY_NATIVE_F64_APPROX_RSQRT
#else
#define HWY_NATIVE_F64_APPROX_RSQRT
#endif
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, ApproximateReciprocalSqrt, frsqrt7, _ALL)
// ------------------------------ MulAdd
// Per-target flag to prevent generic_ops-inl.h from defining int MulAdd.
#ifdef HWY_NATIVE_INT_FMA
#undef HWY_NATIVE_INT_FMA
#else
#define HWY_NATIVE_INT_FMA
#endif
// Note: op is still named vv, not vvv.
#define HWY_RVV_FMA(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
MLEN, NAME, OP) \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \
HWY_RVV_V(BASE, SEW, LMUL) add) { \
return __riscv_v
##OP
##_vv_
##CHAR##SEW
##LMUL(add, mul, x, \
HWY_RVV_AVL(SEW, SHIFT)); \
}
HWY_RVV_FOREACH_UI(HWY_RVV_FMA, MulAdd, macc, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc, _ALL)
// ------------------------------ NegMulAdd
HWY_RVV_FOREACH_UI(HWY_RVV_FMA, NegMulAdd, nmsac, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulAdd, fnmsac, _ALL)
// ------------------------------ MulSub
HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulSub, fmsac, _ALL)
// ------------------------------ NegMulSub
HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
#undef HWY_RVV_FMA
// ================================================== COMPARE
// Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in
// vboolXX_t is a power of two divisor for vector bits. SEW=8 / LMUL=1 = 1/8th
// of all bits; SEW=8 / LMUL=4 = half of all bits.
// mask = f(vector, vector)
#define HWY_RVV_RETM_ARGVV(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
HWY_API HWY_RVV_M(MLEN) \
NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
return __riscv_v
##OP
##_vv_
##CHAR##SEW
##LMUL
##_b
##MLEN( \
a, b, HWY_RVV_AVL(SEW, SHIFT)); \
}
// mask = f(vector, scalar)
#define HWY_RVV_RETM_ARGVS(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
HWY_API HWY_RVV_M(MLEN) \
NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
return __riscv_v
##OP
##_
##CHAR##SEW
##LMUL
##_b
##MLEN( \
a, b, HWY_RVV_AVL(SEW, SHIFT)); \
}
// ------------------------------ Eq
HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Eq, mseq, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Eq, mfeq, _ALL)
namespace detail {
HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL)
}
// namespace detail
// ------------------------------ Ne
HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Ne, msne, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Ne, mfne, _ALL)
namespace detail {
HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL)
}
// namespace detail
// ------------------------------ Lt
HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Lt, msltu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Lt, mslt, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Lt, mflt, _ALL)
namespace detail {
HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL)
HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVS, LtS, msltu_vx, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL)
}
// namespace detail
// ------------------------------ Le
HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Le, msleu, _ALL)
HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Le, msle, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Le, mfle, _ALL)
#undef HWY_RVV_RETM_ARGVV
#undef HWY_RVV_RETM_ARGVS
// ------------------------------ Gt/Ge
template <
class V>
HWY_API
auto Ge(
const V a,
const V b) -> decltype(Le(a, b)) {
return Le(b, a);
}
template <
class V>
HWY_API
auto Gt(
const V a,
const V b) -> decltype(Lt(a, b)) {
return Lt(b, a);
}
// ------------------------------ TestBit
template <
class V>
HWY_API
auto TestBit(
const V a,
const V bit) -> decltype(Eq(a, bit)) {
return detail::NeS(
And(a, bit), 0);
}
// ------------------------------ Not
// NOLINTNEXTLINE
HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM,
Not,
not )
// ------------------------------ And
// mask = f(mask_a, mask_b) (note arg2,arg1 order!)
#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP) \
HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \
return __riscv_vm
##OP
##_mm_b
##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT)); \
}
HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM,
And,
and)
// ------------------------------ AndNot
HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, AndNot, andn)
// ------------------------------ Or
HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM,
Or,
or)
// ------------------------------ Xor
HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM,
Xor,
xor)
// ------------------------------ ExclusiveNeither
HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, ExclusiveNeither, xnor)
#undef HWY_RVV_RETM_ARGMM
// ------------------------------ IfThenElse
#define HWY_RVV_IF_THEN_ELSE(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \
HWY_RVV_V(BASE, SEW, LMUL) no) { \
return __riscv_v
##OP
##_vvm_
##CHAR##SEW
##LMUL(no, yes, m, \
HWY_RVV_AVL(SEW, SHIFT)); \
}
HWY_RVV_FOREACH(HWY_RVV_IF_THEN_ELSE, IfThenElse, merge, _ALL)
#undef HWY_RVV_IF_THEN_ELSE
// ------------------------------ IfThenElseZero
template <
class M,
class V>
HWY_API V IfThenElseZero(
const M mask,
const V yes) {
return IfThenElse(mask, yes, Zero(DFromV<V>()));
}
// ------------------------------ IfThenZeroElse
#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
LMULH, SHIFT, MLEN, NAME, OP) \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) no) { \
return __riscv_v
##OP
##_
##CHAR##SEW
##LMUL(no, 0, m, \
HWY_RVV_AVL(SEW, SHIFT)); \
}
HWY_RVV_FOREACH_UI(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, merge_vxm, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, fmerge_vfm, _ALL)
#undef HWY_RVV_IF_THEN_ZERO_ELSE
// ------------------------------ MaskFromVec
template <
class D>
using MFromD = decltype(Eq(Zero(D()), Zero(D())));
template <
class V>
HWY_API MFromD<DFromV<V>> MaskFromVec(
const V v) {
return detail::NeS(v, 0);
}
// ------------------------------ MaskFalse
// For mask ops including vmclr, elements past VL are tail-agnostic and cannot
// be relied upon, so define a variant of the generic_ops-inl implementation of
// MaskFalse that ensures all bits are zero as required by mask_test.
#ifdef HWY_NATIVE_MASK_FALSE
#undef HWY_NATIVE_MASK_FALSE
#else
#define HWY_NATIVE_MASK_FALSE
#endif
template <
class D>
HWY_API MFromD<D> MaskFalse(D d) {
const DFromV<VFromD<decltype(d)>> d_full;
return MaskFromVec(Zero(d_full));
}
// ------------------------------ RebindMask
template <
class D,
typename MFrom>
HWY_API MFromD<D> RebindMask(
const D
/*d*/, const MFrom mask) {
// No need to check lane size/LMUL are the same: if not, casting MFrom to
// MFromD<D> would fail.
return mask;
}
// ------------------------------ VecFromMask
// Returns mask ? ~0 : 0. No longer use sub.vx(Zero(), 1, mask) because per the
// default mask-agnostic policy, the result of inactive lanes may also be ~0.
#define HWY_RVV_VEC_FROM_MASK(BASE,
CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
template <size_t N> \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) m) { \
/* MaskFalse requires we set all lanes for capped d and virtual LMUL. */ \
const DFromV<VFromD<decltype(d)>> d_full; \
const RebindToSigned<decltype(d_full)> di; \
using TI = TFromD<decltype(di)>; \
return BitCast(d_full, __riscv_v
##OP
##_i
##SEW
##LMUL(Zero(di), TI{-1}, m, \
--> --------------------
--> maximum size reached
--> --------------------