Quelle arm_neon-inl.h Sprache: C

// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// 128-bit Arm NEON vectors and operations.
// External include guard in highway.h - see comment there.

// Arm NEON intrinsics are documented at:
// https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]

#include "hwy/ops/shared-inl.h"

HWY_BEFORE_NAMESPACE();

// Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with
// the same target attribute as our code, see #834.
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
#include <arm_neon.h>  // NOLINT(build/include_order)
HWY_DIAGNOSTICS(pop)

// Must come after arm_neon.h.
namespace hwy {
namespace HWY_NAMESPACE {

namespace detail {  // for code folding and Raw128

// Macros used to define single and double function calls for multiple types
// for full and half vectors. These macros are undefined at the end of the file.

// HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function.
#define HWY_NEON_BUILD_TPL_1
#define HWY_NEON_BUILD_TPL_2
#define HWY_NEON_BUILD_TPL_3

// HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can
// extend it to int32x4x2_t packs.
#define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
#define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
#define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>

// HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives.
#define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
#define HWY_NEON_BUILD_PARAM_2(type, size) \
  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
#define HWY_NEON_BUILD_PARAM_3(type, size)                        \
  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
      const Vec128<type##_t, size> c

// HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying
// function.
#define HWY_NEON_BUILD_ARG_1 a.raw
#define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
#define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw

// We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after
// the __VA_ARGS__ have been expanded. This allows "func" to be a macro on
// itself like with some of the library "functions" such as vshlq_u8. For
// example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as
// "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed.
// Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro
// expects two arguments.
#define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)

// Main macro definition that defines a single function for the given type and
// size of vector, using the underlying (prefix##infix##suffix) function and
// the template, return type, parameters and arguments defined by the "args"
// parameters passed here (see HWY_NEON_BUILD_* macros defined before).
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
  HWY_CONCAT(HWY_NEON_BUILD_TPL_, args)                                      \
  HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)                  \
      name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) {            \
    return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)(                \
        HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args));    \
  }

// The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function
// called "name" using the set of neon functions starting with the given
// "prefix" for all the variants of certain types, as specified next to each
// macro. For example, the prefix "vsub" can be used to define the operator-
// using args=2.

// uint8_t
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
  HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args)     \
  HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args)     \
  HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args)     \
  HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)

// int8_t
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
  HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args)     \
  HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args)     \
  HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args)     \
  HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)

// uint16_t
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
  HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args)    \
  HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args)    \
  HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)

// int16_t
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
  HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args)    \
  HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args)    \
  HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)

// uint32_t
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
  HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args)    \
  HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)

// int32_t
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
  HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args)    \
  HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)

// uint64_t
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)

// int64_t
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)

#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
    (HWY_COMPILER_GCC_ACTUAL >= 1300 || HWY_COMPILER_CLANG >= 1100)
#define HWY_NEON_HAVE_BFLOAT16 1
#else
#define HWY_NEON_HAVE_BFLOAT16 0
#endif

// bfloat16_t
#if HWY_NEON_HAVE_BFLOAT16
#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)       \
  HWY_NEON_DEF_FUNCTION(bfloat16, 8, name, prefix##q, infix, bf16, args) \
  HWY_NEON_DEF_FUNCTION(bfloat16, 4, name, prefix, infix, bf16, args)    \
  HWY_NEON_DEF_FUNCTION(bfloat16, 2, name, prefix, infix, bf16, args)    \
  HWY_NEON_DEF_FUNCTION(bfloat16, 1, name, prefix, infix, bf16, args)
#else
#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
#endif

// Used for conversion instructions if HWY_NEON_HAVE_F16C.
#define HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, \
                                                     args)                \
  HWY_NEON_DEF_FUNCTION(float16, 8, name, prefix##q, infix, f16, args)    \
  HWY_NEON_DEF_FUNCTION(float16, 4, name, prefix, infix, f16, args)       \
  HWY_NEON_DEF_FUNCTION(float16, 2, name, prefix, infix, f16, args)       \
  HWY_NEON_DEF_FUNCTION(float16, 1, name, prefix, infix, f16, args)

// float16_t
#if HWY_HAVE_FLOAT16
#define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, args)
#else
#define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args)
#endif

// Enable generic functions for whichever of (f16, bf16) are not supported.
#if !HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
#elif !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_F16_D(D)
#elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
#elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
#define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<false>* = nullptr
#else
#error "Logic error, handled all four cases"
#endif

// float
#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
  HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args)    \
  HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)

// double
#if HWY_HAVE_FLOAT64
#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
  HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
#else
#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
#endif

// Helper macros to define for more than one type.
// uint8_t, uint16_t and uint32_t
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)             \
  HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)            \
  HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)

// int8_t, int16_t and int32_t
#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)             \
  HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)            \
  HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)

// uint8_t, uint16_t, uint32_t and uint64_t
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)  \
  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)

// int8_t, int16_t, int32_t and int64_t
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)  \
  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)

// All int*_t and uint*_t up to 64
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)             \
  HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)

#define HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args)          \
  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)

#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)

// All previous types.
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)

#define HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)     \
  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)

#define HWY_NEON_DEF_FUNCTION_UIF_8_16_32(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args)        \
  HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args)

#define HWY_NEON_DEF_FUNCTION_UIF_64(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)       \
  HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)

// For vzip1/2
#define HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args)   \
  HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
  HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args)
#define HWY_NEON_DEF_FUNCTION_FULL_UIF_64(name, prefix, infix, args) \
  HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args)        \
  HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args)

// For eor3q, which is only defined for full vectors.
#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args)      \
  HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args)  \
  HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
  HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
  HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args)   \
  HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args)  \
  HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args)  \
  HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args)
// Emulation of some intrinsics on armv7.
#if HWY_ARCH_ARM_V7
#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
#define vzip1_s8(x, y) vzip_s8(x, y).val[0]
#define vzip1_u8(x, y) vzip_u8(x, y).val[0]
#define vzip1_s16(x, y) vzip_s16(x, y).val[0]
#define vzip1_u16(x, y) vzip_u16(x, y).val[0]
#define vzip1_f32(x, y) vzip_f32(x, y).val[0]
#define vzip1_u32(x, y) vzip_u32(x, y).val[0]
#define vzip1_s32(x, y) vzip_s32(x, y).val[0]
#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
#define vzip2_s8(x, y) vzip_s8(x, y).val[1]
#define vzip2_u8(x, y) vzip_u8(x, y).val[1]
#define vzip2_s16(x, y) vzip_s16(x, y).val[1]
#define vzip2_u16(x, y) vzip_u16(x, y).val[1]
#define vzip2_s32(x, y) vzip_s32(x, y).val[1]
#define vzip2_u32(x, y) vzip_u32(x, y).val[1]
#define vzip2_f32(x, y) vzip_f32(x, y).val[1]
#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
#endif

// Wrappers over uint8x16x2_t etc. so we can define StoreInterleaved2
// overloads for all vector types, even those (bfloat16_t) where the
// underlying vector is the same as others (uint16_t).
template <typename T, size_t N>
struct Tuple2;
template <typename T, size_t N>
struct Tuple3;
template <typename T, size_t N>
struct Tuple4;

template <>
struct Tuple2<uint8_t, 16> {
  uint8x16x2_t raw;
};
template <size_t N>
struct Tuple2<uint8_t, N> {
  uint8x8x2_t raw;
};
template <>
struct Tuple2<int8_t, 16> {
  int8x16x2_t raw;
};
template <size_t N>
struct Tuple2<int8_t, N> {
  int8x8x2_t raw;
};
template <>
struct Tuple2<uint16_t, 8> {
  uint16x8x2_t raw;
};
template <size_t N>
struct Tuple2<uint16_t, N> {
  uint16x4x2_t raw;
};
template <>
struct Tuple2<int16_t, 8> {
  int16x8x2_t raw;
};
template <size_t N>
struct Tuple2<int16_t, N> {
  int16x4x2_t raw;
};
template <>
struct Tuple2<uint32_t, 4> {
  uint32x4x2_t raw;
};
template <size_t N>
struct Tuple2<uint32_t, N> {
  uint32x2x2_t raw;
};
template <>
struct Tuple2<int32_t, 4> {
  int32x4x2_t raw;
};
template <size_t N>
struct Tuple2<int32_t, N> {
  int32x2x2_t raw;
};
template <>
struct Tuple2<uint64_t, 2> {
  uint64x2x2_t raw;
};
template <size_t N>
struct Tuple2<uint64_t, N> {
  uint64x1x2_t raw;
};
template <>
struct Tuple2<int64_t, 2> {
  int64x2x2_t raw;
};
template <size_t N>
struct Tuple2<int64_t, N> {
  int64x1x2_t raw;
};

template <>
struct Tuple2<float32_t, 4> {
  float32x4x2_t raw;
};
template <size_t N>
struct Tuple2<float32_t, N> {
  float32x2x2_t raw;
};
#if HWY_HAVE_FLOAT64
template <>
struct Tuple2<float64_t, 2> {
  float64x2x2_t raw;
};
template <size_t N>
struct Tuple2<float64_t, N> {
  float64x1x2_t raw;
};
#endif  // HWY_HAVE_FLOAT64

template <>
struct Tuple3<uint8_t, 16> {
  uint8x16x3_t raw;
};
template <size_t N>
struct Tuple3<uint8_t, N> {
  uint8x8x3_t raw;
};
template <>
struct Tuple3<int8_t, 16> {
  int8x16x3_t raw;
};
template <size_t N>
struct Tuple3<int8_t, N> {
  int8x8x3_t raw;
};
template <>
struct Tuple3<uint16_t, 8> {
  uint16x8x3_t raw;
};
template <size_t N>
struct Tuple3<uint16_t, N> {
  uint16x4x3_t raw;
};
template <>
struct Tuple3<int16_t, 8> {
  int16x8x3_t raw;
};
template <size_t N>
struct Tuple3<int16_t, N> {
  int16x4x3_t raw;
};
template <>
struct Tuple3<uint32_t, 4> {
  uint32x4x3_t raw;
};
template <size_t N>
struct Tuple3<uint32_t, N> {
  uint32x2x3_t raw;
};
template <>
struct Tuple3<int32_t, 4> {
  int32x4x3_t raw;
};
template <size_t N>
struct Tuple3<int32_t, N> {
  int32x2x3_t raw;
};
template <>
struct Tuple3<uint64_t, 2> {
  uint64x2x3_t raw;
};
template <size_t N>
struct Tuple3<uint64_t, N> {
  uint64x1x3_t raw;
};
template <>
struct Tuple3<int64_t, 2> {
  int64x2x3_t raw;
};
template <size_t N>
struct Tuple3<int64_t, N> {
  int64x1x3_t raw;
};

template <>
struct Tuple3<float32_t, 4> {
  float32x4x3_t raw;
};
template <size_t N>
struct Tuple3<float32_t, N> {
  float32x2x3_t raw;
};
#if HWY_HAVE_FLOAT64
template <>
struct Tuple3<float64_t, 2> {
  float64x2x3_t raw;
};
template <size_t N>
struct Tuple3<float64_t, N> {
  float64x1x3_t raw;
};
#endif  // HWY_HAVE_FLOAT64

template <>
struct Tuple4<uint8_t, 16> {
  uint8x16x4_t raw;
};
template <size_t N>
struct Tuple4<uint8_t, N> {
  uint8x8x4_t raw;
};
template <>
struct Tuple4<int8_t, 16> {
  int8x16x4_t raw;
};
template <size_t N>
struct Tuple4<int8_t, N> {
  int8x8x4_t raw;
};
template <>
struct Tuple4<uint16_t, 8> {
  uint16x8x4_t raw;
};
template <size_t N>
struct Tuple4<uint16_t, N> {
  uint16x4x4_t raw;
};
template <>
struct Tuple4<int16_t, 8> {
  int16x8x4_t raw;
};
template <size_t N>
struct Tuple4<int16_t, N> {
  int16x4x4_t raw;
};
template <>
struct Tuple4<uint32_t, 4> {
  uint32x4x4_t raw;
};
template <size_t N>
struct Tuple4<uint32_t, N> {
  uint32x2x4_t raw;
};
template <>
struct Tuple4<int32_t, 4> {
  int32x4x4_t raw;
};
template <size_t N>
struct Tuple4<int32_t, N> {
  int32x2x4_t raw;
};
template <>
struct Tuple4<uint64_t, 2> {
  uint64x2x4_t raw;
};
template <size_t N>
struct Tuple4<uint64_t, N> {
  uint64x1x4_t raw;
};
template <>
struct Tuple4<int64_t, 2> {
  int64x2x4_t raw;
};
template <size_t N>
struct Tuple4<int64_t, N> {
  int64x1x4_t raw;
};

template <>
struct Tuple4<float32_t, 4> {
  float32x4x4_t raw;
};
template <size_t N>
struct Tuple4<float32_t, N> {
  float32x2x4_t raw;
};
#if HWY_HAVE_FLOAT64
template <>
struct Tuple4<float64_t, 2> {
  float64x2x4_t raw;
};
template <size_t N>
struct Tuple4<float64_t, N> {
  float64x1x4_t raw;
};
#endif  // HWY_HAVE_FLOAT64

template <typename T, size_t N>
struct Raw128;

template <>
struct Raw128<uint8_t, 16> {
  using type = uint8x16_t;
};
template <size_t N>
struct Raw128<uint8_t, N> {
  using type = uint8x8_t;
};

template <>
struct Raw128<uint16_t, 8> {
  using type = uint16x8_t;
};
template <size_t N>
struct Raw128<uint16_t, N> {
  using type = uint16x4_t;
};

template <>
struct Raw128<uint32_t, 4> {
  using type = uint32x4_t;
};
template <size_t N>
struct Raw128<uint32_t, N> {
  using type = uint32x2_t;
};

template <>
struct Raw128<uint64_t, 2> {
  using type = uint64x2_t;
};
template <>
struct Raw128<uint64_t, 1> {
  using type = uint64x1_t;
};

template <>
struct Raw128<int8_t, 16> {
  using type = int8x16_t;
};
template <size_t N>
struct Raw128<int8_t, N> {
  using type = int8x8_t;
};

template <>
struct Raw128<int16_t, 8> {
  using type = int16x8_t;
};
template <size_t N>
struct Raw128<int16_t, N> {
  using type = int16x4_t;
};

template <>
struct Raw128<int32_t, 4> {
  using type = int32x4_t;
};
template <size_t N>
struct Raw128<int32_t, N> {
  using type = int32x2_t;
};

template <>
struct Raw128<int64_t, 2> {
  using type = int64x2_t;
};
template <>
struct Raw128<int64_t, 1> {
  using type = int64x1_t;
};

template <>
struct Raw128<float, 4> {
  using type = float32x4_t;
};
template <size_t N>
struct Raw128<float, N> {
  using type = float32x2_t;
};

#if HWY_HAVE_FLOAT64
template <>
struct Raw128<double, 2> {
  using type = float64x2_t;
};
template <>
struct Raw128<double, 1> {
  using type = float64x1_t;
};
#endif  // HWY_HAVE_FLOAT64

#if HWY_NEON_HAVE_F16C

template <>
struct Tuple2<float16_t, 8> {
  float16x8x2_t raw;
};
template <size_t N>
struct Tuple2<float16_t, N> {
  float16x4x2_t raw;
};

template <>
struct Tuple3<float16_t, 8> {
  float16x8x3_t raw;
};
template <size_t N>
struct Tuple3<float16_t, N> {
  float16x4x3_t raw;
};

template <>
struct Tuple4<float16_t, 8> {
  float16x8x4_t raw;
};
template <size_t N>
struct Tuple4<float16_t, N> {
  float16x4x4_t raw;
};

template <>
struct Raw128<float16_t, 8> {
  using type = float16x8_t;
};
template <size_t N>
struct Raw128<float16_t, N> {
  using type = float16x4_t;
};

#else  // !HWY_NEON_HAVE_F16C

template <size_t N>
struct Tuple2<float16_t, N> : public Tuple2<uint16_t, N> {};
template <size_t N>
struct Tuple3<float16_t, N> : public Tuple3<uint16_t, N> {};
template <size_t N>
struct Tuple4<float16_t, N> : public Tuple4<uint16_t, N> {};
template <size_t N>
struct Raw128<float16_t, N> : public Raw128<uint16_t, N> {};

#endif  // HWY_NEON_HAVE_F16C

#if HWY_NEON_HAVE_BFLOAT16

template <>
struct Tuple2<bfloat16_t, 8> {
  bfloat16x8x2_t raw;
};
template <size_t N>
struct Tuple2<bfloat16_t, N> {
  bfloat16x4x2_t raw;
};

template <>
struct Tuple3<bfloat16_t, 8> {
  bfloat16x8x3_t raw;
};
template <size_t N>
struct Tuple3<bfloat16_t, N> {
  bfloat16x4x3_t raw;
};

template <>
struct Tuple4<bfloat16_t, 8> {
  bfloat16x8x4_t raw;
};
template <size_t N>
struct Tuple4<bfloat16_t, N> {
  bfloat16x4x4_t raw;
};

template <>
struct Raw128<bfloat16_t, 8> {
  using type = bfloat16x8_t;
};
template <size_t N>
struct Raw128<bfloat16_t, N> {
  using type = bfloat16x4_t;
};

#else  // !HWY_NEON_HAVE_BFLOAT16

template <size_t N>
struct Tuple2<bfloat16_t, N> : public Tuple2<uint16_t, N> {};
template <size_t N>
struct Tuple3<bfloat16_t, N> : public Tuple3<uint16_t, N> {};
template <size_t N>
struct Tuple4<bfloat16_t, N> : public Tuple4<uint16_t, N> {};
template <size_t N>
struct Raw128<bfloat16_t, N> : public Raw128<uint16_t, N> {};

#endif  // HWY_NEON_HAVE_BFLOAT16

}  // namespace detail

template <typename T, size_t N = 16 / sizeof(T)>
class Vec128 {
public:
  using Raw = typename detail::Raw128<T, N>::type;
  using PrivateT = T;                     // only for DFromV
  static constexpr size_t kPrivateN = N;  // only for DFromV

  HWY_INLINE Vec128() {}
  Vec128(const Vec128&) = default;
  Vec128& operator=(const Vec128&) = default;
  HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}

  // Compound assignment. Only usable if there is a corresponding non-member
  // binary operator overload. For example, only f32 and f64 support division.
  HWY_INLINE Vec128& operator*=(const Vec128 other) {
    return *this = (*this * other);
  }
  HWY_INLINE Vec128& operator/=(const Vec128 other) {
    return *this = (*this / other);
  }
  HWY_INLINE Vec128& operator+=(const Vec128 other) {
    return *this = (*this + other);
  }
  HWY_INLINE Vec128& operator-=(const Vec128 other) {
    return *this = (*this - other);
  }
  HWY_INLINE Vec128& operator%=(const Vec128 other) {
    return *this = (*this % other);
  }
  HWY_INLINE Vec128& operator&=(const Vec128 other) {
    return *this = (*this & other);
  }
  HWY_INLINE Vec128& operator|=(const Vec128 other) {
    return *this = (*this | other);
  }
  HWY_INLINE Vec128& operator^=(const Vec128 other) {
    return *this = (*this ^ other);
  }

  Raw raw;
};

template <typename T>
using Vec64 = Vec128<T, 8 / sizeof(T)>;

template <typename T>
using Vec32 = Vec128<T, 4 / sizeof(T)>;

template <typename T>
using Vec16 = Vec128<T, 2 / sizeof(T)>;

// FF..FF or 0.
template <typename T, size_t N = 16 / sizeof(T)>
class Mask128 {
  // Arm C Language Extensions return and expect unsigned type.
  using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;

public:
  using PrivateT = T;                     // only for DFromM
  static constexpr size_t kPrivateN = N;  // only for DFromM

  HWY_INLINE Mask128() {}
  Mask128(const Mask128&) = default;
  Mask128& operator=(const Mask128&) = default;
  HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {}

  Raw raw;
};

template <typename T>
using Mask64 = Mask128<T, 8 / sizeof(T)>;

template <class V>
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;

template <class M>
using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;

template <class V>
using TFromV = typename V::PrivateT;

// ------------------------------ Set

namespace detail {
// We want to route any combination of N/kPow2 to the intrinsics depending on
// whether the requested size is <= 64 bits or 128. HWY_NEON_BUILD_TPL is
// unconditional and currently does not accept inputs (such as whether the
// vector is 64 or 128-bit). Thus we are not able to use HWY_IF_V_SIZE_D for
// SFINAE. We instead define a private NativeSet which receives a Simd<> whose
// kPow2 has already been folded into its N.
#define HWY_NEON_BUILD_TPL_HWY_SET
#define HWY_NEON_BUILD_RET_HWY_SET(type, size) Vec128<type##_t, size>
#define HWY_NEON_BUILD_PARAM_HWY_SET(type, size) \
  Simd<type##_t, size, 0> /* tag */, type##_t t
#define HWY_NEON_BUILD_ARG_HWY_SET t

HWY_NEON_DEF_FUNCTION_ALL_TYPES(NativeSet, vdup, _n_, HWY_SET)
#if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C
HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(NativeSet, vdup, _n_, HWY_SET)
#endif
HWY_NEON_DEF_FUNCTION_BFLOAT_16(NativeSet, vdup, _n_, HWY_SET)

template <class D, HWY_NEON_IF_EMULATED_D(D)>
HWY_API Vec128<TFromD<D>, MaxLanes(D())> NativeSet(D d, TFromD<D> t) {
  const uint16_t tu = BitCastScalar<uint16_t>(t);
  return Vec128<TFromD<D>, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw);
}

#undef HWY_NEON_BUILD_TPL_HWY_SET
#undef HWY_NEON_BUILD_RET_HWY_SET
#undef HWY_NEON_BUILD_PARAM_HWY_SET
#undef HWY_NEON_BUILD_ARG_HWY_SET

}  // namespace detail

// Full vector. Cannot yet use VFromD because that is defined in terms of Set.
// Do not use a typename T = TFromD<D> argument because T will be deduced from
// the actual argument type, which can differ from TFromD<D>.
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T>
HWY_INLINE Vec128<TFromD<D>> Set(D /* tag */, T t) {
  return detail::NativeSet(Full128<TFromD<D>>(), static_cast<TFromD<D>>(t));
}

// Partial vector: create 64-bit and return wrapper.
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T>
HWY_API Vec128<TFromD<D>, MaxLanes(D())> Set(D /* tag */, T t) {
  const Full64<TFromD<D>> dfull;
  return Vec128<TFromD<D>, MaxLanes(D())>(
      detail::NativeSet(dfull, static_cast<TFromD<D>>(t)).raw);
}

template <class D>
using VFromD = decltype(Set(D(), TFromD<D>()));

template <class D>
HWY_API VFromD<D> Zero(D d) {
  // Default ctor also works for bfloat16_t and float16_t.
  return Set(d, TFromD<D>{});
}

HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
#endif

template <class D>
HWY_API VFromD<D> Undefined(D /*tag*/) {
  VFromD<D> v;
  return v;
}

HWY_DIAGNOSTICS(pop)

#if !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
namespace detail {

#pragma pack(push, 1)

template <class T>
struct alignas(8) Vec64ValsWrapper {
  static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true");
  static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true");
  T vals[8 / sizeof(T)];
};

#pragma pack(pop)

}  // namespace detail
#endif  // !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL

template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                      TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
                                      TFromD<D> /*t8*/, TFromD<D> /*t9*/,
                                      TFromD<D> /*t10*/, TFromD<D> /*t11*/,
                                      TFromD<D> /*t12*/, TFromD<D> /*t13*/,
                                      TFromD<D> /*t14*/, TFromD<D> /*t15*/) {
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef int8_t GccI8RawVectType __attribute__((__vector_size__(8)));
  (void)d;
  const GccI8RawVectType raw = {
      static_cast<int8_t>(t0), static_cast<int8_t>(t1), static_cast<int8_t>(t2),
      static_cast<int8_t>(t3), static_cast<int8_t>(t4), static_cast<int8_t>(t5),
      static_cast<int8_t>(t6), static_cast<int8_t>(t7)};
  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
#else
  return ResizeBitCast(
      d, Set(Full64<uint64_t>(),
             BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
                 {t0, t1, t2, t3, t4, t5, t6, t7}})));
#endif
}

template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3,
                                      TFromD<D> /*t4*/, TFromD<D> /*t5*/,
                                      TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef int16_t GccI16RawVectType __attribute__((__vector_size__(8)));
  (void)d;
  const GccI16RawVectType raw = {
      static_cast<int16_t>(t0), static_cast<int16_t>(t1),
      static_cast<int16_t>(t2), static_cast<int16_t>(t3)};
  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
#else
  return ResizeBitCast(
      d, Set(Full64<uint64_t>(),
             BitCastScalar<uint64_t>(
                 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}})));
#endif
}

template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef int32_t GccI32RawVectType __attribute__((__vector_size__(8)));
  (void)d;
  const GccI32RawVectType raw = {static_cast<int32_t>(t0),
                                 static_cast<int32_t>(t1)};
  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
#else
  return ResizeBitCast(d,
                       Set(Full64<uint64_t>(),
                           BitCastScalar<uint64_t>(
                               detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
#endif
}

template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef float GccF32RawVectType __attribute__((__vector_size__(8)));
  (void)d;
  const GccF32RawVectType raw = {t0, t1};
  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
#else
  return ResizeBitCast(d,
                       Set(Full64<uint64_t>(),
                           BitCastScalar<uint64_t>(
                               detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
#endif
}

template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) {
  return Set(d, t0);
}

template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                      TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
                                      TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
                                      TFromD<D> t11, TFromD<D> t12,
                                      TFromD<D> t13, TFromD<D> t14,
                                      TFromD<D> t15) {
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef int8_t GccI8RawVectType __attribute__((__vector_size__(16)));
  (void)d;
  const GccI8RawVectType raw = {
      static_cast<int8_t>(t0),  static_cast<int8_t>(t1),
      static_cast<int8_t>(t2),  static_cast<int8_t>(t3),
      static_cast<int8_t>(t4),  static_cast<int8_t>(t5),
      static_cast<int8_t>(t6),  static_cast<int8_t>(t7),
      static_cast<int8_t>(t8),  static_cast<int8_t>(t9),
      static_cast<int8_t>(t10), static_cast<int8_t>(t11),
      static_cast<int8_t>(t12), static_cast<int8_t>(t13),
      static_cast<int8_t>(t14), static_cast<int8_t>(t15)};
  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
#else
  const Half<decltype(d)> dh;
  return Combine(d,
                 Dup128VecFromValues(dh, t8, t9, t10, t11, t12, t13, t14, t15,
                                     t8, t9, t10, t11, t12, t13, t14, t15),
                 Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t0, t1,
                                     t2, t3, t4, t5, t6, t7));
#endif
}

template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                      TFromD<D> t5, TFromD<D> t6,
                                      TFromD<D> t7) {
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef int16_t GccI16RawVectType __attribute__((__vector_size__(16)));
  (void)d;
  const GccI16RawVectType raw = {
      static_cast<int16_t>(t0), static_cast<int16_t>(t1),
      static_cast<int16_t>(t2), static_cast<int16_t>(t3),
      static_cast<int16_t>(t4), static_cast<int16_t>(t5),
      static_cast<int16_t>(t6), static_cast<int16_t>(t7)};
  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
#else
  const Half<decltype(d)> dh;
  return Combine(d, Dup128VecFromValues(dh, t4, t5, t6, t7, t4, t5, t6, t7),
                 Dup128VecFromValues(dh, t0, t1, t2, t3, t0, t1, t2, t3));
#endif
}

template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3) {
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef int32_t GccI32RawVectType __attribute__((__vector_size__(16)));
  (void)d;
  const GccI32RawVectType raw = {
      static_cast<int32_t>(t0), static_cast<int32_t>(t1),
      static_cast<int32_t>(t2), static_cast<int32_t>(t3)};
  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
#else
  const Half<decltype(d)> dh;
  return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
                 Dup128VecFromValues(dh, t0, t1, t0, t1));
#endif
}

template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3) {
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
  (void)d;
  const GccF32RawVectType raw = {t0, t1, t2, t3};
  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
#else
  const Half<decltype(d)> dh;
  return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
                 Dup128VecFromValues(dh, t0, t1, t0, t1));
#endif
}

template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef int64_t GccI64RawVectType __attribute__((__vector_size__(16)));
  (void)d;
  const GccI64RawVectType raw = {static_cast<int64_t>(t0),
                                 static_cast<int64_t>(t1)};
  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
#else
  const Half<decltype(d)> dh;
  return Combine(d, Set(dh, t1), Set(dh, t0));
#endif
}

#if HWY_HAVE_FLOAT64
template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
  typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
  (void)d;
  const GccF64RawVectType raw = {t0, t1};
  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
#else
  const Half<decltype(d)> dh;
  return Combine(d, Set(dh, t1), Set(dh, t0));
#endif
}
#endif

// Generic for all vector lengths
template <class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                      TFromD<D> t5, TFromD<D> t6,
                                      TFromD<D> t7) {
  const RebindToSigned<decltype(d)> di;
  return BitCast(d,
                 Dup128VecFromValues(
                     di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
                     BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
                     BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
                     BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
}

#if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3,
                                      TFromD<D> /*t4*/, TFromD<D> /*t5*/,
                                      TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
  typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(8)));
  (void)d;
  const GccF16RawVectType raw = {
      static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
      static_cast<__fp16>(t3)};
  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
}
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                      TFromD<D> t5, TFromD<D> t6,
                                      TFromD<D> t7) {
  typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(16)));
  (void)d;
  const GccF16RawVectType raw = {
      static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
      static_cast<__fp16>(t3), static_cast<__fp16>(t4), static_cast<__fp16>(t5),
      static_cast<__fp16>(t6), static_cast<__fp16>(t7)};
  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
}
#else
// Generic for all vector lengths if MSVC or !HWY_NEON_HAVE_F16C
template <class D, HWY_IF_F16_D(D)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                      TFromD<D> t5, TFromD<D> t6,
                                      TFromD<D> t7) {
  const RebindToSigned<decltype(d)> di;
  return BitCast(d,
                 Dup128VecFromValues(
                     di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
                     BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
                     BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
                     BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
}
#endif  // (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C

namespace detail {

template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<D> Iota0(D d) {
  return Dup128VecFromValues(
      d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2}, TFromD<D>{3}, TFromD<D>{4},
      TFromD<D>{5}, TFromD<D>{6}, TFromD<D>{7}, TFromD<D>{8}, TFromD<D>{9},
      TFromD<D>{10}, TFromD<D>{11}, TFromD<D>{12}, TFromD<D>{13}, TFromD<D>{14},
      TFromD<D>{15});
}

template <class D, HWY_IF_UI16_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
  return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
                             TFromD<D>{3}, TFromD<D>{4}, TFromD<D>{5},
                             TFromD<D>{6}, TFromD<D>{7});
}

template <class D, HWY_IF_F16_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, Dup128VecFromValues(du, uint16_t{0}, uint16_t{0x3C00},
                                        uint16_t{0x4000}, uint16_t{0x4200},
                                        uint16_t{0x4400}, uint16_t{0x4500},
                                        uint16_t{0x4600}, uint16_t{0x4700}));
}

template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE VFromD<D> Iota0(D d) {
  return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
                             TFromD<D>{3});
}

template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE VFromD<D> Iota0(D d) {
  return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1});
}

#if HWY_COMPILER_MSVC
template <class V, HWY_IF_V_SIZE_LE_V(V, 4)>
static HWY_INLINE V MaskOutIota(V v) {
  constexpr size_t kVecSizeInBytes = HWY_MAX_LANES_V(V) * sizeof(TFromV<V>);
  constexpr uint64_t kU64MaskOutMask =
      hwy::LimitsMax<hwy::UnsignedFromSize<kVecSizeInBytes>>();

  const DFromV<decltype(v)> d;
  const Repartition<uint8_t, decltype(d)> du8;
  using VU8 = VFromD<decltype(du8)>;
  const auto mask_out_mask =
      BitCast(d, VU8(vreinterpret_u8_u64(vdup_n_u64(kU64MaskOutMask))));
  return v & mask_out_mask;
}
template <class V, HWY_IF_V_SIZE_GT_V(V, 4)>
static HWY_INLINE V MaskOutIota(V v) {
  return v;
}
#endif

}  // namespace detail

template <class D, typename T2>
HWY_API VFromD<D> Iota(D d, const T2 first) {
  const auto result_iota =
      detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
#if HWY_COMPILER_MSVC
  return detail::MaskOutIota(result_iota);
#else
  return result_iota;
#endif
}

// ------------------------------ Tuple (VFromD)
#include "hwy/ops/tuple-inl.h"

// ------------------------------ Combine

// Full result
template <class D, HWY_IF_U8_D(D)>
HWY_API Vec128<uint8_t> Combine(D /* tag */, Vec64<uint8_t> hi,
                                Vec64<uint8_t> lo) {
  return Vec128<uint8_t>(vcombine_u8(lo.raw, hi.raw));
}
template <class D, HWY_IF_U16_D(D)>
HWY_API Vec128<uint16_t> Combine(D /* tag */, Vec64<uint16_t> hi,
                                 Vec64<uint16_t> lo) {
  return Vec128<uint16_t>(vcombine_u16(lo.raw, hi.raw));
}
template <class D, HWY_IF_U32_D(D)>
HWY_API Vec128<uint32_t> Combine(D /* tag */, Vec64<uint32_t> hi,
                                 Vec64<uint32_t> lo) {
  return Vec128<uint32_t>(vcombine_u32(lo.raw, hi.raw));
}
template <class D, HWY_IF_U64_D(D)>
HWY_API Vec128<uint64_t> Combine(D /* tag */, Vec64<uint64_t> hi,
                                 Vec64<uint64_t> lo) {
  return Vec128<uint64_t>(vcombine_u64(lo.raw, hi.raw));
}

template <class D, HWY_IF_I8_D(D)>
HWY_API Vec128<int8_t> Combine(D /* tag */, Vec64<int8_t> hi,
                               Vec64<int8_t> lo) {
  return Vec128<int8_t>(vcombine_s8(lo.raw, hi.raw));
}
template <class D, HWY_IF_I16_D(D)>
HWY_API Vec128<int16_t> Combine(D /* tag */, Vec64<int16_t> hi,
                                Vec64<int16_t> lo) {
  return Vec128<int16_t>(vcombine_s16(lo.raw, hi.raw));
}
template <class D, HWY_IF_I32_D(D)>
HWY_API Vec128<int32_t> Combine(D /* tag */, Vec64<int32_t> hi,
                                Vec64<int32_t> lo) {
  return Vec128<int32_t>(vcombine_s32(lo.raw, hi.raw));
}
template <class D, HWY_IF_I64_D(D)>
HWY_API Vec128<int64_t> Combine(D /* tag */, Vec64<int64_t> hi,
                                Vec64<int64_t> lo) {
  return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
}

#if HWY_HAVE_FLOAT16
template <class D, HWY_IF_F16_D(D)>
HWY_API Vec128<float16_t> Combine(D, Vec64<float16_t> hi, Vec64<float16_t> lo) {
  return Vec128<float16_t>(vcombine_f16(lo.raw, hi.raw));
}
#endif  // HWY_HAVE_FLOAT16

#if HWY_NEON_HAVE_BFLOAT16
template <class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> Combine(D, Vec64<bfloat16_t> hi, Vec64<bfloat16_t> lo) {
  return VFromD<D>(vcombine_bf16(lo.raw, hi.raw));
}
#endif  // HWY_NEON_HAVE_BFLOAT16

template <class D, class DH = Half<D>, HWY_NEON_IF_EMULATED_D(D)>
HWY_API VFromD<D> Combine(D d, VFromD<DH> hi, VFromD<DH> lo) {
  const RebindToUnsigned<D> du;
  const Half<decltype(du)> duh;
  return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo)));
}

template <class D, HWY_IF_F32_D(D)>
HWY_API Vec128<float> Combine(D /* tag */, Vec64<float> hi, Vec64<float> lo) {
  return Vec128<float>(vcombine_f32(lo.raw, hi.raw));
}
#if HWY_HAVE_FLOAT64
template <class D, HWY_IF_F64_D(D)>
HWY_API Vec128<double> Combine(D /* tag */, Vec64<double> hi,
                               Vec64<double> lo) {
  return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
}
#endif  // HWY_HAVE_FLOAT64

// ------------------------------ BitCast

namespace detail {

// Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
// vreinterpret*_u8_*() set of functions.
#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
  Vec128<uint8_t, size * sizeof(type##_t)>
#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw

// Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
template <size_t N>
HWY_INLINE Vec128<uint8_t, N> BitCastToByte(Vec128<uint8_t, N> v) {
  return v;
}

HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_,
                                 HWY_CAST_TO_U8)
HWY_NEON_DEF_FUNCTION_BFLOAT_16(BitCastToByte, vreinterpret, _u8_,
                                HWY_CAST_TO_U8)

HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)

#if !HWY_HAVE_FLOAT16
#if HWY_NEON_HAVE_F16C
HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(BitCastToByte, vreinterpret, _u8_,
                                             HWY_CAST_TO_U8)
#else
template <size_t N>
HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) {
  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
}
#endif  // HWY_NEON_HAVE_F16C
#endif  // !HWY_HAVE_FLOAT16

#if !HWY_NEON_HAVE_BFLOAT16
template <size_t N>
HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<bfloat16_t, N> v) {
  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
}
#endif  // !HWY_NEON_HAVE_BFLOAT16

#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8

template <class D, HWY_IF_U8_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, VFromD<D> v) {
  return v;
}

// 64-bit or less:

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<RebindToUnsigned<D>> v) {
  return VFromD<D>(vreinterpret_s8_u8(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<Repartition<uint8_t, D>> v) {
  return VFromD<D>(vreinterpret_u16_u8(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<Repartition<uint8_t, D>> v) {
  return VFromD<D>(vreinterpret_s16_u8(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<Repartition<uint8_t, D>> v) {
  return VFromD<D>(vreinterpret_u32_u8(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<Repartition<uint8_t, D>> v) {
  return VFromD<D>(vreinterpret_s32_u8(v.raw));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)>
HWY_INLINE Vec64<uint64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
  return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
HWY_INLINE Vec64<int64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
  return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
}

// Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
  return VFromD<D>(vreinterpret_f16_u8(v.raw));
#else
  const RebindToUnsigned<D> du;
  return VFromD<D>(BitCastFromByte(du, v).raw);
#endif
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
#if HWY_NEON_HAVE_BFLOAT16
  return VFromD<D>(vreinterpret_bf16_u8(v.raw));
#else
  const RebindToUnsigned<D> du;
  return VFromD<D>(BitCastFromByte(du, v).raw);
#endif
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     VFromD<Repartition<uint8_t, D>> v) {
  return VFromD<D>(vreinterpret_f32_u8(v.raw));
}

#if HWY_HAVE_FLOAT64
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F64_D(D)>
HWY_INLINE Vec64<double> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
  return Vec64<double>(vreinterpret_f64_u8(v.raw));
}
#endif  // HWY_HAVE_FLOAT64

// 128-bit full:

template <class D, HWY_IF_I8_D(D)>
HWY_INLINE Vec128<int8_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
}
template <class D, HWY_IF_U16_D(D)>
HWY_INLINE Vec128<uint16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
}
template <class D, HWY_IF_I16_D(D)>
HWY_INLINE Vec128<int16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
}
template <class D, HWY_IF_U32_D(D)>
HWY_INLINE Vec128<uint32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
}
template <class D, HWY_IF_I32_D(D)>
HWY_INLINE Vec128<int32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
}
template <class D, HWY_IF_U64_D(D)>
HWY_INLINE Vec128<uint64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
}
template <class D, HWY_IF_I64_D(D)>
HWY_INLINE Vec128<int64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
}

template <class D, HWY_IF_F32_D(D)>
HWY_INLINE Vec128<float> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<float>(vreinterpretq_f32_u8(v.raw));
}

#if HWY_HAVE_FLOAT64
template <class D, HWY_IF_F64_D(D)>
HWY_INLINE Vec128<double> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
  return Vec128<double>(vreinterpretq_f64_u8(v.raw));
}
#endif  // HWY_HAVE_FLOAT64

// Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
template <class D, HWY_IF_F16_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
  return VFromD<D>(vreinterpretq_f16_u8(v.raw));
#else
  return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
#endif
}

template <class D, HWY_IF_BF16_D(D)>
HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
#if HWY_NEON_HAVE_BFLOAT16
  return VFromD<D>(vreinterpretq_bf16_u8(v.raw));
#else
  return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
#endif
}

}  // namespace detail

template <class D, class FromT>
HWY_API VFromD<D> BitCast(D d,
                          Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
}

// ------------------------------ ResizeBitCast

// <= 8 byte vector to <= 8 byte vector
template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
          HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
  const Repartition<uint8_t, decltype(d)> du8;
  return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToByte(v).raw});
}

// 16-byte vector to 16-byte vector: same as BitCast
template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
          HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
  return BitCast(d, v);
}

// 16-byte vector to <= 8-byte vector
template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
          HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
  const DFromV<decltype(v)> d_from;
  const Half<decltype(d_from)> dh_from;
  return ResizeBitCast(d, LowerHalf(dh_from, v));
}

// <= 8-bit vector to 16-byte vector
template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
          HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
  const Full64<TFromV<FromV>> d_full64_from;
  const Full128<TFromV<FromV>> d_full128_from;
  return BitCast(d, Combine(d_full128_from, Zero(d_full64_from),
                            ResizeBitCast(d_full64_from, v)));
}

// ------------------------------ GetLane

namespace detail {
#define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane>
#define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t
#define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v
#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane

HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET)

#undef HWY_NEON_BUILD_TPL_HWY_GET
#undef HWY_NEON_BUILD_RET_HWY_GET
#undef HWY_NEON_BUILD_PARAM_HWY_GET
#undef HWY_NEON_BUILD_ARG_HWY_GET

}  // namespace detail

template <class V>
HWY_API TFromV<V> GetLane(const V v) {
  return detail::GetLane<0>(v);
}

// ------------------------------ ExtractLane

// Requires one overload per vector length because GetLane<3> is a compile error
// if v is a uint32x2_t.
template <typename T>
HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
  HWY_DASSERT(i == 0);
  (void)i;
  return detail::GetLane<0>(v);
}

template <typename T>
HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::GetLane<0>(v);
      case 1:
        return detail::GetLane<1>(v);
    }
  }
#endif
  alignas(16) T lanes[2];
  Store(v, DFromV<decltype(v)>(), lanes);
  return lanes[i];
}

template <typename T>
HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::GetLane<0>(v);
      case 1:
        return detail::GetLane<1>(v);
      case 2:
        return detail::GetLane<2>(v);
      case 3:
        return detail::GetLane<3>(v);
    }
  }
#endif
  alignas(16) T lanes[4];
  Store(v, DFromV<decltype(v)>(), lanes);
  return lanes[i];
}

template <typename T>
HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::GetLane<0>(v);
      case 1:
        return detail::GetLane<1>(v);
      case 2:
        return detail::GetLane<2>(v);
      case 3:
        return detail::GetLane<3>(v);
      case 4:
        return detail::GetLane<4>(v);
      case 5:
        return detail::GetLane<5>(v);
      case 6:
        return detail::GetLane<6>(v);
      case 7:
        return detail::GetLane<7>(v);
    }
  }
#endif
  alignas(16) T lanes[8];
  Store(v, DFromV<decltype(v)>(), lanes);
  return lanes[i];
}

template <typename T>
HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::GetLane<0>(v);
      case 1:
        return detail::GetLane<1>(v);
      case 2:
        return detail::GetLane<2>(v);
      case 3:
        return detail::GetLane<3>(v);
      case 4:
        return detail::GetLane<4>(v);
      case 5:
        return detail::GetLane<5>(v);
      case 6:
        return detail::GetLane<6>(v);
      case 7:
        return detail::GetLane<7>(v);
      case 8:
        return detail::GetLane<8>(v);
      case 9:
        return detail::GetLane<9>(v);
      case 10:
        return detail::GetLane<10>(v);
      case 11:
        return detail::GetLane<11>(v);
      case 12:
        return detail::GetLane<12>(v);
      case 13:
        return detail::GetLane<13>(v);
      case 14:
        return detail::GetLane<14>(v);
      case 15:
        return detail::GetLane<15>(v);
    }
  }
#endif
  alignas(16) T lanes[16];
  Store(v, DFromV<decltype(v)>(), lanes);
  return lanes[i];
}

// ------------------------------ InsertLane

namespace detail {
#define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane>
#define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size>
#define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \
  Vec128<type##_t, size> v, type##_t t
#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane

HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)

#undef HWY_NEON_BUILD_TPL_HWY_INSERT
#undef HWY_NEON_BUILD_RET_HWY_INSERT
#undef HWY_NEON_BUILD_PARAM_HWY_INSERT
#undef HWY_NEON_BUILD_ARG_HWY_INSERT

template <size_t kLane, class V, class D = DFromV<V>, HWY_NEON_IF_EMULATED_D(D)>
HWY_API V InsertLane(const V v, TFromD<D> t) {
  const D d;
  const RebindToUnsigned<D> du;
  const uint16_t tu = BitCastScalar<uint16_t>(t);
  return BitCast(d, InsertLane<kLane>(BitCast(du, v), tu));
}

}  // namespace detail

// Requires one overload per vector length because InsertLane<3> may be a
// compile error.

template <typename T>
HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
  HWY_DASSERT(i == 0);
  (void)i;
  return Set(DFromV<decltype(v)>(), t);
}

template <typename T>
HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::InsertLane<0>(v, t);
      case 1:
        return detail::InsertLane<1>(v, t);
    }
  }
#endif
  const DFromV<decltype(v)> d;
  alignas(16) T lanes[2];
  Store(v, d, lanes);
  lanes[i] = t;
  return Load(d, lanes);
}

template <typename T>
HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::InsertLane<0>(v, t);
      case 1:
        return detail::InsertLane<1>(v, t);
      case 2:
        return detail::InsertLane<2>(v, t);
      case 3:
        return detail::InsertLane<3>(v, t);
    }
  }
#endif
  const DFromV<decltype(v)> d;
  alignas(16) T lanes[4];
  Store(v, d, lanes);
  lanes[i] = t;
  return Load(d, lanes);
}

template <typename T>
HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::InsertLane<0>(v, t);
      case 1:
        return detail::InsertLane<1>(v, t);
      case 2:
        return detail::InsertLane<2>(v, t);
      case 3:
        return detail::InsertLane<3>(v, t);
      case 4:
        return detail::InsertLane<4>(v, t);
      case 5:
        return detail::InsertLane<5>(v, t);
      case 6:
        return detail::InsertLane<6>(v, t);
      case 7:
        return detail::InsertLane<7>(v, t);
    }
  }
#endif
  const DFromV<decltype(v)> d;
  alignas(16) T lanes[8];
  Store(v, d, lanes);
  lanes[i] = t;
  return Load(d, lanes);
}

template <typename T>
HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::InsertLane<0>(v, t);
      case 1:
        return detail::InsertLane<1>(v, t);
      case 2:
        return detail::InsertLane<2>(v, t);
      case 3:
        return detail::InsertLane<3>(v, t);
      case 4:
        return detail::InsertLane<4>(v, t);
      case 5:
        return detail::InsertLane<5>(v, t);
      case 6:
        return detail::InsertLane<6>(v, t);
      case 7:
        return detail::InsertLane<7>(v, t);
      case 8:
        return detail::InsertLane<8>(v, t);
      case 9:
        return detail::InsertLane<9>(v, t);
      case 10:
        return detail::InsertLane<10>(v, t);
      case 11:
        return detail::InsertLane<11>(v, t);
      case 12:
        return detail::InsertLane<12>(v, t);
      case 13:
        return detail::InsertLane<13>(v, t);
      case 14:
        return detail::InsertLane<14>(v, t);
      case 15:
        return detail::InsertLane<15>(v, t);
    }
  }
#endif
  const DFromV<decltype(v)> d;
  alignas(16) T lanes[16];
  Store(v, d, lanes);
  lanes[i] = t;
  return Load(d, lanes);
}

// ================================================== ARITHMETIC

// ------------------------------ Addition
HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2)

// ------------------------------ Subtraction
HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2)

// ------------------------------ SumsOf8

HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) {
  return Vec128<uint64_t>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw))));
}
HWY_API Vec64<uint64_t> SumsOf8(const Vec64<uint8_t> v) {
  return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw))));
}
HWY_API Vec128<int64_t> SumsOf8(const Vec128<int8_t> v) {
  return Vec128<int64_t>(vpaddlq_s32(vpaddlq_s16(vpaddlq_s8(v.raw))));
}
HWY_API Vec64<int64_t> SumsOf8(const Vec64<int8_t> v) {
  return Vec64<int64_t>(vpaddl_s32(vpaddl_s16(vpaddl_s8(v.raw))));
}

// ------------------------------ SumsOf2
namespace detail {

template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
    hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s8(v.raw));
}

template <class V, HWY_IF_V_SIZE_V(V, 16)>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
    hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s8(v.raw));
}

template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
    hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u8(v.raw));
}

template <class V, HWY_IF_V_SIZE_V(V, 16)>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
    hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u8(v.raw));
}

template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
    hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s16(v.raw));
}

template <class V, HWY_IF_V_SIZE_V(V, 16)>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
    hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s16(v.raw));
}

template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
    hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.30 Sekunden ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.