// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE
"tests/shuffle4_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
class TestPer4LaneBlockShuffle {
private:
template <
class D, HWY_IF_LANES_LE_D(D, 1)>
static HWY_INLINE VFromD<D> InterleaveMaskVectors(D
/*d*/, VFromD<D> a,
VFromD<D>
/*b*/) {
return a;
}
#if HWY_TARGET != HWY_SCALAR
template <
class D, HWY_IF_LANES_GT_D(D, 1)>
static HWY_INLINE VFromD<D> InterleaveMaskVectors(D d, VFromD<D> a,
VFromD<D> b) {
return InterleaveLower(d, a, b);
}
#endif
template <
class D>
static HWY_INLINE Mask<D> Per4LaneBlockShufValidMask(D d,
const size_t N,
const size_t idx1,
const size_t idx0) {
if (N < 4) {
const RebindToSigned<decltype(d)> di;
using TI = TFromD<decltype(di)>;
const auto lane_0_valid =
Set(di,
static_cast<TI>(-
static_cast<
int>(idx0 < N)));
if (N > 1) {
const auto lane_1_valid =
Set(di,
static_cast<TI>(-
static_cast<
int>(idx1 < N)));
return RebindMask(d, MaskFromVec(InterleaveMaskVectors(di, lane_0_valid,
lane_1_valid)));
}
return RebindMask(d, MaskFromVec(lane_0_valid));
}
return FirstN(d, N);
}
// TODO(b/287462770): inline to work around incorrect SVE codegen
template <
class D>
static HWY_INLINE
void DoCheckPer4LaneBlkShufResult(
D d,
const size_t N, VFromD<D> actual,
const TFromD<D>* HWY_RESTRICT src_lanes, TFromD<D>* HWY_RESTRICT expected,
size_t idx3, size_t idx2, size_t idx1, size_t idx0) {
for (size_t i = 0; i < N; i += 4) {
expected[i] = src_lanes[i + idx0];
expected[i + 1] = src_lanes[i + idx1];
expected[i + 2] = src_lanes[i + idx2];
expected[i + 3] = src_lanes[i + idx3];
}
if (N < 4) {
if (idx0 >= N) expected[0] = TFromD<D>{0};
if (idx1 >= N) expected[1] = TFromD<D>{0};
}
const auto valid_lanes_mask = Per4LaneBlockShufValidMask(d, N, idx1, idx0);
HWY_ASSERT_VEC_EQ(d, expected, IfThenElseZero(valid_lanes_mask, actual));
}
#if HWY_TARGET != HWY_SCALAR
template <
class D>
static HWY_NOINLINE
void TestTblLookupPer4LaneBlkShuf(
D d,
const size_t N,
const TFromD<D>* HWY_RESTRICT src_lanes,
TFromD<D>* HWY_RESTRICT expected) {
const auto v = Load(d, src_lanes);
for (size_t idx3210 = 0; idx3210 <= 0xFF; idx3210++) {
const size_t idx3 = (idx3210 >> 6) & 3;
const size_t idx2 = (idx3210 >> 4) & 3;
const size_t idx1 = (idx3210 >> 2) & 3;
const size_t idx0 = idx3210 & 3;
const auto actual = detail::TblLookupPer4LaneBlkShuf(v, idx3210);
DoCheckPer4LaneBlkShufResult(d, N, actual, src_lanes, expected, idx3,
idx2, idx1, idx0);
}
}
#endif
template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0,
class D>
static HWY_INLINE
void DoTestPer4LaneBlkShuffle(
D d,
const size_t N,
const VFromD<D> v,
const TFromD<D>* HWY_RESTRICT src_lanes,
TFromD<D>* HWY_RESTRICT expected) {
const auto actual = Per4LaneBlockShuffle<kIdx3, kIdx2, kIdx1, kIdx0>(v);
DoCheckPer4LaneBlkShufResult(d, N, actual, src_lanes, expected, kIdx3,
kIdx2, kIdx1, kIdx0);
}
template <
class D>
static HWY_NOINLINE
void DoTestPer4LaneBlkShuffles(
D d,
const size_t N,
const VecArg<VFromD<D>> v,
TFromD<D>* HWY_RESTRICT src_lanes, TFromD<D>* HWY_RESTRICT expected) {
Store(v, d, src_lanes);
#if HWY_TARGET != HWY_SCALAR
TestTblLookupPer4LaneBlkShuf(d, N, src_lanes, expected);
#endif
DoTestPer4LaneBlkShuffle<0, 1, 2, 3>(d, N, v, src_lanes, expected);
#if !HWY_COMPILER_MSVC
// speed up MSVC builds
DoTestPer4LaneBlkShuffle<0, 1, 3, 2>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<0, 2, 3, 1>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<0, 3, 0, 2>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<1, 0, 1, 0>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<1, 0, 3, 1>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<1, 0, 3, 2>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<1, 2, 0, 3>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<1, 2, 1, 3>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<1, 1, 0, 0>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<2, 0, 1, 3>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<2, 0, 2, 0>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<2, 1, 2, 0>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<2, 2, 0, 0>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<2, 3, 0, 1>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<2, 3, 3, 0>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<3, 0, 2, 1>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<3, 1, 0, 3>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<3, 1, 3, 1>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<3, 2, 1, 0>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<3, 2, 3, 2>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<3, 3, 0, 1>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<3, 3, 1, 1>(d, N, v, src_lanes, expected);
DoTestPer4LaneBlkShuffle<3, 3, 2, 2>(d, N, v, src_lanes, expected);
#endif
}
template <
class D>
static HWY_INLINE Vec<D> GenerateTestVect(hwy::NonFloatTag
/*tag*/, D d) {
const RebindToUnsigned<decltype(d)> du;
using TU = TFromD<decltype(du)>;
constexpr TU kIotaStart =
static_cast<TU>(0x0706050403020101u & LimitsMax<TU>());
return BitCast(d, Iota(du, kIotaStart));
}
template <
class D>
static HWY_INLINE Vec<D> GenerateTestVect(hwy::FloatTag
/*tag*/, D d) {
const RebindToUnsigned<decltype(d)> du;
using T = TFromD<decltype(d)>;
using TU = TFromD<decltype(du)>;
constexpr size_t kNumOfBitsInT =
sizeof(T) * 8;
constexpr TU kIntBitsMask =
(kNumOfBitsInT > 16) ?
static_cast<TU>(
static_cast<TU>(~TU{0}) >> 16)
: TU{0};
const auto flt_iota = Set(d, 1);
if (kIntBitsMask == 0)
return flt_iota;
const auto int_iota =
And(GenerateTestVect(hwy::NonFloatTag(), du), Set(du, kIntBitsMask));
return Or(flt_iota, BitCast(d, int_iota));
}
public:
template <
class T,
class D>
HWY_NOINLINE
void operator()(T
/*unused*/, D d) {
const size_t N = Lanes(d);
const size_t alloc_len =
static_cast<size_t>((N + 3) & (~size_t{3}));
HWY_ASSERT(alloc_len >= 4);
auto expected = AllocateAligned<T>(alloc_len);
auto src_lanes = AllocateAligned<T>(alloc_len);
HWY_ASSERT(expected && src_lanes);
const T k0 = ConvertScalarTo<T>(0);
expected[alloc_len - 4] = k0;
expected[alloc_len - 3] = k0;
expected[alloc_len - 2] = k0;
expected[alloc_len - 1] = k0;
src_lanes[alloc_len - 4] = k0;
src_lanes[alloc_len - 3] = k0;
src_lanes[alloc_len - 2] = k0;
src_lanes[alloc_len - 1] = k0;
const auto v = GenerateTestVect(hwy::IsFloatTag<T>(), d);
DoTestPer4LaneBlkShuffles(d, N, v, src_lanes.get(), expected.get());
const RebindToUnsigned<decltype(d)> du;
using TU = TFromD<decltype(du)>;
const auto msb_mask =
BitCast(d, Set(du,
static_cast<TU>(TU{1} << (
sizeof(TU) * 8 - 1))));
DoTestPer4LaneBlkShuffles(d, N,
Xor(v, msb_mask), src_lanes.get(),
expected.get());
}
};
HWY_NOINLINE
void TestAllPer4LaneBlockShuffle() {
ForAllTypes(ForPartialFixedOrFullScalableVectors<TestPer4LaneBlockShuffle>());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
}
// namespace HWY_NAMESPACE
}
// namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(HwyShuffle4Test);
HWY_EXPORT_AND_TEST_P(HwyShuffle4Test, TestAllPer4LaneBlockShuffle);
}
// namespace hwy
#endif