diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2024-05-07 23:15:48 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2024-05-07 23:15:48 +0000 |
commit | c7caefd0df41378801e48aaa3cda1270c312762a (patch) | |
tree | 306c083dbb84bd7ee8529aaa3d2aad2c5b6c9dc2 | |
parent | 910c1c3005863b02f71278fb61028acd8f399e51 (diff) | |
parent | 59addd2f7e65b35643dbe541cbd7a20d0b5e90df (diff) | |
download | binary_translation-c7caefd0df41378801e48aaa3cda1270c312762a.tar.gz |
Snap for 11812660 from 59addd2f7e65b35643dbe541cbd7a20d0b5e90df to sdk-release
Change-Id: I66aac65e8d03033faad94ce229a165b5195108ff
-rw-r--r-- | assembler/include/berberis/assembler/common_x86.h | 6 | ||||
-rw-r--r-- | assembler/include/berberis/assembler/x86_32.h | 2 | ||||
-rw-r--r-- | assembler/include/berberis/assembler/x86_64.h | 13 | ||||
-rw-r--r-- | interpreter/riscv64/interpreter.h | 112 | ||||
-rw-r--r-- | interpreter/riscv64/interpreter_test.cc | 1107 | ||||
-rw-r--r-- | intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h | 9 | ||||
-rw-r--r-- | kernel_api/riscv64/open_emulation.cc | 37 | ||||
-rw-r--r-- | tests/inline_asm_tests/Android.bp | 15 | ||||
-rw-r--r-- | tests/inline_asm_tests/main_riscv64.cc | 290 | ||||
-rw-r--r-- | tests/run_host_tests.mk | 23 |
10 files changed, 1347 insertions, 267 deletions
diff --git a/assembler/include/berberis/assembler/common_x86.h b/assembler/include/berberis/assembler/common_x86.h index 86453678..c67ce55a 100644 --- a/assembler/include/berberis/assembler/common_x86.h +++ b/assembler/include/berberis/assembler/common_x86.h @@ -785,13 +785,9 @@ inline void AssemblerX86<Assembler>::Xchgl(Register dest, Register src) { Register other = Assembler::IsAccumulator(src) ? dest : src; EmitInstruction<Opcodes<0x90>>(Register32Bit(other)); } else { - // Clang 8 (after r330298) swaps these two arguments. We are comparing output + // Clang 8 (after r330298) puts dest before src. We are comparing output // to clang in exhaustive test thus we want to match clang behavior exactly. -#if __clang_major__ >= 8 EmitInstruction<Opcodes<0x87>>(Register32Bit(dest), Register32Bit(src)); -#else - EmitInstruction<Opcodes<0x87>>(Register32Bit(src), Register32Bit(dest)); -#endif } } diff --git a/assembler/include/berberis/assembler/x86_32.h b/assembler/include/berberis/assembler/x86_32.h index 40e87a2f..cde5c682 100644 --- a/assembler/include/berberis/assembler/x86_32.h +++ b/assembler/include/berberis/assembler/x86_32.h @@ -183,7 +183,7 @@ class Assembler : public AssemblerX86<Assembler> { // Make sure only type void* can be passed to function below, not Label* or any other type. template <typename T> - auto Jmp(Condition cc, T* target) -> void = delete; + auto Jmp(T* target) -> void = delete; void Jmp(const void* target) { Emit8(0xe9); diff --git a/assembler/include/berberis/assembler/x86_64.h b/assembler/include/berberis/assembler/x86_64.h index ba343f86..c66cc1c7 100644 --- a/assembler/include/berberis/assembler/x86_64.h +++ b/assembler/include/berberis/assembler/x86_64.h @@ -179,7 +179,7 @@ class Assembler : public AssemblerX86<Assembler> { // Make sure only type void* can be passed to function below, not Label* or any other type. template <typename T> - auto Jmp(Condition cc, T* target) -> void = delete; + auto Jmp(T* target) -> void = delete; void Jmp(const void* target) { // There are no jump instruction with properties we need thus we emulate it. @@ -533,22 +533,15 @@ inline void Assembler::Xchgq(Register dest, Register src) { // We compare output to that from clang and thus want to produce the same code. // 0x48 0x90 is suboptimal encoding for that operation (pure 0x90 does the same // and this is what gcc + gas are producing), but this is what clang <= 8 does. -#if __clang_major__ >= 8 if (IsAccumulator(src) && IsAccumulator(dest)) { Emit8(0x90); - } else -#endif - if (IsAccumulator(src) || IsAccumulator(dest)) { + } else if (IsAccumulator(src) || IsAccumulator(dest)) { Register other = IsAccumulator(src) ? dest : src; EmitInstruction<Opcodes<0x90>>(Register64Bit(other)); } else { - // Clang 8 (after r330298) swaps these two arguments. We are comparing output + // Clang 8 (after r330298) puts dest before src. We are comparing output // to clang in exhaustive test thus we want to match clang behavior exactly. -#if __clang_major__ >= 8 EmitInstruction<Opcodes<0x87>>(Register64Bit(dest), Register64Bit(src)); -#else - EmitInstruction<Opcodes<0x87>>(Register64Bit(src), Register64Bit(dest)); -#endif } } diff --git a/interpreter/riscv64/interpreter.h b/interpreter/riscv64/interpreter.h index 0c7bd140..661e4d92 100644 --- a/interpreter/riscv64/interpreter.h +++ b/interpreter/riscv64/interpreter.h @@ -469,7 +469,7 @@ class Interpreter { template <typename ElementType, VectorRegisterGroupMultiplier vlmul> static constexpr size_t GetVlmax() { - constexpr int kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof(ElementType)); + constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType); switch (vlmul) { case VectorRegisterGroupMultiplier::k1register: return kElementsCount; @@ -923,8 +923,7 @@ class Interpreter { if (!IsAligned<kIndexRegistersInvolved>(args.idx)) { return Undefined(); } - constexpr size_t kElementsCount = - static_cast<int>(sizeof(SIMD128Register) / sizeof(IndexElementType)); + constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(IndexElementType); alignas(alignof(SIMD128Register)) IndexElementType indexes[kElementsCount * kIndexRegistersInvolved]; memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved); @@ -1040,10 +1039,10 @@ class Interpreter { if (!IsAligned<kNumRegistersInGroup>(dst)) { return Undefined(); } - if (dst + kNumRegistersInGroup * kSegmentSize >= 32) { + if (dst + kNumRegistersInGroup * kSegmentSize > 32) { return Undefined(); } - constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType)); + constexpr size_t kElementsCount = 16 / sizeof(ElementType); size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); if constexpr (opcode == Decoder::VLUmOpOpcode::kVlm) { @@ -1211,7 +1210,7 @@ class Interpreter { auto vma, typename GetElementIndexLambdaType> void OpVectorGather(uint8_t dst, uint8_t src1, GetElementIndexLambdaType GetElementIndex) { - constexpr int kRegistersInvolved = NumberOfRegistersInvolved(vlmul); + constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul); if (!IsAligned<kRegistersInvolved>(dst | src1)) { return Undefined(); } @@ -1219,7 +1218,7 @@ class Interpreter { if (dst < (src1 + kRegistersInvolved) && src1 < (dst + kRegistersInvolved)) { return Undefined(); } - constexpr int kElementsCount = static_cast<int>(16 / sizeof(ElementType)); + constexpr size_t kElementsCount = 16 / sizeof(ElementType); constexpr size_t vlmax = GetVlmax<ElementType, vlmul>(); size_t vstart = GetCsr<CsrName::kVstart>(); @@ -1323,6 +1322,10 @@ class Interpreter { case Decoder::VOpFVfOpcode::kVfsgnjxvf: return OpVectorvx<intrinsics::Vfsgnjxvx<ElementType>, ElementType, vlmul, vta, vma>( args.dst, args.src1, arg2); + case Decoder::VOpFVfOpcode::kVfslide1upvf: + return OpVectorslide1up<ElementType, vlmul, vta, vma>(args.dst, args.src1, arg2); + case Decoder::VOpFVfOpcode::kVfslide1downvf: + return OpVectorslide1down<ElementType, vlmul, vta, vma>(args.dst, args.src1, arg2); case Decoder::VOpFVfOpcode::kVfmvsf: if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) { return Undefined(); @@ -1640,14 +1643,14 @@ class Interpreter { vlmul, vta, vma, - kFrm>(args.dst, args.src1, Vec<kNegativeZero>{args.src2}); + kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2); } else { return OpVectorvs<intrinsics::Vfredusumvs<ElementType>, ElementType, vlmul, vta, vma, - kFrm>(args.dst, args.src1, Vec<kPositiveZero>{args.src2}); + kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2); } case Decoder::VOpFVvOpcode::kVfredosumvs: // 14.3. Vector Single-Width Floating-Point Reduction Instructions: @@ -1658,14 +1661,14 @@ class Interpreter { vlmul, vta, vma, - kFrm>(args.dst, args.src1, Vec<kNegativeZero>{args.src2}); + kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2); } else { return OpVectorvs<intrinsics::Vfredosumvs<ElementType>, ElementType, vlmul, vta, vma, - kFrm>(args.dst, args.src1, Vec<kPositiveZero>{args.src2}); + kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2); } case Decoder::VOpFVvOpcode::kVfminvv: return OpVectorvv<intrinsics::Vfminvv<ElementType>, ElementType, vlmul, vta, vma>( @@ -1674,10 +1677,10 @@ class Interpreter { // For Vfredmin the identity element is +inf. return OpVectorvs<intrinsics::Vfredminvs<ElementType>, ElementType, vlmul, vta, vma>( args.dst, - args.src1, Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x7f80'0000 : 0x7ff0'0000'0000'0000}>{ - args.src2}); + args.src1}, + args.src2); case Decoder::VOpFVvOpcode::kVfmaxvv: return OpVectorvv<intrinsics::Vfmaxvv<ElementType>, ElementType, vlmul, vta, vma>( args.dst, args.src1, args.src2); @@ -1685,10 +1688,10 @@ class Interpreter { // For Vfredmax the identity element is -inf. return OpVectorvs<intrinsics::Vfredmaxvs<ElementType>, ElementType, vlmul, vta, vma>( args.dst, - args.src1, Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0xff80'0000 : 0xfff0'0000'0000'0000}>{ - args.src2}); + args.src1}, + args.src2); case Decoder::VOpFVvOpcode::kVfsgnjvv: return OpVectorvv<intrinsics::Vfsgnjvv<ElementType>, ElementType, vlmul, vta, vma>( args.dst, args.src1, args.src2); @@ -2091,6 +2094,20 @@ class Interpreter { case Decoder::VOpIVvOpcode::kVnsrlwv: return OpVectorNarrowwv<intrinsics::Vnsrwv<UnsignedType>, UnsignedType, vlmul, vta, vma>( args.dst, args.src1, args.src2); + case Decoder::VOpIVvOpcode::kVnclipuwv: + return OpVectorNarrowwv<intrinsics::Vnclipwv<SaturatingUnsignedType>, + SaturatingUnsignedType, + vlmul, + vta, + vma, + kVxrm>(args.dst, args.src1, args.src2); + case Decoder::VOpIVvOpcode::kVnclipwv: + return OpVectorNarrowwv<intrinsics::Vnclipwv<SaturatingSignedType>, + SaturatingSignedType, + vlmul, + vta, + vma, + kVxrm>(args.dst, args.src1, args.src2); default: Undefined(); } @@ -2224,6 +2241,20 @@ class Interpreter { case Decoder::VOpIVxOpcode::kVslidedownvx: return OpVectorslidedown<ElementType, vlmul, vta, vma>( args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2)); + case Decoder::VOpIVxOpcode::kVnclipuwx: + return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingUnsignedType>, + SaturatingUnsignedType, + vlmul, + vta, + vma, + kVxrm>(args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2)); + case Decoder::VOpIVxOpcode::kVnclipwx: + return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingSignedType>, + SaturatingSignedType, + vlmul, + vta, + vma, + kVxrm>(args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2)); default: Undefined(); } @@ -2267,35 +2298,35 @@ class Interpreter { switch (args.opcode) { case Decoder::VOpMVvOpcode::kVredsumvs: return OpVectorvs<intrinsics::Vredsumvs<ElementType>, ElementType, vlmul, vta, vma>( - args.dst, args.src1, Vec<ElementType{}>{args.src2}); + args.dst, Vec<ElementType{}>{args.src1}, args.src2); case Decoder::VOpMVvOpcode::kVredandvs: return OpVectorvs<intrinsics::Vredandvs<ElementType>, ElementType, vlmul, vta, vma>( - args.dst, args.src1, Vec<~ElementType{}>{args.src2}); + args.dst, Vec<~ElementType{}>{args.src1}, args.src2); case Decoder::VOpMVvOpcode::kVredorvs: return OpVectorvs<intrinsics::Vredorvs<ElementType>, ElementType, vlmul, vta, vma>( - args.dst, args.src1, Vec<ElementType{}>{args.src2}); + args.dst, Vec<ElementType{}>{args.src1}, args.src2); case Decoder::VOpMVvOpcode::kVredxorvs: return OpVectorvs<intrinsics::Vredxorvs<ElementType>, ElementType, vlmul, vta, vma>( - args.dst, args.src1, Vec<ElementType{}>{args.src2}); + args.dst, Vec<ElementType{}>{args.src1}, args.src2); case Decoder::VOpMVvOpcode::kVredminuvs: return OpVectorvs<intrinsics::Vredminvs<UnsignedType>, UnsignedType, vlmul, vta, vma>( args.dst, - args.src1, - Vec<UnsignedType{std::numeric_limits<typename UnsignedType::BaseType>::max()}>( - args.src2)); + Vec<UnsignedType{std::numeric_limits<typename UnsignedType::BaseType>::max()}>{ + args.src1}, + args.src2); case Decoder::VOpMVvOpcode::kVredminvs: return OpVectorvs<intrinsics::Vredminvs<SignedType>, SignedType, vlmul, vta, vma>( args.dst, - args.src1, - Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::max()}>{args.src2}); + Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::max()}>{args.src1}, + args.src2); case Decoder::VOpMVvOpcode::kVredmaxuvs: return OpVectorvs<intrinsics::Vredmaxvs<UnsignedType>, UnsignedType, vlmul, vta, vma>( - args.dst, args.src1, Vec<UnsignedType{}>{args.src2}); + args.dst, Vec<UnsignedType{}>{args.src1}, args.src2); case Decoder::VOpMVvOpcode::kVredmaxvs: return OpVectorvs<intrinsics::Vredmaxvs<SignedType>, SignedType, vlmul, vta, vma>( args.dst, - args.src1, - Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::min()}>{args.src2}); + Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::min()}>{args.src1}, + args.src2); case Decoder::VOpMVvOpcode::kVaadduvv: return OpVectorvv<intrinsics::Vaaddvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>( args.dst, args.src1, args.src2); @@ -2635,8 +2666,7 @@ class Interpreter { if (!IsAligned<kIndexRegistersInvolved>(args.idx)) { return Undefined(); } - constexpr size_t kElementsCount = - static_cast<int>(sizeof(SIMD128Register) / sizeof(IndexElementType)); + constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(IndexElementType); alignas(alignof(SIMD128Register)) IndexElementType indexes[kElementsCount * kIndexRegistersInvolved]; memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved); @@ -2704,7 +2734,7 @@ class Interpreter { if (data + kNumRegistersInGroup * kSegmentSize > 32) { return Undefined(); } - constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType)); + constexpr size_t kElementsCount = 16 / sizeof(ElementType); size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); if constexpr (opcode == Decoder::VSUmOpOpcode::kVsm) { @@ -2959,7 +2989,7 @@ class Interpreter { if (!IsAligned<kRegistersInvolved>(dst | src)) { return Undefined(); } - constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType)); + constexpr size_t kElementsCount = 16 / sizeof(ElementType); size_t vstart = GetCsr<CsrName::kVstart>(); SetCsr<CsrName::kVstart>(0); // The usual property that no elements are written if vstart >= vl does not apply to these @@ -3124,7 +3154,7 @@ class Interpreter { auto vma, CsrName... kExtraCsrs, auto kDefaultElement> - void OpVectorvs(uint8_t dst, uint8_t src1, Vec<kDefaultElement> src2) { + void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) { return OpVectorvs<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), @@ -3140,8 +3170,8 @@ class Interpreter { auto vma, CsrName... kExtraCsrs, auto kDefaultElement> - void OpVectorvs(uint8_t dst, uint8_t src1, Vec<kDefaultElement> src2) { - if (!IsAligned<kRegistersInvolved>(dst | src2.start_no)) { + void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) { + if (!IsAligned<kRegistersInvolved>(dst | src1.start_no)) { return Undefined(); } size_t vstart = GetCsr<CsrName::kVstart>(); @@ -3155,15 +3185,15 @@ class Interpreter { return; } auto mask = GetMaskForVectorOperations<vma>(); - ElementType arg1 = SIMD128Register{state_->cpu.v[src1]}.Get<ElementType>(0); + ElementType init = SIMD128Register{state_->cpu.v[src2]}.Get<ElementType>(0); for (size_t index = 0; index < kRegistersInvolved; ++index) { - arg1 = std::get<0>( + init = std::get<0>( Intrinsic(GetCsr<kExtraCsrs>()..., - arg1, - GetVectorArgument<ElementType, vta, vma>(src2, vstart, vl, index, mask))); + init, + GetVectorArgument<ElementType, vta, vma>(src1, vstart, vl, index, mask))); } SIMD128Register result{state_->cpu.v[dst]}; - result.Set(arg1, 0); + result.Set(init, 0); result = std::get<0>(intrinsics::VectorMasking<ElementType, vta>(result, result, 0, 1)); state_->cpu.v[dst] = result.Get<__uint128_t>(); } @@ -3593,8 +3623,8 @@ class Interpreter { if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kSourceRegistersInvolved>(src)) { return Undefined(); } - int vstart = GetCsr<CsrName::kVstart>(); - int vl = GetCsr<CsrName::kVl>(); + size_t vstart = GetCsr<CsrName::kVstart>(); + size_t vl = GetCsr<CsrName::kVl>(); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { diff --git a/interpreter/riscv64/interpreter_test.cc b/interpreter/riscv64/interpreter_test.cc index 2feb230d..1cbdfcb2 100644 --- a/interpreter/riscv64/interpreter_test.cc +++ b/interpreter/riscv64/interpreter_test.cc @@ -1224,7 +1224,7 @@ class Riscv64InterpreterTest : public ::testing::Test { // instructions that work with double width floats. // These instructions never use float registers though and thus we don't need to store // anything into f1 register, if they are used. - // For Float32/Float64 case we load 1.0 of the appropriate type into f1. + // For Float32/Float64 case we load 5.625 of the appropriate type into f1. ASSERT_LE(vsew, 3); if (vsew == 2) { SetFReg<1>(state_.cpu, 0xffff'ffff'40b4'0000); // float 5.625 @@ -1341,19 +1341,7 @@ class Riscv64InterpreterTest : public ::testing::Test { (kTestVectorInstructionMode == TestVectorInstructionMode::kWidening), 8, expected_result, - [] { - if constexpr (sizeof(ElementType) == sizeof(Int8)) { - return kMaskInt8; - } else if constexpr (sizeof(ElementType) == sizeof(Int16)) { - return kMaskInt16; - } else if constexpr (sizeof(ElementType) == sizeof(Int32)) { - return kMaskInt32; - } else if constexpr (sizeof(ElementType) == sizeof(Int64)) { - return kMaskInt64; - } else { - static_assert(kDependentTypeFalse<ElementType>); - } - }()), + MaskForElem<ElementType>()), Verify((insn_bytes & ~(0x01f00000 * (kTestVectorInstructionMode == TestVectorInstructionMode::kVMerge))) | (1 << 25), @@ -1528,11 +1516,9 @@ class Riscv64InterpreterTest : public ::testing::Test { } } - template <bool kIsMasked, typename ElementType> + template <typename ElementType> auto MaskForElem() { - if constexpr (!kIsMasked) { - return kNoMask; - } else if constexpr (std::is_same_v<ElementType, uint8_t>) { + if constexpr (std::is_same_v<ElementType, uint8_t>) { return kMaskInt8; } else if constexpr (std::is_same_v<ElementType, uint16_t>) { return kMaskInt16; @@ -1545,6 +1531,15 @@ class Riscv64InterpreterTest : public ::testing::Test { } } + template <bool kIsMasked, typename ElementType> + auto MaskForElemIfMasked() { + if constexpr (!kIsMasked) { + return kNoMask; + } else { + return MaskForElem<ElementType>(); + } + } + template <bool kIsMasked> void TestVectorIota(uint32_t insn_bytes, const uint8_t (&expected_result_int8)[8][16], @@ -1662,7 +1657,7 @@ class Riscv64InterpreterTest : public ::testing::Test { (Verify(insn_bytes, BitUtilLog2(sizeof(ElementType)), expected_result, - MaskForElem<kIsMasked, ElementType>()), + MaskForElemIfMasked<kIsMasked, ElementType>()), ...); } @@ -1931,6 +1926,49 @@ class Riscv64InterpreterTest : public ::testing::Test { } } + void TestVectorFloatPermutationInstruction(uint32_t insn_bytes, + const uint32_t (&expected_result_int32)[8][4], + const uint64_t (&expected_result_int64)[8][2], + const __v2du (&source)[16], + uint8_t vlmul, + uint64_t skip = 0, + bool ignore_vma_for_last = false, + bool last_elem_is_f1 = false) { + TestVectorPermutationInstruction<TestVectorInstructionKind::kFloat>(insn_bytes, + source, + vlmul, + skip, + ignore_vma_for_last, + last_elem_is_f1, + /* regx1 */ 0x0, + expected_result_int32, + expected_result_int64); + } + + void TestVectorPermutationInstruction(uint32_t insn_bytes, + const uint8_t (&expected_result_int8)[8][16], + const uint16_t (&expected_result_int16)[8][8], + const uint32_t (&expected_result_int32)[8][4], + const uint64_t (&expected_result_int64)[8][2], + const __v2du (&source)[16], + uint8_t vlmul, + uint64_t regx1 = 0x0, + uint64_t skip = 0, + bool ignore_vma_for_last = false, + bool last_elem_is_x1 = false) { + TestVectorPermutationInstruction<TestVectorInstructionKind::kInteger>(insn_bytes, + source, + vlmul, + skip, + ignore_vma_for_last, + last_elem_is_x1, + regx1, + expected_result_int8, + expected_result_int16, + expected_result_int32, + expected_result_int64); + } + // Unlike regular arithmetic instructions, the result of a permutation // instruction depends also on vlmul. Also, the vslideup specs mention that // the destination vector remains unchanged the first |offset| elements (in @@ -1940,21 +1978,23 @@ class Riscv64InterpreterTest : public ::testing::Test { // // If |ignore_vma_for_last| is true, an inactive element at vl-1 will be // treated as if vma=0 (Undisturbed). - // If |last_elem_is_x1| is true, the last element of the vector in + // If |last_elem_is_reg1| is true, the last element of the vector in // expected_result (that is, at vl-1) will be expected to be the same as // |regx1| when VL < VMAX and said element is active. - void TestVectorPermutationInstruction(uint32_t insn_bytes, - const __v16qu (&expected_result_int8)[8], - const __v8hu (&expected_result_int16)[8], - const __v4su (&expected_result_int32)[8], - const __v2du (&expected_result_int64)[8], - const __v2du (&source)[16], - uint8_t vlmul, - uint64_t regx1 = 0x0, - uint64_t skip = 0, - bool ignore_vma_for_last = false, - bool last_elem_is_x1 = false) { - auto Verify = [this, &source, vlmul, regx1, skip, ignore_vma_for_last, last_elem_is_x1]( + template <TestVectorInstructionKind kTestVectorInstructionKind, + typename... ElementType, + size_t... kResultsCount, + size_t... kElementCount> + void TestVectorPermutationInstruction( + uint32_t insn_bytes, + const __v2du (&source)[16], + uint8_t vlmul, + uint64_t skip, + bool ignore_vma_for_last, + bool last_elem_is_reg1, + uint64_t regx1, + const ElementType (&... expected_result)[kResultsCount][kElementCount]) { + auto Verify = [this, &source, vlmul, regx1, skip, ignore_vma_for_last, last_elem_is_reg1]( uint32_t insn_bytes, uint8_t vsew, const auto& expected_result_raw, @@ -1965,8 +2005,24 @@ class Riscv64InterpreterTest : public ::testing::Test { for (size_t index = 0; index < std::size(source); ++index) { state_.cpu.v[16 + index] = SIMD128Register{source[index]}.Get<__uint128_t>(); } - // Set x1 for vx instructions. - SetXReg<1>(state_.cpu, regx1); + + if constexpr (kTestVectorInstructionKind == TestVectorInstructionKind::kFloat) { + UNUSED(regx1); + // We only support Float32/Float64 for float instructions, but there are conversion + // instructions that work with double width floats. + // These instructions never use float registers though and thus we don't need to store + // anything into f1 register, if they are used. + // For Float32/Float64 case we load 5.625 of the appropriate type into f1. + ASSERT_LE(vsew, 3); + if (vsew == 2) { + SetFReg<1>(state_.cpu, 0xffff'ffff'40b4'0000); // float 5.625 + } else if (vsew == 3) { + SetFReg<1>(state_.cpu, 0x4016'8000'0000'0000); // double 5.625 + } + } else { + // Set x1 for vx instructions. + SetXReg<1>(state_.cpu, regx1); + } const size_t kElementSize = 1 << vsew; size_t num_regs = 1 << vlmul; @@ -2037,7 +2093,7 @@ class Riscv64InterpreterTest : public ::testing::Test { expected_result[index] = SIMD128Register{expected_result_raw[index]}; } - if (vlmul == 2 && last_elem_is_x1) { + if (vlmul == 2 && last_elem_is_reg1) { switch (kElementSize) { case 1: expected_result[last_reg].template Set<uint8_t>( @@ -2048,12 +2104,22 @@ class Riscv64InterpreterTest : public ::testing::Test { static_cast<uint16_t>(GetXReg<1>(state_.cpu)), last_elem); break; case 4: - expected_result[last_reg].template Set<uint32_t>( - static_cast<uint32_t>(GetXReg<1>(state_.cpu)), last_elem); + if constexpr (kTestVectorInstructionKind == TestVectorInstructionKind::kFloat) { + expected_result[last_reg].template Set<uint32_t>( + static_cast<uint32_t>(GetFReg<1>(state_.cpu)), last_elem); + } else { + expected_result[last_reg].template Set<uint32_t>( + static_cast<uint32_t>(GetXReg<1>(state_.cpu)), last_elem); + } break; case 8: - expected_result[last_reg].template Set<uint64_t>( - static_cast<uint64_t>(GetXReg<1>(state_.cpu)), last_elem); + if constexpr (kTestVectorInstructionKind == TestVectorInstructionKind::kFloat) { + expected_result[last_reg].template Set<uint64_t>( + static_cast<uint64_t>(GetFReg<1>(state_.cpu)), last_elem); + } else { + expected_result[last_reg].template Set<uint64_t>( + static_cast<uint64_t>(GetXReg<1>(state_.cpu)), last_elem); + } break; default: FAIL() << "Element size is " << kElementSize; @@ -2114,16 +2180,12 @@ class Riscv64InterpreterTest : public ::testing::Test { } }; - // Some instructions don't support use of mask register, but in these instructions bit - // #25 is set. Test it and skip masking tests if so. - Verify(insn_bytes, 0, expected_result_int8, kMaskInt8); - Verify(insn_bytes, 1, expected_result_int16, kMaskInt16); - Verify(insn_bytes, 2, expected_result_int32, kMaskInt32); - Verify(insn_bytes, 3, expected_result_int64, kMaskInt64); - Verify(insn_bytes | (1 << 25), 0, expected_result_int8, kNoMask); - Verify(insn_bytes | (1 << 25), 1, expected_result_int16, kNoMask); - Verify(insn_bytes | (1 << 25), 2, expected_result_int32, kNoMask); - Verify(insn_bytes | (1 << 25), 3, expected_result_int64, kNoMask); + // Test with and without masking enabled. + (Verify( + insn_bytes, BitUtilLog2(sizeof(ElementType)), expected_result, MaskForElem<ElementType>()), + ...); + (Verify(insn_bytes | (1 << 25), BitUtilLog2(sizeof(ElementType)), expected_result, kNoMask), + ...); } protected: @@ -2662,6 +2724,69 @@ TEST_F(Riscv64InterpreterTest, TestRNU) { {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}}, kVectorCalculationsSource); + + TestNarrowingVectorInstruction(0xb900c457, // Vnclipu.wx v8, v16, x1, v0.t + {{32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40}, + {40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48}, + {48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56}, + {56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64}}, + {{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}}, + {{0x0021'c1a1, 0x0023'c3a3, 0x0025'c5a5, 0x0027'c7a7}, + {0x0029'c9a9, 0x002b'cbab, 0x002d'cdad, 0x002f'cfaf}, + {0x0031'd1b1, 0x0033'd3b3, 0x0035'd5b5, 0x0037'd7b7}, + {0x0039'd9b9, 0x003b'dbbb, 0x003d'ddbd, 0x003f'dfbf}}, + kVectorCalculationsSource); + + TestNarrowingVectorInstruction( + 0xbd00c457, // Vnclip.wx v8, v16, x1, v0.t + {{224, 225, 225, 226, 226, 227, 227, 228, 228, 229, 229, 230, 230, 231, 231, 232}, + {232, 233, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, 239, 239, 240}, + {240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248}, + {248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255, 255, 0}}, + {{0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0xdfbf}}, + {{0xffe1'c1a1, 0xffe3'c3a3, 0xffe5'c5a5, 0xffe7'c7a7}, + {0xffe9'c9a9, 0xffeb'cbab, 0xffed'cdad, 0xffef'cfaf}, + {0xfff1'd1b1, 0xfff3'd3b3, 0xfff5'd5b5, 0xfff7'd7b7}, + {0xfff9'd9b9, 0xfffb'dbbb, 0xfffd'ddbd, 0xffff'dfbf}}, + kVectorCalculationsSource); + + TestNarrowingVectorInstruction( + 0xb90c0457, // Vnclipu.wv v8, v16, v24, v0.t + {{255, 255, 255, 255, 69, 35, 9, 2, 255, 255, 255, 255, 153, 39, 10, 2}, + {255, 255, 255, 255, 85, 43, 11, 3, 255, 255, 255, 255, 185, 47, 12, 3}, + {255, 255, 255, 255, 101, 51, 13, 3, 255, 255, 255, 255, 217, 55, 14, 3}, + {255, 255, 255, 255, 117, 59, 15, 4, 255, 255, 255, 255, 249, 63, 16, 4}}, + {{0xffff, 0xffff, 0xffff, 0xffff, 0x4989, 0x0971, 0x009b, 0x000a}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x5999, 0x0b73, 0x00bb, 0x000c}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x69a9, 0x0d75, 0x00db, 0x000e}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x79b9, 0x0f77, 0x00fb, 0x0010}}, + {{0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}, + {0xa726'a525, 0x0057'9757, 0x0000'5b9b, 0x0000'00bf}, + {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}, + {0xe766'e565, 0x0077'b777, 0x0000'7bbb, 0x0000'00ff}}, + kVectorCalculationsSource); + + TestNarrowingVectorInstruction( + 0xbd0c0457, // Vnclip.wv v8, v16, v24, v0.t + {{128, 128, 128, 128, 197, 227, 249, 254, 128, 128, 128, 128, 153, 231, 250, 254}, + {128, 128, 128, 128, 213, 235, 251, 255, 128, 128, 128, 128, 185, 239, 252, 255}, + {128, 128, 128, 128, 229, 243, 253, 255, 128, 128, 128, 128, 217, 247, 254, 255}, + {128, 128, 128, 158, 245, 251, 255, 0, 128, 128, 128, 222, 249, 255, 0, 0}}, + {{0x8000, 0x8000, 0x8000, 0x8000, 0xc989, 0xf971, 0xff9b, 0xfffa}, + {0x8000, 0x8000, 0x8000, 0x8000, 0xd999, 0xfb73, 0xffbb, 0xfffc}, + {0x8000, 0x8000, 0x8000, 0x8000, 0xe9a9, 0xfd75, 0xffdb, 0xfffe}, + {0x8000, 0x8000, 0x8000, 0x8000, 0xf9b9, 0xff77, 0xfffb, 0x0000}}, + {{0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, + {0xa726'a525, 0xffd7'9757, 0xffff'db9b, 0xffff'ffbf}, + {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, + {0xe766'e565, 0xfff7'b777, 0xffff'fbbb, 0xffff'ffff}}, + kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestRNE) { @@ -2974,6 +3099,69 @@ TEST_F(Riscv64InterpreterTest, TestRNE) { {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}}, kVectorCalculationsSource); + + TestNarrowingVectorInstruction(0xb900c457, // Vnclipu.wx v8, v16, x1, v0.t + {{32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40}, + {40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48}, + {48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56}, + {56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64}}, + {{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}}, + {{0x0021'c1a1, 0x0023'c3a3, 0x0025'c5a5, 0x0027'c7a7}, + {0x0029'c9a9, 0x002b'cbab, 0x002d'cdad, 0x002f'cfaf}, + {0x0031'd1b1, 0x0033'd3b3, 0x0035'd5b5, 0x0037'd7b7}, + {0x0039'd9b9, 0x003b'dbbb, 0x003d'ddbd, 0x003f'dfbf}}, + kVectorCalculationsSource); + + TestNarrowingVectorInstruction( + 0xbd00c457, // Vnclip.wx v8, v16, x1, v0.t + {{224, 225, 225, 226, 226, 227, 227, 228, 228, 229, 229, 230, 230, 231, 231, 232}, + {232, 233, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, 239, 239, 240}, + {240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248}, + {248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255, 255, 0}}, + {{0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0xdfbf}}, + {{0xffe1'c1a1, 0xffe3'c3a3, 0xffe5'c5a5, 0xffe7'c7a7}, + {0xffe9'c9a9, 0xffeb'cbab, 0xffed'cdad, 0xffef'cfaf}, + {0xfff1'd1b1, 0xfff3'd3b3, 0xfff5'd5b5, 0xfff7'd7b7}, + {0xfff9'd9b9, 0xfffb'dbbb, 0xfffd'ddbd, 0xffff'dfbf}}, + kVectorCalculationsSource); + + TestNarrowingVectorInstruction( + 0xb90c0457, // Vnclipu.wv v8, v16, v24, v0.t + {{255, 255, 255, 255, 69, 35, 9, 2, 255, 255, 255, 255, 153, 39, 10, 2}, + {255, 255, 255, 255, 85, 43, 11, 3, 255, 255, 255, 255, 185, 47, 12, 3}, + {255, 255, 255, 255, 101, 51, 13, 3, 255, 255, 255, 255, 217, 55, 14, 3}, + {255, 255, 255, 255, 117, 59, 15, 4, 255, 255, 255, 255, 249, 63, 16, 4}}, + {{0xffff, 0xffff, 0xffff, 0xffff, 0x4989, 0x0971, 0x009b, 0x000a}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x5999, 0x0b73, 0x00bb, 0x000c}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x69a9, 0x0d75, 0x00db, 0x000e}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x79b9, 0x0f77, 0x00fb, 0x0010}}, + {{0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}, + {0xa726'a525, 0x0057'9757, 0x0000'5b9b, 0x0000'00bf}, + {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}, + {0xe766'e565, 0x0077'b777, 0x0000'7bbb, 0x0000'00ff}}, + kVectorCalculationsSource); + + TestNarrowingVectorInstruction( + 0xbd0c0457, // Vnclip.wv v8, v16, v24, v0.t + {{128, 128, 128, 128, 197, 227, 249, 254, 128, 128, 128, 128, 153, 231, 250, 254}, + {128, 128, 128, 128, 213, 235, 251, 255, 128, 128, 128, 128, 185, 239, 252, 255}, + {128, 128, 128, 128, 229, 243, 253, 255, 128, 128, 128, 128, 217, 247, 254, 255}, + {128, 128, 128, 158, 245, 251, 255, 0, 128, 128, 128, 222, 249, 255, 0, 0}}, + {{0x8000, 0x8000, 0x8000, 0x8000, 0xc989, 0xf971, 0xff9b, 0xfffa}, + {0x8000, 0x8000, 0x8000, 0x8000, 0xd999, 0xfb73, 0xffbb, 0xfffc}, + {0x8000, 0x8000, 0x8000, 0x8000, 0xe9a9, 0xfd75, 0xffdb, 0xfffe}, + {0x8000, 0x8000, 0x8000, 0x8000, 0xf9b9, 0xff77, 0xfffb, 0x0000}}, + {{0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, + {0xa726'a525, 0xffd7'9757, 0xffff'db9b, 0xffff'ffbf}, + {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, + {0xe766'e565, 0xfff7'b777, 0xffff'fbbb, 0xffff'ffff}}, + kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestRDN) { @@ -3286,6 +3474,69 @@ TEST_F(Riscv64InterpreterTest, TestRDN) { {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}}, kVectorCalculationsSource); + + TestNarrowingVectorInstruction(0xb900c457, // Vnclipu.wx v8, v16, x1, v0.t + {{32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39}, + {40, 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47}, + {48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55}, + {56, 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63}}, + {{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}}, + {{0x0021'c1a1, 0x0023'c3a3, 0x0025'c5a5, 0x0027'c7a7}, + {0x0029'c9a9, 0x002b'cbab, 0x002d'cdad, 0x002f'cfaf}, + {0x0031'd1b1, 0x0033'd3b3, 0x0035'd5b5, 0x0037'd7b7}, + {0x0039'd9b9, 0x003b'dbbb, 0x003d'ddbd, 0x003f'dfbf}}, + kVectorCalculationsSource); + + TestNarrowingVectorInstruction( + 0xbd00c457, // Vnclip.wx v8, v16, x1, v0.t + {{224, 224, 225, 225, 226, 226, 227, 227, 228, 228, 229, 229, 230, 230, 231, 231}, + {232, 232, 233, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, 239, 239}, + {240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247}, + {248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255, 255}}, + {{0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0xdfbf}}, + {{0xffe1'c1a1, 0xffe3'c3a3, 0xffe5'c5a5, 0xffe7'c7a7}, + {0xffe9'c9a9, 0xffeb'cbab, 0xffed'cdad, 0xffef'cfaf}, + {0xfff1'd1b1, 0xfff3'd3b3, 0xfff5'd5b5, 0xfff7'd7b7}, + {0xfff9'd9b9, 0xfffb'dbbb, 0xfffd'ddbd, 0xffff'dfbf}}, + kVectorCalculationsSource); + + TestNarrowingVectorInstruction( + 0xb90c0457, // Vnclipu.wv v8, v16, v24, v0.t + {{255, 255, 255, 255, 68, 34, 8, 2, 255, 255, 255, 255, 153, 38, 9, 2}, + {255, 255, 255, 255, 84, 42, 10, 2, 255, 255, 255, 255, 185, 46, 11, 2}, + {255, 255, 255, 255, 100, 50, 12, 3, 255, 255, 255, 255, 217, 54, 13, 3}, + {255, 255, 255, 255, 116, 58, 14, 3, 255, 255, 255, 255, 249, 62, 15, 3}}, + {{0xffff, 0xffff, 0xffff, 0xffff, 0x4989, 0x0971, 0x009b, 0x0009}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x5999, 0x0b73, 0x00bb, 0x000b}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x69a9, 0x0d75, 0x00db, 0x000d}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x79b9, 0x0f77, 0x00fb, 0x000f}}, + {{0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}, + {0xa726'a524, 0x0057'9756, 0x0000'5b9b, 0x0000'00bf}, + {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}, + {0xe766'e564, 0x0077'b776, 0x0000'7bbb, 0x0000'00ff}}, + kVectorCalculationsSource); + + TestNarrowingVectorInstruction( + 0xbd0c0457, // Vnclip.wv v8, v16, v24, v0.t + {{128, 128, 128, 128, 196, 226, 248, 254, 128, 128, 128, 128, 153, 230, 249, 254}, + {128, 128, 128, 128, 212, 234, 250, 254, 128, 128, 128, 128, 185, 238, 251, 254}, + {128, 128, 128, 128, 228, 242, 252, 255, 128, 128, 128, 128, 217, 246, 253, 255}, + {128, 128, 128, 157, 244, 250, 254, 255, 128, 128, 128, 221, 249, 254, 255, 255}}, + {{0x8000, 0x8000, 0x8000, 0x8000, 0xc989, 0xf971, 0xff9b, 0xfff9}, + {0x8000, 0x8000, 0x8000, 0x8000, 0xd999, 0xfb73, 0xffbb, 0xfffb}, + {0x8000, 0x8000, 0x8000, 0x8000, 0xe9a9, 0xfd75, 0xffdb, 0xfffd}, + {0x8000, 0x8000, 0x8000, 0x8000, 0xf9b9, 0xff77, 0xfffb, 0xffff}}, + {{0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, + {0xa726'a524, 0xffd7'9756, 0xffff'db9b, 0xffff'ffbf}, + {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, + {0xe766'e564, 0xfff7'b776, 0xffff'fbbb, 0xffff'ffff}}, + kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestROD) { @@ -3598,6 +3849,69 @@ TEST_F(Riscv64InterpreterTest, TestROD) { {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}}, kVectorCalculationsSource); + + TestNarrowingVectorInstruction(0xb900c457, // Vnclipu.wx v8, v16, x1, v0.t + {{33, 33, 33, 33, 35, 35, 35, 35, 37, 37, 37, 37, 39, 39, 39, 39}, + {41, 41, 41, 41, 43, 43, 43, 43, 45, 45, 45, 45, 47, 47, 47, 47}, + {49, 49, 49, 49, 51, 51, 51, 51, 53, 53, 53, 53, 55, 55, 55, 55}, + {57, 57, 57, 57, 59, 59, 59, 59, 61, 61, 61, 61, 63, 63, 63, 63}}, + {{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}}, + {{0x0021'c1a1, 0x0023'c3a3, 0x0025'c5a5, 0x0027'c7a7}, + {0x0029'c9a9, 0x002b'cbab, 0x002d'cdad, 0x002f'cfaf}, + {0x0031'd1b1, 0x0033'd3b3, 0x0035'd5b5, 0x0037'd7b7}, + {0x0039'd9b9, 0x003b'dbbb, 0x003d'ddbd, 0x003f'dfbf}}, + kVectorCalculationsSource); + + TestNarrowingVectorInstruction( + 0xbd00c457, // Vnclip.wx v8, v16, x1, v0.t + {{225, 225, 225, 225, 227, 227, 227, 227, 229, 229, 229, 229, 231, 231, 231, 231}, + {233, 233, 233, 233, 235, 235, 235, 235, 237, 237, 237, 237, 239, 239, 239, 239}, + {241, 241, 241, 241, 243, 243, 243, 243, 245, 245, 245, 245, 247, 247, 247, 247}, + {249, 249, 249, 249, 251, 251, 251, 251, 253, 253, 253, 253, 255, 255, 255, 255}}, + {{0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0xdfbf}}, + {{0xffe1'c1a1, 0xffe3'c3a3, 0xffe5'c5a5, 0xffe7'c7a7}, + {0xffe9'c9a9, 0xffeb'cbab, 0xffed'cdad, 0xffef'cfaf}, + {0xfff1'd1b1, 0xfff3'd3b3, 0xfff5'd5b5, 0xfff7'd7b7}, + {0xfff9'd9b9, 0xfffb'dbbb, 0xfffd'ddbd, 0xffff'dfbf}}, + kVectorCalculationsSource); + + TestNarrowingVectorInstruction( + 0xb90c0457, // Vnclipu.wv v8, v16, v24, v0.t + {{255, 255, 255, 255, 69, 35, 9, 3, 255, 255, 255, 255, 153, 39, 9, 3}, + {255, 255, 255, 255, 85, 43, 11, 3, 255, 255, 255, 255, 185, 47, 11, 3}, + {255, 255, 255, 255, 101, 51, 13, 3, 255, 255, 255, 255, 217, 55, 13, 3}, + {255, 255, 255, 255, 117, 59, 15, 3, 255, 255, 255, 255, 249, 63, 15, 3}}, + {{0xffff, 0xffff, 0xffff, 0xffff, 0x4989, 0x0971, 0x009b, 0x0009}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x5999, 0x0b73, 0x00bb, 0x000b}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x69a9, 0x0d75, 0x00db, 0x000d}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x79b9, 0x0f77, 0x00fb, 0x000f}}, + {{0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}, + {0xa726'a525, 0x0057'9757, 0x0000'5b9b, 0x0000'00bf}, + {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}, + {0xe766'e565, 0x0077'b777, 0x0000'7bbb, 0x0000'00ff}}, + kVectorCalculationsSource); + + TestNarrowingVectorInstruction( + 0xbd0c0457, // Vnclip.wv v8, v16, v24, v0.t + {{128, 128, 128, 128, 197, 227, 249, 255, 128, 128, 128, 128, 153, 231, 249, 255}, + {128, 128, 128, 128, 213, 235, 251, 255, 128, 128, 128, 128, 185, 239, 251, 255}, + {128, 128, 128, 128, 229, 243, 253, 255, 128, 128, 128, 128, 217, 247, 253, 255}, + {128, 128, 128, 157, 245, 251, 255, 255, 128, 128, 128, 221, 249, 255, 255, 255}}, + {{0x8000, 0x8000, 0x8000, 0x8000, 0xc989, 0xf971, 0xff9b, 0xfff9}, + {0x8000, 0x8000, 0x8000, 0x8000, 0xd999, 0xfb73, 0xffbb, 0xfffb}, + {0x8000, 0x8000, 0x8000, 0x8000, 0xe9a9, 0xfd75, 0xffdb, 0xfffd}, + {0x8000, 0x8000, 0x8000, 0x8000, 0xf9b9, 0xff77, 0xfffb, 0xffff}}, + {{0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, + {0xa726'a525, 0xffd7'9757, 0xffff'db9b, 0xffff'ffbf}, + {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, + {0xe766'e565, 0xfff7'b777, 0xffff'fbbb, 0xffff'ffff}}, + kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestVlXreXX) { @@ -10061,284 +10375,568 @@ TEST_F(Riscv64InterpreterTest, TestVfsgnj) { TEST_F(Riscv64InterpreterTest, TestVredsum) { TestVectorReductionInstruction( - 0x10c2457, // vredsum.vs v8,v16,v24,v0.t + 0x1882457, // vredsum.vs v8,v24,v16,v0.t // expected_result_vd0_int8 {242, 228, 200, 144, /* unused */ 0, 146, 44, 121}, // expected_result_vd0_int16 {0x0172, 0x82e4, 0x88c8, 0xa090, /* unused */ 0, 0x1300, 0xa904, 0xe119}, // expected_result_vd0_int32 - {0xcb44'b932, 0x9407'71e4, 0xa70e'64c8, 0xd312'5090, /* unused */ 0, /* unused */ 0, - 0x1907'1300, 0xb713'ad09}, + {0xcb44'b932, + 0x9407'71e4, + 0xa70e'64c8, + 0xd312'5090, + /* unused */ 0, + /* unused */ 0, + 0x1907'1300, + 0xb713'ad09}, // expected_result_vd0_int64 - {0xb32f'a926'9f1b'9511, 0x1f99'0d88'fb74'e962, 0xb92c'970e'74e8'52c4, 0xef4e'ad14'6aca'2888, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x2513'1f0e'1907'1300}, + {0xb32f'a926'9f1b'9511, + 0x1f99'0d88'fb74'e962, + 0xb92c'970e'74e8'52c4, + 0xef4e'ad14'6aca'2888, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x2513'1f0e'1907'1300}, // expected_result_vd0_with_mask_int8 {39, 248, 142, 27, /* unused */ 0, 0, 154, 210}, // expected_result_vd0_with_mask_int16 {0x5f45, 0xc22f, 0x99d0, 0x98bf, /* unused */ 0, 0x1300, 0x1300, 0x4b15}, // expected_result_vd0_with_mask_int32 - {0x2d38'1f29, 0x99a1'838a, 0x1989'ef5c, 0x9cf4'4aa1, /* unused */ 0, /* unused */ 0, - 0x1907'1300, 0x1907'1300}, + {0x2d38'1f29, + 0x99a1'838a, + 0x1989'ef5c, + 0x9cf4'4aa1, + /* unused */ 0, + /* unused */ 0, + 0x1907'1300, + 0x1907'1300}, // expected_result_vd0_with_mask_int64 - {0x2513'1f0e'1907'1300, 0x917c'8370'7560'6751, 0x4e56'3842'222a'0c13, 0xc833'9e0e'73df'49b5, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x2513'1f0e'1907'1300}, + {0x2513'1f0e'1907'1300, + 0x917c'8370'7560'6751, + 0x4e56'3842'222a'0c13, + 0xc833'9e0e'73df'49b5, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x2513'1f0e'1907'1300}, kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestVfredosum) { - TestVectorReductionInstruction( - 0xd0c1457, // vfredosum.vs v8, v16, v24, v0.t - // expected_result_vd0_int32 - {0x9e0c'9a8e, 0xbe2c'bace, 0xfe6c'fb4e, 0x7e6b'fc4d, /* unused */ 0, /* unused */ 0, - 0x9604'9200, 0x9e0c'9a8e}, - // expected_result_vd0_int64 - {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0x7eec'5def'0cee'0dee, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200}, - // expected_result_vd0_with_mask_int32 - {0x9604'929d, 0xbe2c'ba29, 0xfe6c'fb4e, 0x7e6b'fa84, /* unused */ 0, /* unused */ 0, - 0x9604'9200, 0x9604'9200}, - // expected_result_vd0_with_mask_int64 - {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0x6efc'4e0d'ee0d'ee0f, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200}, - kVectorCalculationsSource); + TestVectorReductionInstruction(0xd881457, // vfredosum.vs v8, v24, v16, v0.t + // expected_result_vd0_int32 + {0x9e0c'9a8e, + 0xbe2c'bace, + 0xfe6c'fb4e, + 0x7e6b'fc4d, + /* unused */ 0, + /* unused */ 0, + 0x9604'9200, + 0x9e0c'9a8e}, + // expected_result_vd0_int64 + {0x9e0c'9a09'9604'9200, + 0xbe2c'ba29'b624'b220, + 0xfe6c'fa69'f664'f260, + 0x7eec'5def'0cee'0dee, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x9e0c'9a09'9604'9200}, + // expected_result_vd0_with_mask_int32 + {0x9604'929d, + 0xbe2c'ba29, + 0xfe6c'fb4e, + 0x7e6b'fa84, + /* unused */ 0, + /* unused */ 0, + 0x9604'9200, + 0x9604'9200}, + // expected_result_vd0_with_mask_int64 + {0x9e0c'9a09'9604'9200, + 0xbe2c'ba29'b624'b220, + 0xee7c'ea78'e674'e271, + 0x6efc'4e0d'ee0d'ee0f, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x9e0c'9a09'9604'9200}, + kVectorCalculationsSource); } // Currently Vfredusum is implemented as Vfredosum (as explicitly permitted by RVV 1.0). // If we would implement some speedups which would change results then we may need to alter tests. TEST_F(Riscv64InterpreterTest, TestVfredusum) { - TestVectorReductionInstruction( - 0x50c1457, // vfredusum.vs v8, v16, v24, v0.t - // expected_result_vd0_int32 - {0x9e0c'9a8e, 0xbe2c'bace, 0xfe6c'fb4e, 0x7e6b'fc4d, /* unused */ 0, /* unused */ 0, - 0x9604'9200, 0x9e0c'9a8e}, - // expected_result_vd0_int64 - {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0x7eec'5def'0cee'0dee, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200}, - // expected_result_vd0_with_mask_int32 - {0x9604'929d, 0xbe2c'ba29, 0xfe6c'fb4e, 0x7e6b'fa84, /* unused */ 0, /* unused */ 0, - 0x9604'9200, 0x9604'9200}, - // expected_result_vd0_with_mask_int64 - {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0x6efc'4e0d'ee0d'ee0f, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200}, - kVectorCalculationsSource); + TestVectorReductionInstruction(0x5881457, // vfredusum.vs v8, v24, v16, v0.t + // expected_result_vd0_int32 + {0x9e0c'9a8e, + 0xbe2c'bace, + 0xfe6c'fb4e, + 0x7e6b'fc4d, + /* unused */ 0, + /* unused */ 0, + 0x9604'9200, + 0x9e0c'9a8e}, + // expected_result_vd0_int64 + {0x9e0c'9a09'9604'9200, + 0xbe2c'ba29'b624'b220, + 0xfe6c'fa69'f664'f260, + 0x7eec'5def'0cee'0dee, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x9e0c'9a09'9604'9200}, + // expected_result_vd0_with_mask_int32 + {0x9604'929d, + 0xbe2c'ba29, + 0xfe6c'fb4e, + 0x7e6b'fa84, + /* unused */ 0, + /* unused */ 0, + 0x9604'9200, + 0x9604'9200}, + // expected_result_vd0_with_mask_int64 + {0x9e0c'9a09'9604'9200, + 0xbe2c'ba29'b624'b220, + 0xee7c'ea78'e674'e271, + 0x6efc'4e0d'ee0d'ee0f, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x9e0c'9a09'9604'9200}, + kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestVredand) { TestVectorReductionInstruction( - 0x50c2457, // vredand.vs v8,v16,v24,v0.t + 0x5882457, // vredand.vs v8,v24,v16,v0.t // expected_result_vd0_int8 {0, 0, 0, 0, /* unused */ 0, 0, 0, 0}, // expected_result_vd0_int16 {0x8000, 0x8000, 0x8000, 0x0000, /* unused */ 0, 0x8000, 0x8000, 0x8000}, // expected_result_vd0_int32 - {0x8200'8000, 0x8200'8000, 0x8200'8000, 0x0200'0000, /* unused */ 0, /* unused */ 0, - 0x8200'8000, 0x8200'8000}, + {0x8200'8000, + 0x8200'8000, + 0x8200'8000, + 0x0200'0000, + /* unused */ 0, + /* unused */ 0, + 0x8200'8000, + 0x8200'8000}, // expected_result_vd0_int64 - {0x8604'8000'8200'8000, 0x8604'8000'8200'8000, 0x8604'8000'8200'8000, 0x0604'0000'0200'0000, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8604'8000'8200'8000}, + {0x8604'8000'8200'8000, + 0x8604'8000'8200'8000, + 0x8604'8000'8200'8000, + 0x0604'0000'0200'0000, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x8604'8000'8200'8000}, // expected_result_vd0_with_mask_int8 {0, 0, 0, 0, /* unused */ 0, 0, 0, 0}, // expected_result_vd0_with_mask_int16 {0x8000, 0x8000, 0x8000, 0x0000, /* unused */ 0, 0x8000, 0x8000, 0x8000}, // expected_result_vd0_with_mask_int32 - {0x8200'8000, 0x8200'8000, 0x8200'8000, 0x0200'0000, /* unused */ 0, /* unused */ 0, - 0x8200'8000, 0x8200'8000}, + {0x8200'8000, + 0x8200'8000, + 0x8200'8000, + 0x0200'0000, + /* unused */ 0, + /* unused */ 0, + 0x8200'8000, + 0x8200'8000}, // expected_result_vd0_with_mask_int64 - {0x8604'8000'8200'8000, 0x8604'8000'8200'8000, 0x8604'8000'8200'8000, 0x0604'0000'0200'0000, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8604'8000'8200'8000}, + {0x8604'8000'8200'8000, + 0x8604'8000'8200'8000, + 0x8604'8000'8200'8000, + 0x0604'0000'0200'0000, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x8604'8000'8200'8000}, kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestVredor) { TestVectorReductionInstruction( - 0x90c2457, // vredor.vs v8,v16,v24,v0.t + 0x9882457, // vredor.vs v8,v24,v16,v0.t // expected_result_vd0_int8 {159, 191, 255, 255, /* unused */ 0, 146, 150, 159}, // expected_result_vd0_int16 {0x9f1d, 0xbf3d, 0xff7d, 0xfffd, /* unused */ 0, 0x9300, 0x9704, 0x9f0d}, // expected_result_vd0_int32 - {0x9f1e'9b19, 0xbf3e'bb39, 0xff7e'fb79, 0xfffe'fbf9, /* unused */ 0, /* unused */ 0, - 0x9706'9300, 0x9f0e'9b09}, + {0x9f1e'9b19, + 0xbf3e'bb39, + 0xff7e'fb79, + 0xfffe'fbf9, + /* unused */ 0, + /* unused */ 0, + 0x9706'9300, + 0x9f0e'9b09}, // expected_result_vd0_int64 - {0x9f1e'9f1d'9716'9311, 0xbf3e'bf3d'b736'b331, 0xff7e'ff7d'f776'f371, 0xfffe'fffd'f7f6'f3f1, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9f0e'9f0d'9706'9300}, + {0x9f1e'9f1d'9716'9311, + 0xbf3e'bf3d'b736'b331, + 0xff7e'ff7d'f776'f371, + 0xfffe'fffd'f7f6'f3f1, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x9f0e'9f0d'9706'9300}, // expected_result_vd0_with_mask_int8 {159, 191, 255, 255, /* unused */ 0, 0, 150, 158}, // expected_result_vd0_with_mask_int16 {0x9f1d, 0xbf3d, 0xff7d, 0xfffd, /* unused */ 0, 0x9300, 0x9300, 0x9f0d}, // expected_result_vd0_with_mask_int32 - {0x9f1e'9b19, 0xbf3e'bb39, 0xff7e'fb79, 0xfffe'fbf9, /* unused */ 0, /* unused */ 0, - 0x9706'9300, 0x9706'9300}, + {0x9f1e'9b19, + 0xbf3e'bb39, + 0xff7e'fb79, + 0xfffe'fbf9, + /* unused */ 0, + /* unused */ 0, + 0x9706'9300, + 0x9706'9300}, // expected_result_vd0_with_mask_int64 - {0x9f0e'9f0d'9706'9300, 0xbf3e'bf3d'b736'b331, 0xff7e'ff7d'f776'f371, 0xfffe'fffd'f7f6'f3f1, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9f0e'9f0d'9706'9300}, + {0x9f0e'9f0d'9706'9300, + 0xbf3e'bf3d'b736'b331, + 0xff7e'ff7d'f776'f371, + 0xfffe'fffd'f7f6'f3f1, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x9f0e'9f0d'9706'9300}, kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestVredxor) { TestVectorReductionInstruction( - 0xd0c2457, // vredxor.vs v8,v16,v24,v0.t + 0xd882457, // vredxor.vs v8,v24,v16,v0.t // expected_result_vd0_int8 {0, 0, 0, 0, /* unused */ 0, 146, 0, 1}, // expected_result_vd0_int16 {0x8100, 0x8100, 0x8100, 0x8100, /* unused */ 0, 0x1300, 0x8504, 0x8101}, // expected_result_vd0_int32 - {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x8302'8100, /* unused */ 0, /* unused */ 0, - 0x1506'1300, 0x8b0a'8909}, + {0x8302'8100, + 0x8302'8100, + 0x8302'8100, + 0x8302'8100, + /* unused */ 0, + /* unused */ 0, + 0x1506'1300, + 0x8b0a'8909}, // expected_result_vd0_int64 - {0x9716'9515'9312'9111, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x190a'1f0d'1506'1300}, + {0x9716'9515'9312'9111, + 0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x190a'1f0d'1506'1300}, // expected_result_vd0_with_mask_int8 {143, 154, 150, 43, /* unused */ 0, 0, 146, 150}, // expected_result_vd0_with_mask_int16 {0x1f0d, 0xbd3d, 0x9514, 0x8d0d, /* unused */ 0, 0x1300, 0x1300, 0x1705}, // expected_result_vd0_with_mask_int32 - {0x1d0e'1b09, 0x0d1e'0b18, 0xfb7a'f978, 0xab2a'a929, /* unused */ 0, /* unused */ 0, - 0x1506'1300, 0x1506'1300}, + {0x1d0e'1b09, + 0x0d1e'0b18, + 0xfb7a'f978, + 0xab2a'a929, + /* unused */ 0, + /* unused */ 0, + 0x1506'1300, + 0x1506'1300}, // expected_result_vd0_with_mask_int64 - {0x190a'1f0d'1506'1300, 0x091a'0f1c'0516'0311, 0x293a'2f3c'2536'2331, 0x77f6'75f5'73f2'71f1, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x190a'1f0d'1506'1300}, + {0x190a'1f0d'1506'1300, + 0x091a'0f1c'0516'0311, + 0x293a'2f3c'2536'2331, + 0x77f6'75f5'73f2'71f1, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x190a'1f0d'1506'1300}, kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestVredminu) { TestVectorReductionInstruction( - 0x110c2457, // vredminu.vs v8,v16,v24,v0.t + 0x11882457, // vredminu.vs v8,v24,v16,v0.t // expected_result_vd0_int8 {0, 0, 0, 0, /* unused */ 0, 0, 0, 0}, // expected_result_vd0_int16 {0x8100, 0x8100, 0x8100, 0x0291, /* unused */ 0, 0x8100, 0x8100, 0x8100}, // expected_result_vd0_int32 - {0x83028100, 0x83028100, 0x83028100, 0x06940291, /* unused */ 0, /* unused */ 0, 0x83028100, + {0x83028100, + 0x83028100, + 0x83028100, + 0x06940291, + /* unused */ 0, + /* unused */ 0, + 0x83028100, 0x83028100}, // expected_result_vd0_int64 - {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x0e9c'0a98'0694'0291, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100}, + {0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + 0x0e9c'0a98'0694'0291, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x8706'8504'8302'8100}, // expected_result_vd0_with_mask_int8 {0, 0, 0, 0, /* unused */ 0, 0, 0, 0}, // expected_result_vd0_with_mask_int16 {0x8100, 0x8100, 0x8100, 0x0291, /* unused */ 0, 0x8100, 0x8100, 0x8100}, // expected_result_vd0_with_mask_int32 - {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x0e9c'0a98, /* unused */ 0, /* unused */ 0, - 0x8302'8100, 0x8302'8100}, + {0x8302'8100, + 0x8302'8100, + 0x8302'8100, + 0x0e9c'0a98, + /* unused */ 0, + /* unused */ 0, + 0x8302'8100, + 0x8302'8100}, // expected_result_vd0_with_mask_int64 - {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x1e8c'1a89'1684'1280, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100}, + {0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + 0x1e8c'1a89'1684'1280, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x8706'8504'8302'8100}, kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestVredmin) { TestVectorReductionInstruction( - 0x150c2457, // vredmin.vs v8,v16,v24,v0.t + 0x15882457, // vredmin.vs v8,v24,v16,v0.t // expected_result_vd0_int8 {130, 130, 130, 128, /* unused */ 0, 146, 146, 146}, // expected_result_vd0_int16 {0x8100, 0x8100, 0x8100, 0x8100, /* unused */ 0, 0x8100, 0x8100, 0x8100}, // expected_result_vd0_int32 - {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x8302'8100, /* unused */ 0, /* unused */ 0, - 0x8302'8100, 0x8302'8100}, + {0x8302'8100, + 0x8302'8100, + 0x8302'8100, + 0x8302'8100, + /* unused */ 0, + /* unused */ 0, + 0x8302'8100, + 0x8302'8100}, // expected_result_vd0_int64 - {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100}, + {0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x8706'8504'8302'8100}, // expected_result_vd0_with_mask_int8 {138, 138, 138, 128, /* unused */ 0, 0, 150, 150}, // expected_result_vd0_with_mask_int16 {0x8100, 0x8100, 0x8100, 0x8100, /* unused */ 0, 0x8100, 0x8100, 0x8100}, // expected_result_vd0_with_mask_int32 - {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x8302'8100, /* unused */ 0, /* unused */ 0, - 0x8302'8100, 0x8302'8100}, + {0x8302'8100, + 0x8302'8100, + 0x8302'8100, + 0x8302'8100, + /* unused */ 0, + /* unused */ 0, + 0x8302'8100, + 0x8302'8100}, // expected_result_vd0_with_mask_int64 - {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100}, + {0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x8706'8504'8302'8100}, kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestVfredmin) { - TestVectorReductionInstruction( - 0x150c1457, // vfredmin.vs v8, v16, v24, v0.t - // expected_result_vd0_int32 - {0x9e0c'9a09, 0xbe2c'ba29, 0xfe6c'fa69, 0xfe6c'fa69, /* unused */ 0, /* unused */ 0, - 0x9604'9200, 0x9e0c'9a09}, - // expected_result_vd0_int64 - {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0xfe6c'fa69'f664'f260, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200}, - // expected_result_vd0_with_mask_int32 - {0x9604'9200, 0xbe2c'ba29, 0xfe6c'fa69, 0xfe6c'fa69, /* unused */ 0, /* unused */ 0, - 0x9604'9200, 0x9604'9200}, - // expected_result_vd0_with_mask_int64 - {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0xee7c'ea78'e674'e271, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200}, - kVectorCalculationsSource); + TestVectorReductionInstruction(0x15881457, // vfredmin.vs v8, v24, v16, v0.t + // expected_result_vd0_int32 + {0x9e0c'9a09, + 0xbe2c'ba29, + 0xfe6c'fa69, + 0xfe6c'fa69, + /* unused */ 0, + /* unused */ 0, + 0x9604'9200, + 0x9e0c'9a09}, + // expected_result_vd0_int64 + {0x9e0c'9a09'9604'9200, + 0xbe2c'ba29'b624'b220, + 0xfe6c'fa69'f664'f260, + 0xfe6c'fa69'f664'f260, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x9e0c'9a09'9604'9200}, + // expected_result_vd0_with_mask_int32 + {0x9604'9200, + 0xbe2c'ba29, + 0xfe6c'fa69, + 0xfe6c'fa69, + /* unused */ 0, + /* unused */ 0, + 0x9604'9200, + 0x9604'9200}, + // expected_result_vd0_with_mask_int64 + {0x9e0c'9a09'9604'9200, + 0xbe2c'ba29'b624'b220, + 0xee7c'ea78'e674'e271, + 0xee7c'ea78'e674'e271, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x9e0c'9a09'9604'9200}, + kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestVredmaxu) { TestVectorReductionInstruction( - 0x190c2457, // vredmaxu.vs v8,v16,v24,v0.t + 0x19882457, // vredmaxu.vs v8,v24,v16,v0.t // expected_result_vd0_int8 {158, 190, 254, 254, /* unused */ 0, 146, 150, 158}, // expected_result_vd0_int16 {0x9e0c, 0xbe2c, 0xfe6c, 0xfe6c, /* unused */ 0, 0x9200, 0x9604, 0x9e0c}, // expected_result_vd0_int32 - {0x9e0c'9a09, 0xbe2c'ba29, 0xfe6c'fa69, 0xfe6c'fa69, /* unused */ 0, /* unused */ 0, - 0x9604'9200, 0x9e0c'9a09}, + {0x9e0c'9a09, + 0xbe2c'ba29, + 0xfe6c'fa69, + 0xfe6c'fa69, + /* unused */ 0, + /* unused */ 0, + 0x9604'9200, + 0x9e0c'9a09}, // expected_result_vd0_int64 - {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0xfe6c'fa69'f664'f260, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200}, + {0x9e0c'9a09'9604'9200, + 0xbe2c'ba29'b624'b220, + 0xfe6c'fa69'f664'f260, + 0xfe6c'fa69'f664'f260, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x9e0c'9a09'9604'9200}, // expected_result_vd0_with_mask_int8 {158, 186, 254, 254, /* unused */ 0, 0, 150, 158}, // expected_result_vd0_with_mask_int16 {0x9e0c, 0xba29, 0xfe6c, 0xfe6c, /* unused */ 0, 0x9200, 0x9200, 0x9e0c}, // expected_result_vd0_with_mask_int32 - {0x9604'9200, 0xbe2c'ba29, 0xfe6c'fa69, 0xfe6c'fa69, /* unused */ 0, /* unused */ 0, - 0x9604'9200, 0x9604'9200}, + {0x9604'9200, + 0xbe2c'ba29, + 0xfe6c'fa69, + 0xfe6c'fa69, + /* unused */ 0, + /* unused */ 0, + 0x9604'9200, + 0x9604'9200}, // expected_result_vd0_with_mask_int64 - {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0xee7c'ea78'e674'e271, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200}, + {0x9e0c'9a09'9604'9200, + 0xbe2c'ba29'b624'b220, + 0xee7c'ea78'e674'e271, + 0xee7c'ea78'e674'e271, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x9e0c'9a09'9604'9200}, kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestVredmax) { TestVectorReductionInstruction( - 0x1d0c2457, // vredmax.vs v8,v16,v24,v0.t + 0x1d882457, // vredmax.vs v8,v24,v16,v0.t // expected_result_vd0_int8 {28, 60, 124, 126, /* unused */ 0, 0, 4, 12}, // expected_result_vd0_int16 {0x9e0c, 0xbe2c, 0xfe6c, 0x7eec, /* unused */ 0, 0x9200, 0x9604, 0x9e0c}, // expected_result_vd0_int32 - {0x9e0c'9a09, 0xbe2c'ba29, 0xfe6c'fa69, 0x7eec'7ae9, /* unused */ 0, /* unused */ 0, - 0x9604'9200, 0x9e0c'9a09}, + {0x9e0c'9a09, + 0xbe2c'ba29, + 0xfe6c'fa69, + 0x7eec'7ae9, + /* unused */ 0, + /* unused */ 0, + 0x9604'9200, + 0x9e0c'9a09}, // expected_result_vd0_int64 - {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0x7eec'7ae9'76e4'72e0, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200}, + {0x9e0c'9a09'9604'9200, + 0xbe2c'ba29'b624'b220, + 0xfe6c'fa69'f664'f260, + 0x7eec'7ae9'76e4'72e0, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x9e0c'9a09'9604'9200}, // expected_result_vd0_with_mask_int8 {24, 52, 124, 126, /* unused */ 0, 0, 4, 4}, // expected_result_vd0_with_mask_int16 {0x9e0c, 0xba29, 0xfe6c, 0x7ae9, /* unused */ 0, 0x9200, 0x9200, 0x9e0c}, // expected_result_vd0_with_mask_int32 - {0x9604'9200, 0xbe2c'ba29, 0xfe6c'fa69, 0x7eec'7ae9, /* unused */ 0, /* unused */ 0, - 0x9604'9200, 0x9604'9200}, + {0x9604'9200, + 0xbe2c'ba29, + 0xfe6c'fa69, + 0x7eec'7ae9, + /* unused */ 0, + /* unused */ 0, + 0x9604'9200, + 0x9604'9200}, // expected_result_vd0_with_mask_int64 - {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0x6efc'6af8'66f4'62f1, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200}, + {0x9e0c'9a09'9604'9200, + 0xbe2c'ba29'b624'b220, + 0xee7c'ea78'e674'e271, + 0x6efc'6af8'66f4'62f1, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x9e0c'9a09'9604'9200}, kVectorCalculationsSource); } TEST_F(Riscv64InterpreterTest, TestVfredmax) { - TestVectorReductionInstruction( - 0x1d0c1457, // vfredmax.vs v8, v16, v24, v0.t - // expected_result_vd0_int32 - {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x7eec'7ae9, /* unused */ 0, /* unused */ 0, - 0x8302'8100, 0x8302'8100}, - // expected_result_vd0_int64 - {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x7eec'7ae9'76e4'72e0, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100}, - // expected_result_vd0_with_mask_int32 - {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x7eec'7ae9, /* unused */ 0, /* unused */ 0, - 0x8302'8100, 0x8302'8100}, - // expected_result_vd0_with_mask_int64 - {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x6efc'6af8'66f4'62f1, - /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100}, - kVectorCalculationsSource); + TestVectorReductionInstruction(0x1d881457, // vfredmax.vs v8, v24, v16, v0.t + // expected_result_vd0_int32 + {0x8302'8100, + 0x8302'8100, + 0x8302'8100, + 0x7eec'7ae9, + /* unused */ 0, + /* unused */ 0, + 0x8302'8100, + 0x8302'8100}, + // expected_result_vd0_int64 + {0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + 0x7eec'7ae9'76e4'72e0, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x8706'8504'8302'8100}, + // expected_result_vd0_with_mask_int32 + {0x8302'8100, + 0x8302'8100, + 0x8302'8100, + 0x7eec'7ae9, + /* unused */ 0, + /* unused */ 0, + 0x8302'8100, + 0x8302'8100}, + // expected_result_vd0_with_mask_int64 + {0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + 0x8706'8504'8302'8100, + 0x6efc'6af8'66f4'62f1, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x8706'8504'8302'8100}, + kVectorCalculationsSource); } // Note that the expected test outputs for v[f]merge.vXm are identical to those for v[f]mv.v.X. @@ -12168,6 +12766,155 @@ TEST_F(Riscv64InterpreterTest, TestVslide1down) { /*last_elem_is_x1=*/true); } +TEST_F(Riscv64InterpreterTest, TestVfslide1up) { + TestVectorFloatInstruction(0x3980d457, // vfslide1up.vf v8, v24, f1, v0.t + {{0x40b4'0000, 0x9604'9200, 0x9e0c'9a09, 0x8614'8211}, + {0x8e1c'8a18, 0xb624'b220, 0xbe2c'ba29, 0xa634'a231}, + {0xae3c'aa38, 0xd644'd240, 0xde4c'da49, 0xc654'c251}, + {0xce5c'ca58, 0xf664'f260, 0xfe6c'fa69, 0xe674'e271}, + {0xee7c'ea78, 0x1684'1280, 0x1e8c'1a89, 0x0694'0291}, + {0x0e9c'0a98, 0x36a4'32a0, 0x3eac'3aa9, 0x26b4'22b1}, + {0x2ebc'2ab8, 0x56c4'52c0, 0x5ecc'5ac9, 0x46d4'42d1}, + {0x4edc'4ad8, 0x76e4'72e0, 0x7eec'7ae9, 0x66f4'62f1}}, + {{0x4016'8000'0000'0000, 0x9e0c'9a09'9604'9200}, + {0x8e1c'8a18'8614'8211, 0xbe2c'ba29'b624'b220}, + {0xae3c'aa38'a634'a231, 0xde4c'da49'd644'd240}, + {0xce5c'ca58'c654'c251, 0xfe6c'fa69'f664'f260}, + {0xee7c'ea78'e674'e271, 0x1e8c'1a89'1684'1280}, + {0x0e9c'0a98'0694'0291, 0x3eac'3aa9'36a4'32a0}, + {0x2ebc'2ab8'26b4'22b1, 0x5ecc'5ac9'56c4'52c0}, + {0x4edc'4ad8'46d4'42d1, 0x7eec'7ae9'76e4'72e0}}, + kVectorCalculationsSource); +} + +TEST_F(Riscv64InterpreterTest, TestVfslide1down) { + // Where the element at the top gets inserted will depend on VLMUL so we use + // TestVectorFloatPermutationInstruction instead of TestVectorFloatInstruction. + + // VLMUL = 0 + TestVectorFloatPermutationInstruction( + 0x3d80d457, // vfslide1down.vf v8, v24, f1, v0.t + {{0x9e0c'9a09, 0x8614'8211, 0x8e1c'8a18, 0x40b4'0000}, {}, {}, {}, {}, {}, {}, {}}, + {{0x8e1c'8a18'8614'8211, 0x4016'8000'0000'0000}, {}, {}, {}, {}, {}, {}, {}}, + kVectorCalculationsSource, + /*vlmul=*/0, + /*skip=*/0, + /*ignore_vma_for_last=*/true, + /*last_elem_is_f1=*/true); + + // VLMUL = 1 + TestVectorFloatPermutationInstruction(0x3d80d457, // vfslide1down.vf v8, v24, f1, v0.t + {{0x9e0c'9a09, 0x8614'8211, 0x8e1c'8a18, 0xb624'b220}, + {0xbe2c'ba29, 0xa634'a231, 0xae3c'aa38, 0x40b4'0000}, + {}, + {}, + {}, + {}, + {}, + {}}, + {{0x8e1c'8a18'8614'8211, 0xbe2c'ba29'b624'b220}, + {0xae3c'aa38'a634'a231, 0x4016'8000'0000'0000}, + {}, + {}, + {}, + {}, + {}, + {}}, + kVectorCalculationsSource, + /*vlmul=*/1, + /*skip=*/0, + /*ignore_vma_for_last=*/true, + /*last_elem_is_f1=*/true); + + // VLMUL = 2 + TestVectorFloatPermutationInstruction(0x3d80d457, // vfslide1down.vf v8, v24, f1, v0.t + {{0x9e0c'9a09, 0x8614'8211, 0x8e1c'8a18, 0xb624'b220}, + {0xbe2c'ba29, 0xa634'a231, 0xae3c'aa38, 0xd644'd240}, + {0xde4c'da49, 0xc654'c251, 0xce5c'ca58, 0xf664'f260}, + {0xfe6c'fa69, 0xe674'e271, 0xee7c'ea78, 0x40b4'0000}, + {}, + {}, + {}, + {}}, + {{0x8e1c'8a18'8614'8211, 0xbe2c'ba29'b624'b220}, + {0xae3c'aa38'a634'a231, 0xde4c'da49'd644'd240}, + {0xce5c'ca58'c654'c251, 0xfe6c'fa69'f664'f260}, + {0xee7c'ea78'e674'e271, 0x4016'8000'0000'0000}, + {}, + {}, + {}, + {}}, + kVectorCalculationsSource, + /*vlmul=*/2, + /*skip=*/0, + /*ignore_vma_for_last=*/true, + /*last_elem_is_f1=*/true); + + // VLMUL = 3 + TestVectorFloatPermutationInstruction(0x3d80d457, // vfslide1down.vf v8, v24, f1, v0.t + {{0x9e0c'9a09, 0x8614'8211, 0x8e1c'8a18, 0xb624'b220}, + {0xbe2c'ba29, 0xa634'a231, 0xae3c'aa38, 0xd644'd240}, + {0xde4c'da49, 0xc654'c251, 0xce5c'ca58, 0xf664'f260}, + {0xfe6c'fa69, 0xe674'e271, 0xee7c'ea78, 0x1684'1280}, + {0x1e8c'1a89, 0x0694'0291, 0x0e9c'0a98, 0x36a4'32a0}, + {0x3eac'3aa9, 0x26b4'22b1, 0x2ebc'2ab8, 0x56c4'52c0}, + {0x5ecc'5ac9, 0x46d4'42d1, 0x4edc'4ad8, 0x76e4'72e0}, + {0x7eec'7ae9, 0x66f4'62f1, 0x6efc'6af8, 0x40b4'0000}}, + {{0x8e1c'8a18'8614'8211, 0xbe2c'ba29'b624'b220}, + {0xae3c'aa38'a634'a231, 0xde4c'da49'd644'd240}, + {0xce5c'ca58'c654'c251, 0xfe6c'fa69'f664'f260}, + {0xee7c'ea78'e674'e271, 0x1e8c'1a89'1684'1280}, + {0x0e9c'0a98'0694'0291, 0x3eac'3aa9'36a4'32a0}, + {0x2ebc'2ab8'26b4'22b1, 0x5ecc'5ac9'56c4'52c0}, + {0x4edc'4ad8'46d4'42d1, 0x7eec'7ae9'76e4'72e0}, + {0x6efc'6af8'66f4'62f1, 0x4016'8000'0000'0000}}, + kVectorCalculationsSource, + /*vlmul=*/3, + /*skip=*/0, + /*ignore_vma_for_last=*/true, + /*last_elem_is_f1=*/true); + + // VLMUL = 4 + TestVectorFloatPermutationInstruction(0x3d80d457, // vfslide1down.vf v8, v24, f1, v0.t + {{}, {}, {}, {}, {}, {}, {}, {}}, + {{}, {}, {}, {}, {}, {}, {}, {}}, + kVectorCalculationsSource, + /*vlmul=*/4, + /*skip=*/0, + /*ignore_vma_for_last=*/true, + /*last_elem_is_f1=*/true); + + // VLMUL = 5 + TestVectorFloatPermutationInstruction(0x3d80d457, // vfslide1down.vf v8, v24, f1, v0.t + {{}, {}, {}, {}, {}, {}, {}, {}}, + {{}, {}, {}, {}, {}, {}, {}, {}}, + kVectorCalculationsSource, + /*vlmul=*/5, + /*skip=*/0, + /*ignore_vma_for_last=*/true, + /*last_elem_is_f1=*/true); + + // VLMUL = 6 + TestVectorFloatPermutationInstruction(0x3d80d457, // vfslide1down.vf v8, v24, f1, v0.t + {{0x40b4'0000}, {}, {}, {}, {}, {}, {}, {}}, + {{}, {}, {}, {}, {}, {}, {}, {}}, + kVectorCalculationsSource, + /*vlmul=*/6, + /*skip=*/0, + /*ignore_vma_for_last=*/true, + /*last_elem_is_f1=*/true); + + // VLMUL = 7 + TestVectorFloatPermutationInstruction(0x3d80d457, // vfslide1down.vf v8, v24, f1, v0.t + {{0x9e0c'9a09, 0x40b4'0000}, {}, {}, {}, {}, {}, {}, {}}, + {{0x4016'8000'0000'0000}, {}, {}, {}, {}, {}, {}, {}}, + kVectorCalculationsSource, + /*vlmul=*/7, + /*skip=*/0, + /*ignore_vma_for_last=*/true, + /*last_elem_is_f1=*/true); +} + TEST_F(Riscv64InterpreterTest, TestVwadd) { TestWideningVectorInstruction(0xc50c2457, // vwadd.vv v8,v16,v24,v0.t {{0x0000, 0xff13, 0x0006, 0xff19, 0x000d, 0xff1f, 0x0012, 0xff25}, diff --git a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h index 27353bf3..2019aa6b 100644 --- a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h +++ b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h @@ -873,6 +873,11 @@ std::tuple<ElementType> WideMultiplySignedUnsigned(ElementType arg1, ElementType DEFINE_W_ARITHMETIC_INTRINSIC(Vn##name##wx, Narrowwv, return ({ __VA_ARGS__; }); \ , (SIMD128Register src1, ElementType src2), (), (src1, src2)) +#define DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WV(name, ...) \ + DEFINE_W_ARITHMETIC_INTRINSIC( \ + Vn##name##wv, Narrowwv, return ({ __VA_ARGS__; }); \ + , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2)) + #define DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WX(name, ...) \ DEFINE_W_ARITHMETIC_INTRINSIC( \ Vn##name##wx, Narrowwv, return ({ __VA_ARGS__; }); \ @@ -1103,6 +1108,10 @@ DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV(sr, auto [arg1, arg2] = std::tuple{arg (arg1 >> arg2)) DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX(sr, auto [arg1, arg2] = std::tuple{args...}; (arg1 >> arg2)) +DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WV( + clip, + WideType<ElementType>{(std::get<0>( + Roundoff(csr, static_cast<typename WideType<ElementType>::BaseType>(args)...)))}) DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WX( clip, WideType<ElementType>{(std::get<0>( diff --git a/kernel_api/riscv64/open_emulation.cc b/kernel_api/riscv64/open_emulation.cc index dacf77ad..e2257df1 100644 --- a/kernel_api/riscv64/open_emulation.cc +++ b/kernel_api/riscv64/open_emulation.cc @@ -25,10 +25,7 @@ #include "berberis/kernel_api/tracing.h" -#define GUEST_O_DIRECTORY 00040000 -#define GUEST_O_NOFOLLOW 00100000 -#define GUEST_O_DIRECT 00200000 -#define GUEST_O_LARGEFILE 00400000 +#define GUEST_O_LARGEFILE 00100000 namespace berberis { @@ -55,7 +52,7 @@ namespace berberis { static_assert((O_ACCMODE & ~O_SEARCH) == 00000003); -// These flags should have the same value on all architectures. +// These flags should have the same value on guest and host architectures. static_assert(O_CREAT == 00000100); static_assert(O_EXCL == 00000200); static_assert(O_NOCTTY == 00000400); @@ -65,7 +62,10 @@ static_assert(O_NONBLOCK == 00004000); static_assert(O_DSYNC == 00010000); static_assert(FASYNC == 00020000); static_assert(O_NOATIME == 01000000); +static_assert(O_DIRECTORY == 0200000); +static_assert(O_NOFOLLOW == 00400000); static_assert(O_CLOEXEC == 02000000); +static_assert(O_DIRECT == 040000); static_assert(__O_SYNC == 04000000); static_assert(O_SYNC == (O_DSYNC | __O_SYNC)); static_assert(O_PATH == 010000000); @@ -73,14 +73,13 @@ static_assert(O_PATH == 010000000); namespace { const int kCompatibleOpenFlags = O_ACCMODE | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | O_APPEND | - O_NONBLOCK | O_DSYNC | FASYNC | O_NOATIME | O_CLOEXEC | __O_SYNC | - O_PATH; + O_NONBLOCK | O_DSYNC | FASYNC | O_NOATIME | O_DIRECTORY | + O_NOFOLLOW | O_CLOEXEC | O_DIRECT | __O_SYNC | O_PATH; } // namespace int ToHostOpenFlags(int guest_flags) { - const int kIncompatibleGuestOpenFlags = - GUEST_O_DIRECTORY | GUEST_O_NOFOLLOW | GUEST_O_DIRECT | GUEST_O_LARGEFILE; + const int kIncompatibleGuestOpenFlags = GUEST_O_LARGEFILE; int unknown_guest_flags = guest_flags & ~(kCompatibleOpenFlags | kIncompatibleGuestOpenFlags); if (unknown_guest_flags) { @@ -91,15 +90,6 @@ int ToHostOpenFlags(int guest_flags) { int host_flags = guest_flags & ~kIncompatibleGuestOpenFlags; - if (guest_flags & GUEST_O_DIRECTORY) { - host_flags |= O_DIRECTORY; - } - if (guest_flags & GUEST_O_NOFOLLOW) { - host_flags |= O_NOFOLLOW; - } - if (guest_flags & GUEST_O_DIRECT) { - host_flags |= O_DIRECT; - } if (guest_flags & GUEST_O_LARGEFILE) { host_flags |= O_LARGEFILE; } @@ -108,7 +98,7 @@ int ToHostOpenFlags(int guest_flags) { } int ToGuestOpenFlags(int host_flags) { - const int kIncompatibleHostOpenFlags = O_DIRECTORY | O_NOFOLLOW | O_DIRECT | O_LARGEFILE; + const int kIncompatibleHostOpenFlags = O_LARGEFILE; int unknown_host_flags = host_flags & ~(kCompatibleOpenFlags | kIncompatibleHostOpenFlags); if (unknown_host_flags) { @@ -119,15 +109,6 @@ int ToGuestOpenFlags(int host_flags) { int guest_flags = host_flags & ~kIncompatibleHostOpenFlags; - if (host_flags & O_DIRECTORY) { - guest_flags |= GUEST_O_DIRECTORY; - } - if (host_flags & O_NOFOLLOW) { - guest_flags |= GUEST_O_NOFOLLOW; - } - if (host_flags & O_DIRECT) { - guest_flags |= GUEST_O_DIRECT; - } if (host_flags & O_LARGEFILE) { guest_flags |= GUEST_O_LARGEFILE; } diff --git a/tests/inline_asm_tests/Android.bp b/tests/inline_asm_tests/Android.bp index bba729f6..90e082e5 100644 --- a/tests/inline_asm_tests/Android.bp +++ b/tests/inline_asm_tests/Android.bp @@ -48,3 +48,18 @@ cc_test { }, static_executable: true, } + +cc_test { + name: "inline_asm_tests_riscv64", + native_bridge_supported: true, + enabled: false, + arch: { + riscv64: { + enabled: true, + srcs: [ + "main_riscv64.cc", + ], + }, + }, + static_executable: true, +} diff --git a/tests/inline_asm_tests/main_riscv64.cc b/tests/inline_asm_tests/main_riscv64.cc new file mode 100644 index 00000000..694909a4 --- /dev/null +++ b/tests/inline_asm_tests/main_riscv64.cc @@ -0,0 +1,290 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gtest/gtest.h" + +#include <cstdint> +#include <tuple> + +namespace { + +template <typename T> +constexpr T BitUtilLog2(T x) { + return __builtin_ctz(x); +} + +// TODO(b/301577077): Maybe use __uint128_t instead. +// Or provide a more versatile wrapper, that one can easily init, copy and compare. +using __v2du = uint64_t[2]; + +constexpr __v2du kVectorCalculationsSource[16] = { + {0x8706'8504'8302'8100, 0x8f0e'8d0c'8b0a'8908}, + {0x9716'9514'9312'9110, 0x9f1e'9d1c'9b1a'9918}, + {0xa726'a524'a322'a120, 0xaf2e'ad2c'ab2a'a928}, + {0xb736'b534'b332'b130, 0xbf3e'bd3c'bb3a'b938}, + {0xc746'c544'c342'c140, 0xcf4e'cd4c'cb4a'c948}, + {0xd756'd554'd352'd150, 0xdf5e'dd5c'db5a'd958}, + {0xe766'e564'e362'e160, 0xef6e'ed6c'eb6a'e968}, + {0xf776'f574'f372'f170, 0xff7e'fd7c'fb7a'f978}, + + {0x9e0c'9a09'9604'9200, 0x8e1c'8a18'8614'8211}, + {0xbe2c'ba29'b624'b220, 0xae3c'aa38'a634'a231}, + {0xde4c'da49'd644'd240, 0xce5c'ca58'c654'c251}, + {0xfe6c'fa69'f664'f260, 0xee7c'ea78'e674'e271}, + {0x1e8c'1a89'1684'1280, 0x0e9c'0a98'0694'0291}, + {0x3eac'3aa9'36a4'32a0, 0x2ebc'2ab8'26b4'22b1}, + {0x5ecc'5ac9'56c4'52c0, 0x4edc'4ad8'46d4'42d1}, + {0x7eec'7ae9'76e4'72e0, 0x6efc'6af8'66f4'62f1}, +}; + +// Easily recognizable bit pattern for target register. +constexpr __v2du kUndisturbedResult = {0x5555'5555'5555'5555, 0x5555'5555'5555'5555}; +constexpr __v2du kAgnosticResult = {~uint64_t{0U}, ~uint64_t{0U}}; + +// Mask in form suitable for storing in v0 and use in v0.t form. +static constexpr __v2du kMask = {0xd5ad'd6b5'ad6b'b5ad, 0x6af7'57bb'deed'7bb5}; + +using ExecInsnFunc = void (*)(); + +void RunTwoVectorArgsOneRes(ExecInsnFunc exec_insn, + const __v2du* src, + __v2du* res, + uint64_t vtype, + uint64_t vlmax) { + uint64_t vstart, vl; + // Mask register is, unconditionally, v0, and we need 8, 16, or 24 to handle full 8-registers + // inputs thus we use v8..v15 for destination and place sources into v16..v23 and v24..v31. + asm( // Load arguments and undisturbed result. + "vsetvli t0, zero, e64, m8, ta, ma\n\t" + "vle64.v v8, (%[res])\n\t" + "vle64.v v16, (%[src])\n\t" + "addi t0, %[src], 128\n\t" + "vle64.v v24, (t0)\n\t" + // Load mask. + "vsetvli t0, zero, e64, m1, ta, ma\n\t" + "vle64.v v0, (%[mask])\n\t" + // Execute tested instruction. + "vsetvl t0, zero, %[vtype]\n\t" + "jalr %[exec_insn]\n\t" + // Save vstart and vl just after insn execution for checks. + "csrr %[vstart], vstart\n\t" + "csrr %[vl], vl\n\t" + // Store the result. + "vsetvli t0, zero, e64, m8, ta, ma\n\t" + "vse64.v v8, (%[res])\n\t" + : [vstart] "=&r"(vstart), [vl] "=&r"(vl) + : [exec_insn] "r"(exec_insn), + [src] "r"(src), + [res] "r"(res), + [vtype] "r"(vtype), + [mask] "r"(&kMask) + : "t0", + "ra", + "v0", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25", + "v26", + "v27", + "v28", + "v29", + "v30", + "v31", + "memory"); + // Every vector instruction must set vstart to 0, but shouldn't touch vl. + EXPECT_EQ(vstart, 0); + EXPECT_EQ(vl, vlmax); +} + +template <typename... ExpectedResultType> +void TestVectorReductionInstruction( + ExecInsnFunc exec_insn, + ExecInsnFunc exec_masked_insn, + const __v2du (&source)[16], + std::tuple<const ExpectedResultType (&)[8], + const ExpectedResultType (&)[8]>... expected_result) { + // Each expected_result input to this function is the vd[0] value of the reduction, for each + // of the possible vlmul, i.e. expected_result_vd0_int8[n] = vd[0], int8, no mask, vlmul=n. + // + // As vlmul=4 is reserved, expected_result_vd0_*[4] is ignored. + auto Verify = [&source](ExecInsnFunc exec_insn, + uint8_t vsew, + uint8_t vlmul, + const auto& expected_result) { + for (uint8_t vta = 0; vta < 2; ++vta) { + for (uint8_t vma = 0; vma < 2; ++vma) { + uint64_t vtype = (vma << 7) | (vta << 6) | (vsew << 3) | vlmul; + uint64_t vlmax = 0; + asm("vsetvl %0, zero, %1" : "=r"(vlmax) : "r"(vtype)); + if (vlmax == 0) { + continue; + } + + __v2du result[8]; + // Set undisturbed result vector registers. + for (size_t index = 0; index < 8; ++index) { + memcpy(&result[index], &kUndisturbedResult, sizeof(result[index])); + } + + RunTwoVectorArgsOneRes(exec_insn, &kVectorCalculationsSource[0], &result[0], vtype, vlmax); + + // Reduction instructions are unique in that they produce a scalar + // output to a single vector register as opposed to a register group. + // This allows us to take some short-cuts when validating: + // + // - The mask setting is only useful during computation, as the body + // of the destination is always only element 0, which will always be + // written to, regardless of mask setting. + // - The tail is guaranteed to be 1..VLEN/SEW, so the vlmul setting + // does not affect the elements that the tail policy applies to in the + // destination register. + + // Verify that the destination register holds the reduction in the + // first element and the tail policy applies to the remaining. + __uint128_t expected_result_register; + if (vta) { + memcpy(&expected_result_register, &kAgnosticResult, sizeof(expected_result_register)); + } else { + memcpy(&expected_result_register, &kUndisturbedResult, sizeof(expected_result_register)); + } + size_t vsew_bits = 8 << vsew; + expected_result_register = (expected_result_register >> vsew_bits) << vsew_bits; + expected_result_register |= expected_result; + EXPECT_TRUE(memcmp(&result[0], &expected_result_register, sizeof(result[0])) == 0); + + // Verify all non-destination registers are undisturbed. + for (size_t index = 1; index < 8; ++index) { + EXPECT_TRUE(memcmp(&result[index], &kUndisturbedResult, sizeof(result[index])) == 0); + } + } + } + }; + + for (int vlmul = 0; vlmul < 8; vlmul++) { + ((Verify(exec_insn, + BitUtilLog2(sizeof(ExpectedResultType)), + vlmul, + std::get<0>(expected_result)[vlmul]), + Verify(exec_masked_insn, + BitUtilLog2(sizeof(ExpectedResultType)), + vlmul, + std::get<1>(expected_result)[vlmul])), + ...); + } +} + +void TestVectorReductionInstruction(ExecInsnFunc exec_insn, + ExecInsnFunc exec_masked_insn, + const uint8_t (&expected_result_vd0_int8)[8], + const uint16_t (&expected_result_vd0_int16)[8], + const uint32_t (&expected_result_vd0_int32)[8], + const uint64_t (&expected_result_vd0_int64)[8], + const uint8_t (&expected_result_vd0_with_mask_int8)[8], + const uint16_t (&expected_result_vd0_with_mask_int16)[8], + const uint32_t (&expected_result_vd0_with_mask_int32)[8], + const uint64_t (&expected_result_vd0_with_mask_int64)[8], + const __v2du (&source)[16]) { + TestVectorReductionInstruction( + exec_insn, + exec_masked_insn, + source, + std::tuple<const uint8_t(&)[8], const uint8_t(&)[8]>{expected_result_vd0_int8, + expected_result_vd0_with_mask_int8}, + std::tuple<const uint16_t(&)[8], const uint16_t(&)[8]>{expected_result_vd0_int16, + expected_result_vd0_with_mask_int16}, + std::tuple<const uint32_t(&)[8], const uint32_t(&)[8]>{expected_result_vd0_int32, + expected_result_vd0_with_mask_int32}, + std::tuple<const uint64_t(&)[8], const uint64_t(&)[8]>{expected_result_vd0_int64, + expected_result_vd0_with_mask_int64}); +} + +[[gnu::naked]] void ExecVredsum() { + asm("vredsum.vs v8,v16,v24\n\t" + "ret\n\t"); +} + +[[gnu::naked]] void ExecMaskedVredsum() { + asm("vredsum.vs v8,v16,v24,v0.t\n\t" + "ret\n\t"); +} + +TEST(InlineAsmTestRiscv64, TestVredsum) { + TestVectorReductionInstruction( + ExecVredsum, + ExecMaskedVredsum, + // expected_result_vd0_int8 + {242, 228, 200, 144, /* unused */ 0, 146, 44, 121}, + // expected_result_vd0_int16 + {0x0172, 0x82e4, 0x88c8, 0xa090, /* unused */ 0, 0x1300, 0xa904, 0xe119}, + // expected_result_vd0_int32 + {0xcb44'b932, + 0x9407'71e4, + 0xa70e'64c8, + 0xd312'5090, + /* unused */ 0, + /* unused */ 0, + 0x1907'1300, + 0xb713'ad09}, + // expected_result_vd0_int64 + {0xb32f'a926'9f1b'9511, + 0x1f99'0d88'fb74'e962, + 0xb92c'970e'74e8'52c4, + 0xef4e'ad14'6aca'2888, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x2513'1f0e'1907'1300}, + // expected_result_vd0_with_mask_int8 + {39, 248, 142, 27, /* unused */ 0, 0, 154, 210}, + // expected_result_vd0_with_mask_int16 + {0x5f45, 0xc22f, 0x99d0, 0x98bf, /* unused */ 0, 0x1300, 0x1300, 0x4b15}, + // expected_result_vd0_with_mask_int32 + {0x2d38'1f29, + 0x99a1'838a, + 0x1989'ef5c, + 0x9cf4'4aa1, + /* unused */ 0, + /* unused */ 0, + 0x1907'1300, + 0x1907'1300}, + // expected_result_vd0_with_mask_int64 + {0x2513'1f0e'1907'1300, + 0x917c'8370'7560'6751, + 0x4e56'3842'222a'0c13, + 0xc833'9e0e'73df'49b5, + /* unused */ 0, + /* unused */ 0, + /* unused */ 0, + 0x2513'1f0e'1907'1300}, + kVectorCalculationsSource); +} + +} // namespace diff --git a/tests/run_host_tests.mk b/tests/run_host_tests.mk index 29906c6d..ab6bca76 100644 --- a/tests/run_host_tests.mk +++ b/tests/run_host_tests.mk @@ -115,10 +115,29 @@ endef ifeq ($(BUILD_BERBERIS_RISCV64_TO_X86_64),true) -$(eval $(call add_test,berberis_ndk_program_tests,\ +# berberis_ndk_program_tests + +$(eval $(call add_test,berberis_ndk_program_tests_interpret_only,\ + run_test_x86_64_riscv64,\ + $(TARGET_OUT_TESTCASES)/berberis_ndk_program_tests_static.native_bridge/x86_64/berberis_ndk_program_tests_static,\ + BERBERIS_MODE=interpret-only)) + +$(eval $(call add_test,berberis_ndk_program_tests_lite_translate_or_interpret,\ run_test_x86_64_riscv64,\ $(TARGET_OUT_TESTCASES)/berberis_ndk_program_tests_static.native_bridge/x86_64/berberis_ndk_program_tests_static,\ - )) + BERBERIS_MODE=lite-translate-or-interpret)) + +$(eval $(call add_test,berberis_ndk_program_tests_heavy_optimize_or_interpret,\ + run_test_x86_64_riscv64,\ + $(TARGET_OUT_TESTCASES)/berberis_ndk_program_tests_static.native_bridge/x86_64/berberis_ndk_program_tests_static,\ + BERBERIS_MODE=heavy-optimize-or-interpret)) + +$(eval $(call add_test,berberis_ndk_program_tests_two_gear,\ + run_test_x86_64_riscv64,\ + $(TARGET_OUT_TESTCASES)/berberis_ndk_program_tests_static.native_bridge/x86_64/berberis_ndk_program_tests_static,\ + BERBERIS_MODE=two-gear)) + +# berberis_host_tests $(eval $(call add_test,berberis_host_tests,\ run_test,\ |