diff options
author | Victor Khimenko <khim@google.com> | 2024-03-01 21:52:54 +0000 |
---|---|---|
committer | Victor Khimenko <khim@google.com> | 2024-03-02 10:48:41 +0000 |
commit | ab3b04ad98f847abd4110fddadc62180ca898e95 (patch) | |
tree | 0ce238b54b7f4c1b4a0ce037efacc23227443713 | |
parent | dbc3b17a0a8f6c9722a5438dd732d3ba350fd577 (diff) | |
download | binary_translation-ab3b04ad98f847abd4110fddadc62180ca898e95.tar.gz |
Move SetCsr<CsrName::kVstart>(0); to earlier place.
Also redo handling of Vmv<nr>r.v instructions: these instructions
[ab]use immediate to encode number of registers involved and their
handling of vstart is also similar yet different compared to other
instructions.
Test: berberis_all
Change-Id: I94f6b5438f8e309d61660298c396146df7734c1a
-rw-r--r-- | decoder/include/berberis/decoder/riscv64/decoder.h | 2 | ||||
-rw-r--r-- | interpreter/riscv64/interpreter.h | 146 | ||||
-rw-r--r-- | interpreter/riscv64/interpreter_test.cc | 6 |
3 files changed, 83 insertions, 71 deletions
diff --git a/decoder/include/berberis/decoder/riscv64/decoder.h b/decoder/include/berberis/decoder/riscv64/decoder.h index dcb0a259..a150d048 100644 --- a/decoder/include/berberis/decoder/riscv64/decoder.h +++ b/decoder/include/berberis/decoder/riscv64/decoder.h @@ -345,7 +345,7 @@ class Decoder { kVsadduvi = 0b100000, kVsaddvi = 0b100001, kVsllvi = 0b100101, - kVmvvi = 0b100111, + kVmvXrv = 0b100111, kVsrlvi = 0b101000, kVsravi = 0b101001, kVssrlvi = 0b101010, diff --git a/interpreter/riscv64/interpreter.h b/interpreter/riscv64/interpreter.h index 8a36a613..1445172f 100644 --- a/interpreter/riscv64/interpreter.h +++ b/interpreter/riscv64/interpreter.h @@ -1352,9 +1352,21 @@ class Interpreter { InactiveProcessing::kUndisturbed>( args.dst, BitCastToUnsigned(SignedType{args.imm}), /*dst_mask=*/args.src); } - case Decoder::VOpIViOpcode::kVmvvi: + case Decoder::VOpIViOpcode::kVmvXrv: + // kVmv<nr>rv instruction if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) { - return OpVectorVmvXr<ElementType>(args.dst, args.src, static_cast<uint8_t>(args.imm)); + switch (args.imm) { + case 0: + return OpVectorVmvXrv<ElementType, 1>(args.dst, args.src); + case 1: + return OpVectorVmvXrv<ElementType, 2>(args.dst, args.src); + case 3: + return OpVectorVmvXrv<ElementType, 4>(args.dst, args.src); + case 7: + return OpVectorVmvXrv<ElementType, 8>(args.dst, args.src); + default: + return Unimplemented(); + } } else { return Unimplemented(); } @@ -1986,10 +1998,10 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -1999,7 +2011,6 @@ class Interpreter { result, std::get<0>(intrinsics::Vidv<ElementType>(index)), vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <typename ElementType> @@ -2067,24 +2078,19 @@ class Interpreter { void OpVectormm(uint8_t dst, uint8_t src1, uint8_t src2) { size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); - SIMD128Register arg1(state_->cpu.v[src1]); - SIMD128Register arg2(state_->cpu.v[src2]); - SIMD128Register result; + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } + SIMD128Register arg1(state_->cpu.v[src1]); + SIMD128Register arg2(state_->cpu.v[src2]); + SIMD128Register result; if (vstart > 0) [[unlikely]] { - if (vstart >= vl) [[unlikely]] { - result.Set(state_->cpu.v[dst]); - } else { - const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart); - result.Set(state_->cpu.v[dst]); - result = (result & ~start_mask) | (Intrinsic(arg1, arg2) & start_mask); - } - SetCsr<CsrName::kVstart>(0); + const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart); + result.Set(state_->cpu.v[dst]); + result = (result & ~start_mask) | (Intrinsic(arg1, arg2) & start_mask); } else { result = Intrinsic(arg1, arg2); } @@ -2126,40 +2132,36 @@ class Interpreter { state_->cpu.v[dst] = result.Get<__uint128_t>(); } - template <typename ElementType> - void OpVectorVmvXr(uint8_t dst, uint8_t src, uint8_t nf) { - if (!IsPowerOf2(nf + 1)) { - return Unimplemented(); - } - if (((dst | src) & nf) != 0) { + template <typename ElementType, size_t kRegistersInvolved> + void OpVectorVmvXrv(uint8_t dst, uint8_t src) { + if (!IsAligned<kRegistersInvolved>(dst | src)) { return Unimplemented(); } + constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType)); size_t vstart = GetCsr<CsrName::kVstart>(); + SetCsr<CsrName::kVstart>(0); + // The usual property that no elements are written if vstart >= vl does not apply to these + // instructions. Instead, no elements are written if vstart >= evl. + if (vstart >= kElementsCount * kRegistersInvolved) [[unlikely]] { + return; + } if (vstart == 0) [[likely]] { - for (size_t index = 0; index <= nf; ++index) { + for (size_t index = 0; index < kRegistersInvolved; ++index) { state_->cpu.v[dst + index] = state_->cpu.v[src + index]; } return; } - constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType)); - for (size_t index = 0; index <= nf; ++index) { - if (vstart >= kElementsCount) { - vstart -= kElementsCount; - continue; - } - if (vstart == 0) [[likely]] { - state_->cpu.v[dst + index] = state_->cpu.v[src + index]; - } else { - SIMD128Register destination{state_->cpu.v[dst + index]}; - SIMD128Register source{state_->cpu.v[src + index]}; - for (size_t element_index = vstart; element_index < kElementsCount; ++element_index) { - destination.Set(source.Get<ElementType>(element_index), element_index); - } - state_->cpu.v[dst + index] = destination.Get<__uint128_t>(); - vstart = 0; - } + size_t index = vstart / kElementsCount; + SIMD128Register destination{state_->cpu.v[dst + index]}; + SIMD128Register source{state_->cpu.v[src + index]}; + for (size_t element_index = vstart % kElementsCount; element_index < kElementsCount; + ++element_index) { + destination.Set(source.Get<ElementType>(element_index), element_index); + } + state_->cpu.v[dst + index] = destination.Get<__uint128_t>(); + for (index++; index < kRegistersInvolved; ++index) { + state_->cpu.v[dst + index] = state_->cpu.v[src + index]; } - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, typename ElementType, VectorRegisterGroupMultiplier vlmul, auto vma> @@ -2176,12 +2178,12 @@ class Interpreter { SIMD128Register original_result(state_->cpu.v[dst]); size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); SIMD128Register result_before_vl_masking; // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { result_before_vl_masking = original_result; - SetCsr<CsrName::kVstart>(0); } else { result_before_vl_masking = CollectBitmaskResult<ElementType, kRegistersInvolved>([this, src1, src2](auto index) { @@ -2201,7 +2203,6 @@ class Interpreter { const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart); result_before_vl_masking = (original_result & ~start_mask) | (result_before_vl_masking & start_mask); - SetCsr<CsrName::kVstart>(0); } } const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl); @@ -2222,12 +2223,12 @@ class Interpreter { SIMD128Register original_result(state_->cpu.v[dst]); size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); SIMD128Register result_before_vl_masking; // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { result_before_vl_masking = original_result; - SetCsr<CsrName::kVstart>(0); } else { result_before_vl_masking = CollectBitmaskResult<ElementType, kRegistersInvolved>([this, src1, arg2](auto index) { @@ -2246,7 +2247,6 @@ class Interpreter { const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart); result_before_vl_masking = (original_result & ~start_mask) | (result_before_vl_masking & start_mask); - SetCsr<CsrName::kVstart>(0); } } const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl); @@ -2277,6 +2277,12 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); + // When vstart >= vl, there are no body elements, and no elements are updated in any destination + // vector register group, including that no tail elements are updated with agnostic values. + if (vstart >= vl) [[unlikely]] { + return; + } auto mask = GetMaskForVectorOperations<vma>(); for (size_t index = 0; index < kRegistersInvolved; ++index) { SIMD128Register result{state_->cpu.v[dst + index]}; @@ -2292,7 +2298,6 @@ class Interpreter { result, std::get<0>(Intrinsic(arg)), result_mask, vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, @@ -2316,6 +2321,12 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); + // When vstart >= vl, there are no body elements, and no elements are updated in any destination + // vector register group, including that no tail elements are updated with agnostic values. + if (vstart >= vl) [[unlikely]] { + return; + } int8_t frm = GetCsr<CsrName::kFrm>(); auto mask = GetMaskForVectorOperations<vma>(); for (size_t index = 0; index < kRegistersInvolved; ++index) { @@ -2325,7 +2336,6 @@ class Interpreter { result, std::get<0>(Intrinsic(frm, arg)), vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, @@ -2352,6 +2362,7 @@ class Interpreter { if (vstart != 0) { return Unimplemented(); } + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vl == 0) [[unlikely]] { @@ -2381,7 +2392,6 @@ class Interpreter { result.Set(arg1, 0); result = std::get<0>(intrinsics::VectorMasking<ElementType, vta>(result, result, 0, 1)); state_->cpu.v[dst] = result.Get<__uint128_t>(); - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, @@ -2405,6 +2415,12 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); + // When vstart >= vl, there are no body elements, and no elements are updated in any destination + // vector register group, including that no tail elements are updated with agnostic values. + if (vstart >= vl) [[unlikely]] { + return; + } auto mask = GetMaskForVectorOperations<vma>(); for (size_t index = 0; index < kRegistersInvolved; ++index) { SIMD128Register result{state_->cpu.v[dst + index]}; @@ -2414,7 +2430,6 @@ class Interpreter { result, std::get<0>(Intrinsic(arg1, arg2)), vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, @@ -2438,10 +2453,10 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -2453,7 +2468,6 @@ class Interpreter { result, std::get<0>(Intrinsic(arg1, arg2, result)), vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } // 2*SEW = SEW op SEW @@ -2484,10 +2498,10 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -2507,7 +2521,6 @@ class Interpreter { state_->cpu.v[dst + 2 * index + 1] = result.Get<__uint128_t>(); } } - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, @@ -2532,10 +2545,10 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -2546,7 +2559,6 @@ class Interpreter { result, std::get<0>(Intrinsic(arg1, arg2)), vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } // SEW = 2*SEW op SEW @@ -2580,10 +2592,10 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -2603,7 +2615,6 @@ class Interpreter { orig_result, intrinsic_result, vstart, vl, index, mask); state_->cpu.v[dst + index] = result.template Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } // SEW = 2*SEW op SEW @@ -2638,10 +2649,10 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -2663,7 +2674,6 @@ class Interpreter { orig_result, intrinsic_result, vstart, vl, index, mask); state_->cpu.v[dst + index] = result.template Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, @@ -2722,6 +2732,12 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); + // When vstart >= vl, there are no body elements, and no elements are updated in any destination + // vector register group, including that no tail elements are updated with agnostic values. + if (vstart >= vl) [[unlikely]] { + return; + } auto mask = GetMaskForVectorOperations<vma>(); for (size_t index = 0; index < kRegistersInvolved; ++index) { SIMD128Register result(state_->cpu.v[dst + index]); @@ -2730,7 +2746,6 @@ class Interpreter { result, std::get<0>(Intrinsic(arg1, arg2, result)), vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, @@ -2757,10 +2772,10 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -2777,7 +2792,6 @@ class Interpreter { result, std::get<0>(Intrinsic(arg2)), result_mask, vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma> @@ -2798,11 +2812,11 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); if (vstart >= vl) [[unlikely]] { // From 16.3: For all of the [slide instructions], if vstart >= vl, the // instruction performs no operation and leaves the destination vector // register unchanged. - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -2839,7 +2853,6 @@ class Interpreter { mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma> @@ -2860,15 +2873,14 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); if (vstart >= vl) [[unlikely]] { // From 16.3: For all of the [slide instructions], if vstart >= vl, the // instruction performs no operation and leaves the destination vector // register unchanged. - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); - for (size_t index = 0; index < kRegistersInvolved; ++index) { SIMD128Register result(state_->cpu.v[dst + index]); @@ -2890,8 +2902,6 @@ class Interpreter { mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - - SetCsr<CsrName::kVstart>(0); } // Helper function needed to generate bitmak result from non-bitmask inputs. diff --git a/interpreter/riscv64/interpreter_test.cc b/interpreter/riscv64/interpreter_test.cc index b6f40839..239c50a0 100644 --- a/interpreter/riscv64/interpreter_test.cc +++ b/interpreter/riscv64/interpreter_test.cc @@ -805,7 +805,7 @@ class Riscv64InterpreterTest : public ::testing::Test { // https://github.com/riscv/riscv-v-spec/pull/872 state_.cpu.vtype = BitUtilLog2(sizeof(ElementType)) << 3; state_.cpu.vl = 0; - constexpr int kElementsCount = static_cast<int>(16 / sizeof(ElementType)); + constexpr int kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof(ElementType)); for (int vstart = 0; vstart <= kElementsCount * kNFfields; ++vstart) { state_.cpu.insn_addr = ToGuestAddr(&insn_bytes); state_.cpu.vstart = vstart; @@ -817,7 +817,9 @@ class Riscv64InterpreterTest : public ::testing::Test { for (int index = 0; index < 8; ++index) { SIMD128Register expected_state{kVectorComparisonSource[index]}; SIMD128Register source_value{kVectorComparisonSource[index + 8]}; - if (index < kNFfields) { + if ((vstart < kElementsCount * kNFfields) && index < kNFfields) { + // The usual property that no elements are written if vstart >= vl does not apply to these + // instructions. Instead, no elements are written if vstart >= evl. for (int element_index = 0; element_index < kElementsCount; ++element_index) { if (element_index + index * kElementsCount >= vstart) { expected_state.Set(source_value.Get<ElementType>(element_index), element_index); |