aboutsummaryrefslogtreecommitdiff
path: root/interpreter/riscv64/interpreter.h
diff options
context:
space:
mode:
Diffstat (limited to 'interpreter/riscv64/interpreter.h')
-rw-r--r--interpreter/riscv64/interpreter.h3489
1 files changed, 3489 insertions, 0 deletions
diff --git a/interpreter/riscv64/interpreter.h b/interpreter/riscv64/interpreter.h
new file mode 100644
index 00000000..6680fc0e
--- /dev/null
+++ b/interpreter/riscv64/interpreter.h
@@ -0,0 +1,3489 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file excenaupt in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "berberis/interpreter/riscv64/interpreter.h"
+
+#include <atomic>
+#include <cfenv>
+#include <cstdint>
+#include <cstring>
+
+#include "berberis/base/bit_util.h"
+#include "berberis/base/checks.h"
+#include "berberis/base/macros.h"
+#include "berberis/decoder/riscv64/decoder.h"
+#include "berberis/decoder/riscv64/semantics_player.h"
+#include "berberis/guest_state/guest_addr.h"
+#include "berberis/guest_state/guest_state.h"
+#include "berberis/intrinsics/guest_fp_flags.h" // ToHostRoundingMode
+#include "berberis/intrinsics/intrinsics.h"
+#include "berberis/intrinsics/intrinsics_float.h"
+#include "berberis/intrinsics/riscv64/vector_intrinsics.h"
+#include "berberis/intrinsics/simd_register.h"
+#include "berberis/intrinsics/type_traits.h"
+#include "berberis/kernel_api/run_guest_syscall.h"
+#include "berberis/runtime_primitives/interpret_helpers.h"
+#include "berberis/runtime_primitives/memory_region_reservation.h"
+#include "berberis/runtime_primitives/recovery_code.h"
+
+#include "faulty_memory_accesses.h"
+#include "regs.h"
+
+namespace berberis {
+
+inline constexpr std::memory_order AqRlToStdMemoryOrder(bool aq, bool rl) {
+ if (aq) {
+ if (rl) {
+ return std::memory_order_acq_rel;
+ } else {
+ return std::memory_order_acquire;
+ }
+ } else {
+ if (rl) {
+ return std::memory_order_release;
+ } else {
+ return std::memory_order_relaxed;
+ }
+ }
+}
+
+class Interpreter {
+ public:
+ using CsrName = berberis::CsrName;
+ using Decoder = Decoder<SemanticsPlayer<Interpreter>>;
+ using Register = uint64_t;
+ using FpRegister = uint64_t;
+ using Float32 = intrinsics::Float32;
+ using Float64 = intrinsics::Float64;
+
+ explicit Interpreter(ThreadState* state)
+ : state_(state), branch_taken_(false), exception_raised_(false) {}
+
+ //
+ // Instruction implementations.
+ //
+
+ Register UpdateCsr(Decoder::CsrOpcode opcode, Register arg, Register csr) {
+ switch (opcode) {
+ case Decoder::CsrOpcode::kCsrrs:
+ return arg | csr;
+ case Decoder::CsrOpcode::kCsrrc:
+ return ~arg & csr;
+ default:
+ Unimplemented();
+ return {};
+ }
+ }
+
+ Register UpdateCsr(Decoder::CsrImmOpcode opcode, uint8_t imm, Register csr) {
+ return UpdateCsr(static_cast<Decoder::CsrOpcode>(opcode), imm, csr);
+ }
+
+ // Note: we prefer not to use C11/C++ atomic_thread_fence or even gcc/clang builtin
+ // __atomic_thread_fence because all these function rely on the fact that compiler never uses
+ // non-temporal loads and stores and only issue “mfence” when sequentially consistent ordering is
+ // requested. They never issue “lfence” or “sfence”.
+ // Instead we pull the page from Linux's kernel book and map read ordereding to “lfence”, write
+ // ordering to “sfence” and read-write ordering to “mfence”.
+ // This can be important in the future if we would start using nontemporal moves in manually
+ // created assembly code.
+ // Ordering affecting I/O devices is not relevant to user-space code thus we just ignore bits
+ // related to devices I/O.
+ void Fence(Decoder::FenceOpcode /*opcode*/,
+ Register /*src*/,
+ bool sw,
+ bool sr,
+ bool /*so*/,
+ bool /*si*/,
+ bool pw,
+ bool pr,
+ bool /*po*/,
+ bool /*pi*/) {
+ bool read_fence = sr | pr;
+ bool write_fence = sw | pw;
+ // Two types of fences (total store ordering fence and normal fence) are supposed to be
+ // processed differently, but only for the “read_fence && write_fence” case (otherwise total
+ // store ordering fence becomes normal fence for the “forward compatibility”), yet because x86
+ // doesn't distinguish between these two types of fences and since we are supposed to map all
+ // not-yet defined fences to normal fence (again, for the “forward compatibility”) it's Ok to
+ // just ignore opcode field.
+ if (read_fence) {
+ if (write_fence) {
+ asm volatile("mfence" ::: "memory");
+ } else {
+ asm volatile("lfence" ::: "memory");
+ }
+ } else if (write_fence) {
+ asm volatile("sfence" ::: "memory");
+ }
+ return;
+ }
+
+ void FenceI(Register /*arg*/, int16_t /*imm*/) {
+ // For interpreter-only mode we don't need to do anything here, but when we will have a
+ // translator we would need to flush caches here.
+ }
+
+ template <typename IntType, bool aq, bool rl>
+ Register Lr(int64_t addr) {
+ static_assert(std::is_integral_v<IntType>, "Lr: IntType must be integral");
+ static_assert(std::is_signed_v<IntType>, "Lr: IntType must be signed");
+ CHECK(!exception_raised_);
+ // Address must be aligned on size of IntType.
+ CHECK((addr % sizeof(IntType)) == 0ULL);
+ return MemoryRegionReservation::Load<IntType>(&state_->cpu, addr, AqRlToStdMemoryOrder(aq, rl));
+ }
+
+ template <typename IntType, bool aq, bool rl>
+ Register Sc(int64_t addr, IntType val) {
+ static_assert(std::is_integral_v<IntType>, "Sc: IntType must be integral");
+ static_assert(std::is_signed_v<IntType>, "Sc: IntType must be signed");
+ CHECK(!exception_raised_);
+ // Address must be aligned on size of IntType.
+ CHECK((addr % sizeof(IntType)) == 0ULL);
+ return static_cast<Register>(MemoryRegionReservation::Store<IntType>(
+ &state_->cpu, addr, val, AqRlToStdMemoryOrder(aq, rl)));
+ }
+
+ Register Op(Decoder::OpOpcode opcode, Register arg1, Register arg2) {
+ switch (opcode) {
+ case Decoder::OpOpcode::kAdd:
+ return Int64(arg1) + Int64(arg2);
+ case Decoder::OpOpcode::kSub:
+ return Int64(arg1) - Int64(arg2);
+ case Decoder::OpOpcode::kAnd:
+ return Int64(arg1) & Int64(arg2);
+ case Decoder::OpOpcode::kOr:
+ return Int64(arg1) | Int64(arg2);
+ case Decoder::OpOpcode::kXor:
+ return Int64(arg1) ^ Int64(arg2);
+ case Decoder::OpOpcode::kSll:
+ return Int64(arg1) << Int64(arg2);
+ case Decoder::OpOpcode::kSrl:
+ return UInt64(arg1) >> Int64(arg2);
+ case Decoder::OpOpcode::kSra:
+ return Int64(arg1) >> Int64(arg2);
+ case Decoder::OpOpcode::kSlt:
+ return Int64(arg1) < Int64(arg2) ? 1 : 0;
+ case Decoder::OpOpcode::kSltu:
+ return UInt64(arg1) < UInt64(arg2) ? 1 : 0;
+ case Decoder::OpOpcode::kMul:
+ return Int64(arg1) * Int64(arg2);
+ case Decoder::OpOpcode::kMulh:
+ return NarrowTopHalf(Widen(Int64(arg1)) * Widen(Int64(arg2)));
+ case Decoder::OpOpcode::kMulhsu:
+ return NarrowTopHalf(Widen(Int64(arg1)) * BitCastToSigned(Widen(UInt64(arg2))));
+ case Decoder::OpOpcode::kMulhu:
+ return NarrowTopHalf(Widen(UInt64(arg1)) * Widen(UInt64(arg2)));
+ case Decoder::OpOpcode::kDiv:
+ return Int64(arg1) / Int64(arg2);
+ case Decoder::OpOpcode::kDivu:
+ return UInt64(arg1) / UInt64(arg2);
+ case Decoder::OpOpcode::kRem:
+ return Int64(arg1) % Int64(arg2);
+ case Decoder::OpOpcode::kRemu:
+ return UInt64(arg1) % UInt64(arg2);
+ case Decoder::OpOpcode::kAndn:
+ return Int64(arg1) & (~Int64(arg2));
+ case Decoder::OpOpcode::kOrn:
+ return Int64(arg1) | (~Int64(arg2));
+ case Decoder::OpOpcode::kXnor:
+ return ~(Int64(arg1) ^ Int64(arg2));
+ default:
+ Unimplemented();
+ return {};
+ }
+ }
+
+ Register Op32(Decoder::Op32Opcode opcode, Register arg1, Register arg2) {
+ switch (opcode) {
+ case Decoder::Op32Opcode::kAddw:
+ return Widen(TruncateTo<Int32>(arg1) + TruncateTo<Int32>(arg2));
+ case Decoder::Op32Opcode::kSubw:
+ return Widen(TruncateTo<Int32>(arg1) - TruncateTo<Int32>(arg2));
+ case Decoder::Op32Opcode::kSllw:
+ return Widen(TruncateTo<Int32>(arg1) << TruncateTo<Int32>(arg2));
+ case Decoder::Op32Opcode::kSrlw:
+ return Widen(BitCastToSigned(TruncateTo<UInt32>(arg1) >> TruncateTo<Int32>(arg2)));
+ case Decoder::Op32Opcode::kSraw:
+ return Widen(TruncateTo<Int32>(arg1) >> TruncateTo<Int32>(arg2));
+ case Decoder::Op32Opcode::kMulw:
+ return Widen(TruncateTo<Int32>(arg1) * TruncateTo<Int32>(arg2));
+ case Decoder::Op32Opcode::kDivw:
+ return Widen(TruncateTo<Int32>(arg1) / TruncateTo<Int32>(arg2));
+ case Decoder::Op32Opcode::kDivuw:
+ return Widen(BitCastToSigned(TruncateTo<UInt32>(arg1) / TruncateTo<UInt32>(arg2)));
+ case Decoder::Op32Opcode::kRemw:
+ return Widen(TruncateTo<Int32>(arg1) % TruncateTo<Int32>(arg2));
+ case Decoder::Op32Opcode::kRemuw:
+ return Widen(BitCastToSigned(TruncateTo<UInt32>(arg1) % TruncateTo<UInt32>(arg2)));
+ default:
+ Unimplemented();
+ return {};
+ }
+ }
+
+ Register Load(Decoder::LoadOperandType operand_type, Register arg, int16_t offset) {
+ void* ptr = ToHostAddr<void>(arg + offset);
+ switch (operand_type) {
+ case Decoder::LoadOperandType::k8bitUnsigned:
+ return Load<uint8_t>(ptr);
+ case Decoder::LoadOperandType::k16bitUnsigned:
+ return Load<uint16_t>(ptr);
+ case Decoder::LoadOperandType::k32bitUnsigned:
+ return Load<uint32_t>(ptr);
+ case Decoder::LoadOperandType::k64bit:
+ return Load<uint64_t>(ptr);
+ case Decoder::LoadOperandType::k8bitSigned:
+ return Load<int8_t>(ptr);
+ case Decoder::LoadOperandType::k16bitSigned:
+ return Load<int16_t>(ptr);
+ case Decoder::LoadOperandType::k32bitSigned:
+ return Load<int32_t>(ptr);
+ default:
+ Unimplemented();
+ return {};
+ }
+ }
+
+ template <typename DataType>
+ FpRegister LoadFp(Register arg, int16_t offset) {
+ static_assert(std::is_same_v<DataType, Float32> || std::is_same_v<DataType, Float64>);
+ CHECK(!exception_raised_);
+ DataType* ptr = ToHostAddr<DataType>(arg + offset);
+ FaultyLoadResult result = FaultyLoad(ptr, sizeof(DataType));
+ if (result.is_fault) {
+ exception_raised_ = true;
+ return {};
+ }
+ return result.value;
+ }
+
+ Register OpImm(Decoder::OpImmOpcode opcode, Register arg, int16_t imm) {
+ switch (opcode) {
+ case Decoder::OpImmOpcode::kAddi:
+ return arg + int64_t{imm};
+ case Decoder::OpImmOpcode::kSlti:
+ return bit_cast<int64_t>(arg) < int64_t{imm} ? 1 : 0;
+ case Decoder::OpImmOpcode::kSltiu:
+ return arg < bit_cast<uint64_t>(int64_t{imm}) ? 1 : 0;
+ case Decoder::OpImmOpcode::kXori:
+ return arg ^ int64_t { imm };
+ case Decoder::OpImmOpcode::kOri:
+ return arg | int64_t{imm};
+ case Decoder::OpImmOpcode::kAndi:
+ return arg & int64_t{imm};
+ default:
+ Unimplemented();
+ return {};
+ }
+ }
+
+ Register Lui(int32_t imm) { return int64_t{imm}; }
+
+ Register Auipc(int32_t imm) {
+ uint64_t pc = state_->cpu.insn_addr;
+ return pc + int64_t{imm};
+ }
+
+ Register OpImm32(Decoder::OpImm32Opcode opcode, Register arg, int16_t imm) {
+ switch (opcode) {
+ case Decoder::OpImm32Opcode::kAddiw:
+ return int32_t(arg) + int32_t{imm};
+ default:
+ Unimplemented();
+ return {};
+ }
+ }
+
+ Register Ecall(Register syscall_nr,
+ Register arg0,
+ Register arg1,
+ Register arg2,
+ Register arg3,
+ Register arg4,
+ Register arg5) {
+ CHECK(!exception_raised_);
+ return RunGuestSyscall(syscall_nr, arg0, arg1, arg2, arg3, arg4, arg5);
+ }
+
+ Register Slli(Register arg, int8_t imm) { return arg << imm; }
+
+ Register Srli(Register arg, int8_t imm) { return arg >> imm; }
+
+ Register Srai(Register arg, int8_t imm) { return bit_cast<int64_t>(arg) >> imm; }
+
+ Register ShiftImm32(Decoder::ShiftImm32Opcode opcode, Register arg, uint16_t imm) {
+ switch (opcode) {
+ case Decoder::ShiftImm32Opcode::kSlliw:
+ return int32_t(arg) << int32_t{imm};
+ case Decoder::ShiftImm32Opcode::kSrliw:
+ return bit_cast<int32_t>(uint32_t(arg) >> uint32_t{imm});
+ case Decoder::ShiftImm32Opcode::kSraiw:
+ return int32_t(arg) >> int32_t{imm};
+ default:
+ Unimplemented();
+ return {};
+ }
+ }
+
+ Register Rori(Register arg, int8_t shamt) {
+ CheckShamtIsValid(shamt);
+ return (((uint64_t(arg) >> shamt)) | (uint64_t(arg) << (64 - shamt)));
+ }
+
+ Register Roriw(Register arg, int8_t shamt) {
+ CheckShamt32IsValid(shamt);
+ return int32_t(((uint32_t(arg) >> shamt)) | (uint32_t(arg) << (32 - shamt)));
+ }
+
+ void Store(Decoder::MemoryDataOperandType operand_type,
+ Register arg,
+ int16_t offset,
+ Register data) {
+ void* ptr = ToHostAddr<void>(arg + offset);
+ switch (operand_type) {
+ case Decoder::MemoryDataOperandType::k8bit:
+ Store<uint8_t>(ptr, data);
+ break;
+ case Decoder::MemoryDataOperandType::k16bit:
+ Store<uint16_t>(ptr, data);
+ break;
+ case Decoder::MemoryDataOperandType::k32bit:
+ Store<uint32_t>(ptr, data);
+ break;
+ case Decoder::MemoryDataOperandType::k64bit:
+ Store<uint64_t>(ptr, data);
+ break;
+ default:
+ return Unimplemented();
+ }
+ }
+
+ template <typename DataType>
+ void StoreFp(Register arg, int16_t offset, FpRegister data) {
+ static_assert(std::is_same_v<DataType, Float32> || std::is_same_v<DataType, Float64>);
+ CHECK(!exception_raised_);
+ DataType* ptr = ToHostAddr<DataType>(arg + offset);
+ exception_raised_ = FaultyStore(ptr, sizeof(DataType), data);
+ }
+
+ void CompareAndBranch(Decoder::BranchOpcode opcode,
+ Register arg1,
+ Register arg2,
+ int16_t offset) {
+ bool cond_value;
+ switch (opcode) {
+ case Decoder::BranchOpcode::kBeq:
+ cond_value = arg1 == arg2;
+ break;
+ case Decoder::BranchOpcode::kBne:
+ cond_value = arg1 != arg2;
+ break;
+ case Decoder::BranchOpcode::kBltu:
+ cond_value = arg1 < arg2;
+ break;
+ case Decoder::BranchOpcode::kBgeu:
+ cond_value = arg1 >= arg2;
+ break;
+ case Decoder::BranchOpcode::kBlt:
+ cond_value = bit_cast<int64_t>(arg1) < bit_cast<int64_t>(arg2);
+ break;
+ case Decoder::BranchOpcode::kBge:
+ cond_value = bit_cast<int64_t>(arg1) >= bit_cast<int64_t>(arg2);
+ break;
+ default:
+ return Unimplemented();
+ }
+
+ if (cond_value) {
+ Branch(offset);
+ }
+ }
+
+ void Branch(int32_t offset) {
+ CHECK(!exception_raised_);
+ state_->cpu.insn_addr += offset;
+ branch_taken_ = true;
+ }
+
+ void BranchRegister(Register base, int16_t offset) {
+ CHECK(!exception_raised_);
+ state_->cpu.insn_addr = (base + offset) & ~uint64_t{1};
+ branch_taken_ = true;
+ }
+
+ FpRegister Fmv(FpRegister arg) { return arg; }
+
+ //
+ // V extensions.
+ //
+
+ using TailProcessing = intrinsics::TailProcessing;
+ using InactiveProcessing = intrinsics::InactiveProcessing;
+
+ enum class VectorSelectElementWidth {
+ k8bit = 0b000,
+ k16bit = 0b001,
+ k32bit = 0b010,
+ k64bit = 0b011,
+ kMaxValue = 0b111,
+ };
+
+ enum class VectorRegisterGroupMultiplier {
+ k1register = 0b000,
+ k2registers = 0b001,
+ k4registers = 0b010,
+ k8registers = 0b011,
+ kEigthOfRegister = 0b101,
+ kQuarterOfRegister = 0b110,
+ kHalfOfRegister = 0b111,
+ kMaxValue = 0b111,
+ };
+
+ static constexpr size_t NumberOfRegistersInvolved(VectorRegisterGroupMultiplier vlmul) {
+ switch (vlmul) {
+ case VectorRegisterGroupMultiplier::k2registers:
+ return 2;
+ case VectorRegisterGroupMultiplier::k4registers:
+ return 4;
+ case VectorRegisterGroupMultiplier::k8registers:
+ return 8;
+ default:
+ return 1;
+ }
+ }
+
+ static constexpr size_t NumRegistersInvolvedForWideOperand(VectorRegisterGroupMultiplier vlmul) {
+ switch (vlmul) {
+ case VectorRegisterGroupMultiplier::k1register:
+ return 2;
+ case VectorRegisterGroupMultiplier::k2registers:
+ return 4;
+ case VectorRegisterGroupMultiplier::k4registers:
+ return 8;
+ default:
+ return 1;
+ }
+ }
+
+ template <typename VOpArgs, typename... ExtraArgs>
+ void OpVector(const VOpArgs& args, ExtraArgs... extra_args) {
+ // Note: whole register instructions are not dependent on vtype and are supposed to work even
+ // if vill is set! Handle them before processing other instructions.
+ // Note: other tupes of loads and store are not special and would be processed as usual.
+ // TODO(khim): Handle vstart properly.
+ if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs>) {
+ if (args.opcode == Decoder::VLUmOpOpcode::kVlXreXX) {
+ if (!IsPowerOf2(args.nf + 1)) {
+ return Unimplemented();
+ }
+ if ((args.dst & args.nf) != 0) {
+ return Unimplemented();
+ }
+ auto [src] = std::tuple{extra_args...};
+ __uint128_t* ptr = bit_cast<__uint128_t*>(src);
+ for (size_t index = 0; index <= args.nf; index++) {
+ state_->cpu.v[args.dst + index] = ptr[index];
+ }
+ return;
+ }
+ }
+
+ if constexpr (std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
+ if (args.opcode == Decoder::VSUmOpOpcode::kVsX) {
+ if (args.width != Decoder::MemoryDataOperandType::k8bit) {
+ return Unimplemented();
+ }
+ if (!IsPowerOf2(args.nf + 1)) {
+ return Unimplemented();
+ }
+ if ((args.data & args.nf) != 0) {
+ return Unimplemented();
+ }
+ auto [src] = std::tuple{extra_args...};
+ __uint128_t* ptr = bit_cast<__uint128_t*>(src);
+ for (size_t index = 0; index <= args.nf; index++) {
+ ptr[index] = state_->cpu.v[args.data + index];
+ }
+ return;
+ }
+ }
+
+ // RISC-V V extensions are using 8bit “opcode extension” vtype Csr to make sure 32bit encoding
+ // would be usable.
+ //
+ // Great care is made to ensure that vector code wouldn't need to change vtype Csr often (e.g.
+ // there are special mask instructions which allow one to manipulate on masks without the need
+ // to change the CPU mode.
+ //
+ // Currently we don't have support for multiple CPU mode in Berberis thus we can only handle
+ // these instrtuctions in the interpreter.
+ //
+ // TODO(b/300690740): develop and implement strategy which would allow us to support vector
+ // intrinsics not just in the interpreter. Move code from this function to semantics player.
+ Register vtype = GetCsr<CsrName::kVtype>();
+ if (static_cast<std::make_signed_t<Register>>(vtype) < 0) {
+ return Unimplemented();
+ }
+ if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
+ std::is_same_v<VOpArgs, Decoder::VLoadStrideArgs> ||
+ std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs> ||
+ std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs> ||
+ std::is_same_v<VOpArgs, Decoder::VStoreStrideArgs> ||
+ std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
+ switch (args.width) {
+ case Decoder::MemoryDataOperandType::k8bit:
+ return OpVector<UInt8>(args, vtype, extra_args...);
+ case Decoder::MemoryDataOperandType::k16bit:
+ return OpVector<UInt16>(args, vtype, extra_args...);
+ case Decoder::MemoryDataOperandType::k32bit:
+ return OpVector<UInt32>(args, vtype, extra_args...);
+ case Decoder::MemoryDataOperandType::k64bit:
+ return OpVector<UInt64>(args, vtype, extra_args...);
+ default:
+ return Unimplemented();
+ }
+ } else {
+ VectorRegisterGroupMultiplier vlmul = static_cast<VectorRegisterGroupMultiplier>(vtype & 0x7);
+ if constexpr (std::is_same_v<VOpArgs, Decoder::VOpFVfArgs> ||
+ std::is_same_v<VOpArgs, Decoder::VOpFVvArgs>) {
+ switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
+ case VectorSelectElementWidth::k16bit:
+ if constexpr (sizeof...(extra_args) == 0) {
+ return OpVector<intrinsics::Float16>(args, vlmul, vtype);
+ } else {
+ return Unimplemented();
+ }
+ case VectorSelectElementWidth::k32bit:
+ return OpVector<Float32>(
+ args,
+ vlmul,
+ vtype,
+ std::get<0>(intrinsics::UnboxNan<Float32>(bit_cast<Float64>(extra_args)))...);
+ case VectorSelectElementWidth::k64bit:
+ // Note: if arguments are 64bit floats then we don't need to do any unboxing.
+ return OpVector<Float64>(args, vlmul, vtype, bit_cast<Float64>(extra_args)...);
+ default:
+ return Unimplemented();
+ }
+ } else {
+ switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
+ case VectorSelectElementWidth::k8bit:
+ return OpVector<UInt8>(args, vlmul, vtype, extra_args...);
+ case VectorSelectElementWidth::k16bit:
+ return OpVector<UInt16>(args, vlmul, vtype, extra_args...);
+ case VectorSelectElementWidth::k32bit:
+ return OpVector<UInt32>(args, vlmul, vtype, extra_args...);
+ case VectorSelectElementWidth::k64bit:
+ return OpVector<UInt64>(args, vlmul, vtype, extra_args...);
+ default:
+ return Unimplemented();
+ }
+ }
+ }
+ }
+
+ template <typename ElementType, typename VOpArgs, typename... ExtraArgs>
+ void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
+ auto vemul = Decoder::SignExtend<3>(vtype & 0b111);
+ vemul -= ((vtype >> 3) & 0b111); // Divide by SEW.
+ vemul +=
+ static_cast<std::underlying_type_t<decltype(args.width)>>(args.width); // Multiply by EEW.
+ if (vemul < -3 || vemul > 3) [[unlikely]] {
+ return Unimplemented();
+ }
+ // Note: whole register loads and stores treat args.nf differently, but they are processed
+ // separately above anyway, because they also ignore vtype and all the information in it!
+ // For other loads and stores affected number of registers (EMUL * NF) should be 8 or less.
+ if ((vemul > 0) && ((args.nf + 1) * (1 << vemul) > 8)) {
+ return Unimplemented();
+ }
+ return OpVector<ElementType>(
+ args, static_cast<VectorRegisterGroupMultiplier>(vemul & 0b111), vtype, extra_args...);
+ }
+
+ template <typename ElementType, typename VOpArgs, typename... ExtraArgs>
+ void OpVector(const VOpArgs& args,
+ VectorRegisterGroupMultiplier vlmul,
+ Register vtype,
+ ExtraArgs... extra_args) {
+ switch (vlmul) {
+ case VectorRegisterGroupMultiplier::k1register:
+ return OpVector<ElementType, VectorRegisterGroupMultiplier::k1register>(
+ args, vtype, extra_args...);
+ case VectorRegisterGroupMultiplier::k2registers:
+ return OpVector<ElementType, VectorRegisterGroupMultiplier::k2registers>(
+ args, vtype, extra_args...);
+ case VectorRegisterGroupMultiplier::k4registers:
+ return OpVector<ElementType, VectorRegisterGroupMultiplier::k4registers>(
+ args, vtype, extra_args...);
+ case VectorRegisterGroupMultiplier::k8registers:
+ return OpVector<ElementType, VectorRegisterGroupMultiplier::k8registers>(
+ args, vtype, extra_args...);
+ case VectorRegisterGroupMultiplier::kEigthOfRegister:
+ return OpVector<ElementType, VectorRegisterGroupMultiplier::kEigthOfRegister>(
+ args, vtype, extra_args...);
+ case VectorRegisterGroupMultiplier::kQuarterOfRegister:
+ return OpVector<ElementType, VectorRegisterGroupMultiplier::kQuarterOfRegister>(
+ args, vtype, extra_args...);
+ case VectorRegisterGroupMultiplier::kHalfOfRegister:
+ return OpVector<ElementType, VectorRegisterGroupMultiplier::kHalfOfRegister>(
+ args, vtype, extra_args...);
+ default:
+ return Unimplemented();
+ }
+ }
+
+ template <typename ElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ typename VOpArgs,
+ typename... ExtraArgs>
+ void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
+ if (args.vm) {
+ return OpVector<ElementType, vlmul, intrinsics::NoInactiveProcessing{}>(
+ args, vtype, extra_args...);
+ }
+ if (vtype >> 7) {
+ return OpVector<ElementType, vlmul, InactiveProcessing::kAgnostic>(
+ args, vtype, extra_args...);
+ }
+ return OpVector<ElementType, vlmul, InactiveProcessing::kUndisturbed>(
+ args, vtype, extra_args...);
+ }
+
+ template <typename ElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ auto vma,
+ typename VOpArgs,
+ typename... ExtraArgs>
+ void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
+ if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
+ std::is_same_v<VOpArgs, Decoder::VLoadStrideArgs> ||
+ std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs> ||
+ std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs> ||
+ std::is_same_v<VOpArgs, Decoder::VStoreStrideArgs> ||
+ std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
+ constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
+ // Note: whole register loads and stores treat args.nf differently, but they are processed
+ // separately above anyway, because they also ignore vtype and all the information in it!
+ switch (args.nf) {
+ case 0:
+ return OpVector<ElementType, 1, vlmul, vma>(args, vtype, extra_args...);
+ case 1:
+ if constexpr (kRegistersInvolved > 4) {
+ return Unimplemented();
+ } else {
+ return OpVector<ElementType, 2, vlmul, vma>(args, vtype, extra_args...);
+ }
+ case 2:
+ if constexpr (kRegistersInvolved > 2) {
+ return Unimplemented();
+ } else {
+ return OpVector<ElementType, 3, vlmul, vma>(args, vtype, extra_args...);
+ }
+ case 3:
+ if constexpr (kRegistersInvolved > 2) {
+ return Unimplemented();
+ } else {
+ return OpVector<ElementType, 4, vlmul, vma>(args, vtype, extra_args...);
+ }
+ case 4:
+ if constexpr (kRegistersInvolved > 1) {
+ return Unimplemented();
+ } else {
+ return OpVector<ElementType, 5, vlmul, vma>(args, vtype, extra_args...);
+ }
+ case 5:
+ if constexpr (kRegistersInvolved > 1) {
+ return Unimplemented();
+ } else {
+ return OpVector<ElementType, 6, vlmul, vma>(args, vtype, extra_args...);
+ }
+ case 6:
+ if constexpr (kRegistersInvolved > 1) {
+ return Unimplemented();
+ } else {
+ return OpVector<ElementType, 7, vlmul, vma>(args, vtype, extra_args...);
+ }
+ case 7:
+ if constexpr (kRegistersInvolved > 1) {
+ return Unimplemented();
+ } else {
+ return OpVector<ElementType, 8, vlmul, vma>(args, vtype, extra_args...);
+ }
+ }
+ } else {
+ if ((vtype >> 6) & 1) {
+ return OpVector<ElementType, vlmul, TailProcessing::kAgnostic, vma>(args, extra_args...);
+ }
+ return OpVector<ElementType, vlmul, TailProcessing::kUndisturbed, vma>(args, extra_args...);
+ }
+ }
+
+ template <typename ElementType,
+ size_t kSegmentSize,
+ VectorRegisterGroupMultiplier vlmul,
+ auto vma,
+ typename VOpArgs,
+ typename... ExtraArgs>
+ void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
+ // Indexed loads and stores have two operands with different ElementType's and lmul sizes,
+ // pass vtype to do further selection.
+ if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
+ std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs>) {
+ // Because we know that we are dealing with indexed loads and stores and wouldn't need to
+ // convert elmul to anything else we can immediately turn it into kIndexRegistersInvolved
+ // here.
+ if ((vtype >> 6) & 1) {
+ return OpVector<kSegmentSize,
+ ElementType,
+ NumberOfRegistersInvolved(vlmul),
+ TailProcessing::kAgnostic,
+ vma>(args, vtype, extra_args...);
+ }
+ return OpVector<kSegmentSize,
+ ElementType,
+ NumberOfRegistersInvolved(vlmul),
+ TailProcessing::kUndisturbed,
+ vma>(args, vtype, extra_args...);
+ } else {
+ // For other instruction we have parsed all the information from vtype and only need to pass
+ // args and extra_args.
+ if ((vtype >> 6) & 1) {
+ return OpVector<ElementType, kSegmentSize, vlmul, TailProcessing::kAgnostic, vma>(
+ args, extra_args...);
+ }
+ return OpVector<ElementType, kSegmentSize, vlmul, TailProcessing::kUndisturbed, vma>(
+ args, extra_args...);
+ }
+ }
+
+ template <size_t kSegmentSize,
+ typename IndexElementType,
+ size_t kIndexRegistersInvolved,
+ TailProcessing vta,
+ auto vma,
+ typename VOpArgs,
+ typename... ExtraArgs>
+ void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
+ VectorRegisterGroupMultiplier vlmul = static_cast<VectorRegisterGroupMultiplier>(vtype & 0b111);
+ switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
+ case VectorSelectElementWidth::k8bit:
+ return OpVector<UInt8, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
+ args, vlmul, extra_args...);
+ case VectorSelectElementWidth::k16bit:
+ return OpVector<UInt16, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
+ args, vlmul, extra_args...);
+ case VectorSelectElementWidth::k32bit:
+ return OpVector<UInt32, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
+ args, vlmul, extra_args...);
+ case VectorSelectElementWidth::k64bit:
+ return OpVector<UInt64, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
+ args, vlmul, extra_args...);
+ default:
+ return Unimplemented();
+ }
+ }
+
+ template <typename DataElementType,
+ size_t kSegmentSize,
+ typename IndexElementType,
+ size_t kIndexRegistersInvolved,
+ TailProcessing vta,
+ auto vma,
+ typename VOpArgs,
+ typename... ExtraArgs>
+ void OpVector(const VOpArgs& args, VectorRegisterGroupMultiplier vlmul, ExtraArgs... extra_args) {
+ switch (vlmul) {
+ case VectorRegisterGroupMultiplier::k1register:
+ return OpVector<DataElementType,
+ VectorRegisterGroupMultiplier::k1register,
+ IndexElementType,
+ kSegmentSize,
+ kIndexRegistersInvolved,
+ vta,
+ vma>(args, extra_args...);
+ case VectorRegisterGroupMultiplier::k2registers:
+ return OpVector<DataElementType,
+ VectorRegisterGroupMultiplier::k2registers,
+ IndexElementType,
+ kSegmentSize,
+ kIndexRegistersInvolved,
+ vta,
+ vma>(args, extra_args...);
+ case VectorRegisterGroupMultiplier::k4registers:
+ return OpVector<DataElementType,
+ VectorRegisterGroupMultiplier::k4registers,
+ IndexElementType,
+ kSegmentSize,
+ kIndexRegistersInvolved,
+ vta,
+ vma>(args, extra_args...);
+ case VectorRegisterGroupMultiplier::k8registers:
+ return OpVector<DataElementType,
+ VectorRegisterGroupMultiplier::k8registers,
+ IndexElementType,
+ kSegmentSize,
+ kIndexRegistersInvolved,
+ vta,
+ vma>(args, extra_args...);
+ case VectorRegisterGroupMultiplier::kEigthOfRegister:
+ return OpVector<DataElementType,
+ VectorRegisterGroupMultiplier::kEigthOfRegister,
+ IndexElementType,
+ kSegmentSize,
+ kIndexRegistersInvolved,
+ vta,
+ vma>(args, extra_args...);
+ case VectorRegisterGroupMultiplier::kQuarterOfRegister:
+ return OpVector<DataElementType,
+ VectorRegisterGroupMultiplier::kQuarterOfRegister,
+ IndexElementType,
+ kSegmentSize,
+ kIndexRegistersInvolved,
+ vta,
+ vma>(args, extra_args...);
+ case VectorRegisterGroupMultiplier::kHalfOfRegister:
+ return OpVector<DataElementType,
+ VectorRegisterGroupMultiplier::kHalfOfRegister,
+ IndexElementType,
+ kSegmentSize,
+ kIndexRegistersInvolved,
+ vta,
+ vma>(args, extra_args...);
+ default:
+ return Unimplemented();
+ }
+ }
+
+ template <typename DataElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ typename IndexElementType,
+ size_t kSegmentSize,
+ size_t kIndexRegistersInvolved,
+ TailProcessing vta,
+ auto vma>
+ void OpVector(const Decoder::VLoadIndexedArgs& args, Register src) {
+ return OpVector<DataElementType,
+ kSegmentSize,
+ NumberOfRegistersInvolved(vlmul),
+ IndexElementType,
+ kIndexRegistersInvolved,
+ vta,
+ vma>(args, src);
+ }
+
+ template <typename DataElementType,
+ size_t kSegmentSize,
+ size_t kNumRegistersInGroup,
+ typename IndexElementType,
+ size_t kIndexRegistersInvolved,
+ TailProcessing vta,
+ auto vma>
+ void OpVector(const Decoder::VLoadIndexedArgs& args, Register src) {
+ if (!IsAligned<kIndexRegistersInvolved>(args.idx)) {
+ return Unimplemented();
+ }
+ constexpr size_t kElementsCount =
+ static_cast<int>(sizeof(SIMD128Register) / sizeof(IndexElementType));
+ alignas(alignof(SIMD128Register))
+ IndexElementType indexes[kElementsCount * kIndexRegistersInvolved];
+ memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved);
+ return OpVectorLoad<DataElementType, kSegmentSize, kNumRegistersInGroup, vta, vma>(
+ args.dst, src, [&indexes](size_t index) { return indexes[index]; });
+ }
+
+ template <typename ElementType,
+ size_t kSegmentSize,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVector(const Decoder::VLoadStrideArgs& args, Register src, Register stride) {
+ return OpVector<ElementType, kSegmentSize, NumberOfRegistersInvolved(vlmul), vta, vma>(
+ args, src, stride);
+ }
+
+ template <typename ElementType,
+ size_t kSegmentSize,
+ size_t kNumRegistersInGroup,
+ TailProcessing vta,
+ auto vma>
+ void OpVector(const Decoder::VLoadStrideArgs& args, Register src, Register stride) {
+ return OpVectorLoad<ElementType, kSegmentSize, kNumRegistersInGroup, vta, vma>(
+ args.dst, src, [stride](size_t index) { return stride * index; });
+ }
+
+ template <typename ElementType,
+ size_t kSegmentSize,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVector(const Decoder::VLoadUnitStrideArgs& args, Register src) {
+ return OpVector<ElementType, kSegmentSize, NumberOfRegistersInvolved(vlmul), vta, vma>(args,
+ src);
+ }
+
+ template <typename ElementType,
+ size_t kSegmentSize,
+ size_t kNumRegistersInGroup,
+ TailProcessing vta,
+ auto vma>
+ void OpVector(const Decoder::VLoadUnitStrideArgs& args, Register src) {
+ switch (args.opcode) {
+ case Decoder::VLUmOpOpcode::kVleXXff:
+ return OpVectorLoad<ElementType,
+ kSegmentSize,
+ kNumRegistersInGroup,
+ vta,
+ vma,
+ Decoder::VLUmOpOpcode::kVleXXff>(
+ args.dst, src, [](size_t index) { return kSegmentSize * sizeof(ElementType) * index; });
+ case Decoder::VLUmOpOpcode::kVleXX:
+ return OpVectorLoad<ElementType,
+ kSegmentSize,
+ kNumRegistersInGroup,
+ vta,
+ vma,
+ Decoder::VLUmOpOpcode::kVleXX>(
+ args.dst, src, [](size_t index) { return kSegmentSize * sizeof(ElementType) * index; });
+ case Decoder::VLUmOpOpcode::kVlm:
+ if constexpr (kSegmentSize == 1 &&
+ std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ return OpVectorLoad<UInt8,
+ 1,
+ 1,
+ TailProcessing::kAgnostic,
+ vma,
+ Decoder::VLUmOpOpcode::kVlm>(
+ args.dst, src, [](size_t index) { return index; });
+ }
+ return Unimplemented();
+ default:
+ return Unimplemented();
+ }
+ }
+
+ // The strided version of segmented load sounds like something very convoluted and complicated
+ // that no one may ever want to use, but it's not rare and may be illustrated with simple RGB
+ // bitmap window.
+ //
+ // Suppose it's in memory like this (doubles are 8 bytes in size as per IEEE 754)):
+ // {R: 0.01}{G: 0.11}{B: 0.21} {R: 1.01}{G: 1.11}{B: 1.21}, {R: 2.01}{G: 2.11}{B: 2.21}
+ // {R:10.01}{G:10.11}{B:10.21} {R:11.01}{G:11.11}{B:11.21}, {R:12.01}{G:12.11}{B:12.21}
+ // {R:20.01}{G:20.11}{B:20.21} {R:21.01}{G:21.11}{B:21.21}, {R:22.01}{G:22.11}{B:22.21}
+ // {R:30.01}{G:30.11}{B:30.21} {R:31.01}{G:31.11}{B:31.21}, {R:32.01}{G:32.11}{B:32.21}
+ // This is very tiny 3x4 image with 3 components: red, green, blue.
+ //
+ // Let's assume that x1 is loaded with address of first element and x2 with 72 (that's how much
+ // one row of this image takes).
+ //
+ // Then we may use the following command to load values in memory (with LMUL = 2, ELEN = 4):
+ // vlsseg3e64.v v0, (x1), x2
+ //
+ // They would be loaded like this:
+ // v0: {R: 0.01}{R:10.01} (first group of 2 registers)
+ // v1: {R:20.01}{R:30.01}
+ // v2: {G: 0.11}{G:10.11} (second group of 2 registers)
+ // v3: {G:20.11}{G:30.11}
+ // v4: {B: 0.21}{B:10.21} (third group of 3 registers)
+ // v5: {B:20.21}{B:30.21}
+ // Now we have loaded a column from memory and all three colors are put into a different register
+ // groups for further processing.
+ template <typename ElementType,
+ size_t kSegmentSize,
+ size_t kNumRegistersInGroup,
+ TailProcessing vta,
+ auto vma,
+ typename Decoder::VLUmOpOpcode opcode = typename Decoder::VLUmOpOpcode{},
+ typename GetElementOffsetLambdaType>
+ void OpVectorLoad(uint8_t dst, Register src, GetElementOffsetLambdaType GetElementOffset) {
+ using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
+ if (!IsAligned<kNumRegistersInGroup>(dst)) {
+ return Unimplemented();
+ }
+ if (dst + kNumRegistersInGroup * kSegmentSize >= 32) {
+ return Unimplemented();
+ }
+ constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType));
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ if constexpr (opcode == Decoder::VLUmOpOpcode::kVlm) {
+ vl = AlignUp<CHAR_BIT>(vl) / CHAR_BIT;
+ }
+ // In case of memory access fault we may set vstart to non-zero value, set it to zero here to
+ // simplify the logic below.
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ if constexpr (vta == TailProcessing::kAgnostic) {
+ vstart = std::min(vstart, vl);
+ }
+ // Note: within_group_id is the current register id within a register group. During one
+ // iteration of this loop we compute results for all registers with the current id in all
+ // groups. E.g. for the example above we'd compute v0, v2, v4 during the first iteration (id
+ // within group = 0), and v1, v3, v5 during the second iteration (id within group = 1). This
+ // ensures that memory is always accessed in ordered fashion.
+ std::array<SIMD128Register, kSegmentSize> result;
+ char* ptr = ToHostAddr<char>(src);
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t within_group_id = vstart / kElementsCount; within_group_id < kNumRegistersInGroup;
+ ++within_group_id) {
+ // No need to continue if we have kUndisturbed vta strategy.
+ if constexpr (vta == TailProcessing::kUndisturbed) {
+ if (within_group_id * kElementsCount >= vl) {
+ break;
+ }
+ }
+ // If we have elements that won't be overwritten then load these from registers.
+ // For interpreter we could have filled all the registers unconditionally but we'll want to
+ // reuse this code JITs later.
+ auto register_mask =
+ std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, within_group_id));
+ auto full_mask = std::get<0>(intrinsics::FullMaskForRegister<ElementType>(mask));
+ if (vstart ||
+ (vl < (within_group_id + 1) * kElementsCount && vta == TailProcessing::kUndisturbed) ||
+ !(std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing> ||
+ static_cast<InactiveProcessing>(vma) != InactiveProcessing::kUndisturbed ||
+ register_mask == full_mask)) {
+ for (size_t field = 0; field < kSegmentSize; ++field) {
+ result[field].Set(state_->cpu.v[dst + within_group_id + field * kNumRegistersInGroup]);
+ }
+ }
+ // Read elements from memory, but only if there are any active ones.
+ for (size_t within_register_id = vstart % kElementsCount; within_register_id < kElementsCount;
+ ++within_register_id) {
+ size_t element_index = kElementsCount * within_group_id + within_register_id;
+ // Stop if we reached the vl limit.
+ if (vl <= element_index) {
+ break;
+ }
+ // Don't touch masked-out elements.
+ if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ if ((MaskType(register_mask) & MaskType{static_cast<typename MaskType::BaseType>(
+ 1 << within_register_id)}) == MaskType{0}) {
+ continue;
+ }
+ }
+ // Load segment from memory.
+ for (size_t field = 0; field < kSegmentSize; ++field) {
+ FaultyLoadResult mem_access_result =
+ FaultyLoad(ptr + field * sizeof(ElementType) + GetElementOffset(element_index),
+ sizeof(ElementType));
+ if (mem_access_result.is_fault) {
+ // Documentation doesn't tell us what we are supposed to do to remaining elements when
+ // access fault happens but let's trigger an exception and treat the remaining elements
+ // using vta-specified strategy by simply just adjusting the vl.
+ vl = element_index;
+ if constexpr (opcode == Decoder::VLUmOpOpcode::kVleXXff) {
+ // Fail-first load only triggers exceptions for the first element, otherwise it
+ // changes vl to ensure that other operations would only process elements that are
+ // successfully loaded.
+ if (element_index == 0) [[unlikely]] {
+ exception_raised_ = true;
+ } else {
+ // TODO(b/323994286): Write a test case to verify vl changes correctly.
+ SetCsr<CsrName::kVl>(element_index);
+ }
+ } else {
+ // Most load instructions set vstart to failing element which then may be processed
+ // by exception handler.
+ exception_raised_ = true;
+ SetCsr<CsrName::kVstart>(element_index);
+ }
+ break;
+ }
+ result[field].template Set<ElementType>(static_cast<ElementType>(mem_access_result.value),
+ within_register_id);
+ }
+ }
+ // Lambda to generate tail mask. We don't want to call MakeBitmaskFromVl eagerly because it's
+ // not needed, most of the time, and compiler couldn't eliminate access to mmap-backed memory.
+ auto GetTailMask = [vl, within_group_id] {
+ return std::get<0>(intrinsics::MakeBitmaskFromVl<ElementType>(
+ (vl <= within_group_id * kElementsCount) ? 0 : vl - within_group_id * kElementsCount));
+ };
+ // If mask has inactive elements and InactiveProcessing::kAgnostic mode is used then set them
+ // to ~0.
+ if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ if (register_mask != full_mask) {
+ auto [simd_mask] =
+ intrinsics::BitMaskToSimdMaskForTests<ElementType>(Int64{MaskType{register_mask}});
+ for (size_t field = 0; field < kSegmentSize; ++field) {
+ if constexpr (vma == InactiveProcessing::kAgnostic) {
+ // vstart equal to zero is supposed to be exceptional. From RISV-V V manual (page 14):
+ // The vstart CSR is writable by unprivileged code, but non-zero vstart values may
+ // cause vector instructions to run substantially slower on some implementations, so
+ // vstart should not be used by application programmers. A few vector instructions
+ // cannot be executed with a non-zero vstart value and will raise an illegal
+ // instruction exception as dened below.
+ // TODO(b/300690740): decide whether to merge two cases after support for vectors in
+ // heavy optimizer would be implemented.
+ if (vstart) [[unlikely]] {
+ SIMD128Register vstart_mask = std::get<0>(
+ intrinsics::MakeBitmaskFromVl<ElementType>(vstart % kElementsCount));
+ if constexpr (vta == TailProcessing::kAgnostic) {
+ result[field] |= vstart_mask & ~simd_mask;
+ } else if (vl < (within_group_id + 1) * kElementsCount) {
+ result[field] |= vstart_mask & ~simd_mask & ~GetTailMask();
+ } else {
+ result[field] |= vstart_mask & ~simd_mask;
+ }
+ } else if constexpr (vta == TailProcessing::kAgnostic) {
+ result[field] |= ~simd_mask;
+ } else {
+ if (vl < (within_group_id + 1) * kElementsCount) {
+ result[field] |= ~simd_mask & ~GetTailMask();
+ } else {
+ result[field] |= ~simd_mask;
+ }
+ }
+ }
+ }
+ }
+ }
+ // If we have tail elements and TailProcessing::kAgnostic mode then set them to ~0.
+ if constexpr (vta == TailProcessing::kAgnostic) {
+ for (size_t field = 0; field < kSegmentSize; ++field) {
+ if (vl < (within_group_id + 1) * kElementsCount) {
+ result[field] |= GetTailMask();
+ }
+ }
+ }
+ // Put values back into register file.
+ for (size_t field = 0; field < kSegmentSize; ++field) {
+ state_->cpu.v[dst + within_group_id + field * kNumRegistersInGroup] =
+ result[field].template Get<__uint128_t>();
+ }
+ // Next group should be fully processed.
+ vstart = 0;
+ }
+ }
+
+ template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
+ void OpVector(const Decoder::VOpFVfArgs& args, ElementType arg2) {
+ switch (args.opcode) {
+ case Decoder::VOpFVfOpcode::kVfmvsf:
+ if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ return Unimplemented();
+ }
+ if (args.src1 != 0) {
+ return Unimplemented();
+ }
+ return OpVectorVmvsx<ElementType, vta>(args.dst, arg2);
+ case Decoder::VOpFVfOpcode::kVfmergevf:
+ if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ if (args.src1 != 0) {
+ return Unimplemented();
+ }
+ return OpVectorx<intrinsics::Vcopyx<ElementType>, ElementType, vlmul, vta, vma>(args.dst,
+ arg2);
+ } else {
+ return OpVectorx<intrinsics::Vcopyx<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ // Always use "undisturbed" value from source register.
+ InactiveProcessing::kUndisturbed>(
+ args.dst, arg2, /*dst_mask=*/args.src1);
+ }
+ case Decoder::VOpFVfOpcode::kVfmaxvf:
+ return OpVectorvx<intrinsics::Vfmaxvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, arg2);
+ case Decoder::VOpFVfOpcode::kVfminvf:
+ return OpVectorvx<intrinsics::Vfminvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, arg2);
+ default:
+ return Unimplemented();
+ }
+ }
+
+ template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
+ void OpVector(const Decoder::VOpFVvArgs& args) {
+ using SignedType = std::make_signed_t<typename TypeTraits<ElementType>::Int>;
+ using UnsignedType = std::make_unsigned_t<typename TypeTraits<ElementType>::Int>;
+ // We currently don't support Float16 operations, but conversion routines that deal with
+ // double-width floats use these encodings to produce regular Float32 types.
+ if constexpr (sizeof(ElementType) <= sizeof(Float32)) {
+ using WideElementType = typename TypeTraits<ElementType>::Wide;
+ switch (args.opcode) {
+ case Decoder::VOpFVvOpcode::kVFUnary0:
+ switch (args.vfunary0_opcode) {
+ case Decoder::VFUnary0Opcode::kVfwcvtfxuv:
+ return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<WideElementType, UnsignedType>(FPFlags::DYN, frm, src);
+ },
+ WideElementType,
+ UnsignedType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfwcvtfxv:
+ return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<WideElementType, SignedType>(FPFlags::DYN, frm, src);
+ },
+ WideElementType,
+ SignedType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfncvtxufw:
+ return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::DYN, frm, src);
+ },
+ UnsignedType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfncvtxfw:
+ return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::DYN, frm, src);
+ },
+ SignedType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfncvtrtzxufw:
+ return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::RTZ, frm, src);
+ },
+ UnsignedType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfncvtrtzxfw:
+ return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::RTZ, frm, src);
+ },
+ SignedType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ default:
+ break; // Make compiler happy.
+ }
+ break;
+ default:
+ break; // Make compiler happy.
+ }
+ }
+ // Widening and narrowing opeation which take floating point “narrow” operand may only work
+ // correctly with Float32 input: Float16 is not supported yet, while Float64 input would produce
+ // 128bit output which is currently reserver in RISC-V V.
+ if constexpr (sizeof(ElementType) == sizeof(Float32)) {
+ using WideElementType = typename TypeTraits<ElementType>::Wide;
+ using WideSignedType = typename TypeTraits<SignedType>::Wide;
+ using WideUnsignedType = typename TypeTraits<UnsignedType>::Wide;
+ switch (args.opcode) {
+ case Decoder::VOpFVvOpcode::kVFUnary0:
+ switch (args.vfunary0_opcode) {
+ case Decoder::VFUnary0Opcode::kVfwcvtxufv:
+ return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<WideUnsignedType, ElementType>(FPFlags::DYN, frm, src);
+ },
+ WideUnsignedType,
+ ElementType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfwcvtxfv:
+ return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<WideSignedType, ElementType>(FPFlags::DYN, frm, src);
+ },
+ WideSignedType,
+ ElementType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfwcvtffv:
+ return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<WideElementType, ElementType>(FPFlags::DYN, frm, src);
+ },
+ WideElementType,
+ ElementType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfwcvtrtzxufv:
+ return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<WideUnsignedType, ElementType>(FPFlags::RTZ, frm, src);
+ },
+ WideUnsignedType,
+ ElementType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfwcvtrtzxfv:
+ return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<WideSignedType, ElementType>(FPFlags::RTZ, frm, src);
+ },
+ WideSignedType,
+ ElementType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfncvtfxuw:
+ return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<ElementType, WideUnsignedType>(FPFlags::DYN, frm, src);
+ },
+ ElementType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfncvtffw:
+ return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<ElementType, WideElementType>(FPFlags::DYN, frm, src);
+ },
+ ElementType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfncvtfxw:
+ return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<ElementType, WideSignedType>(FPFlags::DYN, frm, src);
+ },
+ ElementType,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ default:
+ break; // Make compiler happy.
+ }
+ break;
+ default:
+ break; // Make compiler happy.
+ }
+ }
+ // If our ElementType is Float16 then “straight” operations are unsupported and we whouldn't try
+ // instantiate any functions since this would lead to compilke-time error.
+ if constexpr (sizeof(ElementType) >= sizeof(Float32)) {
+ switch (args.opcode) {
+ case Decoder::VOpFVvOpcode::kVfmvfs:
+ if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ return Unimplemented();
+ }
+ if (args.src2 != 0) {
+ return Unimplemented();
+ }
+ return OpVectorVmvfs<ElementType>(args.dst, args.src1);
+ case Decoder::VOpFVvOpcode::kVFUnary0:
+ switch (args.vfunary0_opcode) {
+ case Decoder::VFUnary0Opcode::kVfcvtxufv:
+ return OpVectorv<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<UnsignedType, ElementType>(FPFlags::DYN, frm, src);
+ },
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ CsrName::kFrm>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfcvtxfv:
+ return OpVectorv<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<SignedType, ElementType>(FPFlags::DYN, frm, src);
+ },
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ CsrName::kFrm>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfcvtfxuv:
+ return OpVectorv<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<ElementType, UnsignedType>(FPFlags::DYN, frm, src);
+ },
+ UnsignedType,
+ vlmul,
+ vta,
+ vma,
+ CsrName::kFrm>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfcvtfxv:
+ return OpVectorv<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<ElementType, SignedType>(FPFlags::DYN, frm, src);
+ },
+ SignedType,
+ vlmul,
+ vta,
+ vma,
+ CsrName::kFrm>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfcvtrtzxufv:
+ return OpVectorv<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<UnsignedType, ElementType>(FPFlags::RTZ, frm, src);
+ },
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ CsrName::kFrm>(args.dst, args.src1);
+ case Decoder::VFUnary0Opcode::kVfcvtrtzxfv:
+ return OpVectorv<[](int8_t frm, SIMD128Register src) {
+ return intrinsics::Vfcvtv<SignedType, ElementType>(FPFlags::RTZ, frm, src);
+ },
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ CsrName::kFrm>(args.dst, args.src1);
+ default:
+ break; // Make compiler happy.
+ }
+ break;
+ default:
+ break; // Make compiler happy.
+ }
+ }
+ return Unimplemented();
+ }
+
+ template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
+ void OpVector(const Decoder::VOpIViArgs& args) {
+ using SignedType = berberis::SignedType<ElementType>;
+ using UnsignedType = berberis::UnsignedType<ElementType>;
+ switch (args.opcode) {
+ case Decoder::VOpIViOpcode::kVaddvi:
+ return OpVectorvx<intrinsics::Vaddvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ case Decoder::VOpIViOpcode::kVrsubvi:
+ return OpVectorvx<intrinsics::Vrsubvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ case Decoder::VOpIViOpcode::kVandvi:
+ return OpVectorvx<intrinsics::Vandvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ case Decoder::VOpIViOpcode::kVorvi:
+ return OpVectorvx<intrinsics::Vorvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ case Decoder::VOpIViOpcode::kVxorvi:
+ return OpVectorvx<intrinsics::Vxorvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ case Decoder::VOpIViOpcode::kVmseqvi:
+ return OpVectormvx<intrinsics::Vseqvx<ElementType>, ElementType, vlmul, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ case Decoder::VOpIViOpcode::kVmsnevi:
+ return OpVectormvx<intrinsics::Vsnevx<ElementType>, ElementType, vlmul, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ case Decoder::VOpIViOpcode::kVmsleuvi:
+ return OpVectormvx<intrinsics::Vslevx<UnsignedType>, UnsignedType, vlmul, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ case Decoder::VOpIViOpcode::kVmslevi:
+ return OpVectormvx<intrinsics::Vslevx<SignedType>, SignedType, vlmul, vma>(
+ args.dst, args.src, SignedType{args.imm});
+ case Decoder::VOpIViOpcode::kVmsgtuvi:
+ return OpVectormvx<intrinsics::Vsgtvx<UnsignedType>, UnsignedType, vlmul, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ case Decoder::VOpIViOpcode::kVmsgtvi:
+ return OpVectormvx<intrinsics::Vsgtvx<SignedType>, SignedType, vlmul, vma>(
+ args.dst, args.src, SignedType{args.imm});
+ case Decoder::VOpIViOpcode::kVsllvi:
+ return OpVectorvx<intrinsics::Vslvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ case Decoder::VOpIViOpcode::kVsrlvi:
+ return OpVectorvx<intrinsics::Vsrvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ case Decoder::VOpIViOpcode::kVsravi:
+ return OpVectorvx<intrinsics::Vsrvx<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src, SignedType{args.imm});
+ case Decoder::VOpIViOpcode::kVmergevi:
+ if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ if (args.src != 0) {
+ return Unimplemented();
+ }
+ return OpVectorx<intrinsics::Vcopyx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, BitCastToUnsigned(SignedType{args.imm}));
+ } else {
+ return OpVectorx<intrinsics::Vcopyx<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ // Always use "undisturbed" value from source register.
+ InactiveProcessing::kUndisturbed>(
+ args.dst, BitCastToUnsigned(SignedType{args.imm}), /*dst_mask=*/args.src);
+ }
+ case Decoder::VOpIViOpcode::kVmvXrv:
+ // kVmv<nr>rv instruction
+ if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ switch (args.imm) {
+ case 0:
+ return OpVectorVmvXrv<ElementType, 1>(args.dst, args.src);
+ case 1:
+ return OpVectorVmvXrv<ElementType, 2>(args.dst, args.src);
+ case 3:
+ return OpVectorVmvXrv<ElementType, 4>(args.dst, args.src);
+ case 7:
+ return OpVectorVmvXrv<ElementType, 8>(args.dst, args.src);
+ default:
+ return Unimplemented();
+ }
+ } else {
+ return Unimplemented();
+ }
+ case Decoder::VOpIViOpcode::kVnsrawi:
+ return OpVectorNarrowwx<intrinsics::Vnsrwx<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src, SignedType{args.imm});
+ case Decoder::VOpIViOpcode::kVnsrlwi:
+ return OpVectorNarrowwx<intrinsics::Vnsrwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ case Decoder::VOpIViOpcode::kVslideupvi:
+ return OpVectorslideup<ElementType, vlmul, vta, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ case Decoder::VOpIViOpcode::kVslidedownvi:
+ return OpVectorslidedown<ElementType, vlmul, vta, vma>(
+ args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
+ default:
+ Unimplemented();
+ }
+ }
+
+ template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
+ void OpVector(const Decoder::VOpIVvArgs& args) {
+ using SignedType = berberis::SignedType<ElementType>;
+ using UnsignedType = berberis::UnsignedType<ElementType>;
+ switch (args.opcode) {
+ case Decoder::VOpIVvOpcode::kVaddvv:
+ return OpVectorvv<intrinsics::Vaddvv<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVsubvv:
+ return OpVectorvv<intrinsics::Vsubvv<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVandvv:
+ return OpVectorvv<intrinsics::Vandvv<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVorvv:
+ return OpVectorvv<intrinsics::Vorvv<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVxorvv:
+ return OpVectorvv<intrinsics::Vxorvv<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVmseqvv:
+ return OpVectormvv<intrinsics::Vseqvv<ElementType>, ElementType, vlmul, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVmsnevv:
+ return OpVectormvv<intrinsics::Vsnevv<ElementType>, ElementType, vlmul, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVmsltuvv:
+ return OpVectormvv<intrinsics::Vsltvv<UnsignedType>, ElementType, vlmul, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVmsltvv:
+ return OpVectormvv<intrinsics::Vsltvv<SignedType>, ElementType, vlmul, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVmsleuvv:
+ return OpVectormvv<intrinsics::Vslevv<UnsignedType>, ElementType, vlmul, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVmslevv:
+ return OpVectormvv<intrinsics::Vslevv<SignedType>, ElementType, vlmul, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVsllvv:
+ return OpVectorvv<intrinsics::Vslvv<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVsrlvv:
+ return OpVectorvv<intrinsics::Vsrvv<UnsignedType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVsravv:
+ return OpVectorvv<intrinsics::Vsrvv<SignedType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVminuvv:
+ return OpVectorvv<intrinsics::Vminvv<UnsignedType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVminvv:
+ return OpVectorvv<intrinsics::Vminvv<SignedType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVmaxuvv:
+ return OpVectorvv<intrinsics::Vmaxvv<UnsignedType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVmaxvv:
+ return OpVectorvv<intrinsics::Vmaxvv<SignedType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVmergevv:
+ if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ if (args.src1 != 0) {
+ return Unimplemented();
+ }
+ return OpVectorv<intrinsics::Vcopyv<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src2);
+ } else {
+ return OpVectorv<intrinsics::Vcopyv<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ // Always use "undisturbed" value from source register.
+ InactiveProcessing::kUndisturbed>(
+ args.dst, args.src2, /*dst_mask=*/args.src1);
+ }
+ case Decoder::VOpIVvOpcode::kVnsrawv:
+ return OpVectorNarrowwv<intrinsics::Vnsrwv<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpIVvOpcode::kVnsrlwv:
+ return OpVectorNarrowwv<intrinsics::Vnsrwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ default:
+ Unimplemented();
+ }
+ }
+
+ template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
+ void OpVector(const Decoder::VOpMVvArgs& args) {
+ using SignedType = berberis::SignedType<ElementType>;
+ using UnsignedType = berberis::UnsignedType<ElementType>;
+ if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ switch (args.opcode) {
+ case Decoder::VOpMVvOpcode::kVmandnmm:
+ return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs & ~rhs; }>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVmandmm:
+ return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs & rhs; }>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVmormm:
+ return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs | rhs; }>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVmxormm:
+ return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs ^ rhs; }>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVmornmm:
+ return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs | ~rhs; }>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVmnandmm:
+ return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs & rhs); }>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVmnormm:
+ return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs | rhs); }>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVmxnormm:
+ return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs ^ rhs); }>(
+ args.dst, args.src1, args.src2);
+ default:; // Do nothing: handled in next switch.
+ }
+ }
+ switch (args.opcode) {
+ case Decoder::VOpMVvOpcode::kVredsumvs:
+ return OpVectorvs<intrinsics::Vredsumvs<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVredandvs:
+ return OpVectorvs<intrinsics::Vredandvs<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVredorvs:
+ return OpVectorvs<intrinsics::Vredorvs<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVredxorvs:
+ return OpVectorvs<intrinsics::Vredxorvs<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVredminuvs:
+ return OpVectorvs<intrinsics::Vredminvs<UnsignedType>, UnsignedType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVredminvs:
+ return OpVectorvs<intrinsics::Vredminvs<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVredmaxuvs:
+ return OpVectorvs<intrinsics::Vredmaxvs<UnsignedType>, UnsignedType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVredmaxvs:
+ return OpVectorvs<intrinsics::Vredmaxvs<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVWXUnary0:
+ switch (args.vwxunary0_opcode) {
+ case Decoder::VWXUnary0Opcode::kVmvxs:
+ if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ return Unimplemented();
+ }
+ return OpVectorVmvxs<SignedType>(args.dst, args.src1);
+ case Decoder::VWXUnary0Opcode::kVcpopm:
+ return OpVectorVWXUnary0<intrinsics::Vcpopm<Int128>, vma>(args.dst, args.src1);
+ case Decoder::VWXUnary0Opcode::kVfirstm:
+ return OpVectorVWXUnary0<intrinsics::Vfirstm<Int128>, vma>(args.dst, args.src1);
+ default:
+ return Unimplemented();
+ }
+ case Decoder::VOpMVvOpcode::kVFUnary0:
+ switch (args.vxunary0_opcode) {
+ case Decoder::VXUnary0Opcode::kVzextvf2m:
+ if constexpr (sizeof(UnsignedType) >= 2) {
+ return OpVectorVXUnary0<intrinsics::Vextf2<UnsignedType>,
+ UnsignedType,
+ 2,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ }
+ break;
+ case Decoder::VXUnary0Opcode::kVsextvf2m:
+ if constexpr (sizeof(SignedType) >= 2) {
+ return OpVectorVXUnary0<intrinsics::Vextf2<SignedType>,
+ SignedType,
+ 2,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ }
+ break;
+ case Decoder::VXUnary0Opcode::kVzextvf4m:
+ if constexpr (sizeof(UnsignedType) >= 4) {
+ return OpVectorVXUnary0<intrinsics::Vextf4<UnsignedType>,
+ UnsignedType,
+ 4,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ }
+ break;
+ case Decoder::VXUnary0Opcode::kVsextvf4m:
+ if constexpr (sizeof(SignedType) >= 4) {
+ return OpVectorVXUnary0<intrinsics::Vextf4<SignedType>,
+ SignedType,
+ 4,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ }
+ break;
+ case Decoder::VXUnary0Opcode::kVzextvf8m:
+ if constexpr (sizeof(UnsignedType) >= 8) {
+ return OpVectorVXUnary0<intrinsics::Vextf8<UnsignedType>,
+ UnsignedType,
+ 8,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ }
+ break;
+ case Decoder::VXUnary0Opcode::kVsextvf8m:
+ if constexpr (sizeof(SignedType) >= 8) {
+ return OpVectorVXUnary0<intrinsics::Vextf8<SignedType>,
+ SignedType,
+ 8,
+ vlmul,
+ vta,
+ vma>(args.dst, args.src1);
+ }
+ break;
+ default:
+ return Unimplemented();
+ }
+ return Unimplemented();
+ case Decoder::VOpMVvOpcode::kVMUnary0:
+ switch (args.vmunary0_opcode) {
+ case Decoder::VMUnary0Opcode::kVmsbfm:
+ return OpVectorVMUnary0<intrinsics::Vmsbfm<>, vma>(args.dst, args.src1);
+ case Decoder::VMUnary0Opcode::kVmsofm:
+ return OpVectorVMUnary0<intrinsics::Vmsofm<>, vma>(args.dst, args.src1);
+ case Decoder::VMUnary0Opcode::kVmsifm:
+ return OpVectorVMUnary0<intrinsics::Vmsifm<>, vma>(args.dst, args.src1);
+ case Decoder::VMUnary0Opcode::kVidv:
+ if (args.src1) {
+ return Unimplemented();
+ }
+ return OpVectorVidv<ElementType, vlmul, vta, vma>(args.dst);
+ default:
+ return Unimplemented();
+ }
+ case Decoder::VOpMVvOpcode::kVmaddvv:
+ return OpVectorvvv<intrinsics::Vmaddvv<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVnmsubvv:
+ return OpVectorvvv<intrinsics::Vnmsubvv<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVmaccvv:
+ return OpVectorvvv<intrinsics::Vmaccvv<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVnmsacvv:
+ return OpVectorvvv<intrinsics::Vnmsacvv<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVmulhuvv:
+ return OpVectorvv<intrinsics::Vmulhvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVmulvv:
+ return OpVectorvv<intrinsics::Vmulvv<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVmulhsuvv:
+ return OpVectorvv<intrinsics::Vmulhsuvv<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVmulhvv:
+ return OpVectorvv<intrinsics::Vmulhvv<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpMVvOpcode::kVwaddvv:
+ if constexpr (sizeof(ElementType) == sizeof(Int64) ||
+ vlmul == VectorRegisterGroupMultiplier::k8registers) {
+ return Unimplemented();
+ } else {
+ return OpVectorWidenvv<intrinsics::Vwaddvv<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ }
+ case Decoder::VOpMVvOpcode::kVwadduvv:
+ if constexpr (sizeof(ElementType) == sizeof(Int64) ||
+ vlmul == VectorRegisterGroupMultiplier::k8registers) {
+ return Unimplemented();
+ } else {
+ return OpVectorWidenvv<intrinsics::Vwaddvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ }
+ case Decoder::VOpMVvOpcode::kVwsubuvv:
+ if constexpr (sizeof(ElementType) == sizeof(Int64) ||
+ vlmul == VectorRegisterGroupMultiplier::k8registers) {
+ return Unimplemented();
+ } else {
+ return OpVectorWidenvv<intrinsics::Vwsubvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ }
+ default:
+ Unimplemented();
+ }
+ }
+
+ template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
+ void OpVector(const Decoder::VOpIVxArgs& args, Register arg2) {
+ using SignedType = berberis::SignedType<ElementType>;
+ using UnsignedType = berberis::UnsignedType<ElementType>;
+ switch (args.opcode) {
+ case Decoder::VOpIVxOpcode::kVaddvx:
+ return OpVectorvx<intrinsics::Vaddvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
+ case Decoder::VOpIVxOpcode::kVsubvx:
+ return OpVectorvx<intrinsics::Vsubvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
+ case Decoder::VOpIVxOpcode::kVrsubvx:
+ return OpVectorvx<intrinsics::Vrsubvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
+ case Decoder::VOpIVxOpcode::kVandvx:
+ return OpVectorvx<intrinsics::Vandvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
+ case Decoder::VOpIVxOpcode::kVorvx:
+ return OpVectorvx<intrinsics::Vorvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
+ case Decoder::VOpIVxOpcode::kVxorvx:
+ return OpVectorvx<intrinsics::Vxorvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
+ case Decoder::VOpIVxOpcode::kVmseqvx:
+ return OpVectormvx<intrinsics::Vseqvx<ElementType>, ElementType, vlmul, vma>(
+ args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
+ case Decoder::VOpIVxOpcode::kVmsnevx:
+ return OpVectormvx<intrinsics::Vsnevx<ElementType>, ElementType, vlmul, vma>(
+ args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
+ case Decoder::VOpIVxOpcode::kVmsltuvx:
+ return OpVectormvx<intrinsics::Vsltvx<UnsignedType>, UnsignedType, vlmul, vma>(
+ args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVmsltvx:
+ return OpVectormvx<intrinsics::Vsltvx<SignedType>, SignedType, vlmul, vma>(
+ args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVmsleuvx:
+ return OpVectormvx<intrinsics::Vslevx<UnsignedType>, UnsignedType, vlmul, vma>(
+ args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVmslevx:
+ return OpVectormvx<intrinsics::Vslevx<SignedType>, SignedType, vlmul, vma>(
+ args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVmsgtuvx:
+ return OpVectormvx<intrinsics::Vsgtvx<UnsignedType>, UnsignedType, vlmul, vma>(
+ args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVmsgtvx:
+ return OpVectormvx<intrinsics::Vsgtvx<SignedType>, SignedType, vlmul, vma>(
+ args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVsllvx:
+ return OpVectorvx<intrinsics::Vslvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
+ case Decoder::VOpIVxOpcode::kVsrlvx:
+ return OpVectorvx<intrinsics::Vsrvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVsravx:
+ return OpVectorvx<intrinsics::Vsrvx<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVminuvx:
+ return OpVectorvx<intrinsics::Vminvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVminvx:
+ return OpVectorvx<intrinsics::Vminvx<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVmaxuvx:
+ return OpVectorvx<intrinsics::Vmaxvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVmaxvx:
+ return OpVectorvx<intrinsics::Vmaxvx<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVmergevx:
+ if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ if (args.src1 != 0) {
+ return Unimplemented();
+ }
+ return OpVectorx<intrinsics::Vcopyx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, MaybeTruncateTo<ElementType>(arg2));
+ } else {
+ return OpVectorx<intrinsics::Vcopyx<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ // Always use "undisturbed" value from source register.
+ InactiveProcessing::kUndisturbed>(
+ args.dst, MaybeTruncateTo<ElementType>(arg2), /*dst_mask=*/args.src1);
+ }
+ case Decoder::VOpIVxOpcode::kVnsrawx:
+ return OpVectorNarrowwx<intrinsics::Vnsrwx<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVnsrlwx:
+ return OpVectorNarrowwx<intrinsics::Vnsrwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVslideupvx:
+ return OpVectorslideup<ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
+ case Decoder::VOpIVxOpcode::kVslidedownvx:
+ return OpVectorslidedown<ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
+ default:
+ Unimplemented();
+ }
+ }
+
+ template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
+ void OpVector(const Decoder::VOpMVxArgs& args, Register arg2) {
+ using SignedType = berberis::SignedType<ElementType>;
+ using UnsignedType = berberis::UnsignedType<ElementType>;
+ switch (args.opcode) {
+ case Decoder::VOpMVxOpcode::kVRXUnary0:
+ switch (args.vrxunary0_opcode) {
+ case Decoder::VRXUnary0Opcode::kVmvsx:
+ if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ return Unimplemented();
+ }
+ return OpVectorVmvsx<SignedType, vta>(args.dst, MaybeTruncateTo<SignedType>(arg2));
+ default:
+ return Unimplemented();
+ }
+ case Decoder::VOpMVxOpcode::kVmaddvx:
+ return OpVectorvxv<intrinsics::Vmaddvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
+ case Decoder::VOpMVxOpcode::kVnmsubvx:
+ return OpVectorvxv<intrinsics::Vnmsubvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
+ case Decoder::VOpMVxOpcode::kVmaccvx:
+ return OpVectorvxv<intrinsics::Vmaccvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
+ case Decoder::VOpMVxOpcode::kVnmsacvx:
+ return OpVectorvxv<intrinsics::Vnmsacvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
+ case Decoder::VOpMVxOpcode::kVmulhuvx:
+ return OpVectorvx<intrinsics::Vmulhvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
+ case Decoder::VOpMVxOpcode::kVmulvx:
+ return OpVectorvx<intrinsics::Vmulvx<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
+ case Decoder::VOpMVxOpcode::kVmulhsuvx:
+ return OpVectorvx<intrinsics::Vmulhsuvx<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
+ case Decoder::VOpMVxOpcode::kVmulhvx:
+ return OpVectorvx<intrinsics::Vmulhvx<SignedType>, SignedType, vlmul, vta, vma>(
+ args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
+ default:
+ Unimplemented();
+ }
+ }
+
+ template <typename DataElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ typename IndexElementType,
+ size_t kSegmentSize,
+ size_t kIndexRegistersInvolved,
+ TailProcessing vta,
+ auto vma>
+ void OpVector(const Decoder::VStoreIndexedArgs& args, Register src) {
+ return OpVector<DataElementType,
+ kSegmentSize,
+ NumberOfRegistersInvolved(vlmul),
+ IndexElementType,
+ kIndexRegistersInvolved,
+ !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>(args, src);
+ }
+
+ template <typename DataElementType,
+ size_t kSegmentSize,
+ size_t kNumRegistersInGroup,
+ typename IndexElementType,
+ size_t kIndexRegistersInvolved,
+ bool kUseMasking>
+ void OpVector(const Decoder::VStoreIndexedArgs& args, Register src) {
+ if (!IsAligned<kIndexRegistersInvolved>(args.idx)) {
+ return Unimplemented();
+ }
+ constexpr size_t kElementsCount =
+ static_cast<int>(sizeof(SIMD128Register) / sizeof(IndexElementType));
+ alignas(alignof(SIMD128Register))
+ IndexElementType indexes[kElementsCount * kIndexRegistersInvolved];
+ memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved);
+ return OpVectorStore<DataElementType, kSegmentSize, kNumRegistersInGroup, kUseMasking>(
+ args.data, src, [&indexes](size_t index) { return indexes[index]; });
+ }
+
+ template <typename ElementType,
+ size_t kSegmentSize,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVector(const Decoder::VStoreStrideArgs& args, Register src, Register stride) {
+ return OpVectorStore<ElementType,
+ kSegmentSize,
+ NumberOfRegistersInvolved(vlmul),
+ !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>(
+ args.data, src, [stride](size_t index) { return stride * index; });
+ }
+
+ template <typename ElementType,
+ size_t kSegmentSize,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVector(const Decoder::VStoreUnitStrideArgs& args, Register src) {
+ switch (args.opcode) {
+ case Decoder::VSUmOpOpcode::kVseXX:
+ return OpVectorStore<ElementType,
+ kSegmentSize,
+ NumberOfRegistersInvolved(vlmul),
+ !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>,
+ Decoder::VSUmOpOpcode::kVseXX>(args.data, src, [](size_t index) {
+ return kSegmentSize * sizeof(ElementType) * index;
+ });
+ case Decoder::VSUmOpOpcode::kVsm:
+ if constexpr (kSegmentSize == 1 &&
+ std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ return OpVectorStore<UInt8,
+ 1,
+ 1,
+ /*kUseMasking=*/false,
+ Decoder::VSUmOpOpcode::kVsm>(
+ args.data, src, [](size_t index) { return index; });
+ }
+ return Unimplemented();
+ default:
+ return Unimplemented();
+ }
+ }
+
+ // Look for VLoadStrideArgs for explanation about semantics: VStoreStrideArgs is almost symmetric,
+ // except it ignores vta and vma modes and never alters inactive elements in memory.
+ template <typename ElementType,
+ size_t kSegmentSize,
+ size_t kNumRegistersInGroup,
+ bool kUseMasking,
+ typename Decoder::VSUmOpOpcode opcode = typename Decoder::VSUmOpOpcode{},
+ typename GetElementOffsetLambdaType>
+ void OpVectorStore(uint8_t data, Register src, GetElementOffsetLambdaType GetElementOffset) {
+ using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
+ if (!IsAligned<kNumRegistersInGroup>(data)) {
+ return Unimplemented();
+ }
+ if (data + kNumRegistersInGroup * kSegmentSize > 32) {
+ return Unimplemented();
+ }
+ constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType));
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ if constexpr (opcode == Decoder::VSUmOpOpcode::kVsm) {
+ vl = AlignUp<CHAR_BIT>(vl) / CHAR_BIT;
+ }
+ // In case of memory access fault we may set vstart to non-zero value, set it to zero here to
+ // simplify the logic below.
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ // Technically, since stores never touch tail elements it's not needed, but makes it easier to
+ // reason about the rest of function.
+ return;
+ }
+ char* ptr = ToHostAddr<char>(src);
+ // Note: within_group_id is the current register id within a register group. During one
+ // iteration of this loop we store results for all registers with the current id in all
+ // groups. E.g. for the example above we'd store data from v0, v2, v4 during the first iteration
+ // (id within group = 0), and v1, v3, v5 during the second iteration (id within group = 1). This
+ // ensures that memory is always accessed in ordered fashion.
+ auto mask = GetMaskForVectorOperationsIfNeeded<kUseMasking>();
+ for (size_t within_group_id = vstart / kElementsCount; within_group_id < kNumRegistersInGroup;
+ ++within_group_id) {
+ // No need to continue if we no longer have elements to store.
+ if (within_group_id * kElementsCount >= vl) {
+ break;
+ }
+ auto register_mask =
+ std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, within_group_id));
+ // Store elements to memory, but only if there are any active ones.
+ for (size_t within_register_id = vstart % kElementsCount; within_register_id < kElementsCount;
+ ++within_register_id) {
+ size_t element_index = kElementsCount * within_group_id + within_register_id;
+ // Stop if we reached the vl limit.
+ if (vl <= element_index) {
+ break;
+ }
+ // Don't touch masked-out elements.
+ if constexpr (kUseMasking) {
+ if ((MaskType(register_mask) & MaskType{static_cast<typename MaskType::BaseType>(
+ 1 << within_register_id)}) == MaskType{0}) {
+ continue;
+ }
+ }
+ // Store segment to memory.
+ for (size_t field = 0; field < kSegmentSize; ++field) {
+ bool exception_raised = FaultyStore(
+ ptr + field * sizeof(ElementType) + GetElementOffset(element_index),
+ sizeof(ElementType),
+ SIMD128Register{state_->cpu.v[data + within_group_id + field * kNumRegistersInGroup]}
+ .Get<ElementType>(within_register_id));
+ // Stop processing if memory is inaccessible. It's also the only case where we have to set
+ // vstart to non-zero value!
+ if (exception_raised) {
+ SetCsr<CsrName::kVstart>(element_index);
+ return;
+ }
+ }
+ }
+ // Next group should be fully processed.
+ vstart = 0;
+ }
+ }
+
+ template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
+ void OpVectorVidv(uint8_t dst) {
+ return OpVectorVidv<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst);
+ }
+
+ template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
+ void OpVectorVidv(uint8_t dst) {
+ if (!IsAligned<kRegistersInvolved>(dst)) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t index = 0; index < kRegistersInvolved; ++index) {
+ SIMD128Register result{state_->cpu.v[dst + index]};
+ result = VectorMasking<ElementType, vta, vma>(
+ result, std::get<0>(intrinsics::Vidv<ElementType>(index)), vstart, vl, index, mask);
+ state_->cpu.v[dst + index] = result.Get<__uint128_t>();
+ }
+ }
+
+ template <typename ElementType>
+ void OpVectorVmvfs(uint8_t dst, uint8_t src) {
+ // Note: intrinsics::NanBox always received Float64 argument, even if it processes Float32 value
+ // to not cause recursion in interinsics handling.
+ // NanBox in the interpreter takes FpRegister and returns FpRegister which is probably the
+ // cleanest way of processing that data (at least on x86-64 this produces code that's close to
+ // optimal).
+ NanBoxAndSetFpReg<ElementType>(dst, SIMD128Register{state_->cpu.v[src]}.Get<FpRegister>(0));
+ SetCsr<CsrName::kVstart>(0);
+ }
+
+ template <typename ElementType, TailProcessing vta>
+ void OpVectorVmvsx(uint8_t dst, ElementType element) {
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ // Documentation doesn't specify what happenes when vstart is non-zero but less than vl.
+ // But at least one hardware implementation treats it as NOP:
+ // https://github.com/riscv/riscv-v-spec/issues/937
+ // We are doing the same here.
+ if (vstart == 0 && vl != 0) [[likely]] {
+ SIMD128Register result;
+ if constexpr (vta == intrinsics::TailProcessing::kAgnostic) {
+ result = ~SIMD128Register{};
+ } else {
+ result.Set(state_->cpu.v[dst]);
+ }
+ result.Set(element, 0);
+ state_->cpu.v[dst] = result.Get<Int128>();
+ }
+ SetCsr<CsrName::kVstart>(0);
+ }
+
+ template <typename ElementType>
+ void OpVectorVmvxs(uint8_t dst, uint8_t src1) {
+ static_assert(ElementType::kIsSigned);
+ // Conversion to Int64 would perform sign-extension if source element is signed.
+ Register element = Int64{SIMD128Register{state_->cpu.v[src1]}.Get<ElementType>(0)};
+ SetRegOrIgnore(dst, element);
+ SetCsr<CsrName::kVstart>(0);
+ }
+
+ template <auto Intrinsic, auto vma>
+ void OpVectorVWXUnary0(uint8_t dst, uint8_t src1) {
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ if (vstart != 0) [[unlikely]] {
+ return Unimplemented();
+ }
+ // Note: vcpop.m and vfirst.m are explicit exception to the rule that vstart >= vl doesn't
+ // perform any operations, and they are explicitly defined to perform write even if vl == 0.
+ SIMD128Register arg1(state_->cpu.v[src1]);
+ if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ SIMD128Register mask(state_->cpu.v[0]);
+ arg1 &= mask;
+ }
+ const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
+ arg1 &= ~tail_mask;
+ SIMD128Register result = std::get<0>(Intrinsic(arg1.Get<Int128>()));
+ SetRegOrIgnore(dst, TruncateTo<UInt64>(BitCastToUnsigned(result.Get<Int128>())));
+ }
+
+ template <auto Intrinsic>
+ void OpVectormm(uint8_t dst, uint8_t src1, uint8_t src2) {
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ SIMD128Register arg1(state_->cpu.v[src1]);
+ SIMD128Register arg2(state_->cpu.v[src2]);
+ SIMD128Register result;
+ if (vstart > 0) [[unlikely]] {
+ const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart);
+ result.Set(state_->cpu.v[dst]);
+ result = (result & ~start_mask) | (Intrinsic(arg1, arg2) & start_mask);
+ } else {
+ result = Intrinsic(arg1, arg2);
+ }
+ const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
+ result = result | tail_mask;
+ state_->cpu.v[dst] = result.Get<__uint128_t>();
+ }
+
+ template <auto Intrinsic, auto vma>
+ void OpVectorVMUnary0(uint8_t dst, uint8_t src1) {
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ if (vstart != 0) {
+ return Unimplemented();
+ }
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vl == 0) [[unlikely]] {
+ return;
+ }
+ SIMD128Register arg1(state_->cpu.v[src1]);
+ SIMD128Register mask;
+ if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ mask.Set<__uint128_t>(state_->cpu.v[0]);
+ arg1 &= mask;
+ }
+ const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
+ arg1 &= ~tail_mask;
+ SIMD128Register result = std::get<0>(Intrinsic(arg1.Get<Int128>()));
+ if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ arg1 &= mask;
+ if (vma == InactiveProcessing::kUndisturbed) {
+ result = (result & mask) | (SIMD128Register(state_->cpu.v[dst]) & ~mask);
+ } else {
+ result |= ~mask;
+ }
+ }
+ result |= tail_mask;
+ state_->cpu.v[dst] = result.Get<__uint128_t>();
+ }
+
+ template <typename ElementType, size_t kRegistersInvolved>
+ void OpVectorVmvXrv(uint8_t dst, uint8_t src) {
+ if (!IsAligned<kRegistersInvolved>(dst | src)) {
+ return Unimplemented();
+ }
+ constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType));
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ SetCsr<CsrName::kVstart>(0);
+ // The usual property that no elements are written if vstart >= vl does not apply to these
+ // instructions. Instead, no elements are written if vstart >= evl.
+ if (vstart >= kElementsCount * kRegistersInvolved) [[unlikely]] {
+ return;
+ }
+ if (vstart == 0) [[likely]] {
+ for (size_t index = 0; index < kRegistersInvolved; ++index) {
+ state_->cpu.v[dst + index] = state_->cpu.v[src + index];
+ }
+ return;
+ }
+ size_t index = vstart / kElementsCount;
+ SIMD128Register destination{state_->cpu.v[dst + index]};
+ SIMD128Register source{state_->cpu.v[src + index]};
+ for (size_t element_index = vstart % kElementsCount; element_index < kElementsCount;
+ ++element_index) {
+ destination.Set(source.Get<ElementType>(element_index), element_index);
+ }
+ state_->cpu.v[dst + index] = destination.Get<__uint128_t>();
+ for (index++; index < kRegistersInvolved; ++index) {
+ state_->cpu.v[dst + index] = state_->cpu.v[src + index];
+ }
+ }
+
+ template <auto Intrinsic, typename ElementType, VectorRegisterGroupMultiplier vlmul, auto vma>
+ void OpVectormvv(uint8_t dst, uint8_t src1, uint8_t src2) {
+ return OpVectormvv<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vma>(
+ dst, src1, src2);
+ }
+
+ template <auto Intrinsic, typename ElementType, size_t kRegistersInvolved, auto vma>
+ void OpVectormvv(uint8_t dst, uint8_t src1, uint8_t src2) {
+ if (!IsAligned<kRegistersInvolved>(src1 | src2)) {
+ return Unimplemented();
+ }
+ SIMD128Register original_result(state_->cpu.v[dst]);
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ SIMD128Register result_before_vl_masking;
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ result_before_vl_masking = original_result;
+ } else {
+ result_before_vl_masking =
+ CollectBitmaskResult<ElementType, kRegistersInvolved>([this, src1, src2](auto index) {
+ SIMD128Register arg1(state_->cpu.v[src1 + static_cast<size_t>(index)]);
+ SIMD128Register arg2(state_->cpu.v[src2 + static_cast<size_t>(index)]);
+ return Intrinsic(arg1, arg2);
+ });
+ SIMD128Register mask(state_->cpu.v[0]);
+ if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ if constexpr (vma == InactiveProcessing::kAgnostic) {
+ result_before_vl_masking |= ~mask;
+ } else {
+ result_before_vl_masking = (mask & result_before_vl_masking) | (original_result & ~mask);
+ }
+ }
+ if (vstart > 0) [[unlikely]] {
+ const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart);
+ result_before_vl_masking =
+ (original_result & ~start_mask) | (result_before_vl_masking & start_mask);
+ }
+ }
+ const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
+ state_->cpu.v[dst] = (result_before_vl_masking | tail_mask).Get<__uint128_t>();
+ }
+
+ template <auto Intrinsic, typename ElementType, VectorRegisterGroupMultiplier vlmul, auto vma>
+ void OpVectormvx(uint8_t dst, uint8_t src1, ElementType arg2) {
+ return OpVectormvx<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vma>(
+ dst, src1, arg2);
+ }
+
+ template <auto Intrinsic, typename ElementType, size_t kRegistersInvolved, auto vma>
+ void OpVectormvx(uint8_t dst, uint8_t src1, ElementType arg2) {
+ if (!IsAligned<kRegistersInvolved>(src1)) {
+ return Unimplemented();
+ }
+ SIMD128Register original_result(state_->cpu.v[dst]);
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ SIMD128Register result_before_vl_masking;
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ result_before_vl_masking = original_result;
+ } else {
+ result_before_vl_masking =
+ CollectBitmaskResult<ElementType, kRegistersInvolved>([this, src1, arg2](auto index) {
+ SIMD128Register arg1(state_->cpu.v[src1 + static_cast<size_t>(index)]);
+ return Intrinsic(arg1, arg2);
+ });
+ if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ SIMD128Register mask(state_->cpu.v[0]);
+ if constexpr (vma == InactiveProcessing::kAgnostic) {
+ result_before_vl_masking |= ~mask;
+ } else {
+ result_before_vl_masking = (mask & result_before_vl_masking) | (original_result & ~mask);
+ }
+ }
+ if (vstart > 0) [[unlikely]] {
+ const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart);
+ result_before_vl_masking =
+ (original_result & ~start_mask) | (result_before_vl_masking & start_mask);
+ }
+ }
+ const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
+ state_->cpu.v[dst] = (result_before_vl_masking | tail_mask).Get<__uint128_t>();
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma,
+ CsrName... kExtraCsrs,
+ typename... DstMaskType>
+ void OpVectorv(uint8_t dst, uint8_t src1, DstMaskType... dst_mask) {
+ return OpVectorv<Intrinsic,
+ ElementType,
+ NumberOfRegistersInvolved(vlmul),
+ vta,
+ vma,
+ kExtraCsrs...>(dst, src1, dst_mask...);
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ size_t kRegistersInvolved,
+ TailProcessing vta,
+ auto vma,
+ CsrName... kExtraCsrs,
+ typename... DstMaskType>
+ void OpVectorv(uint8_t dst, uint8_t src, DstMaskType... dst_mask) {
+ static_assert(sizeof...(dst_mask) <= 1);
+ if (!IsAligned<kRegistersInvolved>(dst | src | (dst_mask | ... | 0))) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t index = 0; index < kRegistersInvolved; ++index) {
+ SIMD128Register result{state_->cpu.v[dst + index]};
+ SIMD128Register result_mask;
+ if constexpr (sizeof...(DstMaskType) == 0) {
+ result_mask.Set(state_->cpu.v[dst + index]);
+ } else {
+ uint8_t dst_mask_unpacked[1] = {dst_mask...};
+ result_mask.Set(state_->cpu.v[dst_mask_unpacked[0] + index]);
+ }
+ SIMD128Register arg{state_->cpu.v[src + index]};
+ result =
+ VectorMasking<ElementType, vta, vma>(result,
+ std::get<0>(Intrinsic(GetCsr<kExtraCsrs>()..., arg)),
+ result_mask,
+ vstart,
+ vl,
+ index,
+ mask);
+ state_->cpu.v[dst + index] = result.Get<__uint128_t>();
+ }
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorvs(uint8_t dst, uint8_t src1, uint8_t src2) {
+ return OpVectorvs<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
+ dst, src1, src2);
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ size_t kRegistersInvolved,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorvs(uint8_t dst, uint8_t src1, uint8_t src2) {
+ if (!IsAligned<kRegistersInvolved>(dst | src1 | src2)) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ if (vstart != 0) {
+ return Unimplemented();
+ }
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vl == 0) [[unlikely]] {
+ return;
+ }
+ SIMD128Register result;
+ auto mask = GetMaskForVectorOperations<vma>();
+ ElementType arg1 = SIMD128Register{state_->cpu.v[src1]}.Get<ElementType>(0);
+ for (size_t index = 0; index < kRegistersInvolved; ++index) {
+ using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
+ const MaskType element_count{
+ static_cast<typename MaskType::BaseType>(std::min(16 / sizeof(ElementType), vl))};
+ auto mask_bits = std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, index));
+ SIMD128Register arg2(state_->cpu.v[src2 + index]);
+ for (MaskType element_index = MaskType{0}; element_index < element_count;
+ element_index += MaskType{1}) {
+ if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ if ((MaskType{mask_bits} & (MaskType{1} << element_index)) == MaskType{0}) {
+ continue;
+ }
+ }
+ result = std::get<0>(Intrinsic(arg1, arg2.Get<ElementType>(element_index)));
+ arg1 = result.Get<ElementType>(0);
+ }
+ }
+ result.Set(state_->cpu.v[dst]);
+ result.Set(arg1, 0);
+ result = std::get<0>(intrinsics::VectorMasking<ElementType, vta>(result, result, 0, 1));
+ state_->cpu.v[dst] = result.Get<__uint128_t>();
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorvv(uint8_t dst, uint8_t src1, uint8_t src2) {
+ return OpVectorvv<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
+ dst, src1, src2);
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ size_t kRegistersInvolved,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorvv(uint8_t dst, uint8_t src1, uint8_t src2) {
+ if (!IsAligned<kRegistersInvolved>(dst | src1 | src2)) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t index = 0; index < kRegistersInvolved; ++index) {
+ SIMD128Register result{state_->cpu.v[dst + index]};
+ SIMD128Register arg1{state_->cpu.v[src1 + index]};
+ SIMD128Register arg2{state_->cpu.v[src2 + index]};
+ result = VectorMasking<ElementType, vta, vma>(
+ result, std::get<0>(Intrinsic(arg1, arg2)), vstart, vl, index, mask);
+ state_->cpu.v[dst + index] = result.Get<__uint128_t>();
+ }
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorvvv(uint8_t dst, uint8_t src1, uint8_t src2) {
+ return OpVectorvvv<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
+ dst, src1, src2);
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ size_t kRegistersInvolved,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorvvv(uint8_t dst, uint8_t src1, uint8_t src2) {
+ if (!IsAligned<kRegistersInvolved>(dst | src1 | src2)) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t index = 0; index < kRegistersInvolved; ++index) {
+ SIMD128Register result(state_->cpu.v[dst + index]);
+ SIMD128Register arg1(state_->cpu.v[src1 + index]);
+ SIMD128Register arg2(state_->cpu.v[src2 + index]);
+ result = VectorMasking<ElementType, vta, vma>(
+ result, std::get<0>(Intrinsic(arg1, arg2, result)), vstart, vl, index, mask);
+ state_->cpu.v[dst + index] = result.Get<__uint128_t>();
+ }
+ }
+
+ template <auto Intrinsic,
+ typename TargetElementType,
+ typename SourceElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorWidenvr(uint8_t dst, uint8_t src) {
+ return OpVectorWidenvr<Intrinsic,
+ TargetElementType,
+ SourceElementType,
+ NumRegistersInvolvedForWideOperand(vlmul),
+ NumberOfRegistersInvolved(vlmul),
+ vta,
+ vma>(dst, src);
+ }
+
+ template <auto Intrinsic,
+ typename TargetElementType,
+ typename SourceElementType,
+ size_t kDestRegistersInvolved,
+ size_t kRegistersInvolved,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorWidenvr(uint8_t dst, uint8_t src) {
+ if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kRegistersInvolved>(src)) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ int8_t frm = GetCsr<CsrName::kFrm>();
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t index = 0; index < kRegistersInvolved; ++index) {
+ SIMD128Register result(state_->cpu.v[dst + 2 * index]);
+ SIMD128Register arg(state_->cpu.v[src + index]);
+ result = VectorMasking<TargetElementType, vta, vma>(
+ result, std::get<0>(Intrinsic(frm, arg)), vstart, vl, 2 * index, mask);
+ state_->cpu.v[dst + 2 * index] = result.Get<__uint128_t>();
+ if constexpr (kDestRegistersInvolved > 1) { // if lmul is one full register or more
+ result.Set(state_->cpu.v[dst + 2 * index + 1]);
+ std::tie(arg) = intrinsics::VMovTopHalfToBottom<SourceElementType>(arg);
+ result = VectorMasking<TargetElementType, vta, vma>(
+ result, std::get<0>(Intrinsic(frm, arg)), vstart, vl, 2 * index + 1, mask);
+ state_->cpu.v[dst + 2 * index + 1] = result.Get<__uint128_t>();
+ }
+ }
+ }
+
+ // 2*SEW = SEW op SEW
+ // Attention: not to confuse with to be done OpVectorWidenwv with 2*SEW = 2*SEW op SEW
+ template <auto Intrinsic,
+ typename ElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorWidenvv(uint8_t dst, uint8_t src1, uint8_t src2) {
+ return OpVectorWidenvv<Intrinsic,
+ ElementType,
+ NumRegistersInvolvedForWideOperand(vlmul),
+ NumberOfRegistersInvolved(vlmul),
+ vta,
+ vma>(dst, src1, src2);
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ size_t kDestRegistersInvolved,
+ size_t kRegistersInvolved,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorWidenvv(uint8_t dst, uint8_t src1, uint8_t src2) {
+ if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kRegistersInvolved>(src1 | src2)) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t index = 0; index < kRegistersInvolved; ++index) {
+ SIMD128Register result(state_->cpu.v[dst + 2 * index]);
+ SIMD128Register arg1(state_->cpu.v[src1 + index]);
+ SIMD128Register arg2(state_->cpu.v[src2 + index]);
+ result = VectorMasking<decltype(Widen(ElementType{})), vta, vma>(
+ result, std::get<0>(Intrinsic(arg1, arg2)), vstart, vl, 2 * index, mask);
+ state_->cpu.v[dst + 2 * index] = result.Get<__uint128_t>();
+ if constexpr (kDestRegistersInvolved > 1) { // if lmul is one full register or more
+ result.Set(state_->cpu.v[dst + 2 * index + 1]);
+ std::tie(arg1) = intrinsics::VMovTopHalfToBottom<ElementType>(arg1);
+ std::tie(arg2) = intrinsics::VMovTopHalfToBottom<ElementType>(arg2);
+ result = VectorMasking<decltype(Widen(ElementType{})), vta, vma>(
+ result, std::get<0>(Intrinsic(arg1, arg2)), vstart, vl, 2 * index + 1, mask);
+ state_->cpu.v[dst + 2 * index + 1] = result.Get<__uint128_t>();
+ }
+ }
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma,
+ typename... DstMaskType>
+ void OpVectorvx(uint8_t dst, uint8_t src1, ElementType arg2, DstMaskType... dst_mask) {
+ return OpVectorvx<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
+ dst, src1, arg2, dst_mask...);
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ size_t kRegistersInvolved,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorvx(uint8_t dst, uint8_t src1, ElementType arg2) {
+ if (!IsAligned<kRegistersInvolved>(dst | src1)) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t index = 0; index < kRegistersInvolved; ++index) {
+ SIMD128Register result(state_->cpu.v[dst + index]);
+ SIMD128Register arg1(state_->cpu.v[src1 + index]);
+ result = VectorMasking<ElementType, vta, vma>(
+ result, std::get<0>(Intrinsic(arg1, arg2)), vstart, vl, index, mask);
+ state_->cpu.v[dst + index] = result.Get<__uint128_t>();
+ }
+ }
+
+ template <auto Intrinsic,
+ typename TargetElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorNarrowwr(uint8_t dst, uint8_t src) {
+ return OpVectorNarrowwr<Intrinsic,
+ TargetElementType,
+ NumberOfRegistersInvolved(vlmul),
+ NumRegistersInvolvedForWideOperand(vlmul),
+ vta,
+ vma>(dst, src);
+ }
+
+ template <auto Intrinsic,
+ typename TargetElementType,
+ size_t kDestRegistersInvolved,
+ size_t kSrcRegistersInvolved,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorNarrowwr(uint8_t dst, uint8_t src) {
+ if constexpr (kDestRegistersInvolved == kSrcRegistersInvolved) {
+ if (!IsAligned<kDestRegistersInvolved>(dst | src)) {
+ return Unimplemented();
+ }
+ } else if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kSrcRegistersInvolved>(src)) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ int8_t frm = GetCsr<CsrName::kFrm>();
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t index = 0; index < kDestRegistersInvolved; index++) {
+ SIMD128Register orig_result(state_->cpu.v[dst + index]);
+ SIMD128Register arg_low(state_->cpu.v[src + 2 * index]);
+ SIMD128Register intrinsic_result = std::get<0>(Intrinsic(frm, arg_low));
+ if constexpr (kSrcRegistersInvolved > 1) {
+ SIMD128Register arg_high(state_->cpu.v[src + 2 * index + 1]);
+ SIMD128Register result_high = std::get<0>(Intrinsic(frm, arg_high));
+ intrinsic_result = std::get<0>(
+ intrinsics::VMergeBottomHalfToTop<TargetElementType>(intrinsic_result, result_high));
+ }
+ auto result = VectorMasking<TargetElementType, vta, vma>(
+ orig_result, intrinsic_result, vstart, vl, index, mask);
+ state_->cpu.v[dst + index] = result.template Get<__uint128_t>();
+ }
+ }
+
+ // SEW = 2*SEW op SEW
+ template <auto Intrinsic,
+ typename ElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorNarrowwx(uint8_t dst, uint8_t src1, ElementType arg2) {
+ return OpVectorNarrowwx<Intrinsic,
+ ElementType,
+ NumberOfRegistersInvolved(vlmul),
+ NumRegistersInvolvedForWideOperand(vlmul),
+ vta,
+ vma>(dst, src1, arg2);
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ size_t kDestRegistersInvolved,
+ size_t kSrcRegistersInvolved,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorNarrowwx(uint8_t dst, uint8_t src1, ElementType arg2) {
+ if constexpr (kDestRegistersInvolved == kSrcRegistersInvolved) {
+ if (!IsAligned<kDestRegistersInvolved>(dst | src1)) {
+ return Unimplemented();
+ }
+ } else if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kSrcRegistersInvolved>(src1)) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t index = 0; index < kDestRegistersInvolved; index++) {
+ SIMD128Register orig_result(state_->cpu.v[dst + index]);
+ SIMD128Register arg1_low(state_->cpu.v[src1 + 2 * index]);
+ SIMD128Register intrinsic_result = std::get<0>(Intrinsic(arg1_low, arg2));
+
+ if constexpr (kSrcRegistersInvolved > 1) {
+ SIMD128Register arg1_high(state_->cpu.v[src1 + 2 * index + 1]);
+ SIMD128Register result_high = std::get<0>(Intrinsic(arg1_high, arg2));
+ intrinsic_result = std::get<0>(
+ intrinsics::VMergeBottomHalfToTop<ElementType>(intrinsic_result, result_high));
+ }
+
+ auto result = VectorMasking<ElementType, vta, vma>(
+ orig_result, intrinsic_result, vstart, vl, index, mask);
+ state_->cpu.v[dst + index] = result.template Get<__uint128_t>();
+ }
+ }
+
+ // SEW = 2*SEW op SEW
+ template <auto Intrinsic,
+ typename ElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorNarrowwv(uint8_t dst, uint8_t src1, uint8_t src2) {
+ return OpVectorNarrowwv<Intrinsic,
+ ElementType,
+ NumberOfRegistersInvolved(vlmul),
+ NumRegistersInvolvedForWideOperand(vlmul),
+ vta,
+ vma>(dst, src1, src2);
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ size_t kRegistersInvolved,
+ size_t kFirstSrcRegistersInvolved,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorNarrowwv(uint8_t dst, uint8_t src1, uint8_t src2) {
+ if constexpr (kRegistersInvolved == kFirstSrcRegistersInvolved) {
+ if (!IsAligned<kRegistersInvolved>(dst | src1 | src2)) {
+ return Unimplemented();
+ }
+ } else if (!IsAligned<kRegistersInvolved>(dst | src2) ||
+ !IsAligned<kFirstSrcRegistersInvolved>(src1)) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t index = 0; index < kRegistersInvolved; index++) {
+ SIMD128Register orig_result(state_->cpu.v[dst + index]);
+ SIMD128Register arg1_low(state_->cpu.v[src1 + 2 * index]);
+ SIMD128Register arg2_low(state_->cpu.v[src2 + index]);
+ SIMD128Register intrinsic_result = std::get<0>(Intrinsic(arg1_low, arg2_low));
+
+ if constexpr (kFirstSrcRegistersInvolved > 1) {
+ SIMD128Register arg1_high(state_->cpu.v[src1 + 2 * index + 1]);
+ SIMD128Register arg2_high(state_->cpu.v[src2 + index] >> 64);
+ SIMD128Register result_high = std::get<0>(Intrinsic(arg1_high, arg2_high));
+ intrinsic_result = std::get<0>(
+ intrinsics::VMergeBottomHalfToTop<ElementType>(intrinsic_result, result_high));
+ }
+
+ auto result = VectorMasking<ElementType, vta, vma>(
+ orig_result, intrinsic_result, vstart, vl, index, mask);
+ state_->cpu.v[dst + index] = result.template Get<__uint128_t>();
+ }
+ }
+
+ template <auto Intrinsic,
+ typename DestElementType,
+ const uint8_t kFactor,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorVXUnary0(uint8_t dst, uint8_t src) {
+ static_assert(kFactor == 2 || kFactor == 4 || kFactor == 8);
+ constexpr size_t kDestRegistersInvolved = NumberOfRegistersInvolved(vlmul);
+ constexpr size_t kSourceRegistersInvolved = (kDestRegistersInvolved / kFactor) ?: 1;
+ if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kSourceRegistersInvolved>(src)) {
+ return Unimplemented();
+ }
+ int vstart = GetCsr<CsrName::kVstart>();
+ int vl = GetCsr<CsrName::kVl>();
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ SetCsr<CsrName::kVstart>(0);
+ return;
+ }
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t dst_index = 0; dst_index < kDestRegistersInvolved; dst_index++) {
+ size_t src_index = dst_index / kFactor;
+ size_t src_elem = dst_index % kFactor;
+ SIMD128Register result{state_->cpu.v[dst + dst_index]};
+ SIMD128Register arg{state_->cpu.v[src + src_index] >> ((128 / kFactor) * src_elem)};
+
+ result = VectorMasking<DestElementType, vta, vma>(
+ result, std::get<0>(Intrinsic(arg)), vstart, vl, dst_index, mask);
+ state_->cpu.v[dst + dst_index] = result.Get<__uint128_t>();
+ }
+ SetCsr<CsrName::kVstart>(0);
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorvxv(uint8_t dst, uint8_t src1, ElementType arg2) {
+ return OpVectorvxv<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
+ dst, src1, arg2);
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ size_t kRegistersInvolved,
+ TailProcessing vta,
+ auto vma>
+ void OpVectorvxv(uint8_t dst, uint8_t src1, ElementType arg2) {
+ if (!IsAligned<kRegistersInvolved>(dst | src1)) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t index = 0; index < kRegistersInvolved; ++index) {
+ SIMD128Register result(state_->cpu.v[dst + index]);
+ SIMD128Register arg1(state_->cpu.v[src1 + index]);
+ result = VectorMasking<ElementType, vta, vma>(
+ result, std::get<0>(Intrinsic(arg1, arg2, result)), vstart, vl, index, mask);
+ state_->cpu.v[dst + index] = result.Get<__uint128_t>();
+ }
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ VectorRegisterGroupMultiplier vlmul,
+ TailProcessing vta,
+ auto vma,
+ typename... DstMaskType>
+ void OpVectorx(uint8_t dst, ElementType arg2, DstMaskType... dst_mask) {
+ return OpVectorx<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
+ dst, arg2, dst_mask...);
+ }
+
+ template <auto Intrinsic,
+ typename ElementType,
+ size_t kRegistersInvolved,
+ TailProcessing vta,
+ auto vma,
+ typename... DstMaskType>
+ void OpVectorx(uint8_t dst, ElementType arg2, DstMaskType... dst_mask) {
+ static_assert(sizeof...(dst_mask) <= 1);
+ if (!IsAligned<kRegistersInvolved>(dst | (dst_mask | ... | 0))) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+ // vector register group, including that no tail elements are updated with agnostic values.
+ if (vstart >= vl) [[unlikely]] {
+ return;
+ }
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t index = 0; index < kRegistersInvolved; ++index) {
+ SIMD128Register result(state_->cpu.v[dst + index]);
+ SIMD128Register result_mask;
+ if constexpr (sizeof...(DstMaskType) == 0) {
+ result_mask.Set(state_->cpu.v[dst + index]);
+ } else {
+ uint8_t dst_mask_unpacked[1] = {dst_mask...};
+ result_mask.Set(state_->cpu.v[dst_mask_unpacked[0] + index]);
+ }
+ result = VectorMasking<ElementType, vta, vma>(
+ result, std::get<0>(Intrinsic(arg2)), result_mask, vstart, vl, index, mask);
+ state_->cpu.v[dst + index] = result.Get<__uint128_t>();
+ }
+ }
+
+ template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
+ void OpVectorslideup(uint8_t dst, uint8_t src, Register offset) {
+ return OpVectorslideup<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
+ dst, src, offset);
+ }
+
+ template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
+ void OpVectorslideup(uint8_t dst, uint8_t src, Register offset) {
+ constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
+ if (!IsAligned<kRegistersInvolved>(dst | src)) {
+ return Unimplemented();
+ }
+ // Source and destination must not intersect.
+ if (dst < (src + kRegistersInvolved) && src < (dst + kRegistersInvolved)) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ if (vstart >= vl) [[unlikely]] {
+ // From 16.3: For all of the [slide instructions], if vstart >= vl, the
+ // instruction performs no operation and leaves the destination vector
+ // register unchanged.
+ return;
+ }
+ auto mask = GetMaskForVectorOperations<vma>();
+ // The slideup operation leaves Elements 0 through MAX(vstart, OFFSET) unchanged.
+ const size_t start_elem_index = std::max<Register>(vstart, offset);
+
+ // From 16.3.1: Destination elements OFFSET through vl-1 are written if
+ // unmasked and if OFFSET < vl.
+ // However if OFFSET > vl, we still need to apply the tail policy (as
+ // clarified in https://github.com/riscv/riscv-v-spec/issues/263). Given
+ // that OFFSET could be well past vl we start at vl rather than OFFSET in
+ // that case.
+ for (size_t index = std::min(start_elem_index, vl) / kElementsPerRegister;
+ index < kRegistersInvolved;
+ ++index) {
+ SIMD128Register result(state_->cpu.v[dst + index]);
+
+ // Arguments falling before the input group correspond to the first offset-amount
+ // result elements, which must remain undisturbed. We zero-initialize them here,
+ // but their values are eventually ignored by vstart masking in VectorMasking.
+ ssize_t first_arg_disp = index - 1 - offset / kElementsPerRegister;
+ SIMD128Register arg1 =
+ (first_arg_disp < 0) ? SIMD128Register{0} : state_->cpu.v[src + first_arg_disp];
+ SIMD128Register arg2 =
+ (first_arg_disp + 1 < 0) ? SIMD128Register{0} : state_->cpu.v[src + first_arg_disp + 1];
+
+ result =
+ VectorMasking<ElementType, vta, vma>(result,
+ std::get<0>(intrinsics::VectorSlideUp<ElementType>(
+ offset % kElementsPerRegister, arg1, arg2)),
+ start_elem_index,
+ vl,
+ index,
+ mask);
+ state_->cpu.v[dst + index] = result.Get<__uint128_t>();
+ }
+ }
+
+ template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
+ void OpVectorslidedown(uint8_t dst, uint8_t src, Register offset) {
+ return OpVectorslidedown<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
+ dst, src, offset);
+ }
+
+ template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
+ void OpVectorslidedown(uint8_t dst, uint8_t src, Register offset) {
+ constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
+ if (!IsAligned<kRegistersInvolved>(dst | src)) {
+ return Unimplemented();
+ }
+ // Source and destination must not intersect.
+ if (dst < (src + kRegistersInvolved) && src < (dst + kRegistersInvolved)) {
+ return Unimplemented();
+ }
+ size_t vstart = GetCsr<CsrName::kVstart>();
+ size_t vl = GetCsr<CsrName::kVl>();
+ SetCsr<CsrName::kVstart>(0);
+ if (vstart >= vl) [[unlikely]] {
+ // From 16.3: For all of the [slide instructions], if vstart >= vl, the
+ // instruction performs no operation and leaves the destination vector
+ // register unchanged.
+ return;
+ }
+ auto mask = GetMaskForVectorOperations<vma>();
+ for (size_t index = 0; index < kRegistersInvolved; ++index) {
+ SIMD128Register result(state_->cpu.v[dst + index]);
+
+ size_t first_arg_disp = index + offset / kElementsPerRegister;
+ SIMD128Register arg1 = (first_arg_disp >= kRegistersInvolved)
+ ? SIMD128Register{0}
+ : state_->cpu.v[src + first_arg_disp];
+ SIMD128Register arg2 = (first_arg_disp + 1 >= kRegistersInvolved)
+ ? SIMD128Register{0}
+ : state_->cpu.v[src + first_arg_disp + 1];
+
+ result =
+ VectorMasking<ElementType, vta, vma>(result,
+ std::get<0>(intrinsics::VectorSlideDown<ElementType>(
+ offset % kElementsPerRegister, arg1, arg2)),
+ vstart,
+ vl,
+ index,
+ mask);
+ state_->cpu.v[dst + index] = result.Get<__uint128_t>();
+ }
+ }
+
+ // Helper function needed to generate bitmak result from non-bitmask inputs.
+ // We are processing between 1 and 8 registers here and each register produces between 2 bits
+ // (for 64 bit inputs) and 16 bits (for 8 bit inputs) bitmasks which are then combined into
+ // final result (between 2 and 128 bits long).
+ // Note that we are not handling tail here! These bits remain undefined and should be handled
+ // later.
+ // TODO(b/317757595): Add separate tests to verify the logic.
+ template <typename ElementType, size_t kRegistersInvolved, typename Intrinsic>
+ SIMD128Register CollectBitmaskResult(Intrinsic intrinsic) {
+ // We employ two distinct tactics to handle all possibilities:
+ // 1. For 8bit/16bit types we get full UInt8/UInt16 result and thus use SIMD128Register.Set.
+ // 2. For 32bit/64bit types we only get 2bit or 4bit from each call and thus need to use
+ // shifts to accumulate the result.
+ // But since each of up to 8 results is at most 4bits total bitmask is 32bit (or less).
+ std::conditional_t<sizeof(ElementType) < sizeof(UInt32), SIMD128Register, UInt32>
+ bitmask_result{};
+ for (UInt32 index = UInt32{0}; index < UInt32(kRegistersInvolved); index += UInt32{1}) {
+ const auto [raw_result] =
+ intrinsics::SimdMaskToBitMask<ElementType>(std::get<0>(intrinsic(index)));
+ if constexpr (sizeof(ElementType) < sizeof(Int32)) {
+ bitmask_result.Set(raw_result, index);
+ } else {
+ constexpr UInt32 kElemNum =
+ UInt32{static_cast<uint32_t>((sizeof(SIMD128Register) / sizeof(ElementType)))};
+ bitmask_result |= UInt32(UInt8(raw_result)) << (index * kElemNum);
+ }
+ }
+ return SIMD128Register(bitmask_result);
+ }
+
+ void Nop() {}
+
+ void Unimplemented() {
+ UndefinedInsn(GetInsnAddr());
+ // If there is a guest handler registered for SIGILL we'll delay its processing until the next
+ // sync point (likely the main dispatching loop) due to enabled pending signals. Thus we must
+ // ensure that insn_addr isn't automatically advanced in FinalizeInsn.
+ exception_raised_ = true;
+ }
+
+ //
+ // Guest state getters/setters.
+ //
+
+ Register GetReg(uint8_t reg) const {
+ CheckRegIsValid(reg);
+ return state_->cpu.x[reg];
+ }
+
+ Register GetRegOrZero(uint8_t reg) { return reg == 0 ? 0 : GetReg(reg); }
+
+ void SetReg(uint8_t reg, Register value) {
+ if (exception_raised_) {
+ // Do not produce side effects.
+ return;
+ }
+ CheckRegIsValid(reg);
+ state_->cpu.x[reg] = value;
+ }
+
+ void SetRegOrIgnore(uint8_t reg, Register value) {
+ if (reg != 0) {
+ SetReg(reg, value);
+ }
+ }
+
+ FpRegister GetFpReg(uint8_t reg) const {
+ CheckFpRegIsValid(reg);
+ return state_->cpu.f[reg];
+ }
+
+ template <typename FloatType>
+ FpRegister GetFRegAndUnboxNan(uint8_t reg);
+
+ template <typename FloatType>
+ void NanBoxAndSetFpReg(uint8_t reg, FpRegister value);
+
+ //
+ // Various helper methods.
+ //
+
+ template <CsrName kName>
+ [[nodiscard]] Register GetCsr() const {
+ return state_->cpu.*CsrFieldAddr<kName>;
+ }
+
+ template <CsrName kName>
+ void SetCsr(Register arg) {
+ if (exception_raised_) {
+ return;
+ }
+ state_->cpu.*CsrFieldAddr<kName> = arg & kCsrMask<kName>;
+ }
+
+ [[nodiscard]] uint64_t GetImm(uint64_t imm) const { return imm; }
+
+ [[nodiscard]] Register Copy(Register value) const { return value; }
+
+ [[nodiscard]] GuestAddr GetInsnAddr() const { return state_->cpu.insn_addr; }
+
+ void FinalizeInsn(uint8_t insn_len) {
+ if (!branch_taken_ && !exception_raised_) {
+ state_->cpu.insn_addr += insn_len;
+ }
+ }
+
+#include "berberis/intrinsics/interpreter_intrinsics_hooks-inl.h"
+
+ private:
+ template <typename DataType>
+ Register Load(const void* ptr) {
+ static_assert(std::is_integral_v<DataType>);
+ CHECK(!exception_raised_);
+ FaultyLoadResult result = FaultyLoad(ptr, sizeof(DataType));
+ if (result.is_fault) {
+ exception_raised_ = true;
+ return {};
+ }
+ return static_cast<DataType>(result.value);
+ }
+
+ template <typename DataType>
+ void Store(void* ptr, uint64_t data) {
+ static_assert(std::is_integral_v<DataType>);
+ CHECK(!exception_raised_);
+ exception_raised_ = FaultyStore(ptr, sizeof(DataType), data);
+ }
+
+ void CheckShamtIsValid(int8_t shamt) const {
+ CHECK_GE(shamt, 0);
+ CHECK_LT(shamt, 64);
+ }
+
+ void CheckShamt32IsValid(int8_t shamt) const {
+ CHECK_GE(shamt, 0);
+ CHECK_LT(shamt, 32);
+ }
+
+ void CheckRegIsValid(uint8_t reg) const {
+ CHECK_GT(reg, 0u);
+ CHECK_LE(reg, std::size(state_->cpu.x));
+ }
+
+ void CheckFpRegIsValid(uint8_t reg) const { CHECK_LT(reg, std::size(state_->cpu.f)); }
+
+ template <bool kUseMasking>
+ std::conditional_t<kUseMasking, SIMD128Register, intrinsics::NoInactiveProcessing>
+ GetMaskForVectorOperationsIfNeeded() {
+ if constexpr (kUseMasking) {
+ return {state_->cpu.v[0]};
+ } else {
+ return intrinsics::NoInactiveProcessing{};
+ }
+ }
+
+ template <auto vma>
+ std::conditional_t<std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>,
+ intrinsics::NoInactiveProcessing,
+ SIMD128Register>
+ GetMaskForVectorOperations() {
+ return GetMaskForVectorOperationsIfNeeded<
+ !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>();
+ }
+
+ template <typename ElementType,
+ TailProcessing vta,
+ auto vma = intrinsics::NoInactiveProcessing{},
+ typename MaskType = intrinsics::NoInactiveProcessing>
+ SIMD128Register VectorMasking(SIMD128Register dest,
+ SIMD128Register result,
+ size_t vstart,
+ size_t vl,
+ size_t index,
+ MaskType mask = intrinsics::NoInactiveProcessing{}) {
+ return std::get<0>(intrinsics::VectorMasking<ElementType, vta, vma>(
+ dest,
+ result,
+ vstart - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
+ vl - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
+ std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, index))));
+ }
+
+ template <typename ElementType,
+ TailProcessing vta,
+ auto vma = intrinsics::NoInactiveProcessing{},
+ typename MaskType = intrinsics::NoInactiveProcessing>
+ SIMD128Register VectorMasking(SIMD128Register dest,
+ SIMD128Register result,
+ SIMD128Register result_mask,
+ size_t vstart,
+ size_t vl,
+ size_t index,
+ MaskType mask = intrinsics::NoInactiveProcessing{}) {
+ return std::get<0>(intrinsics::VectorMasking<ElementType, vta, vma>(
+ dest,
+ result,
+ result_mask,
+ vstart - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
+ vl - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
+ std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, index))));
+ }
+
+ ThreadState* state_;
+ bool branch_taken_;
+ // This flag is set by illegal instructions and faulted memory accesses. The former must always
+ // stop the playback of the current instruction, so we don't need to do anything special. The
+ // latter may result in having more operations with side effects called before the end of the
+ // current instruction:
+ // Load (faulted) -> SetReg
+ // LoadFp (faulted) -> NanBoxAndSetFpReg
+ // If an exception is raised before these operations, we skip them. For all other operations with
+ // side-effects we check that this flag is never raised.
+ bool exception_raised_;
+};
+
+template <>
+[[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kFCsr>() const {
+ return FeGetExceptions() | (state_->cpu.frm << 5);
+}
+
+template <>
+[[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kFFlags>() const {
+ return FeGetExceptions();
+}
+
+template <>
+[[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVlenb>() const {
+ return 16;
+}
+
+template <>
+[[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVxrm>() const {
+ return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11;
+}
+
+template <>
+[[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVxsat>() const {
+ return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> >> 2;
+}
+
+template <>
+void inline Interpreter::SetCsr<CsrName::kFCsr>(Register arg) {
+ CHECK(!exception_raised_);
+ FeSetExceptions(arg & 0b1'1111);
+ arg = (arg >> 5) & kCsrMask<CsrName::kFrm>;
+ state_->cpu.frm = arg;
+ FeSetRound(arg);
+}
+
+template <>
+void inline Interpreter::SetCsr<CsrName::kFFlags>(Register arg) {
+ CHECK(!exception_raised_);
+ FeSetExceptions(arg & 0b1'1111);
+}
+
+template <>
+void inline Interpreter::SetCsr<CsrName::kFrm>(Register arg) {
+ CHECK(!exception_raised_);
+ arg &= kCsrMask<CsrName::kFrm>;
+ state_->cpu.frm = arg;
+ FeSetRound(arg);
+}
+
+template <>
+void inline Interpreter::SetCsr<CsrName::kVxrm>(Register arg) {
+ CHECK(!exception_raised_);
+ state_->cpu.*CsrFieldAddr<CsrName::kVcsr> =
+ (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b100) | (arg & 0b11);
+}
+
+template <>
+void inline Interpreter::SetCsr<CsrName::kVxsat>(Register arg) {
+ CHECK(!exception_raised_);
+ state_->cpu.*CsrFieldAddr<CsrName::kVcsr> =
+ (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11) | ((arg & 0b1) << 2);
+}
+
+template <>
+[[nodiscard]] Interpreter::FpRegister inline Interpreter::GetFRegAndUnboxNan<Interpreter::Float32>(
+ uint8_t reg) {
+ CheckFpRegIsValid(reg);
+ FpRegister value = state_->cpu.f[reg];
+ return UnboxNan<Float32>(value);
+}
+
+template <>
+[[nodiscard]] Interpreter::FpRegister inline Interpreter::GetFRegAndUnboxNan<Interpreter::Float64>(
+ uint8_t reg) {
+ CheckFpRegIsValid(reg);
+ return state_->cpu.f[reg];
+}
+
+template <>
+void inline Interpreter::NanBoxAndSetFpReg<Interpreter::Float32>(uint8_t reg, FpRegister value) {
+ if (exception_raised_) {
+ // Do not produce side effects.
+ return;
+ }
+ CheckFpRegIsValid(reg);
+ state_->cpu.f[reg] = NanBox<Float32>(value);
+}
+
+template <>
+void inline Interpreter::NanBoxAndSetFpReg<Interpreter::Float64>(uint8_t reg, FpRegister value) {
+ if (exception_raised_) {
+ // Do not produce side effects.
+ return;
+ }
+ CheckFpRegIsValid(reg);
+ state_->cpu.f[reg] = value;
+}
+
+#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS
+template <>
+extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadIndexedArgs& args);
+template <>
+extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadStrideArgs& args);
+template <>
+extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadUnitStrideArgs& args);
+template <>
+extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVfArgs& args);
+template <>
+extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVvArgs& args);
+template <>
+extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIViArgs& args);
+template <>
+extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVvArgs& args);
+template <>
+extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVxArgs& args);
+template <>
+extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVvArgs& args);
+template <>
+extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVxArgs& args);
+template <>
+extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreIndexedArgs& args);
+template <>
+extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreStrideArgs& args);
+template <>
+extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreUnitStrideArgs& args);
+#endif
+
+} // namespace berberis