diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2024-03-04 22:09:57 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2024-03-04 22:09:57 +0000 |
commit | cb71633fbed62d26757f32be25ebecc388032c40 (patch) | |
tree | 3bce53cd9c8a5d969d4feecee79c597663702a70 | |
parent | cd384f706cd4c14148da0a9d2509dc49bde7ccd7 (diff) | |
parent | c44a48ec868518cc6947476cc69cb368e1b9b594 (diff) | |
download | binary_translation-simpleperf-release.tar.gz |
Snap for 11526323 from c44a48ec868518cc6947476cc69cb368e1b9b594 to simpleperf-releasesimpleperf-release
Change-Id: I8ee6b3c97f6dc6eef08c5cbb0ffa69ad63f5cdce
24 files changed, 1626 insertions, 348 deletions
diff --git a/base/bit_util_test.cc b/base/bit_util_test.cc index 79fde7a4..9ebf1e1f 100644 --- a/base/bit_util_test.cc +++ b/base/bit_util_test.cc @@ -153,11 +153,44 @@ static_assert(std::is_same_v<Int16, UInt16::SignedType>); static_assert(std::is_same_v<UInt16, Int16::UnsignedType>); static_assert(std::is_same_v<UInt16, UInt16::UnsignedType>); +static_assert(std::is_same_v<Int16, SignedType<RawInt16>>); +static_assert(std::is_same_v<Int16, SignedType<Int16>>); +static_assert(std::is_same_v<Int16, SignedType<UInt16>>); +static_assert(std::is_same_v<UInt16, UnsignedType<RawInt16>>); +static_assert(std::is_same_v<UInt16, UnsignedType<Int16>>); +static_assert(std::is_same_v<UInt16, UnsignedType<UInt16>>); + +static_assert(std::is_same_v<Int16, WrappingType<Int16>>); +static_assert(std::is_same_v<UInt16, WrappingType<UInt16>>); +static_assert(std::is_same_v<Int16, WrappingType<SatInt16>>); +static_assert(std::is_same_v<UInt16, WrappingType<SatUInt16>>); + static_assert(std::is_same_v<SatInt16, SatInt16::SignedType>); static_assert(std::is_same_v<SatInt16, SatUInt16::SignedType>); static_assert(std::is_same_v<SatUInt16, SatInt16::UnsignedType>); static_assert(std::is_same_v<SatUInt16, SatUInt16::UnsignedType>); +static_assert(std::is_same_v<SatInt16, SignedType<SatInt16>>); +static_assert(std::is_same_v<SatInt16, SignedType<SatUInt16>>); +static_assert(std::is_same_v<SatUInt16, UnsignedType<SatInt16>>); +static_assert(std::is_same_v<SatUInt16, UnsignedType<SatUInt16>>); + +static_assert(std::is_same_v<SatInt16, SaturatingType<Int16>>); +static_assert(std::is_same_v<SatUInt16, SaturatingType<UInt16>>); +static_assert(std::is_same_v<SatInt16, SaturatingType<SatInt16>>); +static_assert(std::is_same_v<SatUInt16, SaturatingType<SatUInt16>>); + +static_assert(std::is_same_v<SatInt16, SignedType<SatInt16>>); +static_assert(std::is_same_v<SatInt16, SignedType<SatUInt16>>); +static_assert(std::is_same_v<SatUInt16, UnsignedType<SatInt16>>); +static_assert(std::is_same_v<SatUInt16, UnsignedType<SatUInt16>>); + +static_assert(std::is_same_v<RawInt16, RawType<RawInt16>>); +static_assert(std::is_same_v<RawInt16, RawType<Int16>>); +static_assert(std::is_same_v<RawInt16, RawType<UInt16>>); +static_assert(std::is_same_v<RawInt16, RawType<SatInt16>>); +static_assert(std::is_same_v<RawInt16, RawType<SatUInt16>>); + } // namespace } // namespace berberis diff --git a/base/include/berberis/base/bit_util.h b/base/include/berberis/base/bit_util.h index ddf4f91c..b3d287ea 100644 --- a/base/include/berberis/base/bit_util.h +++ b/base/include/berberis/base/bit_util.h @@ -271,6 +271,9 @@ class WrappedFloatType; } // namespace intrinsics +template <typename T> +struct TypeTraits; + // Raw integers. Used to carry payload, which may be be EXPLICITLY converted to Saturating // integer, Wrapping integer, or WrappedFloatType. // @@ -697,6 +700,12 @@ using UInt128 = Wrapping<unsigned __int128>; #endif template <typename IntType> +[[nodiscard]] auto constexpr BitCastToSigned(Raw<IntType> src) -> + typename Wrapping<IntType>::SignedType { + return {static_cast<std::make_signed_t<IntType>>(src.value)}; +} + +template <typename IntType> [[nodiscard]] auto constexpr BitCastToSigned(Saturating<IntType> src) -> typename Saturating<IntType>::SignedType { return {static_cast<std::make_signed_t<IntType>>(src.value)}; @@ -712,6 +721,12 @@ template <typename T> using SignedType = decltype(BitCastToSigned(std::declval<T>())); template <typename IntType> +[[nodiscard]] auto constexpr BitCastToUnsigned(Raw<IntType> src) -> + typename Wrapping<IntType>::UnsignedType { + return {static_cast<std::make_unsigned_t<IntType>>(src.value)}; +} + +template <typename IntType> [[nodiscard]] auto constexpr BitCastToUnsigned(Saturating<IntType> src) -> typename Saturating<IntType>::UnsignedType { return {static_cast<std::make_unsigned_t<IntType>>(src.value)}; @@ -726,6 +741,86 @@ template <typename IntType> template <typename T> using UnsignedType = decltype(BitCastToUnsigned(std::declval<T>())); +template <typename IntType> +[[nodiscard]] auto constexpr BitCastToSaturating(Saturating<IntType> src) -> Saturating<IntType> { + return src; +} + +template <typename IntType> +[[nodiscard]] auto constexpr BitCastToSaturating(Wrapping<IntType> src) -> Saturating<IntType> { + return {src.value}; +} + +template <typename T> +using SaturatingType = decltype(BitCastToSaturating(std::declval<T>())); + +template <typename IntType> +[[nodiscard]] auto constexpr BitCastToWrapping(Saturating<IntType> src) -> Wrapping<IntType> { + return {src.value}; +} + +template <typename IntType> +[[nodiscard]] auto constexpr BitCastToWrapping(Wrapping<IntType> src) -> Wrapping<IntType> { + return src; +} + +template <typename T> +using WrappingType = decltype(BitCastToWrapping(std::declval<T>())); + +template <typename IntType> +[[nodiscard]] auto constexpr BitCastToRaw(Raw<IntType> src) -> Raw<IntType> { + return src; +} + +template <typename IntType> +[[nodiscard]] auto constexpr BitCastToRaw(Saturating<IntType> src) + -> Raw<std::make_unsigned_t<IntType>> { + return {static_cast<std::make_unsigned_t<IntType>>(src.value)}; +} + +template <typename IntType> +[[nodiscard]] auto constexpr BitCastToRaw(Wrapping<IntType> src) + -> Raw<std::make_unsigned_t<IntType>> { + return {static_cast<std::make_unsigned_t<IntType>>(src.value)}; +} + +template <typename BaseType> +[[nodiscard]] constexpr auto BitCastToRaw(intrinsics::WrappedFloatType<BaseType> src) + -> Raw<std::make_unsigned_t<typename TypeTraits<intrinsics::WrappedFloatType<BaseType>>::Int>> { + return {bit_cast< + std::make_unsigned_t<typename TypeTraits<intrinsics::WrappedFloatType<BaseType>>::Int>>(src)}; +} + +template <typename T> +using RawType = decltype(BitCastToRaw(std::declval<T>())); + +template <typename IntType> +[[nodiscard]] auto constexpr BitCastToFloat(Raw<IntType> src) -> + typename TypeTraits<IntType>::Float { + return bit_cast<typename TypeTraits<IntType>::Float>(src.value); +} + +template <typename IntType> +[[nodiscard]] auto constexpr BitCastToFloat(Saturating<IntType> src) -> + typename TypeTraits<IntType>::Float { + return bit_cast<typename TypeTraits<IntType>::Float>(src.value); +} + +template <typename IntType> +[[nodiscard]] auto constexpr BitCastToFloat(Wrapping<IntType> src) -> + typename TypeTraits<IntType>::Float { + return bit_cast<typename TypeTraits<IntType>::Float>(src.value); +} + +template <typename BaseType> +[[nodiscard]] constexpr auto BitCastToFloat(intrinsics::WrappedFloatType<BaseType> src) + -> intrinsics::WrappedFloatType<BaseType> { + return src; +} + +template <typename T> +using FloatType = decltype(BitCastToFloat(std::declval<T>())); + template <typename ResultType, typename IntType> [[nodiscard]] auto constexpr MaybeTruncateTo(IntType src) -> std::enable_if_t<std::is_integral_v<IntType> && @@ -774,9 +869,6 @@ template <typename ResultType, typename IntType> return ResultType{static_cast<ResultType::BaseType>(src.value)}; } -template <typename T> -struct TypeTraits; - template <typename BaseType> [[nodiscard]] constexpr auto Widen(Saturating<BaseType> source) -> Saturating<typename TypeTraits<BaseType>::Wide> { @@ -789,6 +881,12 @@ template <typename BaseType> return {source.value}; } +template <typename BaseType> +[[nodiscard]] constexpr auto Widen(intrinsics::WrappedFloatType<BaseType> source) + -> Wrapping<typename TypeTraits<intrinsics::WrappedFloatType<BaseType>>::Wide> { + return {source.value}; +} + template <typename T> using WideType = decltype(Widen(std::declval<T>())); @@ -812,6 +910,12 @@ template <typename BaseType> return {static_cast<typename TypeTraits<BaseType>::Narrow>(source.value)}; } +template <typename BaseType> +[[nodiscard]] constexpr auto Narrow(intrinsics::WrappedFloatType<BaseType> source) + -> Wrapping<typename TypeTraits<intrinsics::WrappedFloatType<BaseType>>::Narrow> { + return {source.value}; +} + template <typename T> using NarrowType = decltype(Narrow(std::declval<T>())); diff --git a/decoder/include/berberis/decoder/riscv64/decoder.h b/decoder/include/berberis/decoder/riscv64/decoder.h index c9cabf68..a150d048 100644 --- a/decoder/include/berberis/decoder/riscv64/decoder.h +++ b/decoder/include/berberis/decoder/riscv64/decoder.h @@ -234,7 +234,7 @@ class Decoder { kEbreak = 0b000000000001'00000'000'00000, }; - enum class VLoadUnitStrideOpcode : uint8_t { + enum class VLUmOpOpcode : uint8_t { kVleXX = 0b00000, kVlXreXX = 0b01000, kVleXXff = 0b10000, @@ -295,8 +295,8 @@ class Decoder { kVfsgnjnvv = 0b001001, kVfsgnjxvv = 0b001010, kVfmvfs = 0b010000, - kVfcvtXX = 0b010010, - kVXXXv = 0b010011, // Vfsqrt.v/Vfrsqrt7.v/Vfrec7.v/Vfclass.v + kVFUnary0 = 0b010010, + kVFUnary1 = 0b010011, kVmfeqvv = 0b011000, kVmflevv = 0b011001, kVmfltvv = 0b011011, @@ -345,7 +345,7 @@ class Decoder { kVsadduvi = 0b100000, kVsaddvi = 0b100001, kVsllvi = 0b100101, - kVmvvi = 0b100111, + kVmvXrv = 0b100111, kVsrlvi = 0b101000, kVsravi = 0b101001, kVssrlvi = 0b101010, @@ -414,9 +414,9 @@ class Decoder { kVmnandmm = 0b011101, kVmnormm = 0b011110, kVmxnormm = 0b011111, - kVXmXXs = 0b010000, - kVmsXf = 0b010100, - kVxunary0 = 0b010010, + kVWXUnary0 = 0b010000, + kVMUnary0 = 0b010100, + kVFUnary0 = 0b010010, kVmulhuvv = 0b100100, kVmulvv = 0b100101, kVmulhsuvv = 0b100110, @@ -475,7 +475,7 @@ class Decoder { }; enum class VOpMVxOpcode : uint8_t { - kVXmXXx = 0b010000, + kVRXUnary0 = 0b010000, kVmulhuvx = 0b100100, kVmulvx = 0b100101, kVmulhsuvx = 0b100110, @@ -486,30 +486,54 @@ class Decoder { kVnmsacvx = 0b101111, }; - enum class VStoreUnitStrideOpcode : uint8_t { + enum class VSUmOpOpcode : uint8_t { kVseXX = 0b00000, kVsX = 0b01000, kVsm = 0b01011, }; - enum class VXmXXxOpcode : uint8_t { + enum class VFUnary0Opcode : uint8_t { + kVfcvtxufv = 0b00000, + kVfcvtxfv = 0b00001, + kVfcvtfxuv = 0b00010, + kVfcvtfxv = 0b00011, + kVfcvtrtzxufv = 0b00110, + kVfcvtrtzxfv = 0b00111, + kVfwcvtxufv = 0b01000, + kVfwcvtxfv = 0b01001, + kVfwcvtfxuv = 0b01010, + kVfwcvtfxv = 0b01011, + kVfwcvtffv = 0b01100, + kVfwcvtrtzxufv = 0b01110, + kVfwcvtrtzxfv = 0b01111, + kVfncvtxufw = 0b10000, + kVfncvtxfw = 0b10001, + kVfncvtfxuw = 0b10010, + kVfncvtfxw = 0b10011, + kVfncvtffw = 0b10100, + kVfncvtrodffw = 0b10101, + kVfncvtrtzxufw = 0b10110, + kVfncvtrtzxfw = 0b10111, + }; + + enum class VRXUnary0Opcode : uint8_t { kVmvsx = 0b00000, }; - enum class VXmXXsOpcode : uint8_t { + enum class VWXUnary0Opcode : uint8_t { kVmvxs = 0b00000, kVcpopm = 0b10000, kVfirstm = 0b10001, }; - enum class VmsXfOpcode : uint8_t { + enum class VMUnary0Opcode : uint8_t { kVmsbfm = 0b00001, kVmsofm = 0b00010, kVmsifm = 0b00011, kVidv = 0b10001, }; - enum class Vxunary0Opcode : uint8_t { + enum class VXUnary0Opcode : uint8_t { kVzextvf8m = 0b00010, kVsextvf8m = 0b00011, kVzextvf4m = 0b00100, @@ -774,7 +798,7 @@ class Decoder { }; struct VLoadUnitStrideArgs { - VLoadUnitStrideOpcode opcode; + VLUmOpOpcode opcode; MemoryDataOperandType width; bool vm; uint8_t nf; @@ -795,7 +819,10 @@ class Decoder { bool vm; uint8_t dst; uint8_t src1; - uint8_t src2; + union { + VFUnary0Opcode vfunary0_opcode; + uint8_t src2; + }; }; struct VOpIViArgs { @@ -820,9 +847,9 @@ class Decoder { uint8_t dst; uint8_t src1; union { - VXmXXsOpcode vXmXXs_opcode; - VmsXfOpcode vmsXf_opcode; - Vxunary0Opcode vxunary0_opcode; + VWXUnary0Opcode vwxunary0_opcode; + VMUnary0Opcode vmunary0_opcode; + VXUnary0Opcode vxunary0_opcode; uint8_t src2; }; }; @@ -840,7 +867,7 @@ class Decoder { bool vm; uint8_t dst; union { - VXmXXxOpcode vXmXXx_opcode; + VRXUnary0Opcode vrxunary0_opcode; uint8_t src1; }; uint8_t src2; @@ -885,7 +912,7 @@ class Decoder { }; struct VStoreUnitStrideArgs { - VStoreUnitStrideOpcode opcode; + VSUmOpOpcode opcode; MemoryDataOperandType width; bool vm; uint8_t nf; @@ -1637,7 +1664,7 @@ class Decoder { switch (GetBits<26, 2>()) { case 0b00: { const VLoadUnitStrideArgs args = { - .opcode = VLoadUnitStrideOpcode{GetBits<20, 5>()}, + .opcode = VLUmOpOpcode{GetBits<20, 5>()}, .width = decoded_operand_type.eew, .vm = GetBits<25, 1>(), .nf = GetBits<29, 3>(), @@ -1701,7 +1728,7 @@ class Decoder { switch (GetBits<26, 2>()) { case 0b00: { const VStoreUnitStrideArgs args = { - .opcode = VStoreUnitStrideOpcode{GetBits<20, 5>()}, + .opcode = VSUmOpOpcode{GetBits<20, 5>()}, .width = decoded_operand_type.eew, .vm = GetBits<25, 1>(), .nf = GetBits<29, 3>(), diff --git a/guest_state/Android.bp b/guest_state/Android.bp index 2a8b2a21..26ee4c21 100644 --- a/guest_state/Android.bp +++ b/guest_state/Android.bp @@ -24,9 +24,11 @@ cc_library_headers { export_include_dirs: ["include"], header_libs: [ "libberberis_base_headers", + "native_bridge_guest_state_headers", ], export_header_lib_headers: [ "libberberis_base_headers", + "native_bridge_guest_state_headers", ], } diff --git a/guest_state/riscv64/include/berberis/guest_state/guest_state_arch.h b/guest_state/riscv64/include/berberis/guest_state/guest_state_arch.h index 9c7f9044..6f87af8b 100644 --- a/guest_state/riscv64/include/berberis/guest_state/guest_state_arch.h +++ b/guest_state/riscv64/include/berberis/guest_state/guest_state_arch.h @@ -27,11 +27,10 @@ #include "berberis/base/macros.h" #include "berberis/guest_state/guest_addr.h" #include "berberis/guest_state/guest_state_opaque.h" +#include "native_bridge_support/riscv64/guest_state/guest_state_cpu_state.h" namespace berberis { -using Reservation = uint64_t; - enum class CsrName { kFFlags = 0b00'00'0000'0001, kFrm = 0b00'00'0000'0010, @@ -69,48 +68,6 @@ enum class CsrName { BERBERIS_RISV64_PROCESS_NOSTORAGE_CSR(Vxsat), BERBERIS_RISV64_PROCESS_NOSTORAGE_CSR(Vxrm), \ BERBERIS_RISV64_PROCESS_NOSTORAGE_CSR(Vlenb) -struct CPUState { - // x0 to x31. - uint64_t x[32]; - // f0 to f31. We are using uint64_t because C++ may change values of NaN when they are passed from - // or to function and RISC-V uses NaN-boxing which would make things problematic. - uint64_t f[32]; - // v0 to v32. We only support 128bit vectors for now. - alignas(16) __uint128_t v[32]; - - GuestAddr insn_addr; - - GuestAddr reservation_address; - Reservation reservation_value; - - // Technically only 9 bits are defined: sign bit and 8 low bits. - // But for performance reason it's easier to keep full 64bits in this variable. - uint64_t vtype; - // This register usually contains zero and each vector instruction would reset it to zero. - // But it's allowed to change it and if that happens we are supposed to support it. - uint8_t vstart; - // This register is usually set to process full 128 bits set of SIMD data. - // But it's allowed to change it and if that happens we are supposed to support it. - uint8_t vl; - // Only 3 bits are defined but we allocate full byte to simplify implementation. - uint8_t vcsr; - // RISC-V has five rounding modes, while x86-64 has only four. - // - // Extra rounding mode (RMM in RISC-V documentation) is emulated but requires the use of - // FE_TOWARDZERO mode for correct work. - // - // Additionally RISC-V implementation is supposed to support three “illegal” rounding modes and - // when they are selected all instructions which use rounding mode trigger “undefined instruction” - // exception. - // - // For simplicity we always keep full rounding mode (3 bits) in the frm field and set host - // rounding mode to appropriate one. - // - // Exceptions, on the other hand, couldn't be stored here efficiently, instead we rely on the fact - // that x86-64 implements all five exceptions that RISC-V needs (and more). - uint8_t frm; -}; - static_assert(std::is_standard_layout_v<CPUState>); constexpr uint32_t kNumGuestRegs = std::size(CPUState{}.x); diff --git a/interpreter/Android.bp b/interpreter/Android.bp index 98aabc65..ed9ca278 100644 --- a/interpreter/Android.bp +++ b/interpreter/Android.bp @@ -28,6 +28,7 @@ cc_library_static { name: "libberberis_interpreter_riscv64", defaults: ["berberis_defaults_64"], host_supported: true, + cflags: ["-DBERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS"], header_libs: [ "libberberis_base_headers", "libberberis_decoder_riscv64_headers", @@ -44,7 +45,22 @@ cc_library_static { srcs: ["riscv64/faulty_memory_accesses_x86_64.cc"], }, }, - srcs: ["riscv64/interpreter.cc"], + srcs: [ + "riscv64/interpreter-main.cc", + "riscv64/interpreter-VLoadIndexedArgs.cc", + "riscv64/interpreter-VLoadStrideArgs.cc", + "riscv64/interpreter-VLoadUnitStrideArgs.cc", + "riscv64/interpreter-VOpFVfArgs.cc", + "riscv64/interpreter-VOpFVvArgs.cc", + "riscv64/interpreter-VOpIViArgs.cc", + "riscv64/interpreter-VOpIVvArgs.cc", + "riscv64/interpreter-VOpIVxArgs.cc", + "riscv64/interpreter-VOpMVvArgs.cc", + "riscv64/interpreter-VOpMVxArgs.cc", + "riscv64/interpreter-VStoreIndexedArgs.cc", + "riscv64/interpreter-VStoreStrideArgs.cc", + "riscv64/interpreter-VStoreUnitStrideArgs.cc", + ], } cc_test_library { diff --git a/interpreter/riscv64/interpreter-VLoadIndexedArgs.cc b/interpreter/riscv64/interpreter-VLoadIndexedArgs.cc new file mode 100644 index 00000000..6dd0f19d --- /dev/null +++ b/interpreter/riscv64/interpreter-VLoadIndexedArgs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#undef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#include "interpreter.h" + +namespace berberis { + +template void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadIndexedArgs& args); + +} // namespace berberis +#endif diff --git a/interpreter/riscv64/interpreter-VLoadStrideArgs.cc b/interpreter/riscv64/interpreter-VLoadStrideArgs.cc new file mode 100644 index 00000000..ab8d78c5 --- /dev/null +++ b/interpreter/riscv64/interpreter-VLoadStrideArgs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#undef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#include "interpreter.h" + +namespace berberis { + +template void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadStrideArgs& args); + +} // namespace berberis +#endif diff --git a/interpreter/riscv64/interpreter-VLoadUnitStrideArgs.cc b/interpreter/riscv64/interpreter-VLoadUnitStrideArgs.cc new file mode 100644 index 00000000..63168fbd --- /dev/null +++ b/interpreter/riscv64/interpreter-VLoadUnitStrideArgs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#undef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#include "interpreter.h" + +namespace berberis { + +template void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadUnitStrideArgs& args); + +} // namespace berberis +#endif diff --git a/interpreter/riscv64/interpreter-VOpFVfArgs.cc b/interpreter/riscv64/interpreter-VOpFVfArgs.cc new file mode 100644 index 00000000..4701cef8 --- /dev/null +++ b/interpreter/riscv64/interpreter-VOpFVfArgs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#undef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#include "interpreter.h" + +namespace berberis { + +template void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVfArgs& args); + +} // namespace berberis +#endif diff --git a/interpreter/riscv64/interpreter-VOpFVvArgs.cc b/interpreter/riscv64/interpreter-VOpFVvArgs.cc new file mode 100644 index 00000000..7b99809c --- /dev/null +++ b/interpreter/riscv64/interpreter-VOpFVvArgs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#undef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#include "interpreter.h" + +namespace berberis { + +template void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVvArgs& args); + +} // namespace berberis +#endif diff --git a/interpreter/riscv64/interpreter-VOpIViArgs.cc b/interpreter/riscv64/interpreter-VOpIViArgs.cc new file mode 100644 index 00000000..f6ff4df1 --- /dev/null +++ b/interpreter/riscv64/interpreter-VOpIViArgs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#undef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#include "interpreter.h" + +namespace berberis { + +template void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIViArgs& args); + +} // namespace berberis +#endif diff --git a/interpreter/riscv64/interpreter-VOpIVvArgs.cc b/interpreter/riscv64/interpreter-VOpIVvArgs.cc new file mode 100644 index 00000000..5fe0e03c --- /dev/null +++ b/interpreter/riscv64/interpreter-VOpIVvArgs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#undef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#include "interpreter.h" + +namespace berberis { + +template void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVvArgs& args); + +} // namespace berberis +#endif diff --git a/interpreter/riscv64/interpreter-VOpIVxArgs.cc b/interpreter/riscv64/interpreter-VOpIVxArgs.cc new file mode 100644 index 00000000..f8cbecf7 --- /dev/null +++ b/interpreter/riscv64/interpreter-VOpIVxArgs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#undef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#include "interpreter.h" + +namespace berberis { + +template void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVxArgs& args); + +} // namespace berberis +#endif diff --git a/interpreter/riscv64/interpreter-VOpMVvArgs.cc b/interpreter/riscv64/interpreter-VOpMVvArgs.cc new file mode 100644 index 00000000..d3bd73d6 --- /dev/null +++ b/interpreter/riscv64/interpreter-VOpMVvArgs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#undef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#include "interpreter.h" + +namespace berberis { + +template void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVvArgs& args); + +} // namespace berberis +#endif diff --git a/interpreter/riscv64/interpreter-VOpMVxArgs.cc b/interpreter/riscv64/interpreter-VOpMVxArgs.cc new file mode 100644 index 00000000..4dc7d6dc --- /dev/null +++ b/interpreter/riscv64/interpreter-VOpMVxArgs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#undef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#include "interpreter.h" + +namespace berberis { + +template void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVxArgs& args); + +} // namespace berberis +#endif diff --git a/interpreter/riscv64/interpreter-VStoreIndexedArgs.cc b/interpreter/riscv64/interpreter-VStoreIndexedArgs.cc new file mode 100644 index 00000000..1d8eb0ee --- /dev/null +++ b/interpreter/riscv64/interpreter-VStoreIndexedArgs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#undef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#include "interpreter.h" + +namespace berberis { + +template void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreIndexedArgs& args); + +} // namespace berberis +#endif diff --git a/interpreter/riscv64/interpreter-VStoreStrideArgs.cc b/interpreter/riscv64/interpreter-VStoreStrideArgs.cc new file mode 100644 index 00000000..9e32bb75 --- /dev/null +++ b/interpreter/riscv64/interpreter-VStoreStrideArgs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#undef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#include "interpreter.h" + +namespace berberis { + +template void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreStrideArgs& args); + +} // namespace berberis +#endif diff --git a/interpreter/riscv64/interpreter-VStoreUnitStrideArgs.cc b/interpreter/riscv64/interpreter-VStoreUnitStrideArgs.cc new file mode 100644 index 00000000..d12c16e8 --- /dev/null +++ b/interpreter/riscv64/interpreter-VStoreUnitStrideArgs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#undef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +#include "interpreter.h" + +namespace berberis { + +template void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreUnitStrideArgs& args); + +} // namespace berberis +#endif diff --git a/interpreter/riscv64/interpreter-main.cc b/interpreter/riscv64/interpreter-main.cc new file mode 100644 index 00000000..07834b5f --- /dev/null +++ b/interpreter/riscv64/interpreter-main.cc @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2023 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file excenaupt in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "berberis/interpreter/riscv64/interpreter.h" + +#include "berberis/decoder/riscv64/decoder.h" +#include "berberis/decoder/riscv64/semantics_player.h" +#include "berberis/guest_state/guest_addr.h" +#include "berberis/guest_state/guest_state.h" + +#include "faulty_memory_accesses.h" +#include "interpreter.h" + +namespace berberis { + +void InitInterpreter() { + AddFaultyMemoryAccessRecoveryCode(); +} + +void InterpretInsn(ThreadState* state) { + GuestAddr pc = state->cpu.insn_addr; + + Interpreter interpreter(state); + SemanticsPlayer sem_player(&interpreter); + Decoder decoder(&sem_player); + uint8_t insn_len = decoder.Decode(ToHostAddr<const uint16_t>(pc)); + interpreter.FinalizeInsn(insn_len); +} + +} // namespace berberis diff --git a/interpreter/riscv64/interpreter.cc b/interpreter/riscv64/interpreter.h index e39cbbb7..6680fc0e 100644 --- a/interpreter/riscv64/interpreter.cc +++ b/interpreter/riscv64/interpreter.h @@ -44,8 +44,6 @@ namespace berberis { -namespace { - inline constexpr std::memory_order AqRlToStdMemoryOrder(bool aq, bool rl) { if (aq) { if (rl) { @@ -489,7 +487,7 @@ class Interpreter { // Note: other tupes of loads and store are not special and would be processed as usual. // TODO(khim): Handle vstart properly. if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs>) { - if (args.opcode == Decoder::VLoadUnitStrideOpcode::kVlXreXX) { + if (args.opcode == Decoder::VLUmOpOpcode::kVlXreXX) { if (!IsPowerOf2(args.nf + 1)) { return Unimplemented(); } @@ -506,7 +504,7 @@ class Interpreter { } if constexpr (std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) { - if (args.opcode == Decoder::VStoreUnitStrideOpcode::kVsX) { + if (args.opcode == Decoder::VSUmOpOpcode::kVsX) { if (args.width != Decoder::MemoryDataOperandType::k8bit) { return Unimplemented(); } @@ -601,9 +599,10 @@ class Interpreter { template <typename ElementType, typename VOpArgs, typename... ExtraArgs> void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) { - int vemul = Decoder::SignExtend<3>(vtype & 0b111); + auto vemul = Decoder::SignExtend<3>(vtype & 0b111); vemul -= ((vtype >> 3) & 0b111); // Divide by SEW. - vemul += static_cast<int>(args.width); // Multiply by EEW. + vemul += + static_cast<std::underlying_type_t<decltype(args.width)>>(args.width); // Multiply by EEW. if (vemul < -3 || vemul > 3) [[unlikely]] { return Unimplemented(); } @@ -736,7 +735,7 @@ class Interpreter { } template <typename ElementType, - int kSegmentSize, + size_t kSegmentSize, VectorRegisterGroupMultiplier vlmul, auto vma, typename VOpArgs, @@ -773,7 +772,7 @@ class Interpreter { } } - template <int kSegmentSize, + template <size_t kSegmentSize, typename IndexElementType, size_t kIndexRegistersInvolved, TailProcessing vta, @@ -801,7 +800,7 @@ class Interpreter { } template <typename DataElementType, - int kSegmentSize, + size_t kSegmentSize, typename IndexElementType, size_t kIndexRegistersInvolved, TailProcessing vta, @@ -874,7 +873,7 @@ class Interpreter { template <typename DataElementType, VectorRegisterGroupMultiplier vlmul, typename IndexElementType, - int kSegmentSize, + size_t kSegmentSize, size_t kIndexRegistersInvolved, TailProcessing vta, auto vma> @@ -889,7 +888,7 @@ class Interpreter { } template <typename DataElementType, - int kSegmentSize, + size_t kSegmentSize, size_t kNumRegistersInGroup, typename IndexElementType, size_t kIndexRegistersInvolved, @@ -899,7 +898,7 @@ class Interpreter { if (!IsAligned<kIndexRegistersInvolved>(args.idx)) { return Unimplemented(); } - constexpr int kElementsCount = + constexpr size_t kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof(IndexElementType)); alignas(alignof(SIMD128Register)) IndexElementType indexes[kElementsCount * kIndexRegistersInvolved]; @@ -909,7 +908,7 @@ class Interpreter { } template <typename ElementType, - int kSegmentSize, + size_t kSegmentSize, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma> @@ -919,7 +918,7 @@ class Interpreter { } template <typename ElementType, - int kSegmentSize, + size_t kSegmentSize, size_t kNumRegistersInGroup, TailProcessing vta, auto vma> @@ -929,7 +928,7 @@ class Interpreter { } template <typename ElementType, - int kSegmentSize, + size_t kSegmentSize, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma> @@ -939,29 +938,29 @@ class Interpreter { } template <typename ElementType, - int kSegmentSize, + size_t kSegmentSize, size_t kNumRegistersInGroup, TailProcessing vta, auto vma> void OpVector(const Decoder::VLoadUnitStrideArgs& args, Register src) { switch (args.opcode) { - case Decoder::VLoadUnitStrideOpcode::kVleXXff: + case Decoder::VLUmOpOpcode::kVleXXff: return OpVectorLoad<ElementType, kSegmentSize, kNumRegistersInGroup, vta, vma, - Decoder::VLoadUnitStrideOpcode::kVleXXff>( + Decoder::VLUmOpOpcode::kVleXXff>( args.dst, src, [](size_t index) { return kSegmentSize * sizeof(ElementType) * index; }); - case Decoder::VLoadUnitStrideOpcode::kVleXX: + case Decoder::VLUmOpOpcode::kVleXX: return OpVectorLoad<ElementType, kSegmentSize, kNumRegistersInGroup, vta, vma, - Decoder::VLoadUnitStrideOpcode::kVleXX>( + Decoder::VLUmOpOpcode::kVleXX>( args.dst, src, [](size_t index) { return kSegmentSize * sizeof(ElementType) * index; }); - case Decoder::VLoadUnitStrideOpcode::kVlm: + case Decoder::VLUmOpOpcode::kVlm: if constexpr (kSegmentSize == 1 && std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) { return OpVectorLoad<UInt8, @@ -969,7 +968,7 @@ class Interpreter { 1, TailProcessing::kAgnostic, vma, - Decoder::VLoadUnitStrideOpcode::kVlm>( + Decoder::VLUmOpOpcode::kVlm>( args.dst, src, [](size_t index) { return index; }); } return Unimplemented(); @@ -1004,14 +1003,13 @@ class Interpreter { // v5: {B:20.21}{B:30.21} // Now we have loaded a column from memory and all three colors are put into a different register // groups for further processing. - template < - typename ElementType, - int kSegmentSize, - size_t kNumRegistersInGroup, - TailProcessing vta, - auto vma, - typename Decoder::VLoadUnitStrideOpcode opcode = typename Decoder::VLoadUnitStrideOpcode{}, - typename GetElementOffsetLambdaType> + template <typename ElementType, + size_t kSegmentSize, + size_t kNumRegistersInGroup, + TailProcessing vta, + auto vma, + typename Decoder::VLUmOpOpcode opcode = typename Decoder::VLUmOpOpcode{}, + typename GetElementOffsetLambdaType> void OpVectorLoad(uint8_t dst, Register src, GetElementOffsetLambdaType GetElementOffset) { using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>; if (!IsAligned<kNumRegistersInGroup>(dst)) { @@ -1020,10 +1018,10 @@ class Interpreter { if (dst + kNumRegistersInGroup * kSegmentSize >= 32) { return Unimplemented(); } - constexpr int kElementsCount = static_cast<int>(16 / sizeof(ElementType)); + constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType)); size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); - if constexpr (opcode == Decoder::VLoadUnitStrideOpcode::kVlm) { + if constexpr (opcode == Decoder::VLUmOpOpcode::kVlm) { vl = AlignUp<CHAR_BIT>(vl) / CHAR_BIT; } // In case of memory access fault we may set vstart to non-zero value, set it to zero here to @@ -1064,7 +1062,7 @@ class Interpreter { !(std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing> || static_cast<InactiveProcessing>(vma) != InactiveProcessing::kUndisturbed || register_mask == full_mask)) { - for (int field = 0; field < kSegmentSize; ++field) { + for (size_t field = 0; field < kSegmentSize; ++field) { result[field].Set(state_->cpu.v[dst + within_group_id + field * kNumRegistersInGroup]); } } @@ -1084,7 +1082,7 @@ class Interpreter { } } // Load segment from memory. - for (int field = 0; field < kSegmentSize; ++field) { + for (size_t field = 0; field < kSegmentSize; ++field) { FaultyLoadResult mem_access_result = FaultyLoad(ptr + field * sizeof(ElementType) + GetElementOffset(element_index), sizeof(ElementType)); @@ -1093,7 +1091,7 @@ class Interpreter { // access fault happens but let's trigger an exception and treat the remaining elements // using vta-specified strategy by simply just adjusting the vl. vl = element_index; - if constexpr (opcode == Decoder::VLoadUnitStrideOpcode::kVleXXff) { + if constexpr (opcode == Decoder::VLUmOpOpcode::kVleXXff) { // Fail-first load only triggers exceptions for the first element, otherwise it // changes vl to ensure that other operations would only process elements that are // successfully loaded. @@ -1127,7 +1125,7 @@ class Interpreter { if (register_mask != full_mask) { auto [simd_mask] = intrinsics::BitMaskToSimdMaskForTests<ElementType>(Int64{MaskType{register_mask}}); - for (int field = 0; field < kSegmentSize; ++field) { + for (size_t field = 0; field < kSegmentSize; ++field) { if constexpr (vma == InactiveProcessing::kAgnostic) { // vstart equal to zero is supposed to be exceptional. From RISV-V V manual (page 14): // The vstart CSR is writable by unprivileged code, but non-zero vstart values may @@ -1162,14 +1160,14 @@ class Interpreter { } // If we have tail elements and TailProcessing::kAgnostic mode then set them to ~0. if constexpr (vta == TailProcessing::kAgnostic) { - for (int field = 0; field < kSegmentSize; ++field) { + for (size_t field = 0; field < kSegmentSize; ++field) { if (vl < (within_group_id + 1) * kElementsCount) { result[field] |= GetTailMask(); } } } // Put values back into register file. - for (int field = 0; field < kSegmentSize; ++field) { + for (size_t field = 0; field < kSegmentSize; ++field) { state_->cpu.v[dst + within_group_id + field * kNumRegistersInGroup] = result[field].template Get<__uint128_t>(); } @@ -1205,6 +1203,12 @@ class Interpreter { InactiveProcessing::kUndisturbed>( args.dst, arg2, /*dst_mask=*/args.src1); } + case Decoder::VOpFVfOpcode::kVfmaxvf: + return OpVectorvx<intrinsics::Vfmaxvx<ElementType>, ElementType, vlmul, vta, vma>( + args.dst, args.src1, arg2); + case Decoder::VOpFVfOpcode::kVfminvf: + return OpVectorvx<intrinsics::Vfminvx<ElementType>, ElementType, vlmul, vta, vma>( + args.dst, args.src1, arg2); default: return Unimplemented(); } @@ -1212,15 +1216,163 @@ class Interpreter { template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma> void OpVector(const Decoder::VOpFVvArgs& args) { - // We currently don't support Float32 operations, but conversion routines that deal with + using SignedType = std::make_signed_t<typename TypeTraits<ElementType>::Int>; + using UnsignedType = std::make_unsigned_t<typename TypeTraits<ElementType>::Int>; + // We currently don't support Float16 operations, but conversion routines that deal with // double-width floats use these encodings to produce regular Float32 types. - // That's why we need to call these routines twice: one here and one in the large switch below. - if constexpr (sizeof(ElementType) < sizeof(Float32)) { + if constexpr (sizeof(ElementType) <= sizeof(Float32)) { + using WideElementType = typename TypeTraits<ElementType>::Wide; switch (args.opcode) { + case Decoder::VOpFVvOpcode::kVFUnary0: + switch (args.vfunary0_opcode) { + case Decoder::VFUnary0Opcode::kVfwcvtfxuv: + return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<WideElementType, UnsignedType>(FPFlags::DYN, frm, src); + }, + WideElementType, + UnsignedType, + vlmul, + vta, + vma>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfwcvtfxv: + return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<WideElementType, SignedType>(FPFlags::DYN, frm, src); + }, + WideElementType, + SignedType, + vlmul, + vta, + vma>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfncvtxufw: + return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::DYN, frm, src); + }, + UnsignedType, + vlmul, + vta, + vma>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfncvtxfw: + return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::DYN, frm, src); + }, + SignedType, + vlmul, + vta, + vma>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfncvtrtzxufw: + return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::RTZ, frm, src); + }, + UnsignedType, + vlmul, + vta, + vma>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfncvtrtzxfw: + return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::RTZ, frm, src); + }, + SignedType, + vlmul, + vta, + vma>(args.dst, args.src1); + default: + break; // Make compiler happy. + } + break; default: - return Unimplemented(); + break; // Make compiler happy. } - } else { + } + // Widening and narrowing opeation which take floating point “narrow” operand may only work + // correctly with Float32 input: Float16 is not supported yet, while Float64 input would produce + // 128bit output which is currently reserver in RISC-V V. + if constexpr (sizeof(ElementType) == sizeof(Float32)) { + using WideElementType = typename TypeTraits<ElementType>::Wide; + using WideSignedType = typename TypeTraits<SignedType>::Wide; + using WideUnsignedType = typename TypeTraits<UnsignedType>::Wide; + switch (args.opcode) { + case Decoder::VOpFVvOpcode::kVFUnary0: + switch (args.vfunary0_opcode) { + case Decoder::VFUnary0Opcode::kVfwcvtxufv: + return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<WideUnsignedType, ElementType>(FPFlags::DYN, frm, src); + }, + WideUnsignedType, + ElementType, + vlmul, + vta, + vma>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfwcvtxfv: + return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<WideSignedType, ElementType>(FPFlags::DYN, frm, src); + }, + WideSignedType, + ElementType, + vlmul, + vta, + vma>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfwcvtffv: + return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<WideElementType, ElementType>(FPFlags::DYN, frm, src); + }, + WideElementType, + ElementType, + vlmul, + vta, + vma>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfwcvtrtzxufv: + return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<WideUnsignedType, ElementType>(FPFlags::RTZ, frm, src); + }, + WideUnsignedType, + ElementType, + vlmul, + vta, + vma>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfwcvtrtzxfv: + return OpVectorWidenvr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<WideSignedType, ElementType>(FPFlags::RTZ, frm, src); + }, + WideSignedType, + ElementType, + vlmul, + vta, + vma>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfncvtfxuw: + return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<ElementType, WideUnsignedType>(FPFlags::DYN, frm, src); + }, + ElementType, + vlmul, + vta, + vma>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfncvtffw: + return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<ElementType, WideElementType>(FPFlags::DYN, frm, src); + }, + ElementType, + vlmul, + vta, + vma>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfncvtfxw: + return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<ElementType, WideSignedType>(FPFlags::DYN, frm, src); + }, + ElementType, + vlmul, + vta, + vma>(args.dst, args.src1); + default: + break; // Make compiler happy. + } + break; + default: + break; // Make compiler happy. + } + } + // If our ElementType is Float16 then “straight” operations are unsupported and we whouldn't try + // instantiate any functions since this would lead to compilke-time error. + if constexpr (sizeof(ElementType) >= sizeof(Float32)) { switch (args.opcode) { case Decoder::VOpFVvOpcode::kVfmvfs: if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) { @@ -1230,10 +1382,71 @@ class Interpreter { return Unimplemented(); } return OpVectorVmvfs<ElementType>(args.dst, args.src1); + case Decoder::VOpFVvOpcode::kVFUnary0: + switch (args.vfunary0_opcode) { + case Decoder::VFUnary0Opcode::kVfcvtxufv: + return OpVectorv<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<UnsignedType, ElementType>(FPFlags::DYN, frm, src); + }, + ElementType, + vlmul, + vta, + vma, + CsrName::kFrm>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfcvtxfv: + return OpVectorv<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<SignedType, ElementType>(FPFlags::DYN, frm, src); + }, + ElementType, + vlmul, + vta, + vma, + CsrName::kFrm>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfcvtfxuv: + return OpVectorv<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<ElementType, UnsignedType>(FPFlags::DYN, frm, src); + }, + UnsignedType, + vlmul, + vta, + vma, + CsrName::kFrm>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfcvtfxv: + return OpVectorv<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<ElementType, SignedType>(FPFlags::DYN, frm, src); + }, + SignedType, + vlmul, + vta, + vma, + CsrName::kFrm>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfcvtrtzxufv: + return OpVectorv<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<UnsignedType, ElementType>(FPFlags::RTZ, frm, src); + }, + ElementType, + vlmul, + vta, + vma, + CsrName::kFrm>(args.dst, args.src1); + case Decoder::VFUnary0Opcode::kVfcvtrtzxfv: + return OpVectorv<[](int8_t frm, SIMD128Register src) { + return intrinsics::Vfcvtv<SignedType, ElementType>(FPFlags::RTZ, frm, src); + }, + ElementType, + vlmul, + vta, + vma, + CsrName::kFrm>(args.dst, args.src1); + default: + break; // Make compiler happy. + } + break; default: - return Unimplemented(); + break; // Make compiler happy. } } + return Unimplemented(); } template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma> @@ -1299,9 +1512,21 @@ class Interpreter { InactiveProcessing::kUndisturbed>( args.dst, BitCastToUnsigned(SignedType{args.imm}), /*dst_mask=*/args.src); } - case Decoder::VOpIViOpcode::kVmvvi: + case Decoder::VOpIViOpcode::kVmvXrv: + // kVmv<nr>rv instruction if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) { - return OpVectorVmvXr<ElementType>(args.dst, args.src, static_cast<uint8_t>(args.imm)); + switch (args.imm) { + case 0: + return OpVectorVmvXrv<ElementType, 1>(args.dst, args.src); + case 1: + return OpVectorVmvXrv<ElementType, 2>(args.dst, args.src); + case 3: + return OpVectorVmvXrv<ElementType, 4>(args.dst, args.src); + case 7: + return OpVectorVmvXrv<ElementType, 8>(args.dst, args.src); + default: + return Unimplemented(); + } } else { return Unimplemented(); } @@ -1466,89 +1691,101 @@ class Interpreter { case Decoder::VOpMVvOpcode::kVredmaxvs: return OpVectorvs<intrinsics::Vredmaxvs<SignedType>, SignedType, vlmul, vta, vma>( args.dst, args.src1, args.src2); - case Decoder::VOpMVvOpcode::kVXmXXs: - switch (args.vXmXXs_opcode) { - case Decoder::VXmXXsOpcode::kVmvxs: + case Decoder::VOpMVvOpcode::kVWXUnary0: + switch (args.vwxunary0_opcode) { + case Decoder::VWXUnary0Opcode::kVmvxs: if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) { return Unimplemented(); } return OpVectorVmvxs<SignedType>(args.dst, args.src1); - case Decoder::VXmXXsOpcode::kVcpopm: - return OpVectorVXmXXs<intrinsics::Vcpopm<Int128>, vma>(args.dst, args.src1); - case Decoder::VXmXXsOpcode::kVfirstm: - return OpVectorVXmXXs<intrinsics::Vfirstm<Int128>, vma>(args.dst, args.src1); + case Decoder::VWXUnary0Opcode::kVcpopm: + return OpVectorVWXUnary0<intrinsics::Vcpopm<Int128>, vma>(args.dst, args.src1); + case Decoder::VWXUnary0Opcode::kVfirstm: + return OpVectorVWXUnary0<intrinsics::Vfirstm<Int128>, vma>(args.dst, args.src1); default: - return Unimplemented(); + return Unimplemented(); } - case Decoder::VOpMVvOpcode::kVxunary0: + case Decoder::VOpMVvOpcode::kVFUnary0: switch (args.vxunary0_opcode) { - case Decoder::Vxunary0Opcode::kVzextvf2m: - if constexpr (sizeof(UnsignedType) >= 2) { - return OpVectorExtend<intrinsics::Vextf2<UnsignedType>, - UnsignedType, - 2, - vlmul, - vta, - vma>(args.dst, args.src1); - } - break; - case Decoder::Vxunary0Opcode::kVsextvf2m: - if constexpr (sizeof(SignedType) >= 2) { - return OpVectorExtend<intrinsics::Vextf2<SignedType>, SignedType, 2, vlmul, vta, vma>( - args.dst, args.src1); - } - break; - case Decoder::Vxunary0Opcode::kVzextvf4m: - if constexpr (sizeof(UnsignedType) >= 4) { - return OpVectorExtend<intrinsics::Vextf4<UnsignedType>, - UnsignedType, - 4, - vlmul, - vta, - vma>(args.dst, args.src1); - } - break; - case Decoder::Vxunary0Opcode::kVsextvf4m: - if constexpr (sizeof(SignedType) >= 4) { - return OpVectorExtend<intrinsics::Vextf4<SignedType>, SignedType, 4, vlmul, vta, vma>( - args.dst, args.src1); - } - break; - case Decoder::Vxunary0Opcode::kVzextvf8m: - if constexpr (sizeof(UnsignedType) >= 8) { - return OpVectorExtend<intrinsics::Vextf8<UnsignedType>, - UnsignedType, - 8, - vlmul, - vta, - vma>(args.dst, args.src1); - } - break; - case Decoder::Vxunary0Opcode::kVsextvf8m: - if constexpr (sizeof(SignedType) >= 8) { - return OpVectorExtend<intrinsics::Vextf8<SignedType>, SignedType, 8, vlmul, vta, vma>( - args.dst, args.src1); - } - break; + case Decoder::VXUnary0Opcode::kVzextvf2m: + if constexpr (sizeof(UnsignedType) >= 2) { + return OpVectorVXUnary0<intrinsics::Vextf2<UnsignedType>, + UnsignedType, + 2, + vlmul, + vta, + vma>(args.dst, args.src1); + } + break; + case Decoder::VXUnary0Opcode::kVsextvf2m: + if constexpr (sizeof(SignedType) >= 2) { + return OpVectorVXUnary0<intrinsics::Vextf2<SignedType>, + SignedType, + 2, + vlmul, + vta, + vma>(args.dst, args.src1); + } + break; + case Decoder::VXUnary0Opcode::kVzextvf4m: + if constexpr (sizeof(UnsignedType) >= 4) { + return OpVectorVXUnary0<intrinsics::Vextf4<UnsignedType>, + UnsignedType, + 4, + vlmul, + vta, + vma>(args.dst, args.src1); + } + break; + case Decoder::VXUnary0Opcode::kVsextvf4m: + if constexpr (sizeof(SignedType) >= 4) { + return OpVectorVXUnary0<intrinsics::Vextf4<SignedType>, + SignedType, + 4, + vlmul, + vta, + vma>(args.dst, args.src1); + } + break; + case Decoder::VXUnary0Opcode::kVzextvf8m: + if constexpr (sizeof(UnsignedType) >= 8) { + return OpVectorVXUnary0<intrinsics::Vextf8<UnsignedType>, + UnsignedType, + 8, + vlmul, + vta, + vma>(args.dst, args.src1); + } + break; + case Decoder::VXUnary0Opcode::kVsextvf8m: + if constexpr (sizeof(SignedType) >= 8) { + return OpVectorVXUnary0<intrinsics::Vextf8<SignedType>, + SignedType, + 8, + vlmul, + vta, + vma>(args.dst, args.src1); + } + break; default: - return Unimplemented(); + return Unimplemented(); } return Unimplemented(); - case Decoder::VOpMVvOpcode::kVmsXf: - switch (args.vmsXf_opcode) { - case Decoder::VmsXfOpcode::kVmsbfm: - return OpVectorVmsXf<intrinsics::Vmsbfm<>, vma>(args.dst, args.src1); - case Decoder::VmsXfOpcode::kVmsofm: - return OpVectorVmsXf<intrinsics::Vmsofm<>, vma>(args.dst, args.src1); - case Decoder::VmsXfOpcode::kVmsifm: - return OpVectorVmsXf<intrinsics::Vmsifm<>, vma>(args.dst, args.src1); - case Decoder::VmsXfOpcode::kVidv: - if (args.src1) { - return Unimplemented(); - } - return OpVectorVidv<ElementType, vlmul, vta, vma>(args.dst); - default: + case Decoder::VOpMVvOpcode::kVMUnary0: + switch (args.vmunary0_opcode) { + case Decoder::VMUnary0Opcode::kVmsbfm: + return OpVectorVMUnary0<intrinsics::Vmsbfm<>, vma>(args.dst, args.src1); + case Decoder::VMUnary0Opcode::kVmsofm: + return OpVectorVMUnary0<intrinsics::Vmsofm<>, vma>(args.dst, args.src1); + case Decoder::VMUnary0Opcode::kVmsifm: + return OpVectorVMUnary0<intrinsics::Vmsifm<>, vma>(args.dst, args.src1); + case Decoder::VMUnary0Opcode::kVidv: + if (args.src1) { return Unimplemented(); + } + return OpVectorVidv<ElementType, vlmul, vta, vma>(args.dst); + default: + return Unimplemented(); } case Decoder::VOpMVvOpcode::kVmaddvv: return OpVectorvvv<intrinsics::Vmaddvv<ElementType>, ElementType, vlmul, vta, vma>( @@ -1709,15 +1946,15 @@ class Interpreter { using SignedType = berberis::SignedType<ElementType>; using UnsignedType = berberis::UnsignedType<ElementType>; switch (args.opcode) { - case Decoder::VOpMVxOpcode::kVXmXXx: - switch (args.vXmXXx_opcode) { - case Decoder::VXmXXxOpcode::kVmvsx: - if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) { - return Unimplemented(); - } - return OpVectorVmvsx<SignedType, vta>(args.dst, MaybeTruncateTo<SignedType>(arg2)); - default: + case Decoder::VOpMVxOpcode::kVRXUnary0: + switch (args.vrxunary0_opcode) { + case Decoder::VRXUnary0Opcode::kVmvsx: + if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) { return Unimplemented(); + } + return OpVectorVmvsx<SignedType, vta>(args.dst, MaybeTruncateTo<SignedType>(arg2)); + default: + return Unimplemented(); } case Decoder::VOpMVxOpcode::kVmaddvx: return OpVectorvxv<intrinsics::Vmaddvx<ElementType>, ElementType, vlmul, vta, vma>( @@ -1751,7 +1988,7 @@ class Interpreter { template <typename DataElementType, VectorRegisterGroupMultiplier vlmul, typename IndexElementType, - int kSegmentSize, + size_t kSegmentSize, size_t kIndexRegistersInvolved, TailProcessing vta, auto vma> @@ -1765,7 +2002,7 @@ class Interpreter { } template <typename DataElementType, - int kSegmentSize, + size_t kSegmentSize, size_t kNumRegistersInGroup, typename IndexElementType, size_t kIndexRegistersInvolved, @@ -1774,7 +2011,7 @@ class Interpreter { if (!IsAligned<kIndexRegistersInvolved>(args.idx)) { return Unimplemented(); } - constexpr int kElementsCount = + constexpr size_t kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof(IndexElementType)); alignas(alignof(SIMD128Register)) IndexElementType indexes[kElementsCount * kIndexRegistersInvolved]; @@ -1784,7 +2021,7 @@ class Interpreter { } template <typename ElementType, - int kSegmentSize, + size_t kSegmentSize, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma> @@ -1797,29 +2034,28 @@ class Interpreter { } template <typename ElementType, - int kSegmentSize, + size_t kSegmentSize, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma> void OpVector(const Decoder::VStoreUnitStrideArgs& args, Register src) { switch (args.opcode) { - case Decoder::VStoreUnitStrideOpcode::kVseXX: + case Decoder::VSUmOpOpcode::kVseXX: return OpVectorStore<ElementType, kSegmentSize, NumberOfRegistersInvolved(vlmul), !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>, - Decoder::VStoreUnitStrideOpcode::kVseXX>( - args.data, src, [](size_t index) { - return kSegmentSize * sizeof(ElementType) * index; - }); - case Decoder::VStoreUnitStrideOpcode::kVsm: + Decoder::VSUmOpOpcode::kVseXX>(args.data, src, [](size_t index) { + return kSegmentSize * sizeof(ElementType) * index; + }); + case Decoder::VSUmOpOpcode::kVsm: if constexpr (kSegmentSize == 1 && std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) { return OpVectorStore<UInt8, 1, 1, /*kUseMasking=*/false, - Decoder::VStoreUnitStrideOpcode::kVsm>( + Decoder::VSUmOpOpcode::kVsm>( args.data, src, [](size_t index) { return index; }); } return Unimplemented(); @@ -1830,13 +2066,12 @@ class Interpreter { // Look for VLoadStrideArgs for explanation about semantics: VStoreStrideArgs is almost symmetric, // except it ignores vta and vma modes and never alters inactive elements in memory. - template < - typename ElementType, - int kSegmentSize, - size_t kNumRegistersInGroup, - bool kUseMasking, - typename Decoder::VStoreUnitStrideOpcode opcode = typename Decoder::VStoreUnitStrideOpcode{}, - typename GetElementOffsetLambdaType> + template <typename ElementType, + size_t kSegmentSize, + size_t kNumRegistersInGroup, + bool kUseMasking, + typename Decoder::VSUmOpOpcode opcode = typename Decoder::VSUmOpOpcode{}, + typename GetElementOffsetLambdaType> void OpVectorStore(uint8_t data, Register src, GetElementOffsetLambdaType GetElementOffset) { using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>; if (!IsAligned<kNumRegistersInGroup>(data)) { @@ -1845,10 +2080,10 @@ class Interpreter { if (data + kNumRegistersInGroup * kSegmentSize > 32) { return Unimplemented(); } - constexpr int kElementsCount = static_cast<int>(16 / sizeof(ElementType)); + constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType)); size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); - if constexpr (opcode == Decoder::VStoreUnitStrideOpcode::kVsm) { + if constexpr (opcode == Decoder::VSUmOpOpcode::kVsm) { vl = AlignUp<CHAR_BIT>(vl) / CHAR_BIT; } // In case of memory access fault we may set vstart to non-zero value, set it to zero here to @@ -1892,7 +2127,7 @@ class Interpreter { } } // Store segment to memory. - for (int field = 0; field < kSegmentSize; ++field) { + for (size_t field = 0; field < kSegmentSize; ++field) { bool exception_raised = FaultyStore( ptr + field * sizeof(ElementType) + GetElementOffset(element_index), sizeof(ElementType), @@ -1923,10 +2158,10 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -1936,7 +2171,6 @@ class Interpreter { result, std::get<0>(intrinsics::Vidv<ElementType>(index)), vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <typename ElementType> @@ -1981,7 +2215,7 @@ class Interpreter { } template <auto Intrinsic, auto vma> - void OpVectorVXmXXs(uint8_t dst, uint8_t src1) { + void OpVectorVWXUnary0(uint8_t dst, uint8_t src1) { size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); if (vstart != 0) [[unlikely]] { @@ -2004,24 +2238,19 @@ class Interpreter { void OpVectormm(uint8_t dst, uint8_t src1, uint8_t src2) { size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); - SIMD128Register arg1(state_->cpu.v[src1]); - SIMD128Register arg2(state_->cpu.v[src2]); - SIMD128Register result; + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } + SIMD128Register arg1(state_->cpu.v[src1]); + SIMD128Register arg2(state_->cpu.v[src2]); + SIMD128Register result; if (vstart > 0) [[unlikely]] { - if (vstart >= vl) [[unlikely]] { - result.Set(state_->cpu.v[dst]); - } else { - const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart); - result.Set(state_->cpu.v[dst]); - result = (result & ~start_mask) | (Intrinsic(arg1, arg2) & start_mask); - } - SetCsr<CsrName::kVstart>(0); + const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart); + result.Set(state_->cpu.v[dst]); + result = (result & ~start_mask) | (Intrinsic(arg1, arg2) & start_mask); } else { result = Intrinsic(arg1, arg2); } @@ -2031,7 +2260,7 @@ class Interpreter { } template <auto Intrinsic, auto vma> - void OpVectorVmsXf(uint8_t dst, uint8_t src1) { + void OpVectorVMUnary0(uint8_t dst, uint8_t src1) { size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); if (vstart != 0) { @@ -2063,40 +2292,36 @@ class Interpreter { state_->cpu.v[dst] = result.Get<__uint128_t>(); } - template <typename ElementType> - void OpVectorVmvXr(uint8_t dst, uint8_t src, uint8_t nf) { - if (!IsPowerOf2(nf + 1)) { - return Unimplemented(); - } - if (((dst | src) & nf) != 0) { + template <typename ElementType, size_t kRegistersInvolved> + void OpVectorVmvXrv(uint8_t dst, uint8_t src) { + if (!IsAligned<kRegistersInvolved>(dst | src)) { return Unimplemented(); } + constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType)); size_t vstart = GetCsr<CsrName::kVstart>(); + SetCsr<CsrName::kVstart>(0); + // The usual property that no elements are written if vstart >= vl does not apply to these + // instructions. Instead, no elements are written if vstart >= evl. + if (vstart >= kElementsCount * kRegistersInvolved) [[unlikely]] { + return; + } if (vstart == 0) [[likely]] { - for (int index = 0; index <= nf; ++index) { + for (size_t index = 0; index < kRegistersInvolved; ++index) { state_->cpu.v[dst + index] = state_->cpu.v[src + index]; } return; } - constexpr int kElementsCount = static_cast<int>(16 / sizeof(ElementType)); - for (int index = 0; index <= nf; ++index) { - if (vstart >= kElementsCount) { - vstart -= kElementsCount; - continue; - } - if (vstart == 0) [[likely]] { - state_->cpu.v[dst + index] = state_->cpu.v[src + index]; - } else { - SIMD128Register destination{state_->cpu.v[dst + index]}; - SIMD128Register source{state_->cpu.v[src + index]}; - for (int element_index = vstart; element_index < kElementsCount; ++element_index) { - destination.Set(source.Get<ElementType>(element_index), element_index); - } - state_->cpu.v[dst + index] = destination.Get<__uint128_t>(); - vstart = 0; - } + size_t index = vstart / kElementsCount; + SIMD128Register destination{state_->cpu.v[dst + index]}; + SIMD128Register source{state_->cpu.v[src + index]}; + for (size_t element_index = vstart % kElementsCount; element_index < kElementsCount; + ++element_index) { + destination.Set(source.Get<ElementType>(element_index), element_index); + } + state_->cpu.v[dst + index] = destination.Get<__uint128_t>(); + for (index++; index < kRegistersInvolved; ++index) { + state_->cpu.v[dst + index] = state_->cpu.v[src + index]; } - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, typename ElementType, VectorRegisterGroupMultiplier vlmul, auto vma> @@ -2113,12 +2338,12 @@ class Interpreter { SIMD128Register original_result(state_->cpu.v[dst]); size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); SIMD128Register result_before_vl_masking; // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { result_before_vl_masking = original_result; - SetCsr<CsrName::kVstart>(0); } else { result_before_vl_masking = CollectBitmaskResult<ElementType, kRegistersInvolved>([this, src1, src2](auto index) { @@ -2138,7 +2363,6 @@ class Interpreter { const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart); result_before_vl_masking = (original_result & ~start_mask) | (result_before_vl_masking & start_mask); - SetCsr<CsrName::kVstart>(0); } } const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl); @@ -2159,12 +2383,12 @@ class Interpreter { SIMD128Register original_result(state_->cpu.v[dst]); size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); SIMD128Register result_before_vl_masking; // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { result_before_vl_masking = original_result; - SetCsr<CsrName::kVstart>(0); } else { result_before_vl_masking = CollectBitmaskResult<ElementType, kRegistersInvolved>([this, src1, arg2](auto index) { @@ -2183,7 +2407,6 @@ class Interpreter { const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart); result_before_vl_masking = (original_result & ~start_mask) | (result_before_vl_masking & start_mask); - SetCsr<CsrName::kVstart>(0); } } const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl); @@ -2194,21 +2417,16 @@ class Interpreter { typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, - auto vma> - void OpVectorvs(uint8_t dst, uint8_t src1, uint8_t src2) { - return OpVectorvs<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>( - dst, src1, src2); - } - - template <auto Intrinsic, - typename ElementType, - VectorRegisterGroupMultiplier vlmul, - TailProcessing vta, auto vma, + CsrName... kExtraCsrs, typename... DstMaskType> void OpVectorv(uint8_t dst, uint8_t src1, DstMaskType... dst_mask) { - return OpVectorv<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>( - dst, src1, dst_mask...); + return OpVectorv<Intrinsic, + ElementType, + NumberOfRegistersInvolved(vlmul), + vta, + vma, + kExtraCsrs...>(dst, src1, dst_mask...); } template <auto Intrinsic, @@ -2216,6 +2434,7 @@ class Interpreter { size_t kRegistersInvolved, TailProcessing vta, auto vma, + CsrName... kExtraCsrs, typename... DstMaskType> void OpVectorv(uint8_t dst, uint8_t src, DstMaskType... dst_mask) { static_assert(sizeof...(dst_mask) <= 1); @@ -2224,6 +2443,12 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); + // When vstart >= vl, there are no body elements, and no elements are updated in any destination + // vector register group, including that no tail elements are updated with agnostic values. + if (vstart >= vl) [[unlikely]] { + return; + } auto mask = GetMaskForVectorOperations<vma>(); for (size_t index = 0; index < kRegistersInvolved; ++index) { SIMD128Register result{state_->cpu.v[dst + index]}; @@ -2235,11 +2460,26 @@ class Interpreter { result_mask.Set(state_->cpu.v[dst_mask_unpacked[0] + index]); } SIMD128Register arg{state_->cpu.v[src + index]}; - result = VectorMasking<ElementType, vta, vma>( - result, std::get<0>(Intrinsic(arg)), result_mask, vstart, vl, index, mask); + result = + VectorMasking<ElementType, vta, vma>(result, + std::get<0>(Intrinsic(GetCsr<kExtraCsrs>()..., arg)), + result_mask, + vstart, + vl, + index, + mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); + } + + template <auto Intrinsic, + typename ElementType, + VectorRegisterGroupMultiplier vlmul, + TailProcessing vta, + auto vma> + void OpVectorvs(uint8_t dst, uint8_t src1, uint8_t src2) { + return OpVectorvs<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>( + dst, src1, src2); } template <auto Intrinsic, @@ -2256,6 +2496,7 @@ class Interpreter { if (vstart != 0) { return Unimplemented(); } + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vl == 0) [[unlikely]] { @@ -2274,7 +2515,7 @@ class Interpreter { element_index += MaskType{1}) { if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) { if ((MaskType{mask_bits} & (MaskType{1} << element_index)) == MaskType{0}) { - continue; + continue; } } result = std::get<0>(Intrinsic(arg1, arg2.Get<ElementType>(element_index))); @@ -2285,7 +2526,6 @@ class Interpreter { result.Set(arg1, 0); result = std::get<0>(intrinsics::VectorMasking<ElementType, vta>(result, result, 0, 1)); state_->cpu.v[dst] = result.Get<__uint128_t>(); - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, @@ -2309,6 +2549,12 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); + // When vstart >= vl, there are no body elements, and no elements are updated in any destination + // vector register group, including that no tail elements are updated with agnostic values. + if (vstart >= vl) [[unlikely]] { + return; + } auto mask = GetMaskForVectorOperations<vma>(); for (size_t index = 0; index < kRegistersInvolved; ++index) { SIMD128Register result{state_->cpu.v[dst + index]}; @@ -2318,7 +2564,6 @@ class Interpreter { result, std::get<0>(Intrinsic(arg1, arg2)), vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, @@ -2342,10 +2587,10 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -2357,7 +2602,59 @@ class Interpreter { result, std::get<0>(Intrinsic(arg1, arg2, result)), vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } + } + + template <auto Intrinsic, + typename TargetElementType, + typename SourceElementType, + VectorRegisterGroupMultiplier vlmul, + TailProcessing vta, + auto vma> + void OpVectorWidenvr(uint8_t dst, uint8_t src) { + return OpVectorWidenvr<Intrinsic, + TargetElementType, + SourceElementType, + NumRegistersInvolvedForWideOperand(vlmul), + NumberOfRegistersInvolved(vlmul), + vta, + vma>(dst, src); + } + + template <auto Intrinsic, + typename TargetElementType, + typename SourceElementType, + size_t kDestRegistersInvolved, + size_t kRegistersInvolved, + TailProcessing vta, + auto vma> + void OpVectorWidenvr(uint8_t dst, uint8_t src) { + if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kRegistersInvolved>(src)) { + return Unimplemented(); + } + size_t vstart = GetCsr<CsrName::kVstart>(); + size_t vl = GetCsr<CsrName::kVl>(); SetCsr<CsrName::kVstart>(0); + // When vstart >= vl, there are no body elements, and no elements are updated in any destination + // vector register group, including that no tail elements are updated with agnostic values. + if (vstart >= vl) [[unlikely]] { + return; + } + int8_t frm = GetCsr<CsrName::kFrm>(); + auto mask = GetMaskForVectorOperations<vma>(); + for (size_t index = 0; index < kRegistersInvolved; ++index) { + SIMD128Register result(state_->cpu.v[dst + 2 * index]); + SIMD128Register arg(state_->cpu.v[src + index]); + result = VectorMasking<TargetElementType, vta, vma>( + result, std::get<0>(Intrinsic(frm, arg)), vstart, vl, 2 * index, mask); + state_->cpu.v[dst + 2 * index] = result.Get<__uint128_t>(); + if constexpr (kDestRegistersInvolved > 1) { // if lmul is one full register or more + result.Set(state_->cpu.v[dst + 2 * index + 1]); + std::tie(arg) = intrinsics::VMovTopHalfToBottom<SourceElementType>(arg); + result = VectorMasking<TargetElementType, vta, vma>( + result, std::get<0>(Intrinsic(frm, arg)), vstart, vl, 2 * index + 1, mask); + state_->cpu.v[dst + 2 * index + 1] = result.Get<__uint128_t>(); + } + } } // 2*SEW = SEW op SEW @@ -2378,7 +2675,7 @@ class Interpreter { template <auto Intrinsic, typename ElementType, - int kDestRegistersInvolved, + size_t kDestRegistersInvolved, size_t kRegistersInvolved, TailProcessing vta, auto vma> @@ -2388,10 +2685,10 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -2411,7 +2708,6 @@ class Interpreter { state_->cpu.v[dst + 2 * index + 1] = result.Get<__uint128_t>(); } } - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, @@ -2436,10 +2732,10 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -2450,7 +2746,60 @@ class Interpreter { result, std::get<0>(Intrinsic(arg1, arg2)), vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } + } + + template <auto Intrinsic, + typename TargetElementType, + VectorRegisterGroupMultiplier vlmul, + TailProcessing vta, + auto vma> + void OpVectorNarrowwr(uint8_t dst, uint8_t src) { + return OpVectorNarrowwr<Intrinsic, + TargetElementType, + NumberOfRegistersInvolved(vlmul), + NumRegistersInvolvedForWideOperand(vlmul), + vta, + vma>(dst, src); + } + + template <auto Intrinsic, + typename TargetElementType, + size_t kDestRegistersInvolved, + size_t kSrcRegistersInvolved, + TailProcessing vta, + auto vma> + void OpVectorNarrowwr(uint8_t dst, uint8_t src) { + if constexpr (kDestRegistersInvolved == kSrcRegistersInvolved) { + if (!IsAligned<kDestRegistersInvolved>(dst | src)) { + return Unimplemented(); + } + } else if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kSrcRegistersInvolved>(src)) { + return Unimplemented(); + } + size_t vstart = GetCsr<CsrName::kVstart>(); + size_t vl = GetCsr<CsrName::kVl>(); SetCsr<CsrName::kVstart>(0); + // When vstart >= vl, there are no body elements, and no elements are updated in any destination + // vector register group, including that no tail elements are updated with agnostic values. + if (vstart >= vl) [[unlikely]] { + return; + } + int8_t frm = GetCsr<CsrName::kFrm>(); + auto mask = GetMaskForVectorOperations<vma>(); + for (size_t index = 0; index < kDestRegistersInvolved; index++) { + SIMD128Register orig_result(state_->cpu.v[dst + index]); + SIMD128Register arg_low(state_->cpu.v[src + 2 * index]); + SIMD128Register intrinsic_result = std::get<0>(Intrinsic(frm, arg_low)); + if constexpr (kSrcRegistersInvolved > 1) { + SIMD128Register arg_high(state_->cpu.v[src + 2 * index + 1]); + SIMD128Register result_high = std::get<0>(Intrinsic(frm, arg_high)); + intrinsic_result = std::get<0>( + intrinsics::VMergeBottomHalfToTop<TargetElementType>(intrinsic_result, result_high)); + } + auto result = VectorMasking<TargetElementType, vta, vma>( + orig_result, intrinsic_result, vstart, vl, index, mask); + state_->cpu.v[dst + index] = result.template Get<__uint128_t>(); + } } // SEW = 2*SEW op SEW @@ -2470,8 +2819,8 @@ class Interpreter { template <auto Intrinsic, typename ElementType, - int kDestRegistersInvolved, - int kSrcRegistersInvolved, + size_t kDestRegistersInvolved, + size_t kSrcRegistersInvolved, TailProcessing vta, auto vma> void OpVectorNarrowwx(uint8_t dst, uint8_t src1, ElementType arg2) { @@ -2484,14 +2833,14 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); - for (int index = 0; index < kDestRegistersInvolved; index++) { + for (size_t index = 0; index < kDestRegistersInvolved; index++) { SIMD128Register orig_result(state_->cpu.v[dst + index]); SIMD128Register arg1_low(state_->cpu.v[src1 + 2 * index]); SIMD128Register intrinsic_result = std::get<0>(Intrinsic(arg1_low, arg2)); @@ -2507,7 +2856,6 @@ class Interpreter { orig_result, intrinsic_result, vstart, vl, index, mask); state_->cpu.v[dst + index] = result.template Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } // SEW = 2*SEW op SEW @@ -2528,7 +2876,7 @@ class Interpreter { template <auto Intrinsic, typename ElementType, size_t kRegistersInvolved, - int kFirstSrcRegistersInvolved, + size_t kFirstSrcRegistersInvolved, TailProcessing vta, auto vma> void OpVectorNarrowwv(uint8_t dst, uint8_t src1, uint8_t src2) { @@ -2542,10 +2890,10 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -2567,7 +2915,6 @@ class Interpreter { orig_result, intrinsic_result, vstart, vl, index, mask); state_->cpu.v[dst + index] = result.template Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, @@ -2576,7 +2923,7 @@ class Interpreter { VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma> - void OpVectorExtend(uint8_t dst, uint8_t src) { + void OpVectorVXUnary0(uint8_t dst, uint8_t src) { static_assert(kFactor == 2 || kFactor == 4 || kFactor == 8); constexpr size_t kDestRegistersInvolved = NumberOfRegistersInvolved(vlmul); constexpr size_t kSourceRegistersInvolved = (kDestRegistersInvolved / kFactor) ?: 1; @@ -2626,6 +2973,12 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); + // When vstart >= vl, there are no body elements, and no elements are updated in any destination + // vector register group, including that no tail elements are updated with agnostic values. + if (vstart >= vl) [[unlikely]] { + return; + } auto mask = GetMaskForVectorOperations<vma>(); for (size_t index = 0; index < kRegistersInvolved; ++index) { SIMD128Register result(state_->cpu.v[dst + index]); @@ -2634,7 +2987,6 @@ class Interpreter { result, std::get<0>(Intrinsic(arg1, arg2, result)), vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <auto Intrinsic, @@ -2661,10 +3013,10 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); // When vstart >= vl, there are no body elements, and no elements are updated in any destination // vector register group, including that no tail elements are updated with agnostic values. if (vstart >= vl) [[unlikely]] { - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -2681,7 +3033,6 @@ class Interpreter { result, std::get<0>(Intrinsic(arg2)), result_mask, vstart, vl, index, mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma> @@ -2702,11 +3053,11 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); if (vstart >= vl) [[unlikely]] { // From 16.3: For all of the [slide instructions], if vstart >= vl, the // instruction performs no operation and leaves the destination vector // register unchanged. - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); @@ -2743,7 +3094,6 @@ class Interpreter { mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - SetCsr<CsrName::kVstart>(0); } template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma> @@ -2764,15 +3114,14 @@ class Interpreter { } size_t vstart = GetCsr<CsrName::kVstart>(); size_t vl = GetCsr<CsrName::kVl>(); + SetCsr<CsrName::kVstart>(0); if (vstart >= vl) [[unlikely]] { // From 16.3: For all of the [slide instructions], if vstart >= vl, the // instruction performs no operation and leaves the destination vector // register unchanged. - SetCsr<CsrName::kVstart>(0); return; } auto mask = GetMaskForVectorOperations<vma>(); - for (size_t index = 0; index < kRegistersInvolved; ++index) { SIMD128Register result(state_->cpu.v[dst + index]); @@ -2794,8 +3143,6 @@ class Interpreter { mask); state_->cpu.v[dst + index] = result.Get<__uint128_t>(); } - - SetCsr<CsrName::kVstart>(0); } // Helper function needed to generate bitmak result from non-bitmask inputs. @@ -3014,32 +3361,32 @@ class Interpreter { }; template <> -[[nodiscard]] Interpreter::Register Interpreter::GetCsr<CsrName::kFCsr>() const { +[[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kFCsr>() const { return FeGetExceptions() | (state_->cpu.frm << 5); } template <> -[[nodiscard]] Interpreter::Register Interpreter::GetCsr<CsrName::kFFlags>() const { +[[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kFFlags>() const { return FeGetExceptions(); } template <> -[[nodiscard]] Interpreter::Register Interpreter::GetCsr<CsrName::kVlenb>() const { +[[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVlenb>() const { return 16; } template <> -[[nodiscard]] Interpreter::Register Interpreter::GetCsr<CsrName::kVxrm>() const { +[[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVxrm>() const { return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11; } template <> -[[nodiscard]] Interpreter::Register Interpreter::GetCsr<CsrName::kVxsat>() const { +[[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVxsat>() const { return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> >> 2; } template <> -void Interpreter::SetCsr<CsrName::kFCsr>(Register arg) { +void inline Interpreter::SetCsr<CsrName::kFCsr>(Register arg) { CHECK(!exception_raised_); FeSetExceptions(arg & 0b1'1111); arg = (arg >> 5) & kCsrMask<CsrName::kFrm>; @@ -3048,13 +3395,13 @@ void Interpreter::SetCsr<CsrName::kFCsr>(Register arg) { } template <> -void Interpreter::SetCsr<CsrName::kFFlags>(Register arg) { +void inline Interpreter::SetCsr<CsrName::kFFlags>(Register arg) { CHECK(!exception_raised_); FeSetExceptions(arg & 0b1'1111); } template <> -void Interpreter::SetCsr<CsrName::kFrm>(Register arg) { +void inline Interpreter::SetCsr<CsrName::kFrm>(Register arg) { CHECK(!exception_raised_); arg &= kCsrMask<CsrName::kFrm>; state_->cpu.frm = arg; @@ -3062,34 +3409,36 @@ void Interpreter::SetCsr<CsrName::kFrm>(Register arg) { } template <> -void Interpreter::SetCsr<CsrName::kVxrm>(Register arg) { +void inline Interpreter::SetCsr<CsrName::kVxrm>(Register arg) { CHECK(!exception_raised_); state_->cpu.*CsrFieldAddr<CsrName::kVcsr> = (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b100) | (arg & 0b11); } template <> -void Interpreter::SetCsr<CsrName::kVxsat>(Register arg) { +void inline Interpreter::SetCsr<CsrName::kVxsat>(Register arg) { CHECK(!exception_raised_); state_->cpu.*CsrFieldAddr<CsrName::kVcsr> = (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11) | ((arg & 0b1) << 2); } template <> -Interpreter::FpRegister Interpreter::GetFRegAndUnboxNan<Interpreter::Float32>(uint8_t reg) { +[[nodiscard]] Interpreter::FpRegister inline Interpreter::GetFRegAndUnboxNan<Interpreter::Float32>( + uint8_t reg) { CheckFpRegIsValid(reg); FpRegister value = state_->cpu.f[reg]; return UnboxNan<Float32>(value); } template <> -Interpreter::FpRegister Interpreter::GetFRegAndUnboxNan<Interpreter::Float64>(uint8_t reg) { +[[nodiscard]] Interpreter::FpRegister inline Interpreter::GetFRegAndUnboxNan<Interpreter::Float64>( + uint8_t reg) { CheckFpRegIsValid(reg); return state_->cpu.f[reg]; } template <> -void Interpreter::NanBoxAndSetFpReg<Interpreter::Float32>(uint8_t reg, FpRegister value) { +void inline Interpreter::NanBoxAndSetFpReg<Interpreter::Float32>(uint8_t reg, FpRegister value) { if (exception_raised_) { // Do not produce side effects. return; @@ -3099,7 +3448,7 @@ void Interpreter::NanBoxAndSetFpReg<Interpreter::Float32>(uint8_t reg, FpRegiste } template <> -void Interpreter::NanBoxAndSetFpReg<Interpreter::Float64>(uint8_t reg, FpRegister value) { +void inline Interpreter::NanBoxAndSetFpReg<Interpreter::Float64>(uint8_t reg, FpRegister value) { if (exception_raised_) { // Do not produce side effects. return; @@ -3108,20 +3457,33 @@ void Interpreter::NanBoxAndSetFpReg<Interpreter::Float64>(uint8_t reg, FpRegiste state_->cpu.f[reg] = value; } -} // namespace - -void InitInterpreter() { - AddFaultyMemoryAccessRecoveryCode(); -} - -void InterpretInsn(ThreadState* state) { - GuestAddr pc = state->cpu.insn_addr; - - Interpreter interpreter(state); - SemanticsPlayer sem_player(&interpreter); - Decoder decoder(&sem_player); - uint8_t insn_len = decoder.Decode(ToHostAddr<const uint16_t>(pc)); - interpreter.FinalizeInsn(insn_len); -} +#ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS +template <> +extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadIndexedArgs& args); +template <> +extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadStrideArgs& args); +template <> +extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadUnitStrideArgs& args); +template <> +extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVfArgs& args); +template <> +extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVvArgs& args); +template <> +extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIViArgs& args); +template <> +extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVvArgs& args); +template <> +extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVxArgs& args); +template <> +extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVvArgs& args); +template <> +extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVxArgs& args); +template <> +extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreIndexedArgs& args); +template <> +extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreStrideArgs& args); +template <> +extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreUnitStrideArgs& args); +#endif } // namespace berberis diff --git a/interpreter/riscv64/interpreter_test.cc b/interpreter/riscv64/interpreter_test.cc index aa67cfbc..ce991806 100644 --- a/interpreter/riscv64/interpreter_test.cc +++ b/interpreter/riscv64/interpreter_test.cc @@ -805,7 +805,7 @@ class Riscv64InterpreterTest : public ::testing::Test { // https://github.com/riscv/riscv-v-spec/pull/872 state_.cpu.vtype = BitUtilLog2(sizeof(ElementType)) << 3; state_.cpu.vl = 0; - constexpr int kElementsCount = static_cast<int>(16 / sizeof(ElementType)); + constexpr int kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof(ElementType)); for (int vstart = 0; vstart <= kElementsCount * kNFfields; ++vstart) { state_.cpu.insn_addr = ToGuestAddr(&insn_bytes); state_.cpu.vstart = vstart; @@ -817,7 +817,9 @@ class Riscv64InterpreterTest : public ::testing::Test { for (int index = 0; index < 8; ++index) { SIMD128Register expected_state{kVectorComparisonSource[index]}; SIMD128Register source_value{kVectorComparisonSource[index + 8]}; - if (index < kNFfields) { + if ((vstart < kElementsCount * kNFfields) && index < kNFfields) { + // The usual property that no elements are written if vstart >= vl does not apply to these + // instructions. Instead, no elements are written if vstart >= evl. for (int element_index = 0; element_index < kElementsCount; ++element_index) { if (element_index + index * kElementsCount >= vstart) { expected_state.Set(source_value.Get<ElementType>(element_index), element_index); @@ -929,6 +931,14 @@ class Riscv64InterpreterTest : public ::testing::Test { } } + void TestVectorFloatInstruction(uint32_t insn_bytes, + const uint32_t (&expected_result_int32)[8][4], + const uint64_t (&expected_result_int64)[8][2], + const __v2du (&source)[16]) { + TestVectorInstruction<TestVectorInstructionKind::kFloat, TestVectorInstructionMode::kDefault>( + insn_bytes, source, expected_result_int32, expected_result_int64); + } + void TestVectorInstruction(uint32_t insn_bytes, const uint8_t (&expected_result_int8)[8][16], const uint16_t (&expected_result_int16)[8][8], @@ -967,16 +977,46 @@ class Riscv64InterpreterTest : public ::testing::Test { expected_result_int64); } + void TestNarrowingVectorFloatInstruction(uint32_t insn_bytes, + const uint32_t (&expected_result_int32)[4][4], + const __v2du (&source)[16]) { + TestVectorInstruction<TestVectorInstructionKind::kFloat, TestVectorInstructionMode::kNarrowing>( + insn_bytes, source, expected_result_int32); + } + + void TestNarrowingVectorFloatInstruction(uint32_t insn_bytes, + const uint16_t (&expected_result_int16)[4][8], + const uint32_t (&expected_result_int32)[4][4], + const __v2du (&source)[16]) { + TestVectorInstruction<TestVectorInstructionKind::kFloat, TestVectorInstructionMode::kNarrowing>( + insn_bytes, source, expected_result_int16, expected_result_int32); + } + void TestNarrowingVectorInstruction(uint32_t insn_bytes, - const uint8_t (&expected_result_int8)[8][16], - const uint16_t (&expected_result_int16)[8][8], - const uint32_t (&expected_result_int32)[8][4], + const uint8_t (&expected_result_int8)[4][16], + const uint16_t (&expected_result_int16)[4][8], + const uint32_t (&expected_result_int32)[4][4], const __v2du (&source)[16]) { TestVectorInstruction<TestVectorInstructionKind::kInteger, TestVectorInstructionMode::kNarrowing>( insn_bytes, source, expected_result_int8, expected_result_int16, expected_result_int32); } + void TestWideningVectorFloatInstruction(uint32_t insn_bytes, + const uint64_t (&expected_result_int64)[8][2], + const __v2du (&source)[16]) { + TestVectorInstruction<TestVectorInstructionKind::kFloat, TestVectorInstructionMode::kWidening>( + insn_bytes, source, expected_result_int64); + } + + void TestWideningVectorFloatInstruction(uint32_t insn_bytes, + const uint32_t (&expected_result_int32)[8][4], + const uint64_t (&expected_result_int64)[8][2], + const __v2du (&source)[16]) { + TestVectorInstruction<TestVectorInstructionKind::kFloat, TestVectorInstructionMode::kWidening>( + insn_bytes, source, expected_result_int32, expected_result_int64); + } + void TestWideningVectorInstruction(uint32_t insn_bytes, const uint16_t (&expected_result_int16)[8][8], const uint32_t (&expected_result_int32)[8][4], @@ -993,10 +1033,12 @@ class Riscv64InterpreterTest : public ::testing::Test { template <TestVectorInstructionKind kTestVectorInstructionKind, TestVectorInstructionMode kTestVectorInstructionMode, typename... ElementType, + size_t... kResultsCount, size_t... kElementCount> - void TestVectorInstruction(uint32_t insn_bytes, - const __v2du (&source)[16], - const ElementType (&... expected_result)[8][kElementCount]) { + void TestVectorInstruction( + uint32_t insn_bytes, + const __v2du (&source)[16], + const ElementType (&... expected_result)[kResultsCount][kElementCount]) { auto Verify = [this, &source](uint32_t insn_bytes, uint8_t vsew, uint8_t vlmul_max, @@ -1895,6 +1937,265 @@ TEST_F(Riscv64InterpreterTest, TestVmXr) { TestVmvXr<8>(0x9f03b457); // Vmv8r.v v8, v16 } +TEST_F(Riscv64InterpreterTest, TestVfcvtxfv) { + TestVectorFloatInstruction(0x49801457, // Vfcvt.xu.f.v v8, v24, v0.t + {{0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0xffff'ffff, 0xffff'ffff, 0x0000'6a21, 0x6e25'6c00}, + {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}}, + {{0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0xffff'ffff'ffff'ffff, 0xffff'ffff'ffff'ffff}, + {0xffff'ffff'ffff'ffff, 0xffff'ffff'ffff'ffff}}, + kVectorCalculationsSource); + TestVectorFloatInstruction(0x49809457, // Vfcvt.x.f.v v8, v24, v0.t + {{0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x8000'0000, 0x8000'0000, 0xffff'cacf, 0xc8cd'6a00}, + {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x7fff'ffff, 0x7fff'ffff, 0x0000'6a21, 0x6e25'6c00}, + {0x7fff'ffff, 0x7fff'ffff, 0x7fff'ffff, 0x7fff'ffff}}, + {{0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x8000'0000'0000'0000, 0x8000'0000'0000'0000}, + {0x8000'0000'0000'0000, 0x8000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x7fff'ffff'ffff'ffff, 0x7fff'ffff'ffff'ffff}, + {0x7fff'ffff'ffff'ffff, 0x7fff'ffff'ffff'ffff}}, + kVectorCalculationsSource); + TestVectorFloatInstruction(0x49811457, // Vfcvt.f.xu.v v8, v24, v0.t + {{0x4f16'0492, 0x4f1e'0c9a, 0x4f06'1482, 0x4f0e'1c8a}, + {0x4f36'24b2, 0x4f3e'2cba, 0x4f26'34a2, 0x4f2e'3caa}, + {0x4f56'44d2, 0x4f5e'4cda, 0x4f46'54c2, 0x4f4e'5cca}, + {0x4f76'64f2, 0x4f7e'6cfa, 0x4f66'74e2, 0x4f6e'7cea}, + {0x4db4'2094, 0x4df4'60d4, 0x4cd2'8052, 0x4d69'c0aa}, + {0x4e5a'90ca, 0x4e7a'b0eb, 0x4e1a'd08b, 0x4e3a'f0ab}, + {0x4ead'88a6, 0x4ebd'98b6, 0x4e8d'a886, 0x4e9d'b896}, + {0x4eed'c8e6, 0x4efd'd8f6, 0x4ecd'e8c6, 0x4edd'f8d6}}, + {{0x43e3'c193'4132'c092, 0x43e1'c391'4310'c290}, + {0x43e7'c597'4536'c496, 0x43e5'c795'4714'c694}, + {0x43eb'c99b'493a'c89a, 0x43e9'cb99'4b18'ca98}, + {0x43ef'cd9f'4d3e'cc9e, 0x43ed'cf9d'4f1c'ce9c}, + {0x43be'8c1a'8916'8412, 0x43ad'3815'300d'2805}, + {0x43cf'561d'549b'5219, 0x43c7'5e15'5c13'5a11}, + {0x43d7'b316'b255'b115, 0x43d3'b712'b611'b511}, + {0x43df'bb1e'ba5d'b91d, 0x43db'bf1a'be19'bd19}}, + kVectorCalculationsSource); + TestVectorFloatInstruction(0x49819457, // Vfcvt.f.x.v v8, v24, v0.t + {{0xced3'f6dc, 0xcec3'e6cc, 0xcef3'd6fc, 0xcee3'c6ec}, + {0xce93'b69c, 0xce83'a68c, 0xceb3'96bc, 0xcea3'86ac}, + {0xce26'ecb7, 0xce06'cc97, 0xce66'acf7, 0xce46'8cd7}, + {0xcd19'b0da, 0xcbc9'82cc, 0xcdcc'58ec, 0xcd8c'18ac}, + {0x4db4'2094, 0x4df4'60d4, 0x4cd2'8052, 0x4d69'c0aa}, + {0x4e5a'90ca, 0x4e7a'b0eb, 0x4e1a'd08b, 0x4e3a'f0ab}, + {0x4ead'88a6, 0x4ebd'98b6, 0x4e8d'a886, 0x4e9d'b896}, + {0x4eed'c8e6, 0x4efd'd8f6, 0x4ecd'e8c6, 0x4edd'f8d6}}, + {{0xc3d8'7cd9'7d9a'7edc, 0xc3dc'78dd'79de'7adf}, + {0xc3d0'74d1'7592'76d3, 0xc3d4'70d5'71d6'72d7}, + {0xc3c0'd992'db14'dd97, 0xc3c8'd19a'd39c'd59f}, + {0xc379'3059'6099'b0da, 0xc3b1'8315'8719'8b1e}, + {0x43be'8c1a'8916'8412, 0x43ad'3815'300d'2805}, + {0x43cf'561d'549b'5219, 0x43c7'5e15'5c13'5a11}, + {0x43d7'b316'b255'b115, 0x43d3'b712'b611'b511}, + {0x43df'bb1e'ba5d'b91d, 0x43db'bf1a'be19'bd19}}, + kVectorCalculationsSource); + TestVectorFloatInstruction(0x49831457, // Vfcvt.rtz.xu.f.v v8, v24, v0.t + {{0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0xffff'ffff, 0xffff'ffff, 0x0000'6a21, 0x6e25'6c00}, + {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}}, + {{0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0xffff'ffff'ffff'ffff, 0xffff'ffff'ffff'ffff}, + {0xffff'ffff'ffff'ffff, 0xffff'ffff'ffff'ffff}}, + kVectorCalculationsSource); + TestVectorFloatInstruction(0x49839457, // Vfcvt.rtz.x.f.v v8, v24, v0.t + {{0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x8000'0000, 0x8000'0000, 0xffff'cad0, 0xc8cd'6a00}, + {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x7fff'ffff, 0x7fff'ffff, 0x0000'6a21, 0x6e25'6c00}, + {0x7fff'ffff, 0x7fff'ffff, 0x7fff'ffff, 0x7fff'ffff}}, + {{0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x8000'0000'0000'0000, 0x8000'0000'0000'0000}, + {0x8000'0000'0000'0000, 0x8000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x7fff'ffff'ffff'ffff, 0x7fff'ffff'ffff'ffff}, + {0x7fff'ffff'ffff'ffff, 0x7fff'ffff'ffff'ffff}}, + kVectorCalculationsSource); + TestWideningVectorFloatInstruction(0x49c41457, // Vfwcvt.xu.f.v v8, v28, v0.t + {{0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'6229'6000'0000, 0x662d'6480'0000'0000}, + {0x0000'0000'0000'6a21, 0x0000'0000'6e25'6c00}, + {0xffff'ffff'ffff'ffff, 0xffff'ffff'ffff'ffff}, + {0xffff'ffff'ffff'ffff, 0xffff'ffff'ffff'ffff}}, + kVectorCalculationsSource); + TestWideningVectorFloatInstruction(0x49849457, // Vfwcvt.x.f.v v8, v24, v0.t + {{0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0xffff'cecb'7000'0000, 0xccc9'6dc0'0000'0000}, + {0xffff'ffff'ffff'cacf, 0xffff'ffff'c8cd'6a00}, + {0x8000'0000'0000'0000, 0x8000'0000'0000'0000}, + {0x8000'0000'0000'0000, 0x8000'0000'0000'0000}}, + kVectorCalculationsSource); + TestWideningVectorFloatInstruction(0x49861457, // Vfwcvt.f.f.v v8, v24, v0.t + {{0xbac0'9240'0000'0000, 0xbbc1'9341'2000'0000}, + {0xb8c2'9042'2000'0000, 0xb9c3'9143'0000'0000}, + {0xbec4'9644'0000'0000, 0xbfc5'9745'2000'0000}, + {0xbcc6'9446'2000'0000, 0xbdc7'9547'0000'0000}, + {0xc2c8'9a48'0000'0000, 0xc3c9'9b49'2000'0000}, + {0xc0ca'984a'2000'0000, 0xc1cb'994b'0000'0000}, + {0xc6cc'9e4c'0000'0000, 0xc7cd'9f4d'2000'0000}, + {0xc4ce'9c4e'2000'0000, 0xc5cf'9d4f'0000'0000}}, + kVectorCalculationsSource); + TestWideningVectorFloatInstruction(0x49851457, // Vfwcvt.f.xu.v v8, v24, v0.t + {{0x4712'0000, 0x4716'0400, 0x471a'0900, 0x471e'0c00}, + {0x4702'1100, 0x4706'1400, 0x470a'1800, 0x470e'1c00}, + {0x4732'2000, 0x4736'2400, 0x473a'2900, 0x473e'2c00}, + {0x4722'3100, 0x4726'3400, 0x472a'3800, 0x472e'3c00}, + {0x4752'4000, 0x4756'4400, 0x475a'4900, 0x475e'4c00}, + {0x4742'5100, 0x4746'5400, 0x474a'5800, 0x474e'5c00}, + {0x4772'6000, 0x4776'6400, 0x477a'6900, 0x477e'6c00}, + {0x4762'7100, 0x4766'7400, 0x476a'7800, 0x476e'7c00}}, + {{0x41e2'c092'4000'0000, 0x41e3'c193'4120'0000}, + {0x41e0'c290'4220'0000, 0x41e1'c391'4300'0000}, + {0x41e6'c496'4400'0000, 0x41e7'c597'4520'0000}, + {0x41e4'c694'4620'0000, 0x41e5'c795'4700'0000}, + {0x41ea'c89a'4800'0000, 0x41eb'c99b'4920'0000}, + {0x41e8'ca98'4a20'0000, 0x41e9'cb99'4b00'0000}, + {0x41ee'cc9e'4c00'0000, 0x41ef'cd9f'4d20'0000}, + {0x41ec'ce9c'4e20'0000, 0x41ed'cf9d'4f00'0000}}, + kVectorCalculationsSource); + TestWideningVectorFloatInstruction(0x49859457, // Vfwcvt.f.x.v v8, v24, v0.t + {{0xc6dc'0000, 0xc6d3'f800, 0xc6cb'ee00, 0xc6c3'e800}, + {0xc6fb'de00, 0xc6f3'd800, 0xc6eb'd000, 0xc6e3'c800}, + {0xc69b'c000, 0xc693'b800, 0xc68b'ae00, 0xc683'a800}, + {0xc6bb'9e00, 0xc6b3'9800, 0xc6ab'9000, 0xc6a3'8800}, + {0xc637'0000, 0xc626'f000, 0xc616'dc00, 0xc606'd000}, + {0xc676'bc00, 0xc666'b000, 0xc656'a000, 0xc646'9000}, + {0xc55a'0000, 0xc519'c000, 0xc4b2'e000, 0xc3ca'0000}, + {0xc5ec'7800, 0xc5cc'6000, 0xc5ac'4000, 0xc58c'2000}}, + {{0xc1da'7edb'8000'0000, 0xc1d8'7cd9'7dc0'0000}, + {0xc1de'7adf'7bc0'0000, 0xc1dc'78dd'7a00'0000}, + {0xc1d2'76d3'7800'0000, 0xc1d0'74d1'75c0'0000}, + {0xc1d6'72d7'73c0'0000, 0xc1d4'70d5'7200'0000}, + {0xc1c4'dd96'e000'0000, 0xc1c0'd992'db80'0000}, + {0xc1cc'd59e'd780'0000, 0xc1c8'd19a'd400'0000}, + {0xc1a3'361b'4000'0000, 0xc179'3059'7000'0000}, + {0xc1b9'8b1d'8f00'0000, 0xc1b1'8315'8800'0000}}, + kVectorCalculationsSource); + TestWideningVectorFloatInstruction(0x49c71457, // Vfwcvt.rtz.xu.f.v v8, v28, v0.t + {{0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'6229'6000'0000, 0x662d'6480'0000'0000}, + {0x0000'0000'0000'6a21, 0x0000'0000'6e25'6c00}, + {0xffff'ffff'ffff'ffff, 0xffff'ffff'ffff'ffff}, + {0xffff'ffff'ffff'ffff, 0xffff'ffff'ffff'ffff}}, + kVectorCalculationsSource); + TestWideningVectorFloatInstruction(0x49879457, // Vfwcvt.rtz.x.f.v v8, v24, v0.t + {{0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0x0000'0000'0000'0000, 0x0000'0000'0000'0000}, + {0xffff'cecb'7000'0000, 0xccc9'6dc0'0000'0000}, + {0xffff'ffff'ffff'cad0, 0xffff'ffff'c8cd'6a00}, + {0x8000'0000'0000'0000, 0x8000'0000'0000'0000}, + {0x8000'0000'0000'0000, 0x8000'0000'0000'0000}}, + kVectorCalculationsSource); + TestNarrowingVectorFloatInstruction( + 0x49881457, // Vfncvt.xu.f.w v8, v24, v0.t + {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0x6a21, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}}, + {{0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}}, + kVectorCalculationsSource); + TestNarrowingVectorFloatInstruction( + 0x49889457, // Vfncvt.x.f.w v8, v24, v0.t + {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0x8000, 0x8000, 0xcacf, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0x7fff, 0x7fff, 0x6a21, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff}}, + {{0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x7fff'ffff, 0x7fff'ffff, 0x7fff'ffff, 0x7fff'ffff}}, + kVectorCalculationsSource); + TestNarrowingVectorFloatInstruction(0x498a1457, // Vfncvt.f.f.w v8, v24, v0.t + {{0x8000'0000, 0x8000'0000, 0xb165'd14e, 0x8000'0000}, + {0xff80'0000, 0xff80'0000, 0xff80'0000, 0xff80'0000}, + {0x0000'0000, 0x0000'0000, 0x3561'd54a, 0x0000'0000}, + {0x7f80'0000, 0x7f80'0000, 0x7f80'0000, 0x7f80'0000}}, + kVectorCalculationsSource); + TestNarrowingVectorFloatInstruction(0x49891457, // Vfncvt.f.xu.w v8, v24, v0.t + {{0x5f1e'0c9a, 0x5f0e'1c8a, 0x5f3e'2cba, 0x5f2e'3caa}, + {0x5f5e'4cda, 0x5f4e'5cca, 0x5f7e'6cfa, 0x5f6e'7cea}, + {0x5df4'60d4, 0x5d69'c0aa, 0x5e7a'b0eb, 0x5e3a'f0ab}, + {0x5ebd'98b6, 0x5e9d'b896, 0x5efd'd8f6, 0x5edd'f8d6}}, + kVectorCalculationsSource); + TestNarrowingVectorFloatInstruction(0x49899457, // Vfncvt.f.x.w v8, v24, v0.t + {{0xdec3'e6cc, 0xdee3'c6ec, 0xde83'a68c, 0xdea3'86ac}, + {0xde06'cc97, 0xde46'8cd7, 0xdbc9'82cb, 0xdd8c'18ac}, + {0x5df4'60d4, 0x5d69'c0aa, 0x5e7a'b0eb, 0x5e3a'f0ab}, + {0x5ebd'98b6, 0x5e9d'b896, 0x5efd'd8f6, 0x5edd'f8d6}}, + kVectorCalculationsSource); + TestNarrowingVectorFloatInstruction( + 0x498b1457, // Vfncvt.rtz.xu.f.w v8, v24, v0.t + {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0x6a21, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}}, + {{0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}}, + kVectorCalculationsSource); + TestNarrowingVectorFloatInstruction( + 0x498b9457, // Vfncvt.rtz.x.f.w v8, v24, v0.t + {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0x8000, 0x8000, 0xcad0, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}, + {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0x7fff, 0x7fff, 0x6a21, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff}}, + {{0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}, + {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000}, + {0x7fff'ffff, 0x7fff'ffff, 0x7fff'ffff, 0x7fff'ffff}}, + kVectorCalculationsSource); +} + TEST_F(Riscv64InterpreterTest, TestVfmvfs) { TestVfmvfs<intrinsics::Float32>(0x428010d7, 0xffff'ffff'8302'8100); // Vfmv.f.s f1, v8 TestVfmvfs<intrinsics::Float64>(0x428010d7, 0x8706'8504'8302'8100); // Vfmv.f.s f1, v8 @@ -6886,6 +7187,24 @@ TEST_F(Riscv64InterpreterTest, TestVmin) { {0xaaaa'aaaa'aaaa'aaaa, 0xaaaa'aaaa'aaaa'aaaa}, {0xaaaa'aaaa'aaaa'aaaa, 0xaaaa'aaaa'aaaa'aaaa}}, kVectorCalculationsSourceLegacy); + TestVectorFloatInstruction(0x1100d457, // vfmin.vf v8, v16, f1, v0.t + {{0x7fc0'0000, 0x7fc0'0000, 0x7fc0'0000, 0x7fc0'0000}, + {0xaaaa'aaaa, 0xaaaa'aaaa, 0xaaaa'aaaa, 0xaaaa'aaaa}, + {0xbbbb'bbbb, 0xbbbb'bbbb, 0xaaaa'aaaa, 0xaaaa'aaaa}, + {0xaaaa'aaaa, 0xaaaa'aaaa, 0x1111'1111, 0x1111'1111}, + {0x7fc0'0000, 0x7fc0'0000, 0x7fc0'0000, 0x7fc0'0000}, + {0x7fc0'0000, 0x7fc0'0000, 0x7fc0'0000, 0x7fc0'0000}, + {0xa9bb'bbbb, 0xa9bb'bbbb, 0xa9bb'bbbb, 0xa9bb'bbbb}, + {0xa9a9'a9a9, 0xa9a9'a9a9, 0xa9a9'a9a9, 0xa9a9'a9a9}}, + {{0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000}, + {0xaaaa'aaaa'aaaa'aaaa, 0xaaaa'aaaa'aaaa'aaaa}, + {0xbbbb'bbbb'bbbb'bbbb, 0xaaaa'aaaa'aaaa'aaaa}, + {0xaaaa'aaaa'aaaa'aaaa, 0x1111'1111'1111'1111}, + {0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000}, + {0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000}, + {0xa9bb'bbbb'a9bb'bbbb, 0xa9bb'bbbb'a9bb'bbbb}, + {0xa9a9'a9a9'a9a9'a9a9, 0xa9a9'a9a9'a9a9'a9a9}}, + kVectorComparisonSource); } TEST_F(Riscv64InterpreterTest, TestVmaxu) { @@ -7032,6 +7351,24 @@ TEST_F(Riscv64InterpreterTest, TestVmax) { {0xe766'e564'e362'e160, 0xef6e'ed6c'eb6a'e968}, {0xf776'f574'f372'f170, 0xff7e'fd7c'fb7a'f978}}, kVectorCalculationsSourceLegacy); + TestVectorFloatInstruction(0x1900d457, // vfmax.vf v8, v16, f1, v0.t + {{0x7fc0'0000, 0x7fc0'0000, 0x7fc0'0000, 0x7fc0'0000}, + {0x40b4'0000, 0x40b4'0000, 0x40b4'0000, 0x40b4'0000}, + {0x40b4'0000, 0x40b4'0000, 0x40b4'0000, 0x40b4'0000}, + {0x40b4'0000, 0x40b4'0000, 0x40b4'0000, 0x40b4'0000}, + {0x7fc0'0000, 0x7fc0'0000, 0x7fc0'0000, 0x7fc0'0000}, + {0x7fc0'0000, 0x7fc0'0000, 0x7fc0'0000, 0x7fc0'0000}, + {0x40b4'0000, 0x40b4'0000, 0x40b4'0000, 0x40b4'0000}, + {0x40b4'0000, 0x40b4'0000, 0x40b4'0000, 0x40b4'0000}}, + {{0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000}, + {0x4016'8000'0000'0000, 0x4016'8000'0000'0000}, + {0x4016'8000'0000'0000, 0x4016'8000'0000'0000}, + {0x4016'8000'0000'0000, 0x4016'8000'0000'0000}, + {0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000}, + {0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000}, + {0x4016'8000'0000'0000, 0x4016'8000'0000'0000}, + {0x4016'8000'0000'0000, 0x4016'8000'0000'0000}}, + kVectorComparisonSource); } TEST_F(Riscv64InterpreterTest, TestVredsum) { diff --git a/intrinsics/include/berberis/intrinsics/intrinsics_floating_point_impl.h b/intrinsics/include/berberis/intrinsics/intrinsics_floating_point_impl.h index c62ffbae..e4e705bf 100644 --- a/intrinsics/include/berberis/intrinsics/intrinsics_floating_point_impl.h +++ b/intrinsics/include/berberis/intrinsics/intrinsics_floating_point_impl.h @@ -95,9 +95,40 @@ std::tuple<TargetOperandType> FCvtFloatToInteger(int8_t rm, int8_t frm, SourceOp std::is_same_v<Float64, SourceOperandType>); static_assert(std::is_integral_v<TargetOperandType>); int8_t actual_rm = rm == FPFlags::DYN ? frm : rm; - TargetOperandType result = - static_cast<TargetOperandType>(FPRound(arg, ToIntrinsicRoundingMode(actual_rm))); - return static_cast<std::make_signed_t<TargetOperandType>>(result); + SourceOperandType result = FPRound(arg, ToIntrinsicRoundingMode(actual_rm)); + if constexpr (std::is_signed_v<TargetOperandType>) { + // Note: because of how two's complement numbers and floats work minimum negative number always + // either representable precisely or not prepresentable at all, but this is not true for minimal + // possible value. + // Use ~min() to guarantee no surprises with rounding. + constexpr float kMinInBoundsNegativeValue = + static_cast<float>(std::numeric_limits<TargetOperandType>::min()); + constexpr float kMinNotInBoundsPositiveValue = static_cast<float>(-kMinInBoundsNegativeValue); + if (result < SourceOperandType{kMinInBoundsNegativeValue}) [[unlikely]] { + return std::numeric_limits<TargetOperandType>::min(); + } + // Note: we have to ensure that NaN is properly handled by this comparison! + if (result < SourceOperandType{kMinNotInBoundsPositiveValue}) [[likely]] { + return static_cast<TargetOperandType>(result); + } + } else { + // Note: if value is less than zero then result of conversion from float/double to unsigned + // integer is undefined and thus clang/gcc happily use conversion cvttss2si without doing + // anything to handle negative numbers. We need to handle that corner case here. + if (result < SourceOperandType{0.0f}) [[unlikely]] { + return 0; + } + // Similarly to signed interners case above, have to use -2.0f * min to properly handle NaNs. + constexpr float kMinNotInBoundsPositiveValue = static_cast<float>( + -2.0f * + static_cast<float>(std::numeric_limits<std::make_signed_t<TargetOperandType>>::min())); + // Note: we have to ensure that NaN is properly handled by this comparison! + if (result < SourceOperandType{kMinNotInBoundsPositiveValue}) [[likely]] { + return static_cast<TargetOperandType>(result); + } + } + // Handle too large numbers and NaN. + return std::numeric_limits<TargetOperandType>::max(); } template <typename TargetOperandType, diff --git a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h index aa394204..e9e396eb 100644 --- a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h +++ b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h @@ -481,6 +481,32 @@ inline std::tuple<SIMD128Register> Vmsofm(SIMD128Register simd_src) { return {std::get<0>(Vmsbfm(simd_src)) ^ std::get<0>(Vmsifm(simd_src))}; } +template <typename TargetElementType, + typename SourceElementType, + enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible> +inline std::tuple<SIMD128Register> Vfcvtv(int8_t rm, int8_t frm, SIMD128Register src) { + SIMD128Register result; + constexpr int kElementsCount = + std::min(static_cast<int>(sizeof(SIMD128Register) / sizeof(TargetElementType)), + static_cast<int>(sizeof(SIMD128Register) / sizeof(SourceElementType))); + for (int index = 0; index < kElementsCount; ++index) { + if constexpr (std::is_integral_v<TargetElementType>) { + result.Set(std::get<0>(FCvtFloatToInteger<TargetElementType, SourceElementType>( + rm, frm, src.Get<SourceElementType>(index))), + index); + } else if constexpr (std::is_integral_v<SourceElementType>) { + result.Set(std::get<0>(FCvtIntegerToFloat<TargetElementType, SourceElementType>( + rm, frm, src.Get<SourceElementType>(index))), + index); + } else { + result.Set(std::get<0>(FCvtFloatToFloat<TargetElementType, SourceElementType>( + rm, frm, src.Get<SourceElementType>(index))), + index); + } + } + return result; +} + #define DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS(...) __VA_ARGS__ #define DEFINE_ARITHMETIC_INTRINSIC(Name, arithmetic, parameters, arguments) \ \ @@ -609,10 +635,12 @@ DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(nmsub, auto [arg1, arg2, arg3] = std::tuple{a (-(arg2 * arg3) + arg1)) DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(nmsub, auto [arg1, arg2, arg3] = std::tuple{args...}; (-(arg2 * arg3) + arg1)) -DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(min, (std::min(args...))) -DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(min, (std::min(args...))) -DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(max, (std::max(args...))) -DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(max, (std::max(args...))) +DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmin, std::get<0>(FMin(args...))) +DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmax, std::get<0>(FMax(args...))) +DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(min, std::min(args...)) +DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(min, std::min(args...)) +DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(max, std::max(args...)) +DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(max, std::max(args...)) DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redsum, (args + ...)) DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redand, (args & ...)) DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redor, (args | ...)) |