diff options
author | Haines Sy <hainesy@google.com> | 2024-04-22 14:07:07 -0700 |
---|---|---|
committer | Haines Sy <hainesy@google.com> | 2024-04-22 16:21:32 -0700 |
commit | 9b0ed4174e55200654db5fcb7a64ed7476e79988 (patch) | |
tree | 72f5faa82100ca88bc94e4ace94b91a30f930781 | |
parent | 1aa28e380c02e4080592c2e48c88555ed682e9d2 (diff) | |
download | binary_translation-9b0ed4174e55200654db5fcb7a64ed7476e79988.tar.gz |
Add Rem intrinsics
Test: m berberis_all
Change-Id: I7eaa16e570038ecbf4a3a0548f79f72dff72b526
10 files changed, 254 insertions, 73 deletions
diff --git a/decoder/include/berberis/decoder/riscv64/semantics_player.h b/decoder/include/berberis/decoder/riscv64/semantics_player.h index a32ee177..7f4a20c0 100644 --- a/decoder/include/berberis/decoder/riscv64/semantics_player.h +++ b/decoder/include/berberis/decoder/riscv64/semantics_player.h @@ -415,6 +415,10 @@ class SemanticsPlayer { return listener_->template Div<int64_t>(arg1, arg2); case Decoder::OpOpcode::kDivu: return listener_->template Div<uint64_t>(arg1, arg2); + case Decoder::OpOpcode::kRem: + return listener_->template Rem<int64_t>(arg1, arg2); + case Decoder::OpOpcode::kRemu: + return listener_->template Rem<uint64_t>(arg1, arg2); case Decoder::OpOpcode::kMax: return listener_->template Max<int64_t>(arg1, arg2); case Decoder::OpOpcode::kMaxu: @@ -453,6 +457,10 @@ class SemanticsPlayer { return listener_->template Div<int32_t>(arg1, arg2); case Decoder::Op32Opcode::kDivuw: return listener_->template Div<uint32_t>(arg1, arg2); + case Decoder::Op32Opcode::kRemw: + return listener_->template Rem<int32_t>(arg1, arg2); + case Decoder::Op32Opcode::kRemuw: + return listener_->template Rem<uint32_t>(arg1, arg2); case Decoder::Op32Opcode::kRolw: return listener_->template Rol<int32_t>(arg1, arg2); case Decoder::Op32Opcode::kRorw: diff --git a/heavy_optimizer/riscv64/frontend.cc b/heavy_optimizer/riscv64/frontend.cc index 14846835..8dae1afc 100644 --- a/heavy_optimizer/riscv64/frontend.cc +++ b/heavy_optimizer/riscv64/frontend.cc @@ -360,25 +360,6 @@ Register HeavyOptimizerFrontend::Op(Decoder::OpOpcode opcode, Register arg1, Reg Gen<x86_64::MulqRegRegReg>(rax, rdx, arg2, GetFlagsRegister()); Gen<PseudoCopy>(res, rdx, 8); } break; - case OpOpcode::kRem: { - auto rax = AllocTempReg(); - auto rdx = AllocTempReg(); - Gen<PseudoCopy>(rax, arg1, 8); - Gen<PseudoCopy>(rdx, rax, 8); - Gen<x86_64::SarqRegImm>(rdx, 63, GetFlagsRegister()); - Gen<x86_64::IdivqRegRegReg>(rax, rdx, arg2, GetFlagsRegister()); - Gen<PseudoCopy>(res, rdx, 8); - } break; - case OpOpcode::kRemu: { - auto rax = AllocTempReg(); - auto rdx = AllocTempReg(); - Gen<PseudoCopy>(rax, arg1, 8); - // Pseudo-def for use-def operand of XOR to make sure data-flow is integrate. - Gen<PseudoDefReg>(rdx); - Gen<x86_64::XorqRegReg>(rdx, rdx, GetFlagsRegister()); - Gen<x86_64::DivqRegRegReg>(rax, rdx, arg2, GetFlagsRegister()); - Gen<PseudoCopy>(res, rdx, 8); - } break; case OpOpcode::kAndn: if (host_platform::kHasBMI) { Gen<x86_64::AndnqRegRegReg>(res, arg2, arg1, GetFlagsRegister()); @@ -437,25 +418,6 @@ Register HeavyOptimizerFrontend::Op32(Decoder::Op32Opcode opcode, Register arg1, Gen<PseudoCopy>(res, arg1, 4); Gen<x86_64::ImullRegReg>(res, arg2, GetFlagsRegister()); break; - case Op32Opcode::kRemw: { - auto rax = AllocTempReg(); - auto rdx = AllocTempReg(); - Gen<PseudoCopy>(rax, arg1, 4); - Gen<PseudoCopy>(rdx, rax, 4); - Gen<x86_64::SarlRegImm>(rdx, int8_t{31}, GetFlagsRegister()); - Gen<x86_64::IdivlRegRegReg>(rax, rdx, arg2, GetFlagsRegister()); - unextended_res = rdx; - } break; - case Op32Opcode::kRemuw: { - auto rax = AllocTempReg(); - auto rdx = AllocTempReg(); - Gen<PseudoCopy>(rax, arg1, 4); - // Pseudo-def for use-def operand of XOR to make sure data-flow is integrate. - Gen<PseudoDefReg>(rdx); - Gen<x86_64::XorlRegReg>(rdx, rdx, GetFlagsRegister()); - Gen<x86_64::DivlRegRegReg>(rax, rdx, arg2, GetFlagsRegister()); - unextended_res = rdx; - } break; default: Undefined(); return {}; diff --git a/heavy_optimizer/riscv64/inline_intrinsic.h b/heavy_optimizer/riscv64/inline_intrinsic.h index 3d0f8162..f4664893 100644 --- a/heavy_optimizer/riscv64/inline_intrinsic.h +++ b/heavy_optimizer/riscv64/inline_intrinsic.h @@ -447,6 +447,14 @@ class TryBindingBasedInlineIntrinsicForHeavyOptimizer { static_assert(std::is_same_v<Usage, intrinsics::bindings::UseDef>); return std::tuple{std::get<arg_info.from>(input_args_)}; } + } else if constexpr (arg_info.arg_type == ArgInfo::OUT_TMP_ARG) { + if constexpr (kNumOut > 1) { + static_assert(kDependentTypeFalse<ArgTraits<ArgBinding>>); + } else { + CHECK(implicit_result_reg_.IsInvalidReg()); + implicit_result_reg_ = AllocVReg(); + return std::tuple{implicit_result_reg_}; + } } else if constexpr (arg_info.arg_type == ArgInfo::OUT_ARG) { static_assert(!std::is_same_v<ResType, std::monostate>); static_assert(std::is_same_v<Usage, intrinsics::bindings::Def> || @@ -545,7 +553,8 @@ class TryBindingBasedInlineIntrinsicForHeavyOptimizer { CHECK(!xmm_result_reg_.IsInvalidReg()); MovToResult<RegisterClass>(builder_, result_, xmm_result_reg_); } else if constexpr ((arg_info.arg_type == ArgInfo::OUT_ARG || - arg_info.arg_type == ArgInfo::IN_OUT_TMP_ARG) && + arg_info.arg_type == ArgInfo::IN_OUT_TMP_ARG || + arg_info.arg_type == ArgInfo::OUT_TMP_ARG) && RegisterClass::kIsImplicitReg) { CHECK(!implicit_result_reg_.IsInvalidReg()); MovToResult<RegisterClass>(builder_, result_, implicit_result_reg_); diff --git a/interpreter/riscv64/interpreter.h b/interpreter/riscv64/interpreter.h index a6fd3f56..3a43bc68 100644 --- a/interpreter/riscv64/interpreter.h +++ b/interpreter/riscv64/interpreter.h @@ -189,10 +189,6 @@ class Interpreter { return NarrowTopHalf(Widen(Int64(arg1)) * BitCastToSigned(Widen(UInt64(arg2)))); case Decoder::OpOpcode::kMulhu: return NarrowTopHalf(Widen(UInt64(arg1)) * Widen(UInt64(arg2))); - case Decoder::OpOpcode::kRem: - return Int64(arg1) % Int64(arg2); - case Decoder::OpOpcode::kRemu: - return UInt64(arg1) % UInt64(arg2); case Decoder::OpOpcode::kAndn: return Int64(arg1) & (~Int64(arg2)); case Decoder::OpOpcode::kOrn: @@ -219,10 +215,6 @@ class Interpreter { return Widen(TruncateTo<Int32>(arg1) >> TruncateTo<Int32>(arg2)); case Decoder::Op32Opcode::kMulw: return Widen(TruncateTo<Int32>(arg1) * TruncateTo<Int32>(arg2)); - case Decoder::Op32Opcode::kRemw: - return Widen(TruncateTo<Int32>(arg1) % TruncateTo<Int32>(arg2)); - case Decoder::Op32Opcode::kRemuw: - return Widen(BitCastToSigned(TruncateTo<UInt32>(arg1) % TruncateTo<UInt32>(arg2))); default: Undefined(); return {}; diff --git a/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_arith_impl.h b/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_arith_impl.h index cc5ce510..4e948359 100644 --- a/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_arith_impl.h +++ b/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_arith_impl.h @@ -82,6 +82,81 @@ void MacroAssembler<Assembler>::MacroDiv(Register src) { Bind(done); } + +// Divisor comes in "src", dividend comes in gpr_a, remainder is returned in gpr_d. +// gpr_a and FLAGS are clobbered by that macroinstruction. +template <typename Assembler> +template <typename IntType> +void MacroAssembler<Assembler>::MacroRem(Register src) { + Label* zero = MakeLabel(); + Label* overflow = MakeLabel(); + Label* done = MakeLabel(); + Test<IntType>(src, src); + Jcc(Condition::kZero, *zero); + + if constexpr (std::is_signed_v<IntType>) { + Label* do_idiv = MakeLabel(); + // If min int32_t/int64_t is divided by -1 then in risc-v the result is + // the dividend, but x86 will raise an exception. Handle this case separately. + Cmp<IntType>(src, int8_t{-1}); + Jcc(Condition::kNotEqual, *do_idiv); + + if constexpr (std::is_same_v<IntType, int64_t>) { + Cmp<IntType>(gpr_a, + {.disp = constants_pool::kVectorConst<std::numeric_limits<IntType>::min()>}); + } else { + Cmp<IntType>(gpr_a, std::numeric_limits<IntType>::min()); + } + Jcc(Condition::kEqual, *overflow); + + Bind(do_idiv); + // If we are dealing with 8-bit signed case then we need to sign-extend %al into %ax. + if constexpr (std::is_same_v<IntType, int8_t>) { + Cbw(); + // We need to sign-extend gpr_a into gpr_d to ensure 32bit/64-bit/128-bit dividend is correct. + } else if constexpr (std::is_same_v<IntType, int16_t>) { + Cwd(); + } else if constexpr (std::is_same_v<IntType, int32_t>) { + Cdq(); + } else if constexpr (std::is_same_v<IntType, int64_t>) { + Cqo(); + } else { + static_assert(kDependentTypeFalse<IntType>, "Unsupported format"); + } + } else if constexpr (std::is_same_v<IntType, uint8_t>) { + // For 8bit unsigned case we need “xor %ah, %ah” instruction, but our assembler doesn't support + // %ah register. Use .byte to emit the required machine code. + TwoByte(uint16_t{0xe430}); + } else { + // We need to zero-extend eax into dx/edx/rdx to ensure 32-bit/64-bit/128-bit dividend is + // correct. + Xor<uint64_t>(gpr_d, gpr_d); + } + + Div<IntType>(src); + if constexpr (std::is_same_v<IntType, uint8_t> || std::is_same_v<IntType, int8_t>) { + // For 8bit case the result is in %ah, but our assembler doesn't support + // %ah register. move %ah to %al + TwoByte(uint16_t{0xe086}); + } + Jmp(*done); + + Bind(zero); + if constexpr (std::is_same_v<IntType, uint8_t> || std::is_same_v<IntType, int8_t>) { + Mov<int8_t>(gpr_a, src); + } else { + Mov<IntType>(gpr_d, src); + } + Jmp(*done); + + Bind(overflow); + if constexpr (std::is_same_v<IntType, uint8_t> || std::is_same_v<IntType, int8_t>) { + Xor<int8_t>(gpr_a, gpr_a); + } else { + Xor<IntType>(gpr_d, gpr_d); + } + Bind(done); +} } // namespace berberis #endif // RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_ARITH_IMPL_H_ diff --git a/intrinsics/riscv64_to_x86_64/intrinsic_def.json b/intrinsics/riscv64_to_x86_64/intrinsic_def.json index 831beb51..9102efeb 100644 --- a/intrinsics/riscv64_to_x86_64/intrinsic_def.json +++ b/intrinsics/riscv64_to_x86_64/intrinsic_def.json @@ -292,6 +292,22 @@ "in": [ "Type0", "Type0" ], "out": [ "Type0" ] }, + "Rem": { + "comment": "Integer remainder", + "class": "template", + "variants": [ + "int8_t", + "uint8_t", + "int16_t", + "uint16_t", + "int32_t", + "uint32_t", + "int64_t", + "uint64_t" + ], + "in": [ "Type0", "Type0" ], + "out": [ "Type0" ] + }, "FAdd": { "comment": "Floating point addition", "class": "template", diff --git a/intrinsics/riscv64_to_x86_64/machine_ir_intrinsic_binding.json b/intrinsics/riscv64_to_x86_64/machine_ir_intrinsic_binding.json index 1fe5eba3..e9e052b8 100644 --- a/intrinsics/riscv64_to_x86_64/machine_ir_intrinsic_binding.json +++ b/intrinsics/riscv64_to_x86_64/machine_ir_intrinsic_binding.json @@ -182,6 +182,54 @@ "in": [ 1, 0 ], "out": [ 1 ] }, + { + "name": "Rem<int8_t>", + "insn": "RemInt8", + "in": [ 1, 0 ], + "out": [ 1 ] + }, + { + "name": "Rem<uint8_t>", + "insn": "RemUInt8", + "in": [ 1, 0 ], + "out": [ 1 ] + }, + { + "name": "Rem<int16_t>", + "insn": "RemInt16", + "in": [ 1, 0 ], + "out": [ 2 ] + }, + { + "name": "Rem<uint16_t>", + "insn": "RemUInt16", + "in": [ 1, 0 ], + "out": [ 2 ] + }, + { + "name": "Rem<int32_t>", + "insn": "RemInt32", + "in": [ 1, 0 ], + "out": [ 2 ] + }, + { + "name": "Rem<uint32_t>", + "insn": "RemUInt32", + "in": [ 1, 0 ], + "out": [ 2 ] + }, + { + "name": "Rem<int64_t>", + "insn": "RemInt64", + "in": [ 1, 0 ], + "out": [ 2 ] + }, + { + "name": "Rem<uint64_t>", + "insn": "RemUInt64", + "in": [ 1, 0 ], + "out": [ 2 ] + }, { "name": "FAddHostRounding<Float32>", "insn": "AddssXRegXReg", diff --git a/intrinsics/riscv64_to_x86_64/macro_def.json b/intrinsics/riscv64_to_x86_64/macro_def.json index 45f0b0ad..1420d4aa 100644 --- a/intrinsics/riscv64_to_x86_64/macro_def.json +++ b/intrinsics/riscv64_to_x86_64/macro_def.json @@ -198,6 +198,92 @@ "mnemo": "MACRO_UDIV64" }, { + "name": "RemInt8", + "args": [ + { "class": "GeneralReg8", "usage": "use" }, + { "class": "AX", "usage": "use_def" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem<int8_t>", + "mnemo": "MACRO_REM8" + }, + { + "name": "RemInt16", + "args": [ + { "class": "GeneralReg16", "usage": "use" }, + { "class": "AX", "usage": "use_def" }, + { "class": "DX", "usage": "def_early_clobber" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem<int16_t>", + "mnemo": "MACRO_REM16" + }, + { + "name": "RemInt32", + "args": [ + { "class": "GeneralReg32", "usage": "use" }, + { "class": "EAX", "usage": "use_def" }, + { "class": "EDX", "usage": "def_early_clobber" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem<int32_t>", + "mnemo": "MACRO_REM32" + }, + { + "name": "RemInt64", + "args": [ + { "class": "GeneralReg64", "usage": "use" }, + { "class": "EAX", "usage": "use_def" }, + { "class": "EDX", "usage": "def_early_clobber" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem<int64_t>", + "mnemo": "MACRO_REM64" + }, + { + "name": "RemUInt8", + "args": [ + { "class": "GeneralReg8", "usage": "use" }, + { "class": "AX", "usage": "use_def" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem<uint8_t>", + "mnemo": "MACRO_UREM8" + }, + { + "name": "RemUInt16", + "args": [ + { "class": "GeneralReg16", "usage": "use" }, + { "class": "AX", "usage": "use_def" }, + { "class": "DX", "usage": "def_early_clobber" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem<uint16_t>", + "mnemo": "MACRO_UREM16" + }, + { + "name": "RemUInt32", + "args": [ + { "class": "GeneralReg32", "usage": "use" }, + { "class": "EAX", "usage": "use_def" }, + { "class": "EDX", "usage": "def_early_clobber" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem<uint32_t>", + "mnemo": "MACRO_UREM32" + }, + { + "name": "RemUInt64", + "args": [ + { "class": "GeneralReg64", "usage": "use" }, + { "class": "EAX", "usage": "use_def" }, + { "class": "EDX", "usage": "def_early_clobber" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem<uint64_t>", + "mnemo": "MACRO_UREM64" + }, + { "name": "MacroFCvtFloat32ToInt32", "args": [ { "class": "GeneralReg64", "usage": "def" }, diff --git a/lite_translator/riscv64_to_x86_64/inline_intrinsic.h b/lite_translator/riscv64_to_x86_64/inline_intrinsic.h index c870a048..fa930da1 100644 --- a/lite_translator/riscv64_to_x86_64/inline_intrinsic.h +++ b/lite_translator/riscv64_to_x86_64/inline_intrinsic.h @@ -373,6 +373,10 @@ class TryBindingBasedInlineIntrinsic { Mov<std::tuple_element_t<arg_info.from, typename AsmCallInfo::InputArguments>>( as_, as_.rcx, std::get<arg_info.from>(input_args_)); return std::tuple{}; + } else if constexpr (RegisterClass::kAsRegister == 'a') { + Mov<std::tuple_element_t<arg_info.from, typename AsmCallInfo::InputArguments>>( + as_, as_.rax, std::get<arg_info.from>(input_args_)); + return std::tuple{}; } else { static_assert(std::is_same_v<Usage, intrinsics::bindings::UseDef>); static_assert(!RegisterClass::kIsImplicitReg); @@ -415,6 +419,13 @@ class TryBindingBasedInlineIntrinsic { return std::tuple{result_}; } } + } else if constexpr (arg_info.arg_type == ArgInfo::OUT_TMP_ARG) { + if constexpr (RegisterClass::kAsRegister == 'd') { + result_reg_ = as_.rdx; + return std::tuple{}; + } else { + static_assert(kDependentValueFalse<arg_info.arg_type>); + } } else if constexpr (arg_info.arg_type == ArgInfo::TMP_ARG) { static_assert(std::is_same_v<Usage, intrinsics::bindings::Def> || std::is_same_v<Usage, intrinsics::bindings::DefEarlyClobber>); diff --git a/lite_translator/riscv64_to_x86_64/lite_translator.cc b/lite_translator/riscv64_to_x86_64/lite_translator.cc index 1c587472..91091b65 100644 --- a/lite_translator/riscv64_to_x86_64/lite_translator.cc +++ b/lite_translator/riscv64_to_x86_64/lite_translator.cc @@ -105,19 +105,6 @@ Register LiteTranslator::Op(Decoder::OpOpcode opcode, Register arg1, Register ar as_.Mulq(arg2); as_.Movq(res, as_.rdx); break; - case OpOpcode::kRem: - as_.Movq(as_.rax, arg1); - as_.Movq(as_.rdx, as_.rax); - as_.Sarq(as_.rdx, int8_t{63}); - as_.Idivq(arg2); - as_.Movq(res, opcode == OpOpcode::kDiv ? as_.rax : as_.rdx); - break; - case OpOpcode::kRemu: - as_.Movq(as_.rax, arg1); - as_.Xorq(as_.rdx, as_.rdx); - as_.Divq(arg2); - as_.Movq(res, opcode == OpOpcode::kDivu ? as_.rax : as_.rdx); - break; case Decoder::OpOpcode::kAndn: if (host_platform::kHasBMI) { as_.Andnq(res, arg2, arg1); @@ -179,19 +166,6 @@ Register LiteTranslator::Op32(Decoder::Op32Opcode opcode, Register arg1, Registe as_.Imull(res, arg2); as_.Movsxlq(res, res); break; - case Op32Opcode::kRemw: - as_.Movl(as_.rax, arg1); - as_.Movl(as_.rdx, as_.rax); - as_.Sarl(as_.rdx, int8_t{31}); - as_.Idivl(arg2); - as_.Movsxlq(res, as_.rdx); - break; - case Op32Opcode::kRemuw: - as_.Movl(as_.rax, arg1); - as_.Xorl(as_.rdx, as_.rdx); - as_.Divl(arg2); - as_.Movsxlq(res, as_.rdx); - break; default: Undefined(); return {}; |