From 9b0ed4174e55200654db5fcb7a64ed7476e79988 Mon Sep 17 00:00:00 2001 From: Haines Sy Date: Mon, 22 Apr 2024 14:07:07 -0700 Subject: Add Rem intrinsics Test: m berberis_all Change-Id: I7eaa16e570038ecbf4a3a0548f79f72dff72b526 --- .../berberis/decoder/riscv64/semantics_player.h | 8 ++ heavy_optimizer/riscv64/frontend.cc | 38 ---------- heavy_optimizer/riscv64/inline_intrinsic.h | 11 ++- interpreter/riscv64/interpreter.h | 8 -- .../intrinsics/macro_assembler_arith_impl.h | 75 +++++++++++++++++++ intrinsics/riscv64_to_x86_64/intrinsic_def.json | 16 ++++ .../machine_ir_intrinsic_binding.json | 48 ++++++++++++ intrinsics/riscv64_to_x86_64/macro_def.json | 86 ++++++++++++++++++++++ .../riscv64_to_x86_64/inline_intrinsic.h | 11 +++ .../riscv64_to_x86_64/lite_translator.cc | 26 ------- 10 files changed, 254 insertions(+), 73 deletions(-) diff --git a/decoder/include/berberis/decoder/riscv64/semantics_player.h b/decoder/include/berberis/decoder/riscv64/semantics_player.h index a32ee177..7f4a20c0 100644 --- a/decoder/include/berberis/decoder/riscv64/semantics_player.h +++ b/decoder/include/berberis/decoder/riscv64/semantics_player.h @@ -415,6 +415,10 @@ class SemanticsPlayer { return listener_->template Div(arg1, arg2); case Decoder::OpOpcode::kDivu: return listener_->template Div(arg1, arg2); + case Decoder::OpOpcode::kRem: + return listener_->template Rem(arg1, arg2); + case Decoder::OpOpcode::kRemu: + return listener_->template Rem(arg1, arg2); case Decoder::OpOpcode::kMax: return listener_->template Max(arg1, arg2); case Decoder::OpOpcode::kMaxu: @@ -453,6 +457,10 @@ class SemanticsPlayer { return listener_->template Div(arg1, arg2); case Decoder::Op32Opcode::kDivuw: return listener_->template Div(arg1, arg2); + case Decoder::Op32Opcode::kRemw: + return listener_->template Rem(arg1, arg2); + case Decoder::Op32Opcode::kRemuw: + return listener_->template Rem(arg1, arg2); case Decoder::Op32Opcode::kRolw: return listener_->template Rol(arg1, arg2); case Decoder::Op32Opcode::kRorw: diff --git a/heavy_optimizer/riscv64/frontend.cc b/heavy_optimizer/riscv64/frontend.cc index 14846835..8dae1afc 100644 --- a/heavy_optimizer/riscv64/frontend.cc +++ b/heavy_optimizer/riscv64/frontend.cc @@ -360,25 +360,6 @@ Register HeavyOptimizerFrontend::Op(Decoder::OpOpcode opcode, Register arg1, Reg Gen(rax, rdx, arg2, GetFlagsRegister()); Gen(res, rdx, 8); } break; - case OpOpcode::kRem: { - auto rax = AllocTempReg(); - auto rdx = AllocTempReg(); - Gen(rax, arg1, 8); - Gen(rdx, rax, 8); - Gen(rdx, 63, GetFlagsRegister()); - Gen(rax, rdx, arg2, GetFlagsRegister()); - Gen(res, rdx, 8); - } break; - case OpOpcode::kRemu: { - auto rax = AllocTempReg(); - auto rdx = AllocTempReg(); - Gen(rax, arg1, 8); - // Pseudo-def for use-def operand of XOR to make sure data-flow is integrate. - Gen(rdx); - Gen(rdx, rdx, GetFlagsRegister()); - Gen(rax, rdx, arg2, GetFlagsRegister()); - Gen(res, rdx, 8); - } break; case OpOpcode::kAndn: if (host_platform::kHasBMI) { Gen(res, arg2, arg1, GetFlagsRegister()); @@ -437,25 +418,6 @@ Register HeavyOptimizerFrontend::Op32(Decoder::Op32Opcode opcode, Register arg1, Gen(res, arg1, 4); Gen(res, arg2, GetFlagsRegister()); break; - case Op32Opcode::kRemw: { - auto rax = AllocTempReg(); - auto rdx = AllocTempReg(); - Gen(rax, arg1, 4); - Gen(rdx, rax, 4); - Gen(rdx, int8_t{31}, GetFlagsRegister()); - Gen(rax, rdx, arg2, GetFlagsRegister()); - unextended_res = rdx; - } break; - case Op32Opcode::kRemuw: { - auto rax = AllocTempReg(); - auto rdx = AllocTempReg(); - Gen(rax, arg1, 4); - // Pseudo-def for use-def operand of XOR to make sure data-flow is integrate. - Gen(rdx); - Gen(rdx, rdx, GetFlagsRegister()); - Gen(rax, rdx, arg2, GetFlagsRegister()); - unextended_res = rdx; - } break; default: Undefined(); return {}; diff --git a/heavy_optimizer/riscv64/inline_intrinsic.h b/heavy_optimizer/riscv64/inline_intrinsic.h index 3d0f8162..f4664893 100644 --- a/heavy_optimizer/riscv64/inline_intrinsic.h +++ b/heavy_optimizer/riscv64/inline_intrinsic.h @@ -447,6 +447,14 @@ class TryBindingBasedInlineIntrinsicForHeavyOptimizer { static_assert(std::is_same_v); return std::tuple{std::get(input_args_)}; } + } else if constexpr (arg_info.arg_type == ArgInfo::OUT_TMP_ARG) { + if constexpr (kNumOut > 1) { + static_assert(kDependentTypeFalse>); + } else { + CHECK(implicit_result_reg_.IsInvalidReg()); + implicit_result_reg_ = AllocVReg(); + return std::tuple{implicit_result_reg_}; + } } else if constexpr (arg_info.arg_type == ArgInfo::OUT_ARG) { static_assert(!std::is_same_v); static_assert(std::is_same_v || @@ -545,7 +553,8 @@ class TryBindingBasedInlineIntrinsicForHeavyOptimizer { CHECK(!xmm_result_reg_.IsInvalidReg()); MovToResult(builder_, result_, xmm_result_reg_); } else if constexpr ((arg_info.arg_type == ArgInfo::OUT_ARG || - arg_info.arg_type == ArgInfo::IN_OUT_TMP_ARG) && + arg_info.arg_type == ArgInfo::IN_OUT_TMP_ARG || + arg_info.arg_type == ArgInfo::OUT_TMP_ARG) && RegisterClass::kIsImplicitReg) { CHECK(!implicit_result_reg_.IsInvalidReg()); MovToResult(builder_, result_, implicit_result_reg_); diff --git a/interpreter/riscv64/interpreter.h b/interpreter/riscv64/interpreter.h index a6fd3f56..3a43bc68 100644 --- a/interpreter/riscv64/interpreter.h +++ b/interpreter/riscv64/interpreter.h @@ -189,10 +189,6 @@ class Interpreter { return NarrowTopHalf(Widen(Int64(arg1)) * BitCastToSigned(Widen(UInt64(arg2)))); case Decoder::OpOpcode::kMulhu: return NarrowTopHalf(Widen(UInt64(arg1)) * Widen(UInt64(arg2))); - case Decoder::OpOpcode::kRem: - return Int64(arg1) % Int64(arg2); - case Decoder::OpOpcode::kRemu: - return UInt64(arg1) % UInt64(arg2); case Decoder::OpOpcode::kAndn: return Int64(arg1) & (~Int64(arg2)); case Decoder::OpOpcode::kOrn: @@ -219,10 +215,6 @@ class Interpreter { return Widen(TruncateTo(arg1) >> TruncateTo(arg2)); case Decoder::Op32Opcode::kMulw: return Widen(TruncateTo(arg1) * TruncateTo(arg2)); - case Decoder::Op32Opcode::kRemw: - return Widen(TruncateTo(arg1) % TruncateTo(arg2)); - case Decoder::Op32Opcode::kRemuw: - return Widen(BitCastToSigned(TruncateTo(arg1) % TruncateTo(arg2))); default: Undefined(); return {}; diff --git a/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_arith_impl.h b/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_arith_impl.h index cc5ce510..4e948359 100644 --- a/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_arith_impl.h +++ b/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_arith_impl.h @@ -82,6 +82,81 @@ void MacroAssembler::MacroDiv(Register src) { Bind(done); } + +// Divisor comes in "src", dividend comes in gpr_a, remainder is returned in gpr_d. +// gpr_a and FLAGS are clobbered by that macroinstruction. +template +template +void MacroAssembler::MacroRem(Register src) { + Label* zero = MakeLabel(); + Label* overflow = MakeLabel(); + Label* done = MakeLabel(); + Test(src, src); + Jcc(Condition::kZero, *zero); + + if constexpr (std::is_signed_v) { + Label* do_idiv = MakeLabel(); + // If min int32_t/int64_t is divided by -1 then in risc-v the result is + // the dividend, but x86 will raise an exception. Handle this case separately. + Cmp(src, int8_t{-1}); + Jcc(Condition::kNotEqual, *do_idiv); + + if constexpr (std::is_same_v) { + Cmp(gpr_a, + {.disp = constants_pool::kVectorConst::min()>}); + } else { + Cmp(gpr_a, std::numeric_limits::min()); + } + Jcc(Condition::kEqual, *overflow); + + Bind(do_idiv); + // If we are dealing with 8-bit signed case then we need to sign-extend %al into %ax. + if constexpr (std::is_same_v) { + Cbw(); + // We need to sign-extend gpr_a into gpr_d to ensure 32bit/64-bit/128-bit dividend is correct. + } else if constexpr (std::is_same_v) { + Cwd(); + } else if constexpr (std::is_same_v) { + Cdq(); + } else if constexpr (std::is_same_v) { + Cqo(); + } else { + static_assert(kDependentTypeFalse, "Unsupported format"); + } + } else if constexpr (std::is_same_v) { + // For 8bit unsigned case we need “xor %ah, %ah” instruction, but our assembler doesn't support + // %ah register. Use .byte to emit the required machine code. + TwoByte(uint16_t{0xe430}); + } else { + // We need to zero-extend eax into dx/edx/rdx to ensure 32-bit/64-bit/128-bit dividend is + // correct. + Xor(gpr_d, gpr_d); + } + + Div(src); + if constexpr (std::is_same_v || std::is_same_v) { + // For 8bit case the result is in %ah, but our assembler doesn't support + // %ah register. move %ah to %al + TwoByte(uint16_t{0xe086}); + } + Jmp(*done); + + Bind(zero); + if constexpr (std::is_same_v || std::is_same_v) { + Mov(gpr_a, src); + } else { + Mov(gpr_d, src); + } + Jmp(*done); + + Bind(overflow); + if constexpr (std::is_same_v || std::is_same_v) { + Xor(gpr_a, gpr_a); + } else { + Xor(gpr_d, gpr_d); + } + Bind(done); +} } // namespace berberis #endif // RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_ARITH_IMPL_H_ diff --git a/intrinsics/riscv64_to_x86_64/intrinsic_def.json b/intrinsics/riscv64_to_x86_64/intrinsic_def.json index 831beb51..9102efeb 100644 --- a/intrinsics/riscv64_to_x86_64/intrinsic_def.json +++ b/intrinsics/riscv64_to_x86_64/intrinsic_def.json @@ -292,6 +292,22 @@ "in": [ "Type0", "Type0" ], "out": [ "Type0" ] }, + "Rem": { + "comment": "Integer remainder", + "class": "template", + "variants": [ + "int8_t", + "uint8_t", + "int16_t", + "uint16_t", + "int32_t", + "uint32_t", + "int64_t", + "uint64_t" + ], + "in": [ "Type0", "Type0" ], + "out": [ "Type0" ] + }, "FAdd": { "comment": "Floating point addition", "class": "template", diff --git a/intrinsics/riscv64_to_x86_64/machine_ir_intrinsic_binding.json b/intrinsics/riscv64_to_x86_64/machine_ir_intrinsic_binding.json index 1fe5eba3..e9e052b8 100644 --- a/intrinsics/riscv64_to_x86_64/machine_ir_intrinsic_binding.json +++ b/intrinsics/riscv64_to_x86_64/machine_ir_intrinsic_binding.json @@ -181,6 +181,54 @@ "insn": "DivUInt64", "in": [ 1, 0 ], "out": [ 1 ] + }, + { + "name": "Rem", + "insn": "RemInt8", + "in": [ 1, 0 ], + "out": [ 1 ] + }, + { + "name": "Rem", + "insn": "RemUInt8", + "in": [ 1, 0 ], + "out": [ 1 ] + }, + { + "name": "Rem", + "insn": "RemInt16", + "in": [ 1, 0 ], + "out": [ 2 ] + }, + { + "name": "Rem", + "insn": "RemUInt16", + "in": [ 1, 0 ], + "out": [ 2 ] + }, + { + "name": "Rem", + "insn": "RemInt32", + "in": [ 1, 0 ], + "out": [ 2 ] + }, + { + "name": "Rem", + "insn": "RemUInt32", + "in": [ 1, 0 ], + "out": [ 2 ] + }, + { + "name": "Rem", + "insn": "RemInt64", + "in": [ 1, 0 ], + "out": [ 2 ] + }, + { + "name": "Rem", + "insn": "RemUInt64", + "in": [ 1, 0 ], + "out": [ 2 ] }, { "name": "FAddHostRounding", diff --git a/intrinsics/riscv64_to_x86_64/macro_def.json b/intrinsics/riscv64_to_x86_64/macro_def.json index 45f0b0ad..1420d4aa 100644 --- a/intrinsics/riscv64_to_x86_64/macro_def.json +++ b/intrinsics/riscv64_to_x86_64/macro_def.json @@ -197,6 +197,92 @@ "asm": "MacroDiv", "mnemo": "MACRO_UDIV64" }, + { + "name": "RemInt8", + "args": [ + { "class": "GeneralReg8", "usage": "use" }, + { "class": "AX", "usage": "use_def" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem", + "mnemo": "MACRO_REM8" + }, + { + "name": "RemInt16", + "args": [ + { "class": "GeneralReg16", "usage": "use" }, + { "class": "AX", "usage": "use_def" }, + { "class": "DX", "usage": "def_early_clobber" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem", + "mnemo": "MACRO_REM16" + }, + { + "name": "RemInt32", + "args": [ + { "class": "GeneralReg32", "usage": "use" }, + { "class": "EAX", "usage": "use_def" }, + { "class": "EDX", "usage": "def_early_clobber" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem", + "mnemo": "MACRO_REM32" + }, + { + "name": "RemInt64", + "args": [ + { "class": "GeneralReg64", "usage": "use" }, + { "class": "EAX", "usage": "use_def" }, + { "class": "EDX", "usage": "def_early_clobber" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem", + "mnemo": "MACRO_REM64" + }, + { + "name": "RemUInt8", + "args": [ + { "class": "GeneralReg8", "usage": "use" }, + { "class": "AX", "usage": "use_def" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem", + "mnemo": "MACRO_UREM8" + }, + { + "name": "RemUInt16", + "args": [ + { "class": "GeneralReg16", "usage": "use" }, + { "class": "AX", "usage": "use_def" }, + { "class": "DX", "usage": "def_early_clobber" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem", + "mnemo": "MACRO_UREM16" + }, + { + "name": "RemUInt32", + "args": [ + { "class": "GeneralReg32", "usage": "use" }, + { "class": "EAX", "usage": "use_def" }, + { "class": "EDX", "usage": "def_early_clobber" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem", + "mnemo": "MACRO_UREM32" + }, + { + "name": "RemUInt64", + "args": [ + { "class": "GeneralReg64", "usage": "use" }, + { "class": "EAX", "usage": "use_def" }, + { "class": "EDX", "usage": "def_early_clobber" }, + { "class": "FLAGS", "usage": "def" } + ], + "asm": "MacroRem", + "mnemo": "MACRO_UREM64" + }, { "name": "MacroFCvtFloat32ToInt32", "args": [ diff --git a/lite_translator/riscv64_to_x86_64/inline_intrinsic.h b/lite_translator/riscv64_to_x86_64/inline_intrinsic.h index c870a048..fa930da1 100644 --- a/lite_translator/riscv64_to_x86_64/inline_intrinsic.h +++ b/lite_translator/riscv64_to_x86_64/inline_intrinsic.h @@ -373,6 +373,10 @@ class TryBindingBasedInlineIntrinsic { Mov>( as_, as_.rcx, std::get(input_args_)); return std::tuple{}; + } else if constexpr (RegisterClass::kAsRegister == 'a') { + Mov>( + as_, as_.rax, std::get(input_args_)); + return std::tuple{}; } else { static_assert(std::is_same_v); static_assert(!RegisterClass::kIsImplicitReg); @@ -415,6 +419,13 @@ class TryBindingBasedInlineIntrinsic { return std::tuple{result_}; } } + } else if constexpr (arg_info.arg_type == ArgInfo::OUT_TMP_ARG) { + if constexpr (RegisterClass::kAsRegister == 'd') { + result_reg_ = as_.rdx; + return std::tuple{}; + } else { + static_assert(kDependentValueFalse); + } } else if constexpr (arg_info.arg_type == ArgInfo::TMP_ARG) { static_assert(std::is_same_v || std::is_same_v); diff --git a/lite_translator/riscv64_to_x86_64/lite_translator.cc b/lite_translator/riscv64_to_x86_64/lite_translator.cc index 1c587472..91091b65 100644 --- a/lite_translator/riscv64_to_x86_64/lite_translator.cc +++ b/lite_translator/riscv64_to_x86_64/lite_translator.cc @@ -105,19 +105,6 @@ Register LiteTranslator::Op(Decoder::OpOpcode opcode, Register arg1, Register ar as_.Mulq(arg2); as_.Movq(res, as_.rdx); break; - case OpOpcode::kRem: - as_.Movq(as_.rax, arg1); - as_.Movq(as_.rdx, as_.rax); - as_.Sarq(as_.rdx, int8_t{63}); - as_.Idivq(arg2); - as_.Movq(res, opcode == OpOpcode::kDiv ? as_.rax : as_.rdx); - break; - case OpOpcode::kRemu: - as_.Movq(as_.rax, arg1); - as_.Xorq(as_.rdx, as_.rdx); - as_.Divq(arg2); - as_.Movq(res, opcode == OpOpcode::kDivu ? as_.rax : as_.rdx); - break; case Decoder::OpOpcode::kAndn: if (host_platform::kHasBMI) { as_.Andnq(res, arg2, arg1); @@ -179,19 +166,6 @@ Register LiteTranslator::Op32(Decoder::Op32Opcode opcode, Register arg1, Registe as_.Imull(res, arg2); as_.Movsxlq(res, res); break; - case Op32Opcode::kRemw: - as_.Movl(as_.rax, arg1); - as_.Movl(as_.rdx, as_.rax); - as_.Sarl(as_.rdx, int8_t{31}); - as_.Idivl(arg2); - as_.Movsxlq(res, as_.rdx); - break; - case Op32Opcode::kRemuw: - as_.Movl(as_.rax, arg1); - as_.Xorl(as_.rdx, as_.rdx); - as_.Divl(arg2); - as_.Movsxlq(res, as_.rdx); - break; default: Undefined(); return {}; -- cgit v1.2.3