aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHaines Sy <hainesy@google.com>2024-04-22 14:07:07 -0700
committerHaines Sy <hainesy@google.com>2024-04-22 16:21:32 -0700
commit9b0ed4174e55200654db5fcb7a64ed7476e79988 (patch)
tree72f5faa82100ca88bc94e4ace94b91a30f930781
parent1aa28e380c02e4080592c2e48c88555ed682e9d2 (diff)
downloadbinary_translation-9b0ed4174e55200654db5fcb7a64ed7476e79988.tar.gz
Add Rem intrinsics
Test: m berberis_all Change-Id: I7eaa16e570038ecbf4a3a0548f79f72dff72b526
-rw-r--r--decoder/include/berberis/decoder/riscv64/semantics_player.h8
-rw-r--r--heavy_optimizer/riscv64/frontend.cc38
-rw-r--r--heavy_optimizer/riscv64/inline_intrinsic.h11
-rw-r--r--interpreter/riscv64/interpreter.h8
-rw-r--r--intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_arith_impl.h75
-rw-r--r--intrinsics/riscv64_to_x86_64/intrinsic_def.json16
-rw-r--r--intrinsics/riscv64_to_x86_64/machine_ir_intrinsic_binding.json48
-rw-r--r--intrinsics/riscv64_to_x86_64/macro_def.json86
-rw-r--r--lite_translator/riscv64_to_x86_64/inline_intrinsic.h11
-rw-r--r--lite_translator/riscv64_to_x86_64/lite_translator.cc26
10 files changed, 254 insertions, 73 deletions
diff --git a/decoder/include/berberis/decoder/riscv64/semantics_player.h b/decoder/include/berberis/decoder/riscv64/semantics_player.h
index a32ee177..7f4a20c0 100644
--- a/decoder/include/berberis/decoder/riscv64/semantics_player.h
+++ b/decoder/include/berberis/decoder/riscv64/semantics_player.h
@@ -415,6 +415,10 @@ class SemanticsPlayer {
return listener_->template Div<int64_t>(arg1, arg2);
case Decoder::OpOpcode::kDivu:
return listener_->template Div<uint64_t>(arg1, arg2);
+ case Decoder::OpOpcode::kRem:
+ return listener_->template Rem<int64_t>(arg1, arg2);
+ case Decoder::OpOpcode::kRemu:
+ return listener_->template Rem<uint64_t>(arg1, arg2);
case Decoder::OpOpcode::kMax:
return listener_->template Max<int64_t>(arg1, arg2);
case Decoder::OpOpcode::kMaxu:
@@ -453,6 +457,10 @@ class SemanticsPlayer {
return listener_->template Div<int32_t>(arg1, arg2);
case Decoder::Op32Opcode::kDivuw:
return listener_->template Div<uint32_t>(arg1, arg2);
+ case Decoder::Op32Opcode::kRemw:
+ return listener_->template Rem<int32_t>(arg1, arg2);
+ case Decoder::Op32Opcode::kRemuw:
+ return listener_->template Rem<uint32_t>(arg1, arg2);
case Decoder::Op32Opcode::kRolw:
return listener_->template Rol<int32_t>(arg1, arg2);
case Decoder::Op32Opcode::kRorw:
diff --git a/heavy_optimizer/riscv64/frontend.cc b/heavy_optimizer/riscv64/frontend.cc
index 14846835..8dae1afc 100644
--- a/heavy_optimizer/riscv64/frontend.cc
+++ b/heavy_optimizer/riscv64/frontend.cc
@@ -360,25 +360,6 @@ Register HeavyOptimizerFrontend::Op(Decoder::OpOpcode opcode, Register arg1, Reg
Gen<x86_64::MulqRegRegReg>(rax, rdx, arg2, GetFlagsRegister());
Gen<PseudoCopy>(res, rdx, 8);
} break;
- case OpOpcode::kRem: {
- auto rax = AllocTempReg();
- auto rdx = AllocTempReg();
- Gen<PseudoCopy>(rax, arg1, 8);
- Gen<PseudoCopy>(rdx, rax, 8);
- Gen<x86_64::SarqRegImm>(rdx, 63, GetFlagsRegister());
- Gen<x86_64::IdivqRegRegReg>(rax, rdx, arg2, GetFlagsRegister());
- Gen<PseudoCopy>(res, rdx, 8);
- } break;
- case OpOpcode::kRemu: {
- auto rax = AllocTempReg();
- auto rdx = AllocTempReg();
- Gen<PseudoCopy>(rax, arg1, 8);
- // Pseudo-def for use-def operand of XOR to make sure data-flow is integrate.
- Gen<PseudoDefReg>(rdx);
- Gen<x86_64::XorqRegReg>(rdx, rdx, GetFlagsRegister());
- Gen<x86_64::DivqRegRegReg>(rax, rdx, arg2, GetFlagsRegister());
- Gen<PseudoCopy>(res, rdx, 8);
- } break;
case OpOpcode::kAndn:
if (host_platform::kHasBMI) {
Gen<x86_64::AndnqRegRegReg>(res, arg2, arg1, GetFlagsRegister());
@@ -437,25 +418,6 @@ Register HeavyOptimizerFrontend::Op32(Decoder::Op32Opcode opcode, Register arg1,
Gen<PseudoCopy>(res, arg1, 4);
Gen<x86_64::ImullRegReg>(res, arg2, GetFlagsRegister());
break;
- case Op32Opcode::kRemw: {
- auto rax = AllocTempReg();
- auto rdx = AllocTempReg();
- Gen<PseudoCopy>(rax, arg1, 4);
- Gen<PseudoCopy>(rdx, rax, 4);
- Gen<x86_64::SarlRegImm>(rdx, int8_t{31}, GetFlagsRegister());
- Gen<x86_64::IdivlRegRegReg>(rax, rdx, arg2, GetFlagsRegister());
- unextended_res = rdx;
- } break;
- case Op32Opcode::kRemuw: {
- auto rax = AllocTempReg();
- auto rdx = AllocTempReg();
- Gen<PseudoCopy>(rax, arg1, 4);
- // Pseudo-def for use-def operand of XOR to make sure data-flow is integrate.
- Gen<PseudoDefReg>(rdx);
- Gen<x86_64::XorlRegReg>(rdx, rdx, GetFlagsRegister());
- Gen<x86_64::DivlRegRegReg>(rax, rdx, arg2, GetFlagsRegister());
- unextended_res = rdx;
- } break;
default:
Undefined();
return {};
diff --git a/heavy_optimizer/riscv64/inline_intrinsic.h b/heavy_optimizer/riscv64/inline_intrinsic.h
index 3d0f8162..f4664893 100644
--- a/heavy_optimizer/riscv64/inline_intrinsic.h
+++ b/heavy_optimizer/riscv64/inline_intrinsic.h
@@ -447,6 +447,14 @@ class TryBindingBasedInlineIntrinsicForHeavyOptimizer {
static_assert(std::is_same_v<Usage, intrinsics::bindings::UseDef>);
return std::tuple{std::get<arg_info.from>(input_args_)};
}
+ } else if constexpr (arg_info.arg_type == ArgInfo::OUT_TMP_ARG) {
+ if constexpr (kNumOut > 1) {
+ static_assert(kDependentTypeFalse<ArgTraits<ArgBinding>>);
+ } else {
+ CHECK(implicit_result_reg_.IsInvalidReg());
+ implicit_result_reg_ = AllocVReg();
+ return std::tuple{implicit_result_reg_};
+ }
} else if constexpr (arg_info.arg_type == ArgInfo::OUT_ARG) {
static_assert(!std::is_same_v<ResType, std::monostate>);
static_assert(std::is_same_v<Usage, intrinsics::bindings::Def> ||
@@ -545,7 +553,8 @@ class TryBindingBasedInlineIntrinsicForHeavyOptimizer {
CHECK(!xmm_result_reg_.IsInvalidReg());
MovToResult<RegisterClass>(builder_, result_, xmm_result_reg_);
} else if constexpr ((arg_info.arg_type == ArgInfo::OUT_ARG ||
- arg_info.arg_type == ArgInfo::IN_OUT_TMP_ARG) &&
+ arg_info.arg_type == ArgInfo::IN_OUT_TMP_ARG ||
+ arg_info.arg_type == ArgInfo::OUT_TMP_ARG) &&
RegisterClass::kIsImplicitReg) {
CHECK(!implicit_result_reg_.IsInvalidReg());
MovToResult<RegisterClass>(builder_, result_, implicit_result_reg_);
diff --git a/interpreter/riscv64/interpreter.h b/interpreter/riscv64/interpreter.h
index a6fd3f56..3a43bc68 100644
--- a/interpreter/riscv64/interpreter.h
+++ b/interpreter/riscv64/interpreter.h
@@ -189,10 +189,6 @@ class Interpreter {
return NarrowTopHalf(Widen(Int64(arg1)) * BitCastToSigned(Widen(UInt64(arg2))));
case Decoder::OpOpcode::kMulhu:
return NarrowTopHalf(Widen(UInt64(arg1)) * Widen(UInt64(arg2)));
- case Decoder::OpOpcode::kRem:
- return Int64(arg1) % Int64(arg2);
- case Decoder::OpOpcode::kRemu:
- return UInt64(arg1) % UInt64(arg2);
case Decoder::OpOpcode::kAndn:
return Int64(arg1) & (~Int64(arg2));
case Decoder::OpOpcode::kOrn:
@@ -219,10 +215,6 @@ class Interpreter {
return Widen(TruncateTo<Int32>(arg1) >> TruncateTo<Int32>(arg2));
case Decoder::Op32Opcode::kMulw:
return Widen(TruncateTo<Int32>(arg1) * TruncateTo<Int32>(arg2));
- case Decoder::Op32Opcode::kRemw:
- return Widen(TruncateTo<Int32>(arg1) % TruncateTo<Int32>(arg2));
- case Decoder::Op32Opcode::kRemuw:
- return Widen(BitCastToSigned(TruncateTo<UInt32>(arg1) % TruncateTo<UInt32>(arg2)));
default:
Undefined();
return {};
diff --git a/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_arith_impl.h b/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_arith_impl.h
index cc5ce510..4e948359 100644
--- a/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_arith_impl.h
+++ b/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_arith_impl.h
@@ -82,6 +82,81 @@ void MacroAssembler<Assembler>::MacroDiv(Register src) {
Bind(done);
}
+
+// Divisor comes in "src", dividend comes in gpr_a, remainder is returned in gpr_d.
+// gpr_a and FLAGS are clobbered by that macroinstruction.
+template <typename Assembler>
+template <typename IntType>
+void MacroAssembler<Assembler>::MacroRem(Register src) {
+ Label* zero = MakeLabel();
+ Label* overflow = MakeLabel();
+ Label* done = MakeLabel();
+ Test<IntType>(src, src);
+ Jcc(Condition::kZero, *zero);
+
+ if constexpr (std::is_signed_v<IntType>) {
+ Label* do_idiv = MakeLabel();
+ // If min int32_t/int64_t is divided by -1 then in risc-v the result is
+ // the dividend, but x86 will raise an exception. Handle this case separately.
+ Cmp<IntType>(src, int8_t{-1});
+ Jcc(Condition::kNotEqual, *do_idiv);
+
+ if constexpr (std::is_same_v<IntType, int64_t>) {
+ Cmp<IntType>(gpr_a,
+ {.disp = constants_pool::kVectorConst<std::numeric_limits<IntType>::min()>});
+ } else {
+ Cmp<IntType>(gpr_a, std::numeric_limits<IntType>::min());
+ }
+ Jcc(Condition::kEqual, *overflow);
+
+ Bind(do_idiv);
+ // If we are dealing with 8-bit signed case then we need to sign-extend %al into %ax.
+ if constexpr (std::is_same_v<IntType, int8_t>) {
+ Cbw();
+ // We need to sign-extend gpr_a into gpr_d to ensure 32bit/64-bit/128-bit dividend is correct.
+ } else if constexpr (std::is_same_v<IntType, int16_t>) {
+ Cwd();
+ } else if constexpr (std::is_same_v<IntType, int32_t>) {
+ Cdq();
+ } else if constexpr (std::is_same_v<IntType, int64_t>) {
+ Cqo();
+ } else {
+ static_assert(kDependentTypeFalse<IntType>, "Unsupported format");
+ }
+ } else if constexpr (std::is_same_v<IntType, uint8_t>) {
+ // For 8bit unsigned case we need “xor %ah, %ah” instruction, but our assembler doesn't support
+ // %ah register. Use .byte to emit the required machine code.
+ TwoByte(uint16_t{0xe430});
+ } else {
+ // We need to zero-extend eax into dx/edx/rdx to ensure 32-bit/64-bit/128-bit dividend is
+ // correct.
+ Xor<uint64_t>(gpr_d, gpr_d);
+ }
+
+ Div<IntType>(src);
+ if constexpr (std::is_same_v<IntType, uint8_t> || std::is_same_v<IntType, int8_t>) {
+ // For 8bit case the result is in %ah, but our assembler doesn't support
+ // %ah register. move %ah to %al
+ TwoByte(uint16_t{0xe086});
+ }
+ Jmp(*done);
+
+ Bind(zero);
+ if constexpr (std::is_same_v<IntType, uint8_t> || std::is_same_v<IntType, int8_t>) {
+ Mov<int8_t>(gpr_a, src);
+ } else {
+ Mov<IntType>(gpr_d, src);
+ }
+ Jmp(*done);
+
+ Bind(overflow);
+ if constexpr (std::is_same_v<IntType, uint8_t> || std::is_same_v<IntType, int8_t>) {
+ Xor<int8_t>(gpr_a, gpr_a);
+ } else {
+ Xor<IntType>(gpr_d, gpr_d);
+ }
+ Bind(done);
+}
} // namespace berberis
#endif // RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_ARITH_IMPL_H_
diff --git a/intrinsics/riscv64_to_x86_64/intrinsic_def.json b/intrinsics/riscv64_to_x86_64/intrinsic_def.json
index 831beb51..9102efeb 100644
--- a/intrinsics/riscv64_to_x86_64/intrinsic_def.json
+++ b/intrinsics/riscv64_to_x86_64/intrinsic_def.json
@@ -292,6 +292,22 @@
"in": [ "Type0", "Type0" ],
"out": [ "Type0" ]
},
+ "Rem": {
+ "comment": "Integer remainder",
+ "class": "template",
+ "variants": [
+ "int8_t",
+ "uint8_t",
+ "int16_t",
+ "uint16_t",
+ "int32_t",
+ "uint32_t",
+ "int64_t",
+ "uint64_t"
+ ],
+ "in": [ "Type0", "Type0" ],
+ "out": [ "Type0" ]
+ },
"FAdd": {
"comment": "Floating point addition",
"class": "template",
diff --git a/intrinsics/riscv64_to_x86_64/machine_ir_intrinsic_binding.json b/intrinsics/riscv64_to_x86_64/machine_ir_intrinsic_binding.json
index 1fe5eba3..e9e052b8 100644
--- a/intrinsics/riscv64_to_x86_64/machine_ir_intrinsic_binding.json
+++ b/intrinsics/riscv64_to_x86_64/machine_ir_intrinsic_binding.json
@@ -182,6 +182,54 @@
"in": [ 1, 0 ],
"out": [ 1 ]
},
+ {
+ "name": "Rem<int8_t>",
+ "insn": "RemInt8",
+ "in": [ 1, 0 ],
+ "out": [ 1 ]
+ },
+ {
+ "name": "Rem<uint8_t>",
+ "insn": "RemUInt8",
+ "in": [ 1, 0 ],
+ "out": [ 1 ]
+ },
+ {
+ "name": "Rem<int16_t>",
+ "insn": "RemInt16",
+ "in": [ 1, 0 ],
+ "out": [ 2 ]
+ },
+ {
+ "name": "Rem<uint16_t>",
+ "insn": "RemUInt16",
+ "in": [ 1, 0 ],
+ "out": [ 2 ]
+ },
+ {
+ "name": "Rem<int32_t>",
+ "insn": "RemInt32",
+ "in": [ 1, 0 ],
+ "out": [ 2 ]
+ },
+ {
+ "name": "Rem<uint32_t>",
+ "insn": "RemUInt32",
+ "in": [ 1, 0 ],
+ "out": [ 2 ]
+ },
+ {
+ "name": "Rem<int64_t>",
+ "insn": "RemInt64",
+ "in": [ 1, 0 ],
+ "out": [ 2 ]
+ },
+ {
+ "name": "Rem<uint64_t>",
+ "insn": "RemUInt64",
+ "in": [ 1, 0 ],
+ "out": [ 2 ]
+ },
{
"name": "FAddHostRounding<Float32>",
"insn": "AddssXRegXReg",
diff --git a/intrinsics/riscv64_to_x86_64/macro_def.json b/intrinsics/riscv64_to_x86_64/macro_def.json
index 45f0b0ad..1420d4aa 100644
--- a/intrinsics/riscv64_to_x86_64/macro_def.json
+++ b/intrinsics/riscv64_to_x86_64/macro_def.json
@@ -198,6 +198,92 @@
"mnemo": "MACRO_UDIV64"
},
{
+ "name": "RemInt8",
+ "args": [
+ { "class": "GeneralReg8", "usage": "use" },
+ { "class": "AX", "usage": "use_def" },
+ { "class": "FLAGS", "usage": "def" }
+ ],
+ "asm": "MacroRem<int8_t>",
+ "mnemo": "MACRO_REM8"
+ },
+ {
+ "name": "RemInt16",
+ "args": [
+ { "class": "GeneralReg16", "usage": "use" },
+ { "class": "AX", "usage": "use_def" },
+ { "class": "DX", "usage": "def_early_clobber" },
+ { "class": "FLAGS", "usage": "def" }
+ ],
+ "asm": "MacroRem<int16_t>",
+ "mnemo": "MACRO_REM16"
+ },
+ {
+ "name": "RemInt32",
+ "args": [
+ { "class": "GeneralReg32", "usage": "use" },
+ { "class": "EAX", "usage": "use_def" },
+ { "class": "EDX", "usage": "def_early_clobber" },
+ { "class": "FLAGS", "usage": "def" }
+ ],
+ "asm": "MacroRem<int32_t>",
+ "mnemo": "MACRO_REM32"
+ },
+ {
+ "name": "RemInt64",
+ "args": [
+ { "class": "GeneralReg64", "usage": "use" },
+ { "class": "EAX", "usage": "use_def" },
+ { "class": "EDX", "usage": "def_early_clobber" },
+ { "class": "FLAGS", "usage": "def" }
+ ],
+ "asm": "MacroRem<int64_t>",
+ "mnemo": "MACRO_REM64"
+ },
+ {
+ "name": "RemUInt8",
+ "args": [
+ { "class": "GeneralReg8", "usage": "use" },
+ { "class": "AX", "usage": "use_def" },
+ { "class": "FLAGS", "usage": "def" }
+ ],
+ "asm": "MacroRem<uint8_t>",
+ "mnemo": "MACRO_UREM8"
+ },
+ {
+ "name": "RemUInt16",
+ "args": [
+ { "class": "GeneralReg16", "usage": "use" },
+ { "class": "AX", "usage": "use_def" },
+ { "class": "DX", "usage": "def_early_clobber" },
+ { "class": "FLAGS", "usage": "def" }
+ ],
+ "asm": "MacroRem<uint16_t>",
+ "mnemo": "MACRO_UREM16"
+ },
+ {
+ "name": "RemUInt32",
+ "args": [
+ { "class": "GeneralReg32", "usage": "use" },
+ { "class": "EAX", "usage": "use_def" },
+ { "class": "EDX", "usage": "def_early_clobber" },
+ { "class": "FLAGS", "usage": "def" }
+ ],
+ "asm": "MacroRem<uint32_t>",
+ "mnemo": "MACRO_UREM32"
+ },
+ {
+ "name": "RemUInt64",
+ "args": [
+ { "class": "GeneralReg64", "usage": "use" },
+ { "class": "EAX", "usage": "use_def" },
+ { "class": "EDX", "usage": "def_early_clobber" },
+ { "class": "FLAGS", "usage": "def" }
+ ],
+ "asm": "MacroRem<uint64_t>",
+ "mnemo": "MACRO_UREM64"
+ },
+ {
"name": "MacroFCvtFloat32ToInt32",
"args": [
{ "class": "GeneralReg64", "usage": "def" },
diff --git a/lite_translator/riscv64_to_x86_64/inline_intrinsic.h b/lite_translator/riscv64_to_x86_64/inline_intrinsic.h
index c870a048..fa930da1 100644
--- a/lite_translator/riscv64_to_x86_64/inline_intrinsic.h
+++ b/lite_translator/riscv64_to_x86_64/inline_intrinsic.h
@@ -373,6 +373,10 @@ class TryBindingBasedInlineIntrinsic {
Mov<std::tuple_element_t<arg_info.from, typename AsmCallInfo::InputArguments>>(
as_, as_.rcx, std::get<arg_info.from>(input_args_));
return std::tuple{};
+ } else if constexpr (RegisterClass::kAsRegister == 'a') {
+ Mov<std::tuple_element_t<arg_info.from, typename AsmCallInfo::InputArguments>>(
+ as_, as_.rax, std::get<arg_info.from>(input_args_));
+ return std::tuple{};
} else {
static_assert(std::is_same_v<Usage, intrinsics::bindings::UseDef>);
static_assert(!RegisterClass::kIsImplicitReg);
@@ -415,6 +419,13 @@ class TryBindingBasedInlineIntrinsic {
return std::tuple{result_};
}
}
+ } else if constexpr (arg_info.arg_type == ArgInfo::OUT_TMP_ARG) {
+ if constexpr (RegisterClass::kAsRegister == 'd') {
+ result_reg_ = as_.rdx;
+ return std::tuple{};
+ } else {
+ static_assert(kDependentValueFalse<arg_info.arg_type>);
+ }
} else if constexpr (arg_info.arg_type == ArgInfo::TMP_ARG) {
static_assert(std::is_same_v<Usage, intrinsics::bindings::Def> ||
std::is_same_v<Usage, intrinsics::bindings::DefEarlyClobber>);
diff --git a/lite_translator/riscv64_to_x86_64/lite_translator.cc b/lite_translator/riscv64_to_x86_64/lite_translator.cc
index 1c587472..91091b65 100644
--- a/lite_translator/riscv64_to_x86_64/lite_translator.cc
+++ b/lite_translator/riscv64_to_x86_64/lite_translator.cc
@@ -105,19 +105,6 @@ Register LiteTranslator::Op(Decoder::OpOpcode opcode, Register arg1, Register ar
as_.Mulq(arg2);
as_.Movq(res, as_.rdx);
break;
- case OpOpcode::kRem:
- as_.Movq(as_.rax, arg1);
- as_.Movq(as_.rdx, as_.rax);
- as_.Sarq(as_.rdx, int8_t{63});
- as_.Idivq(arg2);
- as_.Movq(res, opcode == OpOpcode::kDiv ? as_.rax : as_.rdx);
- break;
- case OpOpcode::kRemu:
- as_.Movq(as_.rax, arg1);
- as_.Xorq(as_.rdx, as_.rdx);
- as_.Divq(arg2);
- as_.Movq(res, opcode == OpOpcode::kDivu ? as_.rax : as_.rdx);
- break;
case Decoder::OpOpcode::kAndn:
if (host_platform::kHasBMI) {
as_.Andnq(res, arg2, arg1);
@@ -179,19 +166,6 @@ Register LiteTranslator::Op32(Decoder::Op32Opcode opcode, Register arg1, Registe
as_.Imull(res, arg2);
as_.Movsxlq(res, res);
break;
- case Op32Opcode::kRemw:
- as_.Movl(as_.rax, arg1);
- as_.Movl(as_.rdx, as_.rax);
- as_.Sarl(as_.rdx, int8_t{31});
- as_.Idivl(arg2);
- as_.Movsxlq(res, as_.rdx);
- break;
- case Op32Opcode::kRemuw:
- as_.Movl(as_.rax, arg1);
- as_.Xorl(as_.rdx, as_.rdx);
- as_.Divl(arg2);
- as_.Movsxlq(res, as_.rdx);
- break;
default:
Undefined();
return {};