Implement narrowing floating point conversions.

Note: vfncvt.rod.f.f.w instruction is not implemented. It's rarely used (we haven't observed it in the wild yet) and, more importantly, this is rounding which is not support by x86-64 architecture and out intrinsics don't support it either. Test: berberis_all Change-Id: Ia9caa1fbb33db22a71d9f103b212da5892efee6f
author: Ahmed Mohamed Mohamed <ahmed200615200@gmail.com> 2024-03-03 13:31:33 +0000
committer: Victor Khimenko <khim@google.com> 2024-03-04 14:53:31 +0000
commit: ae7726ba95d2daa79febdabd45115c27e9b6f47e (patch)
tree: 0e18775beddca79b0ef3b1eb9afed0ba241da033
parent: e22f587ea25b76f2290c34bd770a0c00c8f03f3d (diff)
download: binary_translation-ae7726ba95d2daa79febdabd45115c27e9b6f47e.tar.gz
2 files changed, 195 insertions, 6 deletions
diff --git a/interpreter/riscv64/interpreter.h b/interpreter/riscv64/interpreter.h
index 66ab030c..28245f54 100644
--- a/interpreter/riscv64/interpreter.h
+++ b/interpreter/riscv64/interpreter.h
@@ -1243,6 +1243,38 @@ class Interpreter {
                                      vlmul,
                                      vta,
                                      vma>(args.dst, args.src1);
+            case Decoder::VFUnary0Opcode::kVfncvtxufw:
+              return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+                return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::DYN, frm, src);
+              },
+                                      UnsignedType,
+                                      vlmul,
+                                      vta,
+                                      vma>(args.dst, args.src1);
+            case Decoder::VFUnary0Opcode::kVfncvtxfw:
+              return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+                return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::DYN, frm, src);
+              },
+                                      SignedType,
+                                      vlmul,
+                                      vta,
+                                      vma>(args.dst, args.src1);
+            case Decoder::VFUnary0Opcode::kVfncvtrtzxufw:
+              return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+                return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::RTZ, frm, src);
+              },
+                                      UnsignedType,
+                                      vlmul,
+                                      vta,
+                                      vma>(args.dst, args.src1);
+            case Decoder::VFUnary0Opcode::kVfncvtrtzxfw:
+              return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+                return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::RTZ, frm, src);
+              },
+                                      SignedType,
+                                      vlmul,
+                                      vta,
+                                      vma>(args.dst, args.src1);
             default:
               break;  // Make compiler happy.
           }
@@ -1306,6 +1338,30 @@ class Interpreter {
                                      vlmul,
                                      vta,
                                      vma>(args.dst, args.src1);
+            case Decoder::VFUnary0Opcode::kVfncvtfxuw:
+              return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+                return intrinsics::Vfcvtv<ElementType, WideUnsignedType>(FPFlags::DYN, frm, src);
+              },
+                                      ElementType,
+                                      vlmul,
+                                      vta,
+                                      vma>(args.dst, args.src1);
+            case Decoder::VFUnary0Opcode::kVfncvtffw:
+              return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+                return intrinsics::Vfcvtv<ElementType, WideElementType>(FPFlags::DYN, frm, src);
+              },
+                                      ElementType,
+                                      vlmul,
+                                      vta,
+                                      vma>(args.dst, args.src1);
+            case Decoder::VFUnary0Opcode::kVfncvtfxw:
+              return OpVectorNarrowwr<[](int8_t frm, SIMD128Register src) {
+                return intrinsics::Vfcvtv<ElementType, WideSignedType>(FPFlags::DYN, frm, src);
+              },
+                                      ElementType,
+                                      vlmul,
+                                      vta,
+                                      vma>(args.dst, args.src1);
             default:
               break;  // Make compiler happy.
           }
@@ -2712,6 +2768,60 @@ class Interpreter {
     }
   }
 
+  template <auto Intrinsic,
+            typename TargetElementType,
+            VectorRegisterGroupMultiplier vlmul,
+            TailProcessing vta,
+            auto vma>
+  void OpVectorNarrowwr(uint8_t dst, uint8_t src) {
+    return OpVectorNarrowwr<Intrinsic,
+                            TargetElementType,
+                            NumberOfRegistersInvolved(vlmul),
+                            NumRegistersInvolvedForWideOperand(vlmul),
+                            vta,
+                            vma>(dst, src);
+  }
+
+  template <auto Intrinsic,
+            typename TargetElementType,
+            size_t kDestRegistersInvolved,
+            size_t kSrcRegistersInvolved,
+            TailProcessing vta,
+            auto vma>
+  void OpVectorNarrowwr(uint8_t dst, uint8_t src) {
+    if constexpr (kDestRegistersInvolved == kSrcRegistersInvolved) {
+      if (!IsAligned<kDestRegistersInvolved>(dst | src)) {
+        return Unimplemented();
+      }
+    } else if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kSrcRegistersInvolved>(src)) {
+      return Unimplemented();
+    }
+    size_t vstart = GetCsr<CsrName::kVstart>();
+    size_t vl = GetCsr<CsrName::kVl>();
+    SetCsr<CsrName::kVstart>(0);
+    // When vstart >= vl, there are no body elements, and no elements are updated in any destination
+    // vector register group, including that no tail elements are updated with agnostic values.
+    if (vstart >= vl) [[unlikely]] {
+      return;
+    }
+    int8_t frm = GetCsr<CsrName::kFrm>();
+    auto mask = GetMaskForVectorOperations<vma>();
+    for (size_t index = 0; index < kDestRegistersInvolved; index++) {
+      SIMD128Register orig_result(state_->cpu.v[dst + index]);
+      SIMD128Register arg_low(state_->cpu.v[src + 2 * index]);
+      SIMD128Register intrinsic_result = std::get<0>(Intrinsic(frm, arg_low));
+      if constexpr (kSrcRegistersInvolved > 1) {
+        SIMD128Register arg_high(state_->cpu.v[src + 2 * index + 1]);
+        SIMD128Register result_high = std::get<0>(Intrinsic(frm, arg_high));
+        intrinsic_result = std::get<0>(
+            intrinsics::VMergeBottomHalfToTop<TargetElementType>(intrinsic_result, result_high));
+      }
+      auto result = VectorMasking<TargetElementType, vta, vma>(
+          orig_result, intrinsic_result, vstart, vl, index, mask);
+      state_->cpu.v[dst + index] = result.template Get<__uint128_t>();
+    }
+  }
+
   // SEW = 2*SEW op SEW
   template <auto Intrinsic,
             typename ElementType,
diff --git a/interpreter/riscv64/interpreter_test.cc b/interpreter/riscv64/interpreter_test.cc
index ec753c17..ce991806 100644
--- a/interpreter/riscv64/interpreter_test.cc
+++ b/interpreter/riscv64/interpreter_test.cc
@@ -977,10 +977,25 @@ class Riscv64InterpreterTest : public ::testing::Test {
         expected_result_int64);
   }
 
+  void TestNarrowingVectorFloatInstruction(uint32_t insn_bytes,
+                                           const uint32_t (&expected_result_int32)[4][4],
+                                           const __v2du (&source)[16]) {
+    TestVectorInstruction<TestVectorInstructionKind::kFloat, TestVectorInstructionMode::kNarrowing>(
+        insn_bytes, source, expected_result_int32);
+  }
+
+  void TestNarrowingVectorFloatInstruction(uint32_t insn_bytes,
+                                           const uint16_t (&expected_result_int16)[4][8],
+                                           const uint32_t (&expected_result_int32)[4][4],
+                                           const __v2du (&source)[16]) {
+    TestVectorInstruction<TestVectorInstructionKind::kFloat, TestVectorInstructionMode::kNarrowing>(
+        insn_bytes, source, expected_result_int16, expected_result_int32);
+  }
+
   void TestNarrowingVectorInstruction(uint32_t insn_bytes,
-                                      const uint8_t (&expected_result_int8)[8][16],
-                                      const uint16_t (&expected_result_int16)[8][8],
-                                      const uint32_t (&expected_result_int32)[8][4],
+                                      const uint8_t (&expected_result_int8)[4][16],
+                                      const uint16_t (&expected_result_int16)[4][8],
+                                      const uint32_t (&expected_result_int32)[4][4],
                                       const __v2du (&source)[16]) {
     TestVectorInstruction<TestVectorInstructionKind::kInteger,
                           TestVectorInstructionMode::kNarrowing>(
@@ -1018,10 +1033,12 @@ class Riscv64InterpreterTest : public ::testing::Test {
   template <TestVectorInstructionKind kTestVectorInstructionKind,
             TestVectorInstructionMode kTestVectorInstructionMode,
             typename... ElementType,
+            size_t... kResultsCount,
             size_t... kElementCount>
-  void TestVectorInstruction(uint32_t insn_bytes,
-                             const __v2du (&source)[16],
-                             const ElementType (&... expected_result)[8][kElementCount]) {
+  void TestVectorInstruction(
+      uint32_t insn_bytes,
+      const __v2du (&source)[16],
+      const ElementType (&... expected_result)[kResultsCount][kElementCount]) {
     auto Verify = [this, &source](uint32_t insn_bytes,
                                   uint8_t vsew,
                                   uint8_t vlmul_max,
@@ -2115,6 +2132,68 @@ TEST_F(Riscv64InterpreterTest, TestVfcvtxfv) {
                                       {0x8000'0000'0000'0000, 0x8000'0000'0000'0000},
                                       {0x8000'0000'0000'0000, 0x8000'0000'0000'0000}},
                                      kVectorCalculationsSource);
+  TestNarrowingVectorFloatInstruction(
+      0x49881457,  // Vfncvt.xu.f.w v8, v24, v0.t
+      {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+       {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+       {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+       {0xffff, 0xffff, 0x6a21, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}},
+      {{0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000},
+       {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000},
+       {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000},
+       {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}},
+      kVectorCalculationsSource);
+  TestNarrowingVectorFloatInstruction(
+      0x49889457,  // Vfncvt.x.f.w v8, v24, v0.t
+      {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+       {0x8000, 0x8000, 0xcacf, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+       {0x7fff, 0x7fff, 0x6a21, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff}},
+      {{0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000},
+       {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
+       {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000},
+       {0x7fff'ffff, 0x7fff'ffff, 0x7fff'ffff, 0x7fff'ffff}},
+      kVectorCalculationsSource);
+  TestNarrowingVectorFloatInstruction(0x498a1457,  // Vfncvt.f.f.w v8, v24, v0.t
+                                      {{0x8000'0000, 0x8000'0000, 0xb165'd14e, 0x8000'0000},
+                                       {0xff80'0000, 0xff80'0000, 0xff80'0000, 0xff80'0000},
+                                       {0x0000'0000, 0x0000'0000, 0x3561'd54a, 0x0000'0000},
+                                       {0x7f80'0000, 0x7f80'0000, 0x7f80'0000, 0x7f80'0000}},
+                                      kVectorCalculationsSource);
+  TestNarrowingVectorFloatInstruction(0x49891457,  // Vfncvt.f.xu.w v8, v24, v0.t
+                                      {{0x5f1e'0c9a, 0x5f0e'1c8a, 0x5f3e'2cba, 0x5f2e'3caa},
+                                       {0x5f5e'4cda, 0x5f4e'5cca, 0x5f7e'6cfa, 0x5f6e'7cea},
+                                       {0x5df4'60d4, 0x5d69'c0aa, 0x5e7a'b0eb, 0x5e3a'f0ab},
+                                       {0x5ebd'98b6, 0x5e9d'b896, 0x5efd'd8f6, 0x5edd'f8d6}},
+                                      kVectorCalculationsSource);
+  TestNarrowingVectorFloatInstruction(0x49899457,  // Vfncvt.f.x.w v8, v24, v0.t
+                                      {{0xdec3'e6cc, 0xdee3'c6ec, 0xde83'a68c, 0xdea3'86ac},
+                                       {0xde06'cc97, 0xde46'8cd7, 0xdbc9'82cb, 0xdd8c'18ac},
+                                       {0x5df4'60d4, 0x5d69'c0aa, 0x5e7a'b0eb, 0x5e3a'f0ab},
+                                       {0x5ebd'98b6, 0x5e9d'b896, 0x5efd'd8f6, 0x5edd'f8d6}},
+                                      kVectorCalculationsSource);
+  TestNarrowingVectorFloatInstruction(
+      0x498b1457,  // Vfncvt.rtz.xu.f.w v8, v24, v0.t
+      {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+       {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+       {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+       {0xffff, 0xffff, 0x6a21, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}},
+      {{0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000},
+       {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000},
+       {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000},
+       {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff}},
+      kVectorCalculationsSource);
+  TestNarrowingVectorFloatInstruction(
+      0x498b9457,  // Vfncvt.rtz.x.f.w v8, v24, v0.t
+      {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+       {0x8000, 0x8000, 0xcad0, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+       {0x7fff, 0x7fff, 0x6a21, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff}},
+      {{0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000},
+       {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
+       {0x0000'0000, 0x0000'0000, 0x0000'0000, 0x0000'0000},
+       {0x7fff'ffff, 0x7fff'ffff, 0x7fff'ffff, 0x7fff'ffff}},
+      kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestVfmvfs) {
author	Ahmed Mohamed Mohamed <ahmed200615200@gmail.com>	2024-03-03 13:31:33 +0000
committer	Victor Khimenko <khim@google.com>	2024-03-04 14:53:31 +0000
commit	ae7726ba95d2daa79febdabd45115c27e9b6f47e (patch)
tree	0e18775beddca79b0ef3b1eb9afed0ba241da033
parent	e22f587ea25b76f2290c34bd770a0c00c8f03f3d (diff)
download	binary_translation-ae7726ba95d2daa79febdabd45115c27e9b6f47e.tar.gz