Snap for 11812660 from 59addd2f7e65b35643dbe541cbd7a20d0b5e90df to sdk-release

Change-Id: I66aac65e8d03033faad94ce229a165b5195108ff
author: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2024-05-07 23:15:48 +0000
committer: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2024-05-07 23:15:48 +0000
commit: c7caefd0df41378801e48aaa3cda1270c312762a (patch)
tree: 306c083dbb84bd7ee8529aaa3d2aad2c5b6c9dc2
parent: 910c1c3005863b02f71278fb61028acd8f399e51 (diff)
parent: 59addd2f7e65b35643dbe541cbd7a20d0b5e90df (diff)
download: binary_translation-c7caefd0df41378801e48aaa3cda1270c312762a.tar.gz
10 files changed, 1347 insertions, 267 deletions
diff --git a/assembler/include/berberis/assembler/common_x86.h b/assembler/include/berberis/assembler/common_x86.h
index 86453678..c67ce55a 100644
--- a/assembler/include/berberis/assembler/common_x86.h
+++ b/assembler/include/berberis/assembler/common_x86.h
@@ -785,13 +785,9 @@ inline void AssemblerX86<Assembler>::Xchgl(Register dest, Register src) {
     Register other = Assembler::IsAccumulator(src) ? dest : src;
     EmitInstruction<Opcodes<0x90>>(Register32Bit(other));
   } else {
-    // Clang 8 (after r330298) swaps these two arguments.  We are comparing output
+    // Clang 8 (after r330298) puts dest before src.  We are comparing output
     // to clang in exhaustive test thus we want to match clang behavior exactly.
-#if __clang_major__ >= 8
     EmitInstruction<Opcodes<0x87>>(Register32Bit(dest), Register32Bit(src));
-#else
-    EmitInstruction<Opcodes<0x87>>(Register32Bit(src), Register32Bit(dest));
-#endif
   }
 }
 
diff --git a/assembler/include/berberis/assembler/x86_32.h b/assembler/include/berberis/assembler/x86_32.h
index 40e87a2f..cde5c682 100644
--- a/assembler/include/berberis/assembler/x86_32.h
+++ b/assembler/include/berberis/assembler/x86_32.h
@@ -183,7 +183,7 @@ class Assembler : public AssemblerX86<Assembler> {
 
   // Make sure only type void* can be passed to function below, not Label* or any other type.
   template <typename T>
-  auto Jmp(Condition cc, T* target) -> void = delete;
+  auto Jmp(T* target) -> void = delete;
 
   void Jmp(const void* target) {
     Emit8(0xe9);
diff --git a/assembler/include/berberis/assembler/x86_64.h b/assembler/include/berberis/assembler/x86_64.h
index ba343f86..c66cc1c7 100644
--- a/assembler/include/berberis/assembler/x86_64.h
+++ b/assembler/include/berberis/assembler/x86_64.h
@@ -179,7 +179,7 @@ class Assembler : public AssemblerX86<Assembler> {
 
   // Make sure only type void* can be passed to function below, not Label* or any other type.
   template <typename T>
-  auto Jmp(Condition cc, T* target) -> void = delete;
+  auto Jmp(T* target) -> void = delete;
 
   void Jmp(const void* target) {
     // There are no jump instruction with properties we need thus we emulate it.
@@ -533,22 +533,15 @@ inline void Assembler::Xchgq(Register dest, Register src) {
   // We compare output to that from clang and thus want to produce the same code.
   // 0x48 0x90 is suboptimal encoding for that operation (pure 0x90 does the same
   // and this is what gcc + gas are producing), but this is what clang <= 8 does.
-#if __clang_major__ >= 8
   if (IsAccumulator(src) && IsAccumulator(dest)) {
     Emit8(0x90);
-  } else
-#endif
-  if (IsAccumulator(src) || IsAccumulator(dest)) {
+  } else if (IsAccumulator(src) || IsAccumulator(dest)) {
     Register other = IsAccumulator(src) ? dest : src;
     EmitInstruction<Opcodes<0x90>>(Register64Bit(other));
   } else {
-  // Clang 8 (after r330298) swaps these two arguments.  We are comparing output
+  // Clang 8 (after r330298) puts dest before src.  We are comparing output
   // to clang in exhaustive test thus we want to match clang behavior exactly.
-#if __clang_major__ >= 8
     EmitInstruction<Opcodes<0x87>>(Register64Bit(dest), Register64Bit(src));
-#else
-    EmitInstruction<Opcodes<0x87>>(Register64Bit(src), Register64Bit(dest));
-#endif
   }
 }
 
diff --git a/interpreter/riscv64/interpreter.h b/interpreter/riscv64/interpreter.h
index 0c7bd140..661e4d92 100644
--- a/interpreter/riscv64/interpreter.h
+++ b/interpreter/riscv64/interpreter.h
@@ -469,7 +469,7 @@ class Interpreter {
 
   template <typename ElementType, VectorRegisterGroupMultiplier vlmul>
   static constexpr size_t GetVlmax() {
-    constexpr int kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof(ElementType));
+    constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
     switch (vlmul) {
       case VectorRegisterGroupMultiplier::k1register:
         return kElementsCount;
@@ -923,8 +923,7 @@ class Interpreter {
     if (!IsAligned<kIndexRegistersInvolved>(args.idx)) {
       return Undefined();
     }
-    constexpr size_t kElementsCount =
-        static_cast<int>(sizeof(SIMD128Register) / sizeof(IndexElementType));
+    constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(IndexElementType);
     alignas(alignof(SIMD128Register))
         IndexElementType indexes[kElementsCount * kIndexRegistersInvolved];
     memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved);
@@ -1040,10 +1039,10 @@ class Interpreter {
     if (!IsAligned<kNumRegistersInGroup>(dst)) {
       return Undefined();
     }
-    if (dst + kNumRegistersInGroup * kSegmentSize >= 32) {
+    if (dst + kNumRegistersInGroup * kSegmentSize > 32) {
       return Undefined();
     }
-    constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType));
+    constexpr size_t kElementsCount = 16 / sizeof(ElementType);
     size_t vstart = GetCsr<CsrName::kVstart>();
     size_t vl = GetCsr<CsrName::kVl>();
     if constexpr (opcode == Decoder::VLUmOpOpcode::kVlm) {
@@ -1211,7 +1210,7 @@ class Interpreter {
             auto vma,
             typename GetElementIndexLambdaType>
   void OpVectorGather(uint8_t dst, uint8_t src1, GetElementIndexLambdaType GetElementIndex) {
-    constexpr int kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
+    constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
     if (!IsAligned<kRegistersInvolved>(dst | src1)) {
       return Undefined();
     }
@@ -1219,7 +1218,7 @@ class Interpreter {
     if (dst < (src1 + kRegistersInvolved) && src1 < (dst + kRegistersInvolved)) {
       return Undefined();
     }
-    constexpr int kElementsCount = static_cast<int>(16 / sizeof(ElementType));
+    constexpr size_t kElementsCount = 16 / sizeof(ElementType);
     constexpr size_t vlmax = GetVlmax<ElementType, vlmul>();
 
     size_t vstart = GetCsr<CsrName::kVstart>();
@@ -1323,6 +1322,10 @@ class Interpreter {
       case Decoder::VOpFVfOpcode::kVfsgnjxvf:
         return OpVectorvx<intrinsics::Vfsgnjxvx<ElementType>, ElementType, vlmul, vta, vma>(
             args.dst, args.src1, arg2);
+      case Decoder::VOpFVfOpcode::kVfslide1upvf:
+        return OpVectorslide1up<ElementType, vlmul, vta, vma>(args.dst, args.src1, arg2);
+      case Decoder::VOpFVfOpcode::kVfslide1downvf:
+        return OpVectorslide1down<ElementType, vlmul, vta, vma>(args.dst, args.src1, arg2);
       case Decoder::VOpFVfOpcode::kVfmvsf:
         if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
           return Undefined();
@@ -1640,14 +1643,14 @@ class Interpreter {
                               vlmul,
                               vta,
                               vma,
-                              kFrm>(args.dst, args.src1, Vec<kNegativeZero>{args.src2});
+                              kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
           } else {
             return OpVectorvs<intrinsics::Vfredusumvs<ElementType>,
                               ElementType,
                               vlmul,
                               vta,
                               vma,
-                              kFrm>(args.dst, args.src1, Vec<kPositiveZero>{args.src2});
+                              kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
           }
         case Decoder::VOpFVvOpcode::kVfredosumvs:
           // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
@@ -1658,14 +1661,14 @@ class Interpreter {
                               vlmul,
                               vta,
                               vma,
-                              kFrm>(args.dst, args.src1, Vec<kNegativeZero>{args.src2});
+                              kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
           } else {
             return OpVectorvs<intrinsics::Vfredosumvs<ElementType>,
                               ElementType,
                               vlmul,
                               vta,
                               vma,
-                              kFrm>(args.dst, args.src1, Vec<kPositiveZero>{args.src2});
+                              kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
           }
         case Decoder::VOpFVvOpcode::kVfminvv:
           return OpVectorvv<intrinsics::Vfminvv<ElementType>, ElementType, vlmul, vta, vma>(
@@ -1674,10 +1677,10 @@ class Interpreter {
           // For Vfredmin the identity element is +inf.
           return OpVectorvs<intrinsics::Vfredminvs<ElementType>, ElementType, vlmul, vta, vma>(
               args.dst,
-              args.src1,
               Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x7f80'0000
                                                                         : 0x7ff0'0000'0000'0000}>{
-                  args.src2});
+                  args.src1},
+              args.src2);
         case Decoder::VOpFVvOpcode::kVfmaxvv:
           return OpVectorvv<intrinsics::Vfmaxvv<ElementType>, ElementType, vlmul, vta, vma>(
               args.dst, args.src1, args.src2);
@@ -1685,10 +1688,10 @@ class Interpreter {
           // For Vfredmax the identity element is -inf.
           return OpVectorvs<intrinsics::Vfredmaxvs<ElementType>, ElementType, vlmul, vta, vma>(
               args.dst,
-              args.src1,
               Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0xff80'0000
                                                                         : 0xfff0'0000'0000'0000}>{
-                  args.src2});
+                  args.src1},
+              args.src2);
         case Decoder::VOpFVvOpcode::kVfsgnjvv:
           return OpVectorvv<intrinsics::Vfsgnjvv<ElementType>, ElementType, vlmul, vta, vma>(
               args.dst, args.src1, args.src2);
@@ -2091,6 +2094,20 @@ class Interpreter {
       case Decoder::VOpIVvOpcode::kVnsrlwv:
         return OpVectorNarrowwv<intrinsics::Vnsrwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
             args.dst, args.src1, args.src2);
+      case Decoder::VOpIVvOpcode::kVnclipuwv:
+        return OpVectorNarrowwv<intrinsics::Vnclipwv<SaturatingUnsignedType>,
+                                SaturatingUnsignedType,
+                                vlmul,
+                                vta,
+                                vma,
+                                kVxrm>(args.dst, args.src1, args.src2);
+      case Decoder::VOpIVvOpcode::kVnclipwv:
+        return OpVectorNarrowwv<intrinsics::Vnclipwv<SaturatingSignedType>,
+                                SaturatingSignedType,
+                                vlmul,
+                                vta,
+                                vma,
+                                kVxrm>(args.dst, args.src1, args.src2);
       default:
         Undefined();
     }
@@ -2224,6 +2241,20 @@ class Interpreter {
       case Decoder::VOpIVxOpcode::kVslidedownvx:
         return OpVectorslidedown<ElementType, vlmul, vta, vma>(
             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
+      case Decoder::VOpIVxOpcode::kVnclipuwx:
+        return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingUnsignedType>,
+                                SaturatingUnsignedType,
+                                vlmul,
+                                vta,
+                                vma,
+                                kVxrm>(args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
+      case Decoder::VOpIVxOpcode::kVnclipwx:
+        return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingSignedType>,
+                                SaturatingSignedType,
+                                vlmul,
+                                vta,
+                                vma,
+                                kVxrm>(args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
       default:
         Undefined();
     }
@@ -2267,35 +2298,35 @@ class Interpreter {
     switch (args.opcode) {
       case Decoder::VOpMVvOpcode::kVredsumvs:
         return OpVectorvs<intrinsics::Vredsumvs<ElementType>, ElementType, vlmul, vta, vma>(
-            args.dst, args.src1, Vec<ElementType{}>{args.src2});
+            args.dst, Vec<ElementType{}>{args.src1}, args.src2);
       case Decoder::VOpMVvOpcode::kVredandvs:
         return OpVectorvs<intrinsics::Vredandvs<ElementType>, ElementType, vlmul, vta, vma>(
-            args.dst, args.src1, Vec<~ElementType{}>{args.src2});
+            args.dst, Vec<~ElementType{}>{args.src1}, args.src2);
       case Decoder::VOpMVvOpcode::kVredorvs:
         return OpVectorvs<intrinsics::Vredorvs<ElementType>, ElementType, vlmul, vta, vma>(
-            args.dst, args.src1, Vec<ElementType{}>{args.src2});
+            args.dst, Vec<ElementType{}>{args.src1}, args.src2);
       case Decoder::VOpMVvOpcode::kVredxorvs:
         return OpVectorvs<intrinsics::Vredxorvs<ElementType>, ElementType, vlmul, vta, vma>(
-            args.dst, args.src1, Vec<ElementType{}>{args.src2});
+            args.dst, Vec<ElementType{}>{args.src1}, args.src2);
       case Decoder::VOpMVvOpcode::kVredminuvs:
         return OpVectorvs<intrinsics::Vredminvs<UnsignedType>, UnsignedType, vlmul, vta, vma>(
             args.dst,
-            args.src1,
-            Vec<UnsignedType{std::numeric_limits<typename UnsignedType::BaseType>::max()}>(
-                args.src2));
+            Vec<UnsignedType{std::numeric_limits<typename UnsignedType::BaseType>::max()}>{
+                args.src1},
+            args.src2);
       case Decoder::VOpMVvOpcode::kVredminvs:
         return OpVectorvs<intrinsics::Vredminvs<SignedType>, SignedType, vlmul, vta, vma>(
             args.dst,
-            args.src1,
-            Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::max()}>{args.src2});
+            Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::max()}>{args.src1},
+            args.src2);
       case Decoder::VOpMVvOpcode::kVredmaxuvs:
         return OpVectorvs<intrinsics::Vredmaxvs<UnsignedType>, UnsignedType, vlmul, vta, vma>(
-            args.dst, args.src1, Vec<UnsignedType{}>{args.src2});
+            args.dst, Vec<UnsignedType{}>{args.src1}, args.src2);
       case Decoder::VOpMVvOpcode::kVredmaxvs:
         return OpVectorvs<intrinsics::Vredmaxvs<SignedType>, SignedType, vlmul, vta, vma>(
             args.dst,
-            args.src1,
-            Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::min()}>{args.src2});
+            Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::min()}>{args.src1},
+            args.src2);
       case Decoder::VOpMVvOpcode::kVaadduvv:
         return OpVectorvv<intrinsics::Vaaddvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
             args.dst, args.src1, args.src2);
@@ -2635,8 +2666,7 @@ class Interpreter {
     if (!IsAligned<kIndexRegistersInvolved>(args.idx)) {
       return Undefined();
     }
-    constexpr size_t kElementsCount =
-        static_cast<int>(sizeof(SIMD128Register) / sizeof(IndexElementType));
+    constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(IndexElementType);
     alignas(alignof(SIMD128Register))
         IndexElementType indexes[kElementsCount * kIndexRegistersInvolved];
     memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved);
@@ -2704,7 +2734,7 @@ class Interpreter {
     if (data + kNumRegistersInGroup * kSegmentSize > 32) {
       return Undefined();
     }
-    constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType));
+    constexpr size_t kElementsCount = 16 / sizeof(ElementType);
     size_t vstart = GetCsr<CsrName::kVstart>();
     size_t vl = GetCsr<CsrName::kVl>();
     if constexpr (opcode == Decoder::VSUmOpOpcode::kVsm) {
@@ -2959,7 +2989,7 @@ class Interpreter {
     if (!IsAligned<kRegistersInvolved>(dst | src)) {
       return Undefined();
     }
-    constexpr size_t kElementsCount = static_cast<int>(16 / sizeof(ElementType));
+    constexpr size_t kElementsCount = 16 / sizeof(ElementType);
     size_t vstart = GetCsr<CsrName::kVstart>();
     SetCsr<CsrName::kVstart>(0);
     // The usual property that no elements are written if vstart >= vl does not apply to these
@@ -3124,7 +3154,7 @@ class Interpreter {
             auto vma,
             CsrName... kExtraCsrs,
             auto kDefaultElement>
-  void OpVectorvs(uint8_t dst, uint8_t src1, Vec<kDefaultElement> src2) {
+  void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) {
     return OpVectorvs<Intrinsic,
                       ElementType,
                       NumberOfRegistersInvolved(vlmul),
@@ -3140,8 +3170,8 @@ class Interpreter {
             auto vma,
             CsrName... kExtraCsrs,
             auto kDefaultElement>
-  void OpVectorvs(uint8_t dst, uint8_t src1, Vec<kDefaultElement> src2) {
-    if (!IsAligned<kRegistersInvolved>(dst | src2.start_no)) {
+  void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) {
+    if (!IsAligned<kRegistersInvolved>(dst | src1.start_no)) {
       return Undefined();
     }
     size_t vstart = GetCsr<CsrName::kVstart>();
@@ -3155,15 +3185,15 @@ class Interpreter {
       return;
     }
     auto mask = GetMaskForVectorOperations<vma>();
-    ElementType arg1 = SIMD128Register{state_->cpu.v[src1]}.Get<ElementType>(0);
+    ElementType init = SIMD128Register{state_->cpu.v[src2]}.Get<ElementType>(0);
     for (size_t index = 0; index < kRegistersInvolved; ++index) {
-      arg1 = std::get<0>(
+      init = std::get<0>(
           Intrinsic(GetCsr<kExtraCsrs>()...,
-                    arg1,
-                    GetVectorArgument<ElementType, vta, vma>(src2, vstart, vl, index, mask)));
+                    init,
+                    GetVectorArgument<ElementType, vta, vma>(src1, vstart, vl, index, mask)));
     }
     SIMD128Register result{state_->cpu.v[dst]};
-    result.Set(arg1, 0);
+    result.Set(init, 0);
     result = std::get<0>(intrinsics::VectorMasking<ElementType, vta>(result, result, 0, 1));
     state_->cpu.v[dst] = result.Get<__uint128_t>();
   }
@@ -3593,8 +3623,8 @@ class Interpreter {
     if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kSourceRegistersInvolved>(src)) {
       return Undefined();
     }
-    int vstart = GetCsr<CsrName::kVstart>();
-    int vl = GetCsr<CsrName::kVl>();
+    size_t vstart = GetCsr<CsrName::kVstart>();
+    size_t vl = GetCsr<CsrName::kVl>();
     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
     // vector register group, including that no tail elements are updated with agnostic values.
     if (vstart >= vl) [[unlikely]] {
diff --git a/interpreter/riscv64/interpreter_test.cc b/interpreter/riscv64/interpreter_test.cc
index 2feb230d..1cbdfcb2 100644
--- a/interpreter/riscv64/interpreter_test.cc
+++ b/interpreter/riscv64/interpreter_test.cc
@@ -1224,7 +1224,7 @@ class Riscv64InterpreterTest : public ::testing::Test {
         // instructions that work with double width floats.
         // These instructions never use float registers though and thus we don't need to store
         // anything into f1 register, if they are used.
-        // For Float32/Float64 case we load 1.0 of the appropriate type into f1.
+        // For Float32/Float64 case we load 5.625 of the appropriate type into f1.
         ASSERT_LE(vsew, 3);
         if (vsew == 2) {
           SetFReg<1>(state_.cpu, 0xffff'ffff'40b4'0000);  // float 5.625
@@ -1341,19 +1341,7 @@ class Riscv64InterpreterTest : public ::testing::Test {
                  (kTestVectorInstructionMode == TestVectorInstructionMode::kWidening),
              8,
              expected_result,
-             [] {
-               if constexpr (sizeof(ElementType) == sizeof(Int8)) {
-                 return kMaskInt8;
-               } else if constexpr (sizeof(ElementType) == sizeof(Int16)) {
-                 return kMaskInt16;
-               } else if constexpr (sizeof(ElementType) == sizeof(Int32)) {
-                 return kMaskInt32;
-               } else if constexpr (sizeof(ElementType) == sizeof(Int64)) {
-                 return kMaskInt64;
-               } else {
-                 static_assert(kDependentTypeFalse<ElementType>);
-               }
-             }()),
+             MaskForElem<ElementType>()),
       Verify((insn_bytes &
               ~(0x01f00000 * (kTestVectorInstructionMode == TestVectorInstructionMode::kVMerge))) |
                  (1 << 25),
@@ -1528,11 +1516,9 @@ class Riscv64InterpreterTest : public ::testing::Test {
     }
   }
 
-  template <bool kIsMasked, typename ElementType>
+  template <typename ElementType>
   auto MaskForElem() {
-    if constexpr (!kIsMasked) {
-      return kNoMask;
-    } else if constexpr (std::is_same_v<ElementType, uint8_t>) {
+    if constexpr (std::is_same_v<ElementType, uint8_t>) {
       return kMaskInt8;
     } else if constexpr (std::is_same_v<ElementType, uint16_t>) {
       return kMaskInt16;
@@ -1545,6 +1531,15 @@ class Riscv64InterpreterTest : public ::testing::Test {
     }
   }
 
+  template <bool kIsMasked, typename ElementType>
+  auto MaskForElemIfMasked() {
+    if constexpr (!kIsMasked) {
+      return kNoMask;
+    } else {
+      return MaskForElem<ElementType>();
+    }
+  }
+
   template <bool kIsMasked>
   void TestVectorIota(uint32_t insn_bytes,
                       const uint8_t (&expected_result_int8)[8][16],
@@ -1662,7 +1657,7 @@ class Riscv64InterpreterTest : public ::testing::Test {
     (Verify(insn_bytes,
             BitUtilLog2(sizeof(ElementType)),
             expected_result,
-            MaskForElem<kIsMasked, ElementType>()),
+            MaskForElemIfMasked<kIsMasked, ElementType>()),
      ...);
   }
 
@@ -1931,6 +1926,49 @@ class Riscv64InterpreterTest : public ::testing::Test {
     }
   }
 
+  void TestVectorFloatPermutationInstruction(uint32_t insn_bytes,
+                                             const uint32_t (&expected_result_int32)[8][4],
+                                             const uint64_t (&expected_result_int64)[8][2],
+                                             const __v2du (&source)[16],
+                                             uint8_t vlmul,
+                                             uint64_t skip = 0,
+                                             bool ignore_vma_for_last = false,
+                                             bool last_elem_is_f1 = false) {
+    TestVectorPermutationInstruction<TestVectorInstructionKind::kFloat>(insn_bytes,
+                                                                        source,
+                                                                        vlmul,
+                                                                        skip,
+                                                                        ignore_vma_for_last,
+                                                                        last_elem_is_f1,
+                                                                        /* regx1 */ 0x0,
+                                                                        expected_result_int32,
+                                                                        expected_result_int64);
+  }
+
+  void TestVectorPermutationInstruction(uint32_t insn_bytes,
+                                        const uint8_t (&expected_result_int8)[8][16],
+                                        const uint16_t (&expected_result_int16)[8][8],
+                                        const uint32_t (&expected_result_int32)[8][4],
+                                        const uint64_t (&expected_result_int64)[8][2],
+                                        const __v2du (&source)[16],
+                                        uint8_t vlmul,
+                                        uint64_t regx1 = 0x0,
+                                        uint64_t skip = 0,
+                                        bool ignore_vma_for_last = false,
+                                        bool last_elem_is_x1 = false) {
+    TestVectorPermutationInstruction<TestVectorInstructionKind::kInteger>(insn_bytes,
+                                                                          source,
+                                                                          vlmul,
+                                                                          skip,
+                                                                          ignore_vma_for_last,
+                                                                          last_elem_is_x1,
+                                                                          regx1,
+                                                                          expected_result_int8,
+                                                                          expected_result_int16,
+                                                                          expected_result_int32,
+                                                                          expected_result_int64);
+  }
+
   // Unlike regular arithmetic instructions, the result of a permutation
   // instruction depends also on vlmul.  Also, the vslideup specs mention that
   // the destination vector remains unchanged the first |offset| elements (in
@@ -1940,21 +1978,23 @@ class Riscv64InterpreterTest : public ::testing::Test {
   //
   // If |ignore_vma_for_last| is true, an inactive element at vl-1 will be
   // treated as if vma=0 (Undisturbed).
-  // If |last_elem_is_x1| is true, the last element of the vector in
+  // If |last_elem_is_reg1| is true, the last element of the vector in
   // expected_result (that is, at vl-1) will be expected to be the same as
   // |regx1| when VL < VMAX and said element is active.
-  void TestVectorPermutationInstruction(uint32_t insn_bytes,
-                                        const __v16qu (&expected_result_int8)[8],
-                                        const __v8hu (&expected_result_int16)[8],
-                                        const __v4su (&expected_result_int32)[8],
-                                        const __v2du (&expected_result_int64)[8],
-                                        const __v2du (&source)[16],
-                                        uint8_t vlmul,
-                                        uint64_t regx1 = 0x0,
-                                        uint64_t skip = 0,
-                                        bool ignore_vma_for_last = false,
-                                        bool last_elem_is_x1 = false) {
-    auto Verify = [this, &source, vlmul, regx1, skip, ignore_vma_for_last, last_elem_is_x1](
+  template <TestVectorInstructionKind kTestVectorInstructionKind,
+            typename... ElementType,
+            size_t... kResultsCount,
+            size_t... kElementCount>
+  void TestVectorPermutationInstruction(
+      uint32_t insn_bytes,
+      const __v2du (&source)[16],
+      uint8_t vlmul,
+      uint64_t skip,
+      bool ignore_vma_for_last,
+      bool last_elem_is_reg1,
+      uint64_t regx1,
+      const ElementType (&... expected_result)[kResultsCount][kElementCount]) {
+    auto Verify = [this, &source, vlmul, regx1, skip, ignore_vma_for_last, last_elem_is_reg1](
                       uint32_t insn_bytes,
                       uint8_t vsew,
                       const auto& expected_result_raw,
@@ -1965,8 +2005,24 @@ class Riscv64InterpreterTest : public ::testing::Test {
       for (size_t index = 0; index < std::size(source); ++index) {
         state_.cpu.v[16 + index] = SIMD128Register{source[index]}.Get<__uint128_t>();
       }
-      // Set x1 for vx instructions.
-      SetXReg<1>(state_.cpu, regx1);
+
+      if constexpr (kTestVectorInstructionKind == TestVectorInstructionKind::kFloat) {
+        UNUSED(regx1);
+        // We only support Float32/Float64 for float instructions, but there are conversion
+        // instructions that work with double width floats.
+        // These instructions never use float registers though and thus we don't need to store
+        // anything into f1 register, if they are used.
+        // For Float32/Float64 case we load 5.625 of the appropriate type into f1.
+        ASSERT_LE(vsew, 3);
+        if (vsew == 2) {
+          SetFReg<1>(state_.cpu, 0xffff'ffff'40b4'0000);  // float 5.625
+        } else if (vsew == 3) {
+          SetFReg<1>(state_.cpu, 0x4016'8000'0000'0000);  // double 5.625
+        }
+      } else {
+        // Set x1 for vx instructions.
+        SetXReg<1>(state_.cpu, regx1);
+      }
 
       const size_t kElementSize = 1 << vsew;
       size_t num_regs = 1 << vlmul;
@@ -2037,7 +2093,7 @@ class Riscv64InterpreterTest : public ::testing::Test {
             expected_result[index] = SIMD128Register{expected_result_raw[index]};
           }
 
-          if (vlmul == 2 && last_elem_is_x1) {
+          if (vlmul == 2 && last_elem_is_reg1) {
             switch (kElementSize) {
               case 1:
                 expected_result[last_reg].template Set<uint8_t>(
@@ -2048,12 +2104,22 @@ class Riscv64InterpreterTest : public ::testing::Test {
                     static_cast<uint16_t>(GetXReg<1>(state_.cpu)), last_elem);
                 break;
               case 4:
-                expected_result[last_reg].template Set<uint32_t>(
-                    static_cast<uint32_t>(GetXReg<1>(state_.cpu)), last_elem);
+                if constexpr (kTestVectorInstructionKind == TestVectorInstructionKind::kFloat) {
+                  expected_result[last_reg].template Set<uint32_t>(
+                      static_cast<uint32_t>(GetFReg<1>(state_.cpu)), last_elem);
+                } else {
+                  expected_result[last_reg].template Set<uint32_t>(
+                      static_cast<uint32_t>(GetXReg<1>(state_.cpu)), last_elem);
+                }
                 break;
               case 8:
-                expected_result[last_reg].template Set<uint64_t>(
-                    static_cast<uint64_t>(GetXReg<1>(state_.cpu)), last_elem);
+                if constexpr (kTestVectorInstructionKind == TestVectorInstructionKind::kFloat) {
+                  expected_result[last_reg].template Set<uint64_t>(
+                      static_cast<uint64_t>(GetFReg<1>(state_.cpu)), last_elem);
+                } else {
+                  expected_result[last_reg].template Set<uint64_t>(
+                      static_cast<uint64_t>(GetXReg<1>(state_.cpu)), last_elem);
+                }
                 break;
               default:
                 FAIL() << "Element size is " << kElementSize;
@@ -2114,16 +2180,12 @@ class Riscv64InterpreterTest : public ::testing::Test {
       }
     };
 
-    // Some instructions don't support use of mask register, but in these instructions bit
-    // #25 is set.  Test it and skip masking tests if so.
-    Verify(insn_bytes, 0, expected_result_int8, kMaskInt8);
-    Verify(insn_bytes, 1, expected_result_int16, kMaskInt16);
-    Verify(insn_bytes, 2, expected_result_int32, kMaskInt32);
-    Verify(insn_bytes, 3, expected_result_int64, kMaskInt64);
-    Verify(insn_bytes | (1 << 25), 0, expected_result_int8, kNoMask);
-    Verify(insn_bytes | (1 << 25), 1, expected_result_int16, kNoMask);
-    Verify(insn_bytes | (1 << 25), 2, expected_result_int32, kNoMask);
-    Verify(insn_bytes | (1 << 25), 3, expected_result_int64, kNoMask);
+    // Test with and without masking enabled.
+    (Verify(
+         insn_bytes, BitUtilLog2(sizeof(ElementType)), expected_result, MaskForElem<ElementType>()),
+     ...);
+    (Verify(insn_bytes | (1 << 25), BitUtilLog2(sizeof(ElementType)), expected_result, kNoMask),
+     ...);
   }
 
  protected:
@@ -2662,6 +2724,69 @@ TEST_F(Riscv64InterpreterTest, TestRNU) {
        {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
        {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}},
       kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(0xb900c457,  // Vnclipu.wx v8, v16, x1, v0.t
+                                 {{32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40},
+                                  {40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48},
+                                  {48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56},
+                                  {56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64}},
+                                 {{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+                                  {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+                                  {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+                                  {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}},
+                                 {{0x0021'c1a1, 0x0023'c3a3, 0x0025'c5a5, 0x0027'c7a7},
+                                  {0x0029'c9a9, 0x002b'cbab, 0x002d'cdad, 0x002f'cfaf},
+                                  {0x0031'd1b1, 0x0033'd3b3, 0x0035'd5b5, 0x0037'd7b7},
+                                  {0x0039'd9b9, 0x003b'dbbb, 0x003d'ddbd, 0x003f'dfbf}},
+                                 kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(
+      0xbd00c457,  // Vnclip.wx v8, v16, x1, v0.t
+      {{224, 225, 225, 226, 226, 227, 227, 228, 228, 229, 229, 230, 230, 231, 231, 232},
+       {232, 233, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, 239, 239, 240},
+       {240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248},
+       {248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255, 255, 0}},
+      {{0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0xdfbf}},
+      {{0xffe1'c1a1, 0xffe3'c3a3, 0xffe5'c5a5, 0xffe7'c7a7},
+       {0xffe9'c9a9, 0xffeb'cbab, 0xffed'cdad, 0xffef'cfaf},
+       {0xfff1'd1b1, 0xfff3'd3b3, 0xfff5'd5b5, 0xfff7'd7b7},
+       {0xfff9'd9b9, 0xfffb'dbbb, 0xfffd'ddbd, 0xffff'dfbf}},
+      kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(
+      0xb90c0457,  // Vnclipu.wv v8, v16, v24, v0.t
+      {{255, 255, 255, 255, 69, 35, 9, 2, 255, 255, 255, 255, 153, 39, 10, 2},
+       {255, 255, 255, 255, 85, 43, 11, 3, 255, 255, 255, 255, 185, 47, 12, 3},
+       {255, 255, 255, 255, 101, 51, 13, 3, 255, 255, 255, 255, 217, 55, 14, 3},
+       {255, 255, 255, 255, 117, 59, 15, 4, 255, 255, 255, 255, 249, 63, 16, 4}},
+      {{0xffff, 0xffff, 0xffff, 0xffff, 0x4989, 0x0971, 0x009b, 0x000a},
+       {0xffff, 0xffff, 0xffff, 0xffff, 0x5999, 0x0b73, 0x00bb, 0x000c},
+       {0xffff, 0xffff, 0xffff, 0xffff, 0x69a9, 0x0d75, 0x00db, 0x000e},
+       {0xffff, 0xffff, 0xffff, 0xffff, 0x79b9, 0x0f77, 0x00fb, 0x0010}},
+      {{0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff},
+       {0xa726'a525, 0x0057'9757, 0x0000'5b9b, 0x0000'00bf},
+       {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff},
+       {0xe766'e565, 0x0077'b777, 0x0000'7bbb, 0x0000'00ff}},
+      kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(
+      0xbd0c0457,  // Vnclip.wv v8, v16, v24, v0.t
+      {{128, 128, 128, 128, 197, 227, 249, 254, 128, 128, 128, 128, 153, 231, 250, 254},
+       {128, 128, 128, 128, 213, 235, 251, 255, 128, 128, 128, 128, 185, 239, 252, 255},
+       {128, 128, 128, 128, 229, 243, 253, 255, 128, 128, 128, 128, 217, 247, 254, 255},
+       {128, 128, 128, 158, 245, 251, 255, 0, 128, 128, 128, 222, 249, 255, 0, 0}},
+      {{0x8000, 0x8000, 0x8000, 0x8000, 0xc989, 0xf971, 0xff9b, 0xfffa},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0xd999, 0xfb73, 0xffbb, 0xfffc},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0xe9a9, 0xfd75, 0xffdb, 0xfffe},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0xf9b9, 0xff77, 0xfffb, 0x0000}},
+      {{0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
+       {0xa726'a525, 0xffd7'9757, 0xffff'db9b, 0xffff'ffbf},
+       {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
+       {0xe766'e565, 0xfff7'b777, 0xffff'fbbb, 0xffff'ffff}},
+      kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestRNE) {
@@ -2974,6 +3099,69 @@ TEST_F(Riscv64InterpreterTest, TestRNE) {
        {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
        {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}},
       kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(0xb900c457,  // Vnclipu.wx v8, v16, x1, v0.t
+                                 {{32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40},
+                                  {40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48},
+                                  {48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56},
+                                  {56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64}},
+                                 {{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+                                  {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+                                  {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+                                  {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}},
+                                 {{0x0021'c1a1, 0x0023'c3a3, 0x0025'c5a5, 0x0027'c7a7},
+                                  {0x0029'c9a9, 0x002b'cbab, 0x002d'cdad, 0x002f'cfaf},
+                                  {0x0031'd1b1, 0x0033'd3b3, 0x0035'd5b5, 0x0037'd7b7},
+                                  {0x0039'd9b9, 0x003b'dbbb, 0x003d'ddbd, 0x003f'dfbf}},
+                                 kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(
+      0xbd00c457,  // Vnclip.wx v8, v16, x1, v0.t
+      {{224, 225, 225, 226, 226, 227, 227, 228, 228, 229, 229, 230, 230, 231, 231, 232},
+       {232, 233, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, 239, 239, 240},
+       {240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248},
+       {248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255, 255, 0}},
+      {{0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0xdfbf}},
+      {{0xffe1'c1a1, 0xffe3'c3a3, 0xffe5'c5a5, 0xffe7'c7a7},
+       {0xffe9'c9a9, 0xffeb'cbab, 0xffed'cdad, 0xffef'cfaf},
+       {0xfff1'd1b1, 0xfff3'd3b3, 0xfff5'd5b5, 0xfff7'd7b7},
+       {0xfff9'd9b9, 0xfffb'dbbb, 0xfffd'ddbd, 0xffff'dfbf}},
+      kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(
+      0xb90c0457,  // Vnclipu.wv v8, v16, v24, v0.t
+      {{255, 255, 255, 255, 69, 35, 9, 2, 255, 255, 255, 255, 153, 39, 10, 2},
+       {255, 255, 255, 255, 85, 43, 11, 3, 255, 255, 255, 255, 185, 47, 12, 3},
+       {255, 255, 255, 255, 101, 51, 13, 3, 255, 255, 255, 255, 217, 55, 14, 3},
+       {255, 255, 255, 255, 117, 59, 15, 4, 255, 255, 255, 255, 249, 63, 16, 4}},
+      {{0xffff, 0xffff, 0xffff, 0xffff, 0x4989, 0x0971, 0x009b, 0x000a},
+       {0xffff, 0xffff, 0xffff, 0xffff, 0x5999, 0x0b73, 0x00bb, 0x000c},
+       {0xffff, 0xffff, 0xffff, 0xffff, 0x69a9, 0x0d75, 0x00db, 0x000e},
+       {0xffff, 0xffff, 0xffff, 0xffff, 0x79b9, 0x0f77, 0x00fb, 0x0010}},
+      {{0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff},
+       {0xa726'a525, 0x0057'9757, 0x0000'5b9b, 0x0000'00bf},
+       {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff},
+       {0xe766'e565, 0x0077'b777, 0x0000'7bbb, 0x0000'00ff}},
+      kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(
+      0xbd0c0457,  // Vnclip.wv v8, v16, v24, v0.t
+      {{128, 128, 128, 128, 197, 227, 249, 254, 128, 128, 128, 128, 153, 231, 250, 254},
+       {128, 128, 128, 128, 213, 235, 251, 255, 128, 128, 128, 128, 185, 239, 252, 255},
+       {128, 128, 128, 128, 229, 243, 253, 255, 128, 128, 128, 128, 217, 247, 254, 255},
+       {128, 128, 128, 158, 245, 251, 255, 0, 128, 128, 128, 222, 249, 255, 0, 0}},
+      {{0x8000, 0x8000, 0x8000, 0x8000, 0xc989, 0xf971, 0xff9b, 0xfffa},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0xd999, 0xfb73, 0xffbb, 0xfffc},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0xe9a9, 0xfd75, 0xffdb, 0xfffe},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0xf9b9, 0xff77, 0xfffb, 0x0000}},
+      {{0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
+       {0xa726'a525, 0xffd7'9757, 0xffff'db9b, 0xffff'ffbf},
+       {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
+       {0xe766'e565, 0xfff7'b777, 0xffff'fbbb, 0xffff'ffff}},
+      kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestRDN) {
@@ -3286,6 +3474,69 @@ TEST_F(Riscv64InterpreterTest, TestRDN) {
        {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
        {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}},
       kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(0xb900c457,  // Vnclipu.wx v8, v16, x1, v0.t
+                                 {{32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39},
+                                  {40, 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47},
+                                  {48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55},
+                                  {56, 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63}},
+                                 {{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+                                  {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+                                  {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+                                  {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}},
+                                 {{0x0021'c1a1, 0x0023'c3a3, 0x0025'c5a5, 0x0027'c7a7},
+                                  {0x0029'c9a9, 0x002b'cbab, 0x002d'cdad, 0x002f'cfaf},
+                                  {0x0031'd1b1, 0x0033'd3b3, 0x0035'd5b5, 0x0037'd7b7},
+                                  {0x0039'd9b9, 0x003b'dbbb, 0x003d'ddbd, 0x003f'dfbf}},
+                                 kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(
+      0xbd00c457,  // Vnclip.wx v8, v16, x1, v0.t
+      {{224, 224, 225, 225, 226, 226, 227, 227, 228, 228, 229, 229, 230, 230, 231, 231},
+       {232, 232, 233, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, 239, 239},
+       {240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247},
+       {248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255, 255}},
+      {{0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0xdfbf}},
+      {{0xffe1'c1a1, 0xffe3'c3a3, 0xffe5'c5a5, 0xffe7'c7a7},
+       {0xffe9'c9a9, 0xffeb'cbab, 0xffed'cdad, 0xffef'cfaf},
+       {0xfff1'd1b1, 0xfff3'd3b3, 0xfff5'd5b5, 0xfff7'd7b7},
+       {0xfff9'd9b9, 0xfffb'dbbb, 0xfffd'ddbd, 0xffff'dfbf}},
+      kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(
+      0xb90c0457,  // Vnclipu.wv v8, v16, v24, v0.t
+      {{255, 255, 255, 255, 68, 34, 8, 2, 255, 255, 255, 255, 153, 38, 9, 2},
+       {255, 255, 255, 255, 84, 42, 10, 2, 255, 255, 255, 255, 185, 46, 11, 2},
+       {255, 255, 255, 255, 100, 50, 12, 3, 255, 255, 255, 255, 217, 54, 13, 3},
+       {255, 255, 255, 255, 116, 58, 14, 3, 255, 255, 255, 255, 249, 62, 15, 3}},
+      {{0xffff, 0xffff, 0xffff, 0xffff, 0x4989, 0x0971, 0x009b, 0x0009},
+       {0xffff, 0xffff, 0xffff, 0xffff, 0x5999, 0x0b73, 0x00bb, 0x000b},
+       {0xffff, 0xffff, 0xffff, 0xffff, 0x69a9, 0x0d75, 0x00db, 0x000d},
+       {0xffff, 0xffff, 0xffff, 0xffff, 0x79b9, 0x0f77, 0x00fb, 0x000f}},
+      {{0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff},
+       {0xa726'a524, 0x0057'9756, 0x0000'5b9b, 0x0000'00bf},
+       {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff},
+       {0xe766'e564, 0x0077'b776, 0x0000'7bbb, 0x0000'00ff}},
+      kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(
+      0xbd0c0457,  // Vnclip.wv v8, v16, v24, v0.t
+      {{128, 128, 128, 128, 196, 226, 248, 254, 128, 128, 128, 128, 153, 230, 249, 254},
+       {128, 128, 128, 128, 212, 234, 250, 254, 128, 128, 128, 128, 185, 238, 251, 254},
+       {128, 128, 128, 128, 228, 242, 252, 255, 128, 128, 128, 128, 217, 246, 253, 255},
+       {128, 128, 128, 157, 244, 250, 254, 255, 128, 128, 128, 221, 249, 254, 255, 255}},
+      {{0x8000, 0x8000, 0x8000, 0x8000, 0xc989, 0xf971, 0xff9b, 0xfff9},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0xd999, 0xfb73, 0xffbb, 0xfffb},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0xe9a9, 0xfd75, 0xffdb, 0xfffd},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0xf9b9, 0xff77, 0xfffb, 0xffff}},
+      {{0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
+       {0xa726'a524, 0xffd7'9756, 0xffff'db9b, 0xffff'ffbf},
+       {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
+       {0xe766'e564, 0xfff7'b776, 0xffff'fbbb, 0xffff'ffff}},
+      kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestROD) {
@@ -3598,6 +3849,69 @@ TEST_F(Riscv64InterpreterTest, TestROD) {
        {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
        {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000}},
       kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(0xb900c457,  // Vnclipu.wx v8, v16, x1, v0.t
+                                 {{33, 33, 33, 33, 35, 35, 35, 35, 37, 37, 37, 37, 39, 39, 39, 39},
+                                  {41, 41, 41, 41, 43, 43, 43, 43, 45, 45, 45, 45, 47, 47, 47, 47},
+                                  {49, 49, 49, 49, 51, 51, 51, 51, 53, 53, 53, 53, 55, 55, 55, 55},
+                                  {57, 57, 57, 57, 59, 59, 59, 59, 61, 61, 61, 61, 63, 63, 63, 63}},
+                                 {{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+                                  {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+                                  {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+                                  {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}},
+                                 {{0x0021'c1a1, 0x0023'c3a3, 0x0025'c5a5, 0x0027'c7a7},
+                                  {0x0029'c9a9, 0x002b'cbab, 0x002d'cdad, 0x002f'cfaf},
+                                  {0x0031'd1b1, 0x0033'd3b3, 0x0035'd5b5, 0x0037'd7b7},
+                                  {0x0039'd9b9, 0x003b'dbbb, 0x003d'ddbd, 0x003f'dfbf}},
+                                 kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(
+      0xbd00c457,  // Vnclip.wx v8, v16, x1, v0.t
+      {{225, 225, 225, 225, 227, 227, 227, 227, 229, 229, 229, 229, 231, 231, 231, 231},
+       {233, 233, 233, 233, 235, 235, 235, 235, 237, 237, 237, 237, 239, 239, 239, 239},
+       {241, 241, 241, 241, 243, 243, 243, 243, 245, 245, 245, 245, 247, 247, 247, 247},
+       {249, 249, 249, 249, 251, 251, 251, 251, 253, 253, 253, 253, 255, 255, 255, 255}},
+      {{0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0xdfbf}},
+      {{0xffe1'c1a1, 0xffe3'c3a3, 0xffe5'c5a5, 0xffe7'c7a7},
+       {0xffe9'c9a9, 0xffeb'cbab, 0xffed'cdad, 0xffef'cfaf},
+       {0xfff1'd1b1, 0xfff3'd3b3, 0xfff5'd5b5, 0xfff7'd7b7},
+       {0xfff9'd9b9, 0xfffb'dbbb, 0xfffd'ddbd, 0xffff'dfbf}},
+      kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(
+      0xb90c0457,  // Vnclipu.wv v8, v16, v24, v0.t
+      {{255, 255, 255, 255, 69, 35, 9, 3, 255, 255, 255, 255, 153, 39, 9, 3},
+       {255, 255, 255, 255, 85, 43, 11, 3, 255, 255, 255, 255, 185, 47, 11, 3},
+       {255, 255, 255, 255, 101, 51, 13, 3, 255, 255, 255, 255, 217, 55, 13, 3},
+       {255, 255, 255, 255, 117, 59, 15, 3, 255, 255, 255, 255, 249, 63, 15, 3}},
+      {{0xffff, 0xffff, 0xffff, 0xffff, 0x4989, 0x0971, 0x009b, 0x0009},
+       {0xffff, 0xffff, 0xffff, 0xffff, 0x5999, 0x0b73, 0x00bb, 0x000b},
+       {0xffff, 0xffff, 0xffff, 0xffff, 0x69a9, 0x0d75, 0x00db, 0x000d},
+       {0xffff, 0xffff, 0xffff, 0xffff, 0x79b9, 0x0f77, 0x00fb, 0x000f}},
+      {{0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff},
+       {0xa726'a525, 0x0057'9757, 0x0000'5b9b, 0x0000'00bf},
+       {0xffff'ffff, 0xffff'ffff, 0xffff'ffff, 0xffff'ffff},
+       {0xe766'e565, 0x0077'b777, 0x0000'7bbb, 0x0000'00ff}},
+      kVectorCalculationsSource);
+
+  TestNarrowingVectorInstruction(
+      0xbd0c0457,  // Vnclip.wv v8, v16, v24, v0.t
+      {{128, 128, 128, 128, 197, 227, 249, 255, 128, 128, 128, 128, 153, 231, 249, 255},
+       {128, 128, 128, 128, 213, 235, 251, 255, 128, 128, 128, 128, 185, 239, 251, 255},
+       {128, 128, 128, 128, 229, 243, 253, 255, 128, 128, 128, 128, 217, 247, 253, 255},
+       {128, 128, 128, 157, 245, 251, 255, 255, 128, 128, 128, 221, 249, 255, 255, 255}},
+      {{0x8000, 0x8000, 0x8000, 0x8000, 0xc989, 0xf971, 0xff9b, 0xfff9},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0xd999, 0xfb73, 0xffbb, 0xfffb},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0xe9a9, 0xfd75, 0xffdb, 0xfffd},
+       {0x8000, 0x8000, 0x8000, 0x8000, 0xf9b9, 0xff77, 0xfffb, 0xffff}},
+      {{0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
+       {0xa726'a525, 0xffd7'9757, 0xffff'db9b, 0xffff'ffbf},
+       {0x8000'0000, 0x8000'0000, 0x8000'0000, 0x8000'0000},
+       {0xe766'e565, 0xfff7'b777, 0xffff'fbbb, 0xffff'ffff}},
+      kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestVlXreXX) {
@@ -10061,284 +10375,568 @@ TEST_F(Riscv64InterpreterTest, TestVfsgnj) {
 
 TEST_F(Riscv64InterpreterTest, TestVredsum) {
   TestVectorReductionInstruction(
-      0x10c2457,  // vredsum.vs v8,v16,v24,v0.t
+      0x1882457,  // vredsum.vs v8,v24,v16,v0.t
       // expected_result_vd0_int8
       {242, 228, 200, 144, /* unused */ 0, 146, 44, 121},
       // expected_result_vd0_int16
       {0x0172, 0x82e4, 0x88c8, 0xa090, /* unused */ 0, 0x1300, 0xa904, 0xe119},
       // expected_result_vd0_int32
-      {0xcb44'b932, 0x9407'71e4, 0xa70e'64c8, 0xd312'5090, /* unused */ 0, /* unused */ 0,
-       0x1907'1300, 0xb713'ad09},
+      {0xcb44'b932,
+       0x9407'71e4,
+       0xa70e'64c8,
+       0xd312'5090,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x1907'1300,
+       0xb713'ad09},
       // expected_result_vd0_int64
-      {0xb32f'a926'9f1b'9511, 0x1f99'0d88'fb74'e962, 0xb92c'970e'74e8'52c4, 0xef4e'ad14'6aca'2888,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x2513'1f0e'1907'1300},
+      {0xb32f'a926'9f1b'9511,
+       0x1f99'0d88'fb74'e962,
+       0xb92c'970e'74e8'52c4,
+       0xef4e'ad14'6aca'2888,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x2513'1f0e'1907'1300},
       // expected_result_vd0_with_mask_int8
       {39, 248, 142, 27, /* unused */ 0, 0, 154, 210},
       // expected_result_vd0_with_mask_int16
       {0x5f45, 0xc22f, 0x99d0, 0x98bf, /* unused */ 0, 0x1300, 0x1300, 0x4b15},
       // expected_result_vd0_with_mask_int32
-      {0x2d38'1f29, 0x99a1'838a, 0x1989'ef5c, 0x9cf4'4aa1, /* unused */ 0, /* unused */ 0,
-       0x1907'1300, 0x1907'1300},
+      {0x2d38'1f29,
+       0x99a1'838a,
+       0x1989'ef5c,
+       0x9cf4'4aa1,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x1907'1300,
+       0x1907'1300},
       // expected_result_vd0_with_mask_int64
-      {0x2513'1f0e'1907'1300, 0x917c'8370'7560'6751, 0x4e56'3842'222a'0c13, 0xc833'9e0e'73df'49b5,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x2513'1f0e'1907'1300},
+      {0x2513'1f0e'1907'1300,
+       0x917c'8370'7560'6751,
+       0x4e56'3842'222a'0c13,
+       0xc833'9e0e'73df'49b5,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x2513'1f0e'1907'1300},
       kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestVfredosum) {
-  TestVectorReductionInstruction(
-      0xd0c1457,  // vfredosum.vs v8, v16, v24, v0.t
-      // expected_result_vd0_int32
-      {0x9e0c'9a8e, 0xbe2c'bace, 0xfe6c'fb4e, 0x7e6b'fc4d, /* unused */ 0, /* unused */ 0,
-       0x9604'9200, 0x9e0c'9a8e},
-      // expected_result_vd0_int64
-      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0x7eec'5def'0cee'0dee,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
-      // expected_result_vd0_with_mask_int32
-      {0x9604'929d, 0xbe2c'ba29, 0xfe6c'fb4e, 0x7e6b'fa84, /* unused */ 0, /* unused */ 0,
-       0x9604'9200, 0x9604'9200},
-      // expected_result_vd0_with_mask_int64
-      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0x6efc'4e0d'ee0d'ee0f,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
-      kVectorCalculationsSource);
+  TestVectorReductionInstruction(0xd881457,  // vfredosum.vs v8, v24, v16, v0.t
+                                             // expected_result_vd0_int32
+                                 {0x9e0c'9a8e,
+                                  0xbe2c'bace,
+                                  0xfe6c'fb4e,
+                                  0x7e6b'fc4d,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x9604'9200,
+                                  0x9e0c'9a8e},
+                                 // expected_result_vd0_int64
+                                 {0x9e0c'9a09'9604'9200,
+                                  0xbe2c'ba29'b624'b220,
+                                  0xfe6c'fa69'f664'f260,
+                                  0x7eec'5def'0cee'0dee,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x9e0c'9a09'9604'9200},
+                                 // expected_result_vd0_with_mask_int32
+                                 {0x9604'929d,
+                                  0xbe2c'ba29,
+                                  0xfe6c'fb4e,
+                                  0x7e6b'fa84,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x9604'9200,
+                                  0x9604'9200},
+                                 // expected_result_vd0_with_mask_int64
+                                 {0x9e0c'9a09'9604'9200,
+                                  0xbe2c'ba29'b624'b220,
+                                  0xee7c'ea78'e674'e271,
+                                  0x6efc'4e0d'ee0d'ee0f,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x9e0c'9a09'9604'9200},
+                                 kVectorCalculationsSource);
 }
 
 // Currently Vfredusum is implemented as Vfredosum (as explicitly permitted by RVV 1.0).
 // If we would implement some speedups which would change results then we may need to alter tests.
 TEST_F(Riscv64InterpreterTest, TestVfredusum) {
-  TestVectorReductionInstruction(
-      0x50c1457,  // vfredusum.vs v8, v16, v24, v0.t
-      // expected_result_vd0_int32
-      {0x9e0c'9a8e, 0xbe2c'bace, 0xfe6c'fb4e, 0x7e6b'fc4d, /* unused */ 0, /* unused */ 0,
-       0x9604'9200, 0x9e0c'9a8e},
-      // expected_result_vd0_int64
-      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0x7eec'5def'0cee'0dee,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
-      // expected_result_vd0_with_mask_int32
-      {0x9604'929d, 0xbe2c'ba29, 0xfe6c'fb4e, 0x7e6b'fa84, /* unused */ 0, /* unused */ 0,
-       0x9604'9200, 0x9604'9200},
-      // expected_result_vd0_with_mask_int64
-      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0x6efc'4e0d'ee0d'ee0f,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
-      kVectorCalculationsSource);
+  TestVectorReductionInstruction(0x5881457,  // vfredusum.vs v8, v24, v16, v0.t
+                                             // expected_result_vd0_int32
+                                 {0x9e0c'9a8e,
+                                  0xbe2c'bace,
+                                  0xfe6c'fb4e,
+                                  0x7e6b'fc4d,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x9604'9200,
+                                  0x9e0c'9a8e},
+                                 // expected_result_vd0_int64
+                                 {0x9e0c'9a09'9604'9200,
+                                  0xbe2c'ba29'b624'b220,
+                                  0xfe6c'fa69'f664'f260,
+                                  0x7eec'5def'0cee'0dee,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x9e0c'9a09'9604'9200},
+                                 // expected_result_vd0_with_mask_int32
+                                 {0x9604'929d,
+                                  0xbe2c'ba29,
+                                  0xfe6c'fb4e,
+                                  0x7e6b'fa84,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x9604'9200,
+                                  0x9604'9200},
+                                 // expected_result_vd0_with_mask_int64
+                                 {0x9e0c'9a09'9604'9200,
+                                  0xbe2c'ba29'b624'b220,
+                                  0xee7c'ea78'e674'e271,
+                                  0x6efc'4e0d'ee0d'ee0f,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x9e0c'9a09'9604'9200},
+                                 kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestVredand) {
   TestVectorReductionInstruction(
-      0x50c2457,  // vredand.vs v8,v16,v24,v0.t
+      0x5882457,  // vredand.vs v8,v24,v16,v0.t
       // expected_result_vd0_int8
       {0, 0, 0, 0, /* unused */ 0, 0, 0, 0},
       // expected_result_vd0_int16
       {0x8000, 0x8000, 0x8000, 0x0000, /* unused */ 0, 0x8000, 0x8000, 0x8000},
       // expected_result_vd0_int32
-      {0x8200'8000, 0x8200'8000, 0x8200'8000, 0x0200'0000, /* unused */ 0, /* unused */ 0,
-       0x8200'8000, 0x8200'8000},
+      {0x8200'8000,
+       0x8200'8000,
+       0x8200'8000,
+       0x0200'0000,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x8200'8000,
+       0x8200'8000},
       // expected_result_vd0_int64
-      {0x8604'8000'8200'8000, 0x8604'8000'8200'8000, 0x8604'8000'8200'8000, 0x0604'0000'0200'0000,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8604'8000'8200'8000},
+      {0x8604'8000'8200'8000,
+       0x8604'8000'8200'8000,
+       0x8604'8000'8200'8000,
+       0x0604'0000'0200'0000,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x8604'8000'8200'8000},
       // expected_result_vd0_with_mask_int8
       {0, 0, 0, 0, /* unused */ 0, 0, 0, 0},
       // expected_result_vd0_with_mask_int16
       {0x8000, 0x8000, 0x8000, 0x0000, /* unused */ 0, 0x8000, 0x8000, 0x8000},
       // expected_result_vd0_with_mask_int32
-      {0x8200'8000, 0x8200'8000, 0x8200'8000, 0x0200'0000, /* unused */ 0, /* unused */ 0,
-       0x8200'8000, 0x8200'8000},
+      {0x8200'8000,
+       0x8200'8000,
+       0x8200'8000,
+       0x0200'0000,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x8200'8000,
+       0x8200'8000},
       // expected_result_vd0_with_mask_int64
-      {0x8604'8000'8200'8000, 0x8604'8000'8200'8000, 0x8604'8000'8200'8000, 0x0604'0000'0200'0000,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8604'8000'8200'8000},
+      {0x8604'8000'8200'8000,
+       0x8604'8000'8200'8000,
+       0x8604'8000'8200'8000,
+       0x0604'0000'0200'0000,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x8604'8000'8200'8000},
       kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestVredor) {
   TestVectorReductionInstruction(
-      0x90c2457,  // vredor.vs v8,v16,v24,v0.t
+      0x9882457,  // vredor.vs v8,v24,v16,v0.t
       // expected_result_vd0_int8
       {159, 191, 255, 255, /* unused */ 0, 146, 150, 159},
       // expected_result_vd0_int16
       {0x9f1d, 0xbf3d, 0xff7d, 0xfffd, /* unused */ 0, 0x9300, 0x9704, 0x9f0d},
       // expected_result_vd0_int32
-      {0x9f1e'9b19, 0xbf3e'bb39, 0xff7e'fb79, 0xfffe'fbf9, /* unused */ 0, /* unused */ 0,
-       0x9706'9300, 0x9f0e'9b09},
+      {0x9f1e'9b19,
+       0xbf3e'bb39,
+       0xff7e'fb79,
+       0xfffe'fbf9,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x9706'9300,
+       0x9f0e'9b09},
       // expected_result_vd0_int64
-      {0x9f1e'9f1d'9716'9311, 0xbf3e'bf3d'b736'b331, 0xff7e'ff7d'f776'f371, 0xfffe'fffd'f7f6'f3f1,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9f0e'9f0d'9706'9300},
+      {0x9f1e'9f1d'9716'9311,
+       0xbf3e'bf3d'b736'b331,
+       0xff7e'ff7d'f776'f371,
+       0xfffe'fffd'f7f6'f3f1,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x9f0e'9f0d'9706'9300},
       // expected_result_vd0_with_mask_int8
       {159, 191, 255, 255, /* unused */ 0, 0, 150, 158},
       // expected_result_vd0_with_mask_int16
       {0x9f1d, 0xbf3d, 0xff7d, 0xfffd, /* unused */ 0, 0x9300, 0x9300, 0x9f0d},
       // expected_result_vd0_with_mask_int32
-      {0x9f1e'9b19, 0xbf3e'bb39, 0xff7e'fb79, 0xfffe'fbf9, /* unused */ 0, /* unused */ 0,
-       0x9706'9300, 0x9706'9300},
+      {0x9f1e'9b19,
+       0xbf3e'bb39,
+       0xff7e'fb79,
+       0xfffe'fbf9,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x9706'9300,
+       0x9706'9300},
       // expected_result_vd0_with_mask_int64
-      {0x9f0e'9f0d'9706'9300, 0xbf3e'bf3d'b736'b331, 0xff7e'ff7d'f776'f371, 0xfffe'fffd'f7f6'f3f1,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9f0e'9f0d'9706'9300},
+      {0x9f0e'9f0d'9706'9300,
+       0xbf3e'bf3d'b736'b331,
+       0xff7e'ff7d'f776'f371,
+       0xfffe'fffd'f7f6'f3f1,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x9f0e'9f0d'9706'9300},
       kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestVredxor) {
   TestVectorReductionInstruction(
-      0xd0c2457,  // vredxor.vs v8,v16,v24,v0.t
+      0xd882457,  // vredxor.vs v8,v24,v16,v0.t
       // expected_result_vd0_int8
       {0, 0, 0, 0, /* unused */ 0, 146, 0, 1},
       // expected_result_vd0_int16
       {0x8100, 0x8100, 0x8100, 0x8100, /* unused */ 0, 0x1300, 0x8504, 0x8101},
       // expected_result_vd0_int32
-      {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x8302'8100, /* unused */ 0, /* unused */ 0,
-       0x1506'1300, 0x8b0a'8909},
+      {0x8302'8100,
+       0x8302'8100,
+       0x8302'8100,
+       0x8302'8100,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x1506'1300,
+       0x8b0a'8909},
       // expected_result_vd0_int64
-      {0x9716'9515'9312'9111, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x190a'1f0d'1506'1300},
+      {0x9716'9515'9312'9111,
+       0x8706'8504'8302'8100,
+       0x8706'8504'8302'8100,
+       0x8706'8504'8302'8100,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x190a'1f0d'1506'1300},
       // expected_result_vd0_with_mask_int8
       {143, 154, 150, 43, /* unused */ 0, 0, 146, 150},
       // expected_result_vd0_with_mask_int16
       {0x1f0d, 0xbd3d, 0x9514, 0x8d0d, /* unused */ 0, 0x1300, 0x1300, 0x1705},
       // expected_result_vd0_with_mask_int32
-      {0x1d0e'1b09, 0x0d1e'0b18, 0xfb7a'f978, 0xab2a'a929, /* unused */ 0, /* unused */ 0,
-       0x1506'1300, 0x1506'1300},
+      {0x1d0e'1b09,
+       0x0d1e'0b18,
+       0xfb7a'f978,
+       0xab2a'a929,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x1506'1300,
+       0x1506'1300},
       // expected_result_vd0_with_mask_int64
-      {0x190a'1f0d'1506'1300, 0x091a'0f1c'0516'0311, 0x293a'2f3c'2536'2331, 0x77f6'75f5'73f2'71f1,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x190a'1f0d'1506'1300},
+      {0x190a'1f0d'1506'1300,
+       0x091a'0f1c'0516'0311,
+       0x293a'2f3c'2536'2331,
+       0x77f6'75f5'73f2'71f1,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x190a'1f0d'1506'1300},
       kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestVredminu) {
   TestVectorReductionInstruction(
-      0x110c2457,  // vredminu.vs v8,v16,v24,v0.t
+      0x11882457,  // vredminu.vs v8,v24,v16,v0.t
       // expected_result_vd0_int8
       {0, 0, 0, 0, /* unused */ 0, 0, 0, 0},
       // expected_result_vd0_int16
       {0x8100, 0x8100, 0x8100, 0x0291, /* unused */ 0, 0x8100, 0x8100, 0x8100},
       // expected_result_vd0_int32
-      {0x83028100, 0x83028100, 0x83028100, 0x06940291, /* unused */ 0, /* unused */ 0, 0x83028100,
+      {0x83028100,
+       0x83028100,
+       0x83028100,
+       0x06940291,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x83028100,
        0x83028100},
       // expected_result_vd0_int64
-      {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x0e9c'0a98'0694'0291,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100},
+      {0x8706'8504'8302'8100,
+       0x8706'8504'8302'8100,
+       0x8706'8504'8302'8100,
+       0x0e9c'0a98'0694'0291,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x8706'8504'8302'8100},
       // expected_result_vd0_with_mask_int8
       {0, 0, 0, 0, /* unused */ 0, 0, 0, 0},
       // expected_result_vd0_with_mask_int16
       {0x8100, 0x8100, 0x8100, 0x0291, /* unused */ 0, 0x8100, 0x8100, 0x8100},
       // expected_result_vd0_with_mask_int32
-      {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x0e9c'0a98, /* unused */ 0, /* unused */ 0,
-       0x8302'8100, 0x8302'8100},
+      {0x8302'8100,
+       0x8302'8100,
+       0x8302'8100,
+       0x0e9c'0a98,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x8302'8100,
+       0x8302'8100},
       // expected_result_vd0_with_mask_int64
-      {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x1e8c'1a89'1684'1280,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100},
+      {0x8706'8504'8302'8100,
+       0x8706'8504'8302'8100,
+       0x8706'8504'8302'8100,
+       0x1e8c'1a89'1684'1280,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x8706'8504'8302'8100},
       kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestVredmin) {
   TestVectorReductionInstruction(
-      0x150c2457,  // vredmin.vs v8,v16,v24,v0.t
+      0x15882457,  // vredmin.vs v8,v24,v16,v0.t
       // expected_result_vd0_int8
       {130, 130, 130, 128, /* unused */ 0, 146, 146, 146},
       // expected_result_vd0_int16
       {0x8100, 0x8100, 0x8100, 0x8100, /* unused */ 0, 0x8100, 0x8100, 0x8100},
       // expected_result_vd0_int32
-      {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x8302'8100, /* unused */ 0, /* unused */ 0,
-       0x8302'8100, 0x8302'8100},
+      {0x8302'8100,
+       0x8302'8100,
+       0x8302'8100,
+       0x8302'8100,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x8302'8100,
+       0x8302'8100},
       // expected_result_vd0_int64
-      {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100},
+      {0x8706'8504'8302'8100,
+       0x8706'8504'8302'8100,
+       0x8706'8504'8302'8100,
+       0x8706'8504'8302'8100,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x8706'8504'8302'8100},
       // expected_result_vd0_with_mask_int8
       {138, 138, 138, 128, /* unused */ 0, 0, 150, 150},
       // expected_result_vd0_with_mask_int16
       {0x8100, 0x8100, 0x8100, 0x8100, /* unused */ 0, 0x8100, 0x8100, 0x8100},
       // expected_result_vd0_with_mask_int32
-      {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x8302'8100, /* unused */ 0, /* unused */ 0,
-       0x8302'8100, 0x8302'8100},
+      {0x8302'8100,
+       0x8302'8100,
+       0x8302'8100,
+       0x8302'8100,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x8302'8100,
+       0x8302'8100},
       // expected_result_vd0_with_mask_int64
-      {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100},
+      {0x8706'8504'8302'8100,
+       0x8706'8504'8302'8100,
+       0x8706'8504'8302'8100,
+       0x8706'8504'8302'8100,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x8706'8504'8302'8100},
       kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestVfredmin) {
-  TestVectorReductionInstruction(
-      0x150c1457,  // vfredmin.vs v8, v16, v24, v0.t
-      // expected_result_vd0_int32
-      {0x9e0c'9a09, 0xbe2c'ba29, 0xfe6c'fa69, 0xfe6c'fa69, /* unused */ 0, /* unused */ 0,
-       0x9604'9200, 0x9e0c'9a09},
-      // expected_result_vd0_int64
-      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0xfe6c'fa69'f664'f260,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
-      // expected_result_vd0_with_mask_int32
-      {0x9604'9200, 0xbe2c'ba29, 0xfe6c'fa69, 0xfe6c'fa69, /* unused */ 0, /* unused */ 0,
-       0x9604'9200, 0x9604'9200},
-      // expected_result_vd0_with_mask_int64
-      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0xee7c'ea78'e674'e271,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
-      kVectorCalculationsSource);
+  TestVectorReductionInstruction(0x15881457,  // vfredmin.vs v8, v24, v16, v0.t
+                                              // expected_result_vd0_int32
+                                 {0x9e0c'9a09,
+                                  0xbe2c'ba29,
+                                  0xfe6c'fa69,
+                                  0xfe6c'fa69,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x9604'9200,
+                                  0x9e0c'9a09},
+                                 // expected_result_vd0_int64
+                                 {0x9e0c'9a09'9604'9200,
+                                  0xbe2c'ba29'b624'b220,
+                                  0xfe6c'fa69'f664'f260,
+                                  0xfe6c'fa69'f664'f260,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x9e0c'9a09'9604'9200},
+                                 // expected_result_vd0_with_mask_int32
+                                 {0x9604'9200,
+                                  0xbe2c'ba29,
+                                  0xfe6c'fa69,
+                                  0xfe6c'fa69,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x9604'9200,
+                                  0x9604'9200},
+                                 // expected_result_vd0_with_mask_int64
+                                 {0x9e0c'9a09'9604'9200,
+                                  0xbe2c'ba29'b624'b220,
+                                  0xee7c'ea78'e674'e271,
+                                  0xee7c'ea78'e674'e271,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x9e0c'9a09'9604'9200},
+                                 kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestVredmaxu) {
   TestVectorReductionInstruction(
-      0x190c2457,  // vredmaxu.vs v8,v16,v24,v0.t
+      0x19882457,  // vredmaxu.vs v8,v24,v16,v0.t
       // expected_result_vd0_int8
       {158, 190, 254, 254, /* unused */ 0, 146, 150, 158},
       // expected_result_vd0_int16
       {0x9e0c, 0xbe2c, 0xfe6c, 0xfe6c, /* unused */ 0, 0x9200, 0x9604, 0x9e0c},
       // expected_result_vd0_int32
-      {0x9e0c'9a09, 0xbe2c'ba29, 0xfe6c'fa69, 0xfe6c'fa69, /* unused */ 0, /* unused */ 0,
-       0x9604'9200, 0x9e0c'9a09},
+      {0x9e0c'9a09,
+       0xbe2c'ba29,
+       0xfe6c'fa69,
+       0xfe6c'fa69,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x9604'9200,
+       0x9e0c'9a09},
       // expected_result_vd0_int64
-      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0xfe6c'fa69'f664'f260,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+      {0x9e0c'9a09'9604'9200,
+       0xbe2c'ba29'b624'b220,
+       0xfe6c'fa69'f664'f260,
+       0xfe6c'fa69'f664'f260,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x9e0c'9a09'9604'9200},
       // expected_result_vd0_with_mask_int8
       {158, 186, 254, 254, /* unused */ 0, 0, 150, 158},
       // expected_result_vd0_with_mask_int16
       {0x9e0c, 0xba29, 0xfe6c, 0xfe6c, /* unused */ 0, 0x9200, 0x9200, 0x9e0c},
       // expected_result_vd0_with_mask_int32
-      {0x9604'9200, 0xbe2c'ba29, 0xfe6c'fa69, 0xfe6c'fa69, /* unused */ 0, /* unused */ 0,
-       0x9604'9200, 0x9604'9200},
+      {0x9604'9200,
+       0xbe2c'ba29,
+       0xfe6c'fa69,
+       0xfe6c'fa69,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x9604'9200,
+       0x9604'9200},
       // expected_result_vd0_with_mask_int64
-      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0xee7c'ea78'e674'e271,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+      {0x9e0c'9a09'9604'9200,
+       0xbe2c'ba29'b624'b220,
+       0xee7c'ea78'e674'e271,
+       0xee7c'ea78'e674'e271,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x9e0c'9a09'9604'9200},
       kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestVredmax) {
   TestVectorReductionInstruction(
-      0x1d0c2457,  // vredmax.vs v8,v16,v24,v0.t
+      0x1d882457,  // vredmax.vs v8,v24,v16,v0.t
       // expected_result_vd0_int8
       {28, 60, 124, 126, /* unused */ 0, 0, 4, 12},
       // expected_result_vd0_int16
       {0x9e0c, 0xbe2c, 0xfe6c, 0x7eec, /* unused */ 0, 0x9200, 0x9604, 0x9e0c},
       // expected_result_vd0_int32
-      {0x9e0c'9a09, 0xbe2c'ba29, 0xfe6c'fa69, 0x7eec'7ae9, /* unused */ 0, /* unused */ 0,
-       0x9604'9200, 0x9e0c'9a09},
+      {0x9e0c'9a09,
+       0xbe2c'ba29,
+       0xfe6c'fa69,
+       0x7eec'7ae9,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x9604'9200,
+       0x9e0c'9a09},
       // expected_result_vd0_int64
-      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0x7eec'7ae9'76e4'72e0,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+      {0x9e0c'9a09'9604'9200,
+       0xbe2c'ba29'b624'b220,
+       0xfe6c'fa69'f664'f260,
+       0x7eec'7ae9'76e4'72e0,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x9e0c'9a09'9604'9200},
       // expected_result_vd0_with_mask_int8
       {24, 52, 124, 126, /* unused */ 0, 0, 4, 4},
       // expected_result_vd0_with_mask_int16
       {0x9e0c, 0xba29, 0xfe6c, 0x7ae9, /* unused */ 0, 0x9200, 0x9200, 0x9e0c},
       // expected_result_vd0_with_mask_int32
-      {0x9604'9200, 0xbe2c'ba29, 0xfe6c'fa69, 0x7eec'7ae9, /* unused */ 0, /* unused */ 0,
-       0x9604'9200, 0x9604'9200},
+      {0x9604'9200,
+       0xbe2c'ba29,
+       0xfe6c'fa69,
+       0x7eec'7ae9,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x9604'9200,
+       0x9604'9200},
       // expected_result_vd0_with_mask_int64
-      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0x6efc'6af8'66f4'62f1,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+      {0x9e0c'9a09'9604'9200,
+       0xbe2c'ba29'b624'b220,
+       0xee7c'ea78'e674'e271,
+       0x6efc'6af8'66f4'62f1,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x9e0c'9a09'9604'9200},
       kVectorCalculationsSource);
 }
 
 TEST_F(Riscv64InterpreterTest, TestVfredmax) {
-  TestVectorReductionInstruction(
-      0x1d0c1457,  // vfredmax.vs v8, v16, v24, v0.t
-      // expected_result_vd0_int32
-      {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x7eec'7ae9, /* unused */ 0, /* unused */ 0,
-       0x8302'8100, 0x8302'8100},
-      // expected_result_vd0_int64
-      {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x7eec'7ae9'76e4'72e0,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100},
-      // expected_result_vd0_with_mask_int32
-      {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x7eec'7ae9, /* unused */ 0, /* unused */ 0,
-       0x8302'8100, 0x8302'8100},
-      // expected_result_vd0_with_mask_int64
-      {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x6efc'6af8'66f4'62f1,
-       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100},
-      kVectorCalculationsSource);
+  TestVectorReductionInstruction(0x1d881457,  // vfredmax.vs v8, v24, v16, v0.t
+                                              // expected_result_vd0_int32
+                                 {0x8302'8100,
+                                  0x8302'8100,
+                                  0x8302'8100,
+                                  0x7eec'7ae9,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x8302'8100,
+                                  0x8302'8100},
+                                 // expected_result_vd0_int64
+                                 {0x8706'8504'8302'8100,
+                                  0x8706'8504'8302'8100,
+                                  0x8706'8504'8302'8100,
+                                  0x7eec'7ae9'76e4'72e0,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x8706'8504'8302'8100},
+                                 // expected_result_vd0_with_mask_int32
+                                 {0x8302'8100,
+                                  0x8302'8100,
+                                  0x8302'8100,
+                                  0x7eec'7ae9,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x8302'8100,
+                                  0x8302'8100},
+                                 // expected_result_vd0_with_mask_int64
+                                 {0x8706'8504'8302'8100,
+                                  0x8706'8504'8302'8100,
+                                  0x8706'8504'8302'8100,
+                                  0x6efc'6af8'66f4'62f1,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  /* unused */ 0,
+                                  0x8706'8504'8302'8100},
+                                 kVectorCalculationsSource);
 }
 
 // Note that the expected test outputs for v[f]merge.vXm are identical to those for v[f]mv.v.X.
@@ -12168,6 +12766,155 @@ TEST_F(Riscv64InterpreterTest, TestVslide1down) {
                                    /*last_elem_is_x1=*/true);
 }
 
+TEST_F(Riscv64InterpreterTest, TestVfslide1up) {
+  TestVectorFloatInstruction(0x3980d457,  // vfslide1up.vf v8, v24, f1, v0.t
+                             {{0x40b4'0000, 0x9604'9200, 0x9e0c'9a09, 0x8614'8211},
+                              {0x8e1c'8a18, 0xb624'b220, 0xbe2c'ba29, 0xa634'a231},
+                              {0xae3c'aa38, 0xd644'd240, 0xde4c'da49, 0xc654'c251},
+                              {0xce5c'ca58, 0xf664'f260, 0xfe6c'fa69, 0xe674'e271},
+                              {0xee7c'ea78, 0x1684'1280, 0x1e8c'1a89, 0x0694'0291},
+                              {0x0e9c'0a98, 0x36a4'32a0, 0x3eac'3aa9, 0x26b4'22b1},
+                              {0x2ebc'2ab8, 0x56c4'52c0, 0x5ecc'5ac9, 0x46d4'42d1},
+                              {0x4edc'4ad8, 0x76e4'72e0, 0x7eec'7ae9, 0x66f4'62f1}},
+                             {{0x4016'8000'0000'0000, 0x9e0c'9a09'9604'9200},
+                              {0x8e1c'8a18'8614'8211, 0xbe2c'ba29'b624'b220},
+                              {0xae3c'aa38'a634'a231, 0xde4c'da49'd644'd240},
+                              {0xce5c'ca58'c654'c251, 0xfe6c'fa69'f664'f260},
+                              {0xee7c'ea78'e674'e271, 0x1e8c'1a89'1684'1280},
+                              {0x0e9c'0a98'0694'0291, 0x3eac'3aa9'36a4'32a0},
+                              {0x2ebc'2ab8'26b4'22b1, 0x5ecc'5ac9'56c4'52c0},
+                              {0x4edc'4ad8'46d4'42d1, 0x7eec'7ae9'76e4'72e0}},
+                             kVectorCalculationsSource);
+}
+
+TEST_F(Riscv64InterpreterTest, TestVfslide1down) {
+  // Where the element at the top gets inserted will depend on VLMUL so we use
+  // TestVectorFloatPermutationInstruction instead of TestVectorFloatInstruction.
+
+  // VLMUL = 0
+  TestVectorFloatPermutationInstruction(
+      0x3d80d457,  // vfslide1down.vf v8, v24, f1, v0.t
+      {{0x9e0c'9a09, 0x8614'8211, 0x8e1c'8a18, 0x40b4'0000}, {}, {}, {}, {}, {}, {}, {}},
+      {{0x8e1c'8a18'8614'8211, 0x4016'8000'0000'0000}, {}, {}, {}, {}, {}, {}, {}},
+      kVectorCalculationsSource,
+      /*vlmul=*/0,
+      /*skip=*/0,
+      /*ignore_vma_for_last=*/true,
+      /*last_elem_is_f1=*/true);
+
+  // VLMUL = 1
+  TestVectorFloatPermutationInstruction(0x3d80d457,  // vfslide1down.vf v8, v24, f1, v0.t
+                                        {{0x9e0c'9a09, 0x8614'8211, 0x8e1c'8a18, 0xb624'b220},
+                                         {0xbe2c'ba29, 0xa634'a231, 0xae3c'aa38, 0x40b4'0000},
+                                         {},
+                                         {},
+                                         {},
+                                         {},
+                                         {},
+                                         {}},
+                                        {{0x8e1c'8a18'8614'8211, 0xbe2c'ba29'b624'b220},
+                                         {0xae3c'aa38'a634'a231, 0x4016'8000'0000'0000},
+                                         {},
+                                         {},
+                                         {},
+                                         {},
+                                         {},
+                                         {}},
+                                        kVectorCalculationsSource,
+                                        /*vlmul=*/1,
+                                        /*skip=*/0,
+                                        /*ignore_vma_for_last=*/true,
+                                        /*last_elem_is_f1=*/true);
+
+  // VLMUL = 2
+  TestVectorFloatPermutationInstruction(0x3d80d457,  // vfslide1down.vf v8, v24, f1, v0.t
+                                        {{0x9e0c'9a09, 0x8614'8211, 0x8e1c'8a18, 0xb624'b220},
+                                         {0xbe2c'ba29, 0xa634'a231, 0xae3c'aa38, 0xd644'd240},
+                                         {0xde4c'da49, 0xc654'c251, 0xce5c'ca58, 0xf664'f260},
+                                         {0xfe6c'fa69, 0xe674'e271, 0xee7c'ea78, 0x40b4'0000},
+                                         {},
+                                         {},
+                                         {},
+                                         {}},
+                                        {{0x8e1c'8a18'8614'8211, 0xbe2c'ba29'b624'b220},
+                                         {0xae3c'aa38'a634'a231, 0xde4c'da49'd644'd240},
+                                         {0xce5c'ca58'c654'c251, 0xfe6c'fa69'f664'f260},
+                                         {0xee7c'ea78'e674'e271, 0x4016'8000'0000'0000},
+                                         {},
+                                         {},
+                                         {},
+                                         {}},
+                                        kVectorCalculationsSource,
+                                        /*vlmul=*/2,
+                                        /*skip=*/0,
+                                        /*ignore_vma_for_last=*/true,
+                                        /*last_elem_is_f1=*/true);
+
+  // VLMUL = 3
+  TestVectorFloatPermutationInstruction(0x3d80d457,  // vfslide1down.vf v8, v24, f1, v0.t
+                                        {{0x9e0c'9a09, 0x8614'8211, 0x8e1c'8a18, 0xb624'b220},
+                                         {0xbe2c'ba29, 0xa634'a231, 0xae3c'aa38, 0xd644'd240},
+                                         {0xde4c'da49, 0xc654'c251, 0xce5c'ca58, 0xf664'f260},
+                                         {0xfe6c'fa69, 0xe674'e271, 0xee7c'ea78, 0x1684'1280},
+                                         {0x1e8c'1a89, 0x0694'0291, 0x0e9c'0a98, 0x36a4'32a0},
+                                         {0x3eac'3aa9, 0x26b4'22b1, 0x2ebc'2ab8, 0x56c4'52c0},
+                                         {0x5ecc'5ac9, 0x46d4'42d1, 0x4edc'4ad8, 0x76e4'72e0},
+                                         {0x7eec'7ae9, 0x66f4'62f1, 0x6efc'6af8, 0x40b4'0000}},
+                                        {{0x8e1c'8a18'8614'8211, 0xbe2c'ba29'b624'b220},
+                                         {0xae3c'aa38'a634'a231, 0xde4c'da49'd644'd240},
+                                         {0xce5c'ca58'c654'c251, 0xfe6c'fa69'f664'f260},
+                                         {0xee7c'ea78'e674'e271, 0x1e8c'1a89'1684'1280},
+                                         {0x0e9c'0a98'0694'0291, 0x3eac'3aa9'36a4'32a0},
+                                         {0x2ebc'2ab8'26b4'22b1, 0x5ecc'5ac9'56c4'52c0},
+                                         {0x4edc'4ad8'46d4'42d1, 0x7eec'7ae9'76e4'72e0},
+                                         {0x6efc'6af8'66f4'62f1, 0x4016'8000'0000'0000}},
+                                        kVectorCalculationsSource,
+                                        /*vlmul=*/3,
+                                        /*skip=*/0,
+                                        /*ignore_vma_for_last=*/true,
+                                        /*last_elem_is_f1=*/true);
+
+  // VLMUL = 4
+  TestVectorFloatPermutationInstruction(0x3d80d457,  // vfslide1down.vf v8, v24, f1, v0.t
+                                        {{}, {}, {}, {}, {}, {}, {}, {}},
+                                        {{}, {}, {}, {}, {}, {}, {}, {}},
+                                        kVectorCalculationsSource,
+                                        /*vlmul=*/4,
+                                        /*skip=*/0,
+                                        /*ignore_vma_for_last=*/true,
+                                        /*last_elem_is_f1=*/true);
+
+  // VLMUL = 5
+  TestVectorFloatPermutationInstruction(0x3d80d457,  // vfslide1down.vf v8, v24, f1, v0.t
+                                        {{}, {}, {}, {}, {}, {}, {}, {}},
+                                        {{}, {}, {}, {}, {}, {}, {}, {}},
+                                        kVectorCalculationsSource,
+                                        /*vlmul=*/5,
+                                        /*skip=*/0,
+                                        /*ignore_vma_for_last=*/true,
+                                        /*last_elem_is_f1=*/true);
+
+  // VLMUL = 6
+  TestVectorFloatPermutationInstruction(0x3d80d457,  // vfslide1down.vf v8, v24, f1, v0.t
+                                        {{0x40b4'0000}, {}, {}, {}, {}, {}, {}, {}},
+                                        {{}, {}, {}, {}, {}, {}, {}, {}},
+                                        kVectorCalculationsSource,
+                                        /*vlmul=*/6,
+                                        /*skip=*/0,
+                                        /*ignore_vma_for_last=*/true,
+                                        /*last_elem_is_f1=*/true);
+
+  // VLMUL = 7
+  TestVectorFloatPermutationInstruction(0x3d80d457,  // vfslide1down.vf v8, v24, f1, v0.t
+                                        {{0x9e0c'9a09, 0x40b4'0000}, {}, {}, {}, {}, {}, {}, {}},
+                                        {{0x4016'8000'0000'0000}, {}, {}, {}, {}, {}, {}, {}},
+                                        kVectorCalculationsSource,
+                                        /*vlmul=*/7,
+                                        /*skip=*/0,
+                                        /*ignore_vma_for_last=*/true,
+                                        /*last_elem_is_f1=*/true);
+}
+
 TEST_F(Riscv64InterpreterTest, TestVwadd) {
   TestWideningVectorInstruction(0xc50c2457,  // vwadd.vv v8,v16,v24,v0.t
                                 {{0x0000, 0xff13, 0x0006, 0xff19, 0x000d, 0xff1f, 0x0012, 0xff25},
diff --git a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
index 27353bf3..2019aa6b 100644
--- a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
+++ b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
@@ -873,6 +873,11 @@ std::tuple<ElementType> WideMultiplySignedUnsigned(ElementType arg1, ElementType
   DEFINE_W_ARITHMETIC_INTRINSIC(Vn##name##wx, Narrowwv, return ({ __VA_ARGS__; }); \
                                 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
 
+#define DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WV(name, ...) \
+  DEFINE_W_ARITHMETIC_INTRINSIC(                                  \
+      Vn##name##wv, Narrowwv, return ({ __VA_ARGS__; });          \
+      , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
+
 #define DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WX(name, ...) \
   DEFINE_W_ARITHMETIC_INTRINSIC(                                  \
       Vn##name##wx, Narrowwv, return ({ __VA_ARGS__; });          \
@@ -1103,6 +1108,10 @@ DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV(sr, auto [arg1, arg2] = std::tuple{arg
                                           (arg1 >> arg2))
 DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX(sr, auto [arg1, arg2] = std::tuple{args...};
                                           (arg1 >> arg2))
+DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WV(
+    clip,
+    WideType<ElementType>{(std::get<0>(
+        Roundoff(csr, static_cast<typename WideType<ElementType>::BaseType>(args)...)))})
 DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WX(
     clip,
     WideType<ElementType>{(std::get<0>(
diff --git a/kernel_api/riscv64/open_emulation.cc b/kernel_api/riscv64/open_emulation.cc
index dacf77ad..e2257df1 100644
--- a/kernel_api/riscv64/open_emulation.cc
+++ b/kernel_api/riscv64/open_emulation.cc
@@ -25,10 +25,7 @@
 
 #include "berberis/kernel_api/tracing.h"
 
-#define GUEST_O_DIRECTORY 00040000
-#define GUEST_O_NOFOLLOW 00100000
-#define GUEST_O_DIRECT 00200000
-#define GUEST_O_LARGEFILE 00400000
+#define GUEST_O_LARGEFILE 00100000
 
 namespace berberis {
 
@@ -55,7 +52,7 @@ namespace berberis {
 
 static_assert((O_ACCMODE & ~O_SEARCH) == 00000003);
 
-// These flags should have the same value on all architectures.
+// These flags should have the same value on guest and host architectures.
 static_assert(O_CREAT == 00000100);
 static_assert(O_EXCL == 00000200);
 static_assert(O_NOCTTY == 00000400);
@@ -65,7 +62,10 @@ static_assert(O_NONBLOCK == 00004000);
 static_assert(O_DSYNC == 00010000);
 static_assert(FASYNC == 00020000);
 static_assert(O_NOATIME == 01000000);
+static_assert(O_DIRECTORY == 0200000);
+static_assert(O_NOFOLLOW == 00400000);
 static_assert(O_CLOEXEC == 02000000);
+static_assert(O_DIRECT == 040000);
 static_assert(__O_SYNC == 04000000);
 static_assert(O_SYNC == (O_DSYNC | __O_SYNC));
 static_assert(O_PATH == 010000000);
@@ -73,14 +73,13 @@ static_assert(O_PATH == 010000000);
 namespace {
 
 const int kCompatibleOpenFlags = O_ACCMODE | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | O_APPEND |
-                                 O_NONBLOCK | O_DSYNC | FASYNC | O_NOATIME | O_CLOEXEC | __O_SYNC |
-                                 O_PATH;
+                                 O_NONBLOCK | O_DSYNC | FASYNC | O_NOATIME | O_DIRECTORY |
+                                 O_NOFOLLOW | O_CLOEXEC | O_DIRECT | __O_SYNC | O_PATH;
 
 }  // namespace
 
 int ToHostOpenFlags(int guest_flags) {
-  const int kIncompatibleGuestOpenFlags =
-      GUEST_O_DIRECTORY | GUEST_O_NOFOLLOW | GUEST_O_DIRECT | GUEST_O_LARGEFILE;
+  const int kIncompatibleGuestOpenFlags = GUEST_O_LARGEFILE;
 
   int unknown_guest_flags = guest_flags & ~(kCompatibleOpenFlags | kIncompatibleGuestOpenFlags);
   if (unknown_guest_flags) {
@@ -91,15 +90,6 @@ int ToHostOpenFlags(int guest_flags) {
 
   int host_flags = guest_flags & ~kIncompatibleGuestOpenFlags;
 
-  if (guest_flags & GUEST_O_DIRECTORY) {
-    host_flags |= O_DIRECTORY;
-  }
-  if (guest_flags & GUEST_O_NOFOLLOW) {
-    host_flags |= O_NOFOLLOW;
-  }
-  if (guest_flags & GUEST_O_DIRECT) {
-    host_flags |= O_DIRECT;
-  }
   if (guest_flags & GUEST_O_LARGEFILE) {
     host_flags |= O_LARGEFILE;
   }
@@ -108,7 +98,7 @@ int ToHostOpenFlags(int guest_flags) {
 }
 
 int ToGuestOpenFlags(int host_flags) {
-  const int kIncompatibleHostOpenFlags = O_DIRECTORY | O_NOFOLLOW | O_DIRECT | O_LARGEFILE;
+  const int kIncompatibleHostOpenFlags = O_LARGEFILE;
 
   int unknown_host_flags = host_flags & ~(kCompatibleOpenFlags | kIncompatibleHostOpenFlags);
   if (unknown_host_flags) {
@@ -119,15 +109,6 @@ int ToGuestOpenFlags(int host_flags) {
 
   int guest_flags = host_flags & ~kIncompatibleHostOpenFlags;
 
-  if (host_flags & O_DIRECTORY) {
-    guest_flags |= GUEST_O_DIRECTORY;
-  }
-  if (host_flags & O_NOFOLLOW) {
-    guest_flags |= GUEST_O_NOFOLLOW;
-  }
-  if (host_flags & O_DIRECT) {
-    guest_flags |= GUEST_O_DIRECT;
-  }
   if (host_flags & O_LARGEFILE) {
     guest_flags |= GUEST_O_LARGEFILE;
   }
diff --git a/tests/inline_asm_tests/Android.bp b/tests/inline_asm_tests/Android.bp
index bba729f6..90e082e5 100644
--- a/tests/inline_asm_tests/Android.bp
+++ b/tests/inline_asm_tests/Android.bp
@@ -48,3 +48,18 @@ cc_test {
     },
     static_executable: true,
 }
+
+cc_test {
+    name: "inline_asm_tests_riscv64",
+    native_bridge_supported: true,
+    enabled: false,
+    arch: {
+        riscv64: {
+            enabled: true,
+            srcs: [
+                "main_riscv64.cc",
+            ],
+        },
+    },
+    static_executable: true,
+}
diff --git a/tests/inline_asm_tests/main_riscv64.cc b/tests/inline_asm_tests/main_riscv64.cc
new file mode 100644
index 00000000..694909a4
--- /dev/null
+++ b/tests/inline_asm_tests/main_riscv64.cc
@@ -0,0 +1,290 @@
+/*
+ * Copyright (C) 2024 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gtest/gtest.h"
+
+#include <cstdint>
+#include <tuple>
+
+namespace {
+
+template <typename T>
+constexpr T BitUtilLog2(T x) {
+  return __builtin_ctz(x);
+}
+
+// TODO(b/301577077): Maybe use __uint128_t instead.
+// Or provide a more versatile wrapper, that one can easily init, copy and compare.
+using __v2du = uint64_t[2];
+
+constexpr __v2du kVectorCalculationsSource[16] = {
+    {0x8706'8504'8302'8100, 0x8f0e'8d0c'8b0a'8908},
+    {0x9716'9514'9312'9110, 0x9f1e'9d1c'9b1a'9918},
+    {0xa726'a524'a322'a120, 0xaf2e'ad2c'ab2a'a928},
+    {0xb736'b534'b332'b130, 0xbf3e'bd3c'bb3a'b938},
+    {0xc746'c544'c342'c140, 0xcf4e'cd4c'cb4a'c948},
+    {0xd756'd554'd352'd150, 0xdf5e'dd5c'db5a'd958},
+    {0xe766'e564'e362'e160, 0xef6e'ed6c'eb6a'e968},
+    {0xf776'f574'f372'f170, 0xff7e'fd7c'fb7a'f978},
+
+    {0x9e0c'9a09'9604'9200, 0x8e1c'8a18'8614'8211},
+    {0xbe2c'ba29'b624'b220, 0xae3c'aa38'a634'a231},
+    {0xde4c'da49'd644'd240, 0xce5c'ca58'c654'c251},
+    {0xfe6c'fa69'f664'f260, 0xee7c'ea78'e674'e271},
+    {0x1e8c'1a89'1684'1280, 0x0e9c'0a98'0694'0291},
+    {0x3eac'3aa9'36a4'32a0, 0x2ebc'2ab8'26b4'22b1},
+    {0x5ecc'5ac9'56c4'52c0, 0x4edc'4ad8'46d4'42d1},
+    {0x7eec'7ae9'76e4'72e0, 0x6efc'6af8'66f4'62f1},
+};
+
+// Easily recognizable bit pattern for target register.
+constexpr __v2du kUndisturbedResult = {0x5555'5555'5555'5555, 0x5555'5555'5555'5555};
+constexpr __v2du kAgnosticResult = {~uint64_t{0U}, ~uint64_t{0U}};
+
+// Mask in form suitable for storing in v0 and use in v0.t form.
+static constexpr __v2du kMask = {0xd5ad'd6b5'ad6b'b5ad, 0x6af7'57bb'deed'7bb5};
+
+using ExecInsnFunc = void (*)();
+
+void RunTwoVectorArgsOneRes(ExecInsnFunc exec_insn,
+                            const __v2du* src,
+                            __v2du* res,
+                            uint64_t vtype,
+                            uint64_t vlmax) {
+  uint64_t vstart, vl;
+  // Mask register is, unconditionally, v0, and we need 8, 16, or 24 to handle full 8-registers
+  // inputs thus we use v8..v15 for destination and place sources into v16..v23 and v24..v31.
+  asm(  // Load arguments and undisturbed result.
+      "vsetvli t0, zero, e64, m8, ta, ma\n\t"
+      "vle64.v v8, (%[res])\n\t"
+      "vle64.v v16, (%[src])\n\t"
+      "addi t0, %[src], 128\n\t"
+      "vle64.v v24, (t0)\n\t"
+      // Load mask.
+      "vsetvli t0, zero, e64, m1, ta, ma\n\t"
+      "vle64.v v0, (%[mask])\n\t"
+      // Execute tested instruction.
+      "vsetvl t0, zero, %[vtype]\n\t"
+      "jalr %[exec_insn]\n\t"
+      // Save vstart and vl just after insn execution for checks.
+      "csrr %[vstart], vstart\n\t"
+      "csrr %[vl], vl\n\t"
+      // Store the result.
+      "vsetvli t0, zero, e64, m8, ta, ma\n\t"
+      "vse64.v v8, (%[res])\n\t"
+      : [vstart] "=&r"(vstart), [vl] "=&r"(vl)
+      : [exec_insn] "r"(exec_insn),
+        [src] "r"(src),
+        [res] "r"(res),
+        [vtype] "r"(vtype),
+        [mask] "r"(&kMask)
+      : "t0",
+        "ra",
+        "v0",
+        "v8",
+        "v9",
+        "v10",
+        "v11",
+        "v12",
+        "v13",
+        "v14",
+        "v15",
+        "v16",
+        "v17",
+        "v18",
+        "v19",
+        "v20",
+        "v21",
+        "v22",
+        "v23",
+        "v24",
+        "v25",
+        "v26",
+        "v27",
+        "v28",
+        "v29",
+        "v30",
+        "v31",
+        "memory");
+  // Every vector instruction must set vstart to 0, but shouldn't touch vl.
+  EXPECT_EQ(vstart, 0);
+  EXPECT_EQ(vl, vlmax);
+}
+
+template <typename... ExpectedResultType>
+void TestVectorReductionInstruction(
+    ExecInsnFunc exec_insn,
+    ExecInsnFunc exec_masked_insn,
+    const __v2du (&source)[16],
+    std::tuple<const ExpectedResultType (&)[8],
+               const ExpectedResultType (&)[8]>... expected_result) {
+  // Each expected_result input to this function is the vd[0] value of the reduction, for each
+  // of the possible vlmul, i.e. expected_result_vd0_int8[n] = vd[0], int8, no mask, vlmul=n.
+  //
+  // As vlmul=4 is reserved, expected_result_vd0_*[4] is ignored.
+  auto Verify = [&source](ExecInsnFunc exec_insn,
+                          uint8_t vsew,
+                          uint8_t vlmul,
+                          const auto& expected_result) {
+    for (uint8_t vta = 0; vta < 2; ++vta) {
+      for (uint8_t vma = 0; vma < 2; ++vma) {
+        uint64_t vtype = (vma << 7) | (vta << 6) | (vsew << 3) | vlmul;
+        uint64_t vlmax = 0;
+        asm("vsetvl %0, zero, %1" : "=r"(vlmax) : "r"(vtype));
+        if (vlmax == 0) {
+          continue;
+        }
+
+        __v2du result[8];
+        // Set undisturbed result vector registers.
+        for (size_t index = 0; index < 8; ++index) {
+          memcpy(&result[index], &kUndisturbedResult, sizeof(result[index]));
+        }
+
+        RunTwoVectorArgsOneRes(exec_insn, &kVectorCalculationsSource[0], &result[0], vtype, vlmax);
+
+        // Reduction instructions are unique in that they produce a scalar
+        // output to a single vector register as opposed to a register group.
+        // This allows us to take some short-cuts when validating:
+        //
+        // - The mask setting is only useful during computation, as the body
+        // of the destination is always only element 0, which will always be
+        // written to, regardless of mask setting.
+        // - The tail is guaranteed to be 1..VLEN/SEW, so the vlmul setting
+        // does not affect the elements that the tail policy applies to in the
+        // destination register.
+
+        // Verify that the destination register holds the reduction in the
+        // first element and the tail policy applies to the remaining.
+        __uint128_t expected_result_register;
+        if (vta) {
+          memcpy(&expected_result_register, &kAgnosticResult, sizeof(expected_result_register));
+        } else {
+          memcpy(&expected_result_register, &kUndisturbedResult, sizeof(expected_result_register));
+        }
+        size_t vsew_bits = 8 << vsew;
+        expected_result_register = (expected_result_register >> vsew_bits) << vsew_bits;
+        expected_result_register |= expected_result;
+        EXPECT_TRUE(memcmp(&result[0], &expected_result_register, sizeof(result[0])) == 0);
+
+        // Verify all non-destination registers are undisturbed.
+        for (size_t index = 1; index < 8; ++index) {
+          EXPECT_TRUE(memcmp(&result[index], &kUndisturbedResult, sizeof(result[index])) == 0);
+        }
+      }
+    }
+  };
+
+  for (int vlmul = 0; vlmul < 8; vlmul++) {
+    ((Verify(exec_insn,
+             BitUtilLog2(sizeof(ExpectedResultType)),
+             vlmul,
+             std::get<0>(expected_result)[vlmul]),
+      Verify(exec_masked_insn,
+             BitUtilLog2(sizeof(ExpectedResultType)),
+             vlmul,
+             std::get<1>(expected_result)[vlmul])),
+     ...);
+  }
+}
+
+void TestVectorReductionInstruction(ExecInsnFunc exec_insn,
+                                    ExecInsnFunc exec_masked_insn,
+                                    const uint8_t (&expected_result_vd0_int8)[8],
+                                    const uint16_t (&expected_result_vd0_int16)[8],
+                                    const uint32_t (&expected_result_vd0_int32)[8],
+                                    const uint64_t (&expected_result_vd0_int64)[8],
+                                    const uint8_t (&expected_result_vd0_with_mask_int8)[8],
+                                    const uint16_t (&expected_result_vd0_with_mask_int16)[8],
+                                    const uint32_t (&expected_result_vd0_with_mask_int32)[8],
+                                    const uint64_t (&expected_result_vd0_with_mask_int64)[8],
+                                    const __v2du (&source)[16]) {
+  TestVectorReductionInstruction(
+      exec_insn,
+      exec_masked_insn,
+      source,
+      std::tuple<const uint8_t(&)[8], const uint8_t(&)[8]>{expected_result_vd0_int8,
+                                                           expected_result_vd0_with_mask_int8},
+      std::tuple<const uint16_t(&)[8], const uint16_t(&)[8]>{expected_result_vd0_int16,
+                                                             expected_result_vd0_with_mask_int16},
+      std::tuple<const uint32_t(&)[8], const uint32_t(&)[8]>{expected_result_vd0_int32,
+                                                             expected_result_vd0_with_mask_int32},
+      std::tuple<const uint64_t(&)[8], const uint64_t(&)[8]>{expected_result_vd0_int64,
+                                                             expected_result_vd0_with_mask_int64});
+}
+
+[[gnu::naked]] void ExecVredsum() {
+  asm("vredsum.vs v8,v16,v24\n\t"
+      "ret\n\t");
+}
+
+[[gnu::naked]] void ExecMaskedVredsum() {
+  asm("vredsum.vs v8,v16,v24,v0.t\n\t"
+      "ret\n\t");
+}
+
+TEST(InlineAsmTestRiscv64, TestVredsum) {
+  TestVectorReductionInstruction(
+      ExecVredsum,
+      ExecMaskedVredsum,
+      // expected_result_vd0_int8
+      {242, 228, 200, 144, /* unused */ 0, 146, 44, 121},
+      // expected_result_vd0_int16
+      {0x0172, 0x82e4, 0x88c8, 0xa090, /* unused */ 0, 0x1300, 0xa904, 0xe119},
+      // expected_result_vd0_int32
+      {0xcb44'b932,
+       0x9407'71e4,
+       0xa70e'64c8,
+       0xd312'5090,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x1907'1300,
+       0xb713'ad09},
+      // expected_result_vd0_int64
+      {0xb32f'a926'9f1b'9511,
+       0x1f99'0d88'fb74'e962,
+       0xb92c'970e'74e8'52c4,
+       0xef4e'ad14'6aca'2888,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x2513'1f0e'1907'1300},
+      // expected_result_vd0_with_mask_int8
+      {39, 248, 142, 27, /* unused */ 0, 0, 154, 210},
+      // expected_result_vd0_with_mask_int16
+      {0x5f45, 0xc22f, 0x99d0, 0x98bf, /* unused */ 0, 0x1300, 0x1300, 0x4b15},
+      // expected_result_vd0_with_mask_int32
+      {0x2d38'1f29,
+       0x99a1'838a,
+       0x1989'ef5c,
+       0x9cf4'4aa1,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x1907'1300,
+       0x1907'1300},
+      // expected_result_vd0_with_mask_int64
+      {0x2513'1f0e'1907'1300,
+       0x917c'8370'7560'6751,
+       0x4e56'3842'222a'0c13,
+       0xc833'9e0e'73df'49b5,
+       /* unused */ 0,
+       /* unused */ 0,
+       /* unused */ 0,
+       0x2513'1f0e'1907'1300},
+      kVectorCalculationsSource);
+}
+
+}  // namespace
diff --git a/tests/run_host_tests.mk b/tests/run_host_tests.mk
index 29906c6d..ab6bca76 100644
--- a/tests/run_host_tests.mk
+++ b/tests/run_host_tests.mk
@@ -115,10 +115,29 @@ endef
 
 ifeq ($(BUILD_BERBERIS_RISCV64_TO_X86_64),true)
 
-$(eval $(call add_test,berberis_ndk_program_tests,\
+# berberis_ndk_program_tests
+
+$(eval $(call add_test,berberis_ndk_program_tests_interpret_only,\
+	run_test_x86_64_riscv64,\
+	$(TARGET_OUT_TESTCASES)/berberis_ndk_program_tests_static.native_bridge/x86_64/berberis_ndk_program_tests_static,\
+	BERBERIS_MODE=interpret-only))
+
+$(eval $(call add_test,berberis_ndk_program_tests_lite_translate_or_interpret,\
 	run_test_x86_64_riscv64,\
 	$(TARGET_OUT_TESTCASES)/berberis_ndk_program_tests_static.native_bridge/x86_64/berberis_ndk_program_tests_static,\
-	))
+	BERBERIS_MODE=lite-translate-or-interpret))
+
+$(eval $(call add_test,berberis_ndk_program_tests_heavy_optimize_or_interpret,\
+	run_test_x86_64_riscv64,\
+	$(TARGET_OUT_TESTCASES)/berberis_ndk_program_tests_static.native_bridge/x86_64/berberis_ndk_program_tests_static,\
+	BERBERIS_MODE=heavy-optimize-or-interpret))
+
+$(eval $(call add_test,berberis_ndk_program_tests_two_gear,\
+	run_test_x86_64_riscv64,\
+	$(TARGET_OUT_TESTCASES)/berberis_ndk_program_tests_static.native_bridge/x86_64/berberis_ndk_program_tests_static,\
+	BERBERIS_MODE=two-gear))
+
+# berberis_host_tests
 
 $(eval $(call add_test,berberis_host_tests,\
 	run_test,\
author	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2024-05-07 23:15:48 +0000
committer	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2024-05-07 23:15:48 +0000
commit	c7caefd0df41378801e48aaa3cda1270c312762a (patch)
tree	306c083dbb84bd7ee8529aaa3d2aad2c5b6c9dc2
parent	910c1c3005863b02f71278fb61028acd8f399e51 (diff)
parent	59addd2f7e65b35643dbe541cbd7a20d0b5e90df (diff)
download	binary_translation-c7caefd0df41378801e48aaa3cda1270c312762a.tar.gz