aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhoebe Wang <phoebe.wang@intel.com>2024-05-06 10:59:44 +0800
committerTom Stellard <tstellar@redhat.com>2024-05-08 20:14:03 -0700
commitdfc89f89ed14ebf22effe9dd9605608a975c4ed8 (patch)
tree34ee1da0a0cd9c0d577507018c0f69985a0a2a6b
parent047cd915b86a4f35543ad4e691953aaa5a91c4fe (diff)
downloadllvm-dfc89f89ed14ebf22effe9dd9605608a975c4ed8.tar.gz
[X86][FP16] Do not create VBROADCAST_LOAD for f16 without AVX2 (#91125)
AVX doesn't provide 16-bit BROADCAST instruction. Fixes #91005
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp2
-rw-r--r--llvm/test/CodeGen/X86/pr91005.ll40
2 files changed, 41 insertions, 1 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c572b27fe401..3e4ecab8443a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7295,7 +7295,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
if (ScalarSize == 32 ||
(ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
- CVT == MVT::f16 ||
+ (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
const Constant *C = nullptr;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
diff --git a/llvm/test/CodeGen/X86/pr91005.ll b/llvm/test/CodeGen/X86/pr91005.ll
new file mode 100644
index 000000000000..16b78bf1e7e1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr91005.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s
+
+define void @PR91005(ptr %0) minsize {
+; CHECK-LABEL: PR91005:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: je .LBB0_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: vpextrw $0, %xmm0, %eax
+; CHECK-NEXT: movzwl %ax, %eax
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: movw %ax, (%rdi)
+; CHECK-NEXT: .LBB0_2: # %common.ret
+; CHECK-NEXT: retq
+ %2 = bitcast <2 x half> poison to <2 x i16>
+ %3 = icmp eq <2 x i16> %2, <i16 31744, i16 31744>
+ br i1 poison, label %4, label %common.ret
+
+common.ret: ; preds = %4, %1
+ ret void
+
+4: ; preds = %1
+ %5 = select <2 x i1> %3, <2 x half> <half 0xH3C00, half 0xH3C00>, <2 x half> zeroinitializer
+ %6 = fmul <2 x half> %5, zeroinitializer
+ %7 = fsub <2 x half> %6, zeroinitializer
+ %8 = extractelement <2 x half> %7, i64 0
+ store half %8, ptr %0, align 2
+ br label %common.ret
+}
+
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>)