-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AArch64] Improve select dagcombine #169925
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-aarch64 Author: Graham Hunter (huntergr-arm) ChangesAn AnyOf reduction (aka vector.reduce.or) with a fixed-width vector is canonicalized to a bitcast of the mask vector to an integer of the same overall size, which is then compared against zero. If the scalar result of the bitcast is smaller than the element size of vectors being selected, we often end up with suboptimal codegen. This fixes the main cases, removing scalarized code. Full diff: https://github.com/llvm/llvm-project/pull/169925.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6072fd9d8f242..e6872dfe995d8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26965,6 +26965,11 @@ static SDValue performSelectCombine(SDNode *N,
if (!ResVT.isVector() || NumMaskElts == 0)
return SDValue();
+ // Avoid creating vectors with excessive VFs for small types.
+ if (DCI.isBeforeLegalize() &&
+ SrcVT.getSizeInBits() < ResVT.getScalarSizeInBits())
+ NumMaskElts = ResVT.getVectorNumElements();
+
SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
index 1ca4719d9b6bf..8ad9ea3b7a8d5 100644
--- a/llvm/test/CodeGen/AArch64/expand-select.ll
+++ b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -4,20 +4,15 @@
define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
; CHECK-LABEL: foo:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: and w8, w0, #0x1
-; CHECK-NEXT: ldr x11, [sp]
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: ldp x8, x10, [sp, #8]
-; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: tst w9, #0x1
-; CHECK-NEXT: csel x8, x5, x8, ne
-; CHECK-NEXT: csel x9, x4, x11, ne
-; CHECK-NEXT: stp x9, x8, [x10, #16]
-; CHECK-NEXT: csel x8, x3, x7, ne
-; CHECK-NEXT: csel x9, x2, x6, ne
-; CHECK-NEXT: stp x9, x8, [x10]
+; CHECK-NEXT: ldp x8, x9, [sp, #8]
+; CHECK-NEXT: tst w0, #0x1
+; CHECK-NEXT: ldr x10, [sp]
+; CHECK-NEXT: csel x8, x5, x8, eq
+; CHECK-NEXT: csel x10, x4, x10, eq
+; CHECK-NEXT: stp x10, x8, [x9, #16]
+; CHECK-NEXT: csel x8, x3, x7, eq
+; CHECK-NEXT: csel x10, x2, x6, eq
+; CHECK-NEXT: stp x10, x8, [x9]
; CHECK-NEXT: ret
%cond = and i32 %In1, 1
%cbool = icmp eq i32 %cond, 0
@@ -31,22 +26,17 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) {
; CHECK-LABEL: bar:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: and w8, w0, #0x1
-; CHECK-NEXT: ldr x10, [sp, #16]
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: tst w9, #0x1
-; CHECK-NEXT: ldp x8, x9, [sp]
-; CHECK-NEXT: csel x11, x2, x6, ne
-; CHECK-NEXT: str x11, [x10]
-; CHECK-NEXT: csel x8, x4, x8, ne
-; CHECK-NEXT: stur x8, [x10, #12]
-; CHECK-NEXT: csel x8, x5, x9, ne
-; CHECK-NEXT: csel x9, x3, x7, ne
-; CHECK-NEXT: str w8, [x10, #20]
-; CHECK-NEXT: str w9, [x10, #8]
+; CHECK-NEXT: ldp x8, x10, [sp]
+; CHECK-NEXT: tst w0, #0x1
+; CHECK-NEXT: ldr x9, [sp, #16]
+; CHECK-NEXT: csel x11, x2, x6, eq
+; CHECK-NEXT: csel x8, x4, x8, eq
+; CHECK-NEXT: str x11, [x9]
+; CHECK-NEXT: stur x8, [x9, #12]
+; CHECK-NEXT: csel x8, x5, x10, eq
+; CHECK-NEXT: csel x10, x3, x7, eq
+; CHECK-NEXT: str w8, [x9, #20]
+; CHECK-NEXT: str w10, [x9, #8]
; CHECK-NEXT: ret
%cond = and i32 %In1, 1
%cbool = icmp eq i32 %cond, 0
diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
new file mode 100644
index 0000000000000..43abb6ac9b944
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc %s -o - | FileCheck %s
+target triple = "aarch64-linux-gnu"
+
+;; An 'AnyOf' reduction (vector.reduce.or) is instcombined to a bitcast to an
+;; integer of a bitwidth equal to the number of lanes being reduced, then
+;; compared against zero. To select between vectors for NEON, we then need to
+;; broadcast the result, but we must be careful when the bitwidth of the scalar
+;; result is smaller than the element size of the vectors being selected. We
+;; don't want to end up with scalarization.
+
+define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: any_of_select_vf4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: umaxv s0, v0.4s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: dup v0.4s, w8
+; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp slt <4 x i32> %mask, zeroinitializer
+ %cmp.bc = bitcast <4 x i1> %cmp to i4
+ %cmp.bc.not = icmp eq i4 %cmp.bc, 0
+ %res = select i1 %cmp.bc.not, <4 x i32> %a, <4 x i32> %b
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: any_of_select_vf2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT: umaxv s0, v0.4s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csetm x8, ne
+; CHECK-NEXT: dup v0.2d, x8
+; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp slt <2 x i64> %mask, zeroinitializer
+ %cmp.bc = bitcast <2 x i1> %cmp to i2
+ %cmp.bc.not = icmp eq i2 %cmp.bc, 0
+ %res = select i1 %cmp.bc.not, <2 x i64> %a, <2 x i64> %b
+ ret <2 x i64> %res
+}
|
|
|
||
| // Avoid creating vectors with excessive VFs for small types. | ||
| if (DCI.isBeforeLegalize() && | ||
| SrcVT.getSizeInBits() < ResVT.getScalarSizeInBits()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would think we always want to use NumMaskElts before type legalisation, that would also improve e.g.
define <32 x i8> @any_of_select_vf8(<32 x i8> %mask, <32 x i8> %a, <32 x i8> %b) {
%cmp = icmp slt <32 x i8> %mask, zeroinitializer
%cmp.bc = bitcast <32 x i1> %cmp to i32
%cmp.bc.not = icmp eq i32 %cmp.bc, 0
%res = select i1 %cmp.bc.not, <32 x i8> %a, <32 x i8> %b
ret <32 x i8> %res
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Always using the elementcount of ResVT before legalization does seem to be better, yes. Thanks.
An AnyOf reduction (aka vector.reduce.or) with a fixed-width vector is canonicalized to a bitcast of the mask vector to an integer of the same overall size, which is then compared against zero.
If the scalar result of the bitcast is smaller than the element size of vectors being selected, we often end up with suboptimal codegen. This fixes the main cases, removing scalarized code.