[AArch64] Improve select dagcombine #169925

huntergr-arm · 2025-11-28T15:04:21Z

An AnyOf reduction (aka vector.reduce.or) with a fixed-width vector is canonicalized to a bitcast of the mask vector to an integer of the same overall size, which is then compared against zero.

If the scalar result of the bitcast is smaller than the element size of vectors being selected, we often end up with suboptimal codegen. This fixes the main cases, removing scalarized code.

llvmbot · 2025-11-28T15:04:52Z

@llvm/pr-subscribers-backend-aarch64

Author: Graham Hunter (huntergr-arm)

Changes

An AnyOf reduction (aka vector.reduce.or) with a fixed-width vector is canonicalized to a bitcast of the mask vector to an integer of the same overall size, which is then compared against zero.

If the scalar result of the bitcast is smaller than the element size of vectors being selected, we often end up with suboptimal codegen. This fixes the main cases, removing scalarized code.

Full diff: https://github.com/llvm/llvm-project/pull/169925.diff

3 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+5)
(modified) llvm/test/CodeGen/AArch64/expand-select.ll (+20-30)
(added) llvm/test/CodeGen/AArch64/neon-anyof-splat.ll (+46)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6072fd9d8f242..e6872dfe995d8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26965,6 +26965,11 @@ static SDValue performSelectCombine(SDNode *N,
   if (!ResVT.isVector() || NumMaskElts == 0)
     return SDValue();
 
+  // Avoid creating vectors with excessive VFs for small types.
+  if (DCI.isBeforeLegalize() &&
+      SrcVT.getSizeInBits() < ResVT.getScalarSizeInBits())
+    NumMaskElts = ResVT.getVectorNumElements();
+
   SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
 
diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
index 1ca4719d9b6bf..8ad9ea3b7a8d5 100644
--- a/llvm/test/CodeGen/AArch64/expand-select.ll
+++ b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -4,20 +4,15 @@
 define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
 ; CHECK-LABEL: foo:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi d0, #0000000000000000
-; CHECK-NEXT:    and w8, w0, #0x1
-; CHECK-NEXT:    ldr x11, [sp]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    ldp x8, x10, [sp, #8]
-; CHECK-NEXT:    cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    tst w9, #0x1
-; CHECK-NEXT:    csel x8, x5, x8, ne
-; CHECK-NEXT:    csel x9, x4, x11, ne
-; CHECK-NEXT:    stp x9, x8, [x10, #16]
-; CHECK-NEXT:    csel x8, x3, x7, ne
-; CHECK-NEXT:    csel x9, x2, x6, ne
-; CHECK-NEXT:    stp x9, x8, [x10]
+; CHECK-NEXT:    ldp x8, x9, [sp, #8]
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    ldr x10, [sp]
+; CHECK-NEXT:    csel x8, x5, x8, eq
+; CHECK-NEXT:    csel x10, x4, x10, eq
+; CHECK-NEXT:    stp x10, x8, [x9, #16]
+; CHECK-NEXT:    csel x8, x3, x7, eq
+; CHECK-NEXT:    csel x10, x2, x6, eq
+; CHECK-NEXT:    stp x10, x8, [x9]
 ; CHECK-NEXT:    ret
   %cond = and i32 %In1, 1
   %cbool = icmp eq i32 %cond, 0
@@ -31,22 +26,17 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
 define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) {
 ; CHECK-LABEL: bar:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi d0, #0000000000000000
-; CHECK-NEXT:    and w8, w0, #0x1
-; CHECK-NEXT:    ldr x10, [sp, #16]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    tst w9, #0x1
-; CHECK-NEXT:    ldp x8, x9, [sp]
-; CHECK-NEXT:    csel x11, x2, x6, ne
-; CHECK-NEXT:    str x11, [x10]
-; CHECK-NEXT:    csel x8, x4, x8, ne
-; CHECK-NEXT:    stur x8, [x10, #12]
-; CHECK-NEXT:    csel x8, x5, x9, ne
-; CHECK-NEXT:    csel x9, x3, x7, ne
-; CHECK-NEXT:    str w8, [x10, #20]
-; CHECK-NEXT:    str w9, [x10, #8]
+; CHECK-NEXT:    ldp x8, x10, [sp]
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    ldr x9, [sp, #16]
+; CHECK-NEXT:    csel x11, x2, x6, eq
+; CHECK-NEXT:    csel x8, x4, x8, eq
+; CHECK-NEXT:    str x11, [x9]
+; CHECK-NEXT:    stur x8, [x9, #12]
+; CHECK-NEXT:    csel x8, x5, x10, eq
+; CHECK-NEXT:    csel x10, x3, x7, eq
+; CHECK-NEXT:    str w8, [x9, #20]
+; CHECK-NEXT:    str w10, [x9, #8]
 ; CHECK-NEXT:    ret
   %cond = and i32 %In1, 1
   %cbool = icmp eq i32 %cond, 0
diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
new file mode 100644
index 0000000000000..43abb6ac9b944
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc %s -o - | FileCheck %s
+target triple = "aarch64-linux-gnu"
+
+;; An 'AnyOf' reduction (vector.reduce.or) is instcombined to a bitcast to an
+;; integer of a bitwidth equal to the number of lanes being reduced, then
+;; compared against zero. To select between vectors for NEON, we then need to
+;; broadcast the result, but we must be careful when the bitwidth of the scalar
+;; result is smaller than the element size of the vectors being selected. We
+;; don't want to end up with scalarization.
+
+define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: any_of_select_vf4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <4 x i32> %mask, zeroinitializer
+  %cmp.bc = bitcast <4 x i1> %cmp to i4
+  %cmp.bc.not = icmp eq i4 %cmp.bc, 0
+  %res = select i1 %cmp.bc.not, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: any_of_select_vf2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csetm x8, ne
+; CHECK-NEXT:    dup v0.2d, x8
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <2 x i64> %mask, zeroinitializer
+  %cmp.bc = bitcast <2 x i1> %cmp to i2
+  %cmp.bc.not = icmp eq i2 %cmp.bc, 0
+  %res = select i1 %cmp.bc.not, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %res
+}

sdesmalen-arm · 2025-11-28T16:12:52Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp


+  // Avoid creating vectors with excessive VFs for small types.
+  if (DCI.isBeforeLegalize() &&
+      SrcVT.getSizeInBits() < ResVT.getScalarSizeInBits())


I would think we always want to use NumMaskElts before type legalisation, that would also improve e.g.

define <32 x i8> @any_of_select_vf8(<32 x i8> %mask, <32 x i8> %a, <32 x i8> %b) { %cmp = icmp slt <32 x i8> %mask, zeroinitializer %cmp.bc = bitcast <32 x i1> %cmp to i32 %cmp.bc.not = icmp eq i32 %cmp.bc, 0 %res = select i1 %cmp.bc.not, <32 x i8> %a, <32 x i8> %b ret <32 x i8> %res }

Always using the elementcount of ResVT before legalization does seem to be better, yes. Thanks.

huntergr-arm added 2 commits November 28, 2025 14:27

[AArch64] Codegen test for select from canonical fixed-width AnyOf

1262bd3

Avoid choosing a bad ElementCount for splatting the condition

7c8b20f

huntergr-arm requested review from paulwalker-arm and sdesmalen-arm November 28, 2025 15:04

llvmbot added the backend:AArch64 label Nov 28, 2025

sdesmalen-arm reviewed Nov 28, 2025

View reviewed changes

huntergr-arm added 2 commits November 28, 2025 16:56

Add suggested wider-than-legal test

dce36a4

Always use ResVT's element count before legalization

3be56c0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64] Improve select dagcombine #169925

[AArch64] Improve select dagcombine #169925

Uh oh!

huntergr-arm commented Nov 28, 2025

Uh oh!

llvmbot commented Nov 28, 2025

Uh oh!

sdesmalen-arm Nov 28, 2025

Uh oh!

huntergr-arm Nov 28, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[AArch64] Improve select dagcombine #169925

Are you sure you want to change the base?

[AArch64] Improve select dagcombine #169925

Uh oh!

Conversation

huntergr-arm commented Nov 28, 2025

Uh oh!

llvmbot commented Nov 28, 2025

Uh oh!

sdesmalen-arm Nov 28, 2025

Choose a reason for hiding this comment

Uh oh!

huntergr-arm Nov 28, 2025

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants