Skip to content

Conversation

@cmc-rep
Copy link
Contributor

@cmc-rep cmc-rep commented Nov 28, 2025

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Nov 28, 2025

@llvm/pr-subscribers-llvm-globalisel
@llvm/pr-subscribers-vectorizers
@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-amdgpu

Author: Gang Chen (cmc-rep)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/169946.diff

3 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp (+4-33)
  • (modified) llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll (-4)
  • (added) llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-stores.ll (+108)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index c28314f6ab124..3ba901c33dbcc 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1084,39 +1084,10 @@ bool Vectorizer::isSafeToMove(
     if (!IsLoadChain && isInvariantLoad(I))
       continue;
 
-    // If I is in the chain, we can tell whether it aliases ChainIt by checking
-    // what offset ChainIt accesses.  This may be better than AA is able to do.
-    //
-    // We should really only have duplicate offsets for stores (the duplicate
-    // loads should be CSE'ed), but in case we have a duplicate load, we'll
-    // split the chain so we don't have to handle this case specially.
-    if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
-      // I and ChainElem overlap if:
-      //   - I and ChainElem have the same offset, OR
-      //   - I's offset is less than ChainElem's, but I touches past the
-      //     beginning of ChainElem, OR
-      //   - ChainElem's offset is less than I's, but ChainElem touches past the
-      //     beginning of I.
-      const APInt &IOffset = OffsetIt->second;
-      unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
-      if (IOffset == ChainElemOffset ||
-          (IOffset.sle(ChainElemOffset) &&
-           (IOffset + IElemSize).sgt(ChainElemOffset)) ||
-          (ChainElemOffset.sle(IOffset) &&
-           (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
-        LLVM_DEBUG({
-          // Double check that AA also sees this alias.  If not, we probably
-          // have a bug.
-          ModRefInfo MR =
-              BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
-          assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
-          dbgs() << "LSV: Found alias in chain: " << *I << "\n";
-        });
-        return false; // We found an aliasing instruction; bail.
-      }
-
-      continue; // We're confident there's no alias.
-    }
+    // Allow on-chain aliasing because write-order is preserved when stores are
+    // vectorized.
+    if (ChainOffsets.count(I))
+      continue;
 
     LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
     ModRefInfo MR = BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
index 57da5976b3cfa..6f3c2fc5f387e 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
@@ -10,11 +10,7 @@ define amdgpu_kernel void @no_crash(i32 %arg) {
 ; GCN-SAME: i32 [[ARG:%.*]]) {
 ; GCN-NEXT:    [[TEMP2:%.*]] = add i32 [[ARG]], 14
 ; GCN-NEXT:    [[TEMP3:%.*]] = getelementptr [16384 x i32], ptr addrspace(3) @[[GLOB0:[0-9]+]], i32 0, i32 [[TEMP2]]
-; GCN-NEXT:    [[TEMP4:%.*]] = add i32 [[ARG]], 15
-; GCN-NEXT:    [[TEMP5:%.*]] = getelementptr [16384 x i32], ptr addrspace(3) @[[GLOB0]], i32 0, i32 [[TEMP4]]
 ; GCN-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(3) [[TEMP3]], align 4
-; GCN-NEXT:    store i32 0, ptr addrspace(3) [[TEMP5]], align 4
-; GCN-NEXT:    store i32 0, ptr addrspace(3) [[TEMP5]], align 4
 ; GCN-NEXT:    ret void
 ;
   %temp2 = add i32 %arg, 14
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-stores.ll
new file mode 100644
index 0000000000000..cd3e3bded681a
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-stores.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
+
+define void @onevec(ptr %ptr, <1 x i32> %sd0, i32 %sd1, i32 %sd2, <1 x i32> %sd3, <1 x i32> %sd4, <1 x i32> %sd5) {
+; CHECK-LABEL: define void @onevec(
+; CHECK-SAME: ptr [[PTR:%.*]], <1 x i32> [[SD0:%.*]], i32 [[SD1:%.*]], i32 [[SD2:%.*]], <1 x i32> [[SD3:%.*]], <1 x i32> [[SD4:%.*]], <1 x i32> [[SD5:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i32> [[SD0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <1 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <1 x i32> [[TMP2]], i32 [[SD1]], i32 0
+; CHECK-NEXT:    store <1 x i32> [[TMP3]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <1 x i32> poison, i32 [[SD2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i32> [[SD3]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <1 x i32> [[TMP4]], i32 [[TMP5]], i32 0
+; CHECK-NEXT:    store <1 x i32> [[TMP6]], ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 32
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i32> [[SD4]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <1 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <1 x i32> [[SD5]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <1 x i32> [[TMP8]], i32 [[TMP9]], i32 0
+; CHECK-NEXT:    store <1 x i32> [[TMP10]], ptr [[GEP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  store <1 x i32> %sd0, ptr %ptr, align 4
+  store i32 %sd1, ptr %ptr, align 4
+
+  %gep1 = getelementptr inbounds i8, ptr %ptr, i32 16
+  store i32 %sd2, ptr %gep1, align 4
+  store <1 x i32> %sd3, ptr %gep1, align 4
+
+  %gep2 = getelementptr inbounds i8, ptr %ptr, i32 32
+  store <1 x i32> %sd4, ptr %gep2, align 4
+  store <1 x i32> %sd5, ptr %gep2, align 4
+  ret void
+}
+
+define void @test(ptr %ptr, i32 %sd0, <2 x i32> %sd1, <2 x i32> %sd2, i32 %sd3) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[SD0:%.*]], <2 x i32> [[SD1:%.*]], <2 x i32> [[SD2:%.*]], i32 [[SD3:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[SD0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SD1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[SD1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[SD2]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[SD2]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[SD3]], i32 2
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 %sd0, ptr %ptr, align 4
+  %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
+  store <2 x i32> %sd1, ptr %gep1, align 4
+  %gep2 = getelementptr inbounds i8, ptr %ptr, i32 8
+  store <2 x i32> %sd2, ptr %gep2, align 4
+  %gep3 = getelementptr inbounds i8, ptr %ptr, i32 8
+  store i32 %sd3, ptr %gep3, align 4
+  ret void
+}
+
+define void @vect_zext_bitcast_i8_st4_to_i32_idx(ptr addrspace(1) %arg1, i32 %base, i32 %sd1, i32 %sd2, i32 %sd25, i32 %sd3, i32 %sd4) {
+; CHECK-LABEL: define void @vect_zext_bitcast_i8_st4_to_i32_idx(
+; CHECK-SAME: ptr addrspace(1) [[ARG1:%.*]], i32 [[BASE:%.*]], i32 [[SD1:%.*]], i32 [[SD2:%.*]], i32 [[SD25:%.*]], i32 [[SD3:%.*]], i32 [[SD4:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = add nuw i32 [[BASE]], 0
+; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[SD1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[SD2]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT:    [[ADD25:%.*]] = add nuw i32 [[BASE]], 6
+; CHECK-NEXT:    [[ZEXT25:%.*]] = zext i32 [[ADD25]] to i64
+; CHECK-NEXT:    [[GEP25:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT25]]
+; CHECK-NEXT:    store i32 [[SD25]], ptr addrspace(1) [[GEP25]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add nuw i32 [[BASE]], 8
+; CHECK-NEXT:    [[ZEXT3:%.*]] = zext i32 [[ADD3]] to i64
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[SD3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[SD4]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[TMP4]], ptr addrspace(1) [[GEP3]], align 4
+; CHECK-NEXT:    ret void
+;
+  %add1 = add nuw i32 %base, 0
+  %zext1 = zext i32 %add1 to i64
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext1
+  store i32 %sd1, ptr addrspace(1) %gep1, align 4
+  %add2 = add nuw i32 %base, 4
+  %zext2 = zext i32 %add2 to i64
+  %gep2 = getelementptr inbounds i8,ptr addrspace(1) %arg1, i64 %zext2
+  store i32 %sd2, ptr addrspace(1) %gep2, align 4
+
+  ; A store with 2-byte overlap breaks continuity.
+  %add25 = add nuw i32 %base, 6
+  %zext25 = zext i32 %add25 to i64
+  %gep25 = getelementptr inbounds i8,ptr addrspace(1) %arg1, i64 %zext25
+  store i32 %sd25, ptr addrspace(1) %gep25, align 4
+
+  %add3 = add nuw i32 %base, 8
+  %zext3 = zext i32 %add3 to i64
+  %gep3 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext3
+  store i32 %sd3, ptr addrspace(1) %gep3, align 4
+  %add4 = add nuw i32 %base, 12
+  %zext4 = zext i32 %add4 to i64
+  %gep4 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext4
+  store i32 %sd4, ptr addrspace(1) %gep4, align 4
+  ret void
+}

@github-actions
Copy link

github-actions bot commented Nov 28, 2025

🐧 Linux x64 Test Results

  • 186684 tests passed
  • 4887 tests skipped

@cmc-rep cmc-rep force-pushed the lsv-redundant-stores branch from 6b29c2d to 7595483 Compare November 28, 2025 19:28
@cmc-rep cmc-rep force-pushed the lsv-redundant-stores branch from a2e0a85 to e50e04f Compare November 29, 2025 21:55
@cmc-rep cmc-rep requested review from arsenm and dakersnar November 29, 2025 22:43
@arsenm
Copy link
Contributor

arsenm commented Nov 30, 2025

Why? The vectorizer should not be making attempts to handle patterns that should have optimized away already

@cmc-rep
Copy link
Contributor Author

cmc-rep commented Nov 30, 2025

Why? The vectorizer should not be making attempts to handle patterns that should have optimized away already

We have downstream needs for this: major IR transform happens after the last call of DSE, which creates new case of DSE.
We either need to add another DSE pass somewhere, or we add DSE-like capability here. The overhead of adding this on top of what we have added for redundant load is really minimal.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants