[LoadStoreVectorizer] Allow redundant stores

cmc-rep · cmc-rep · commit 75954833f636 · 2025-11-28T11:28:15.000-08:00
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -316,14 +316,12 @@ class Vectorizer {
   /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
   /// instructions.
   ///
-  /// The map ChainElemOffsets must contain all of the elements in
-  /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
-  /// address.  It's ok if it contains additional entries.
+  /// The map ChainSet must contain all of the elements in
+  /// [ChainBegin, ChainElem]. It's ok if it contains additional entries.
   template <bool IsLoadChain>
-  bool isSafeToMove(
-      Instruction *ChainElem, Instruction *ChainBegin,
-      const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
-      BatchAAResults &BatchAA);
+  bool isSafeToMove(Instruction *ChainElem, Instruction *ChainBegin,
+                    const DenseSet<Instruction *> &ChainSet,
+                    BatchAAResults &BatchAA);
 
   /// Merges the equivalence classes if they have underlying objects that differ
   /// by one level of indirection (i.e., one is a getelementptr and the other is
@@ -540,9 +538,9 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
   // We know that elements in the chain with nonverlapping offsets can't
   // alias, but AA may not be smart enough to figure this out.  Use a
   // hashtable.
-  DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
+  DenseSet<Instruction *> ChainSet;
   for (const auto &E : C)
-    ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
+    ChainSet.insert(E.Inst);
 
   // Across a single invocation of this function the IR is not changing, so
   // using a batched Alias Analysis is safe and can reduce compile time.
@@ -573,8 +571,8 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
     SmallVector<ChainElem, 1> NewChain;
     NewChain.emplace_back(*ChainBegin);
     for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
-      if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
-                               ChainOffsets, BatchAA)) {
+      if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst, ChainSet,
+                               BatchAA)) {
         LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
                           << *ChainIt->Inst << " into " << *ChainBegin->Inst
                           << "\n");
@@ -1037,10 +1035,9 @@ bool Vectorizer::vectorizeChain(Chain &C) {
 }
 
 template <bool IsLoadChain>
-bool Vectorizer::isSafeToMove(
-    Instruction *ChainElem, Instruction *ChainBegin,
-    const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
-    BatchAAResults &BatchAA) {
+bool Vectorizer::isSafeToMove(Instruction *ChainElem, Instruction *ChainBegin,
+                              const DenseSet<Instruction *> &ChainSet,
+                              BatchAAResults &BatchAA) {
   LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
                     << *ChainBegin << ")\n");
 
@@ -1066,10 +1063,6 @@ bool Vectorizer::isSafeToMove(
       return BasicBlock::iterator(ChainBegin);
   }());
 
-  const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
-  const unsigned ChainElemSize =
-      DL.getTypeStoreSize(getLoadStoreType(ChainElem));
-
   for (; BBIt != BBItEnd; ++BBIt) {
     Instruction *I = &*BBIt;
 
@@ -1084,39 +1077,10 @@ bool Vectorizer::isSafeToMove(
     if (!IsLoadChain && isInvariantLoad(I))
       continue;
 
-    // If I is in the chain, we can tell whether it aliases ChainIt by checking
-    // what offset ChainIt accesses.  This may be better than AA is able to do.
-    //
-    // We should really only have duplicate offsets for stores (the duplicate
-    // loads should be CSE'ed), but in case we have a duplicate load, we'll
-    // split the chain so we don't have to handle this case specially.
-    if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
-      // I and ChainElem overlap if:
-      //   - I and ChainElem have the same offset, OR
-      //   - I's offset is less than ChainElem's, but I touches past the
-      //     beginning of ChainElem, OR
-      //   - ChainElem's offset is less than I's, but ChainElem touches past the
-      //     beginning of I.
-      const APInt &IOffset = OffsetIt->second;
-      unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
-      if (IOffset == ChainElemOffset ||
-          (IOffset.sle(ChainElemOffset) &&
-           (IOffset + IElemSize).sgt(ChainElemOffset)) ||
-          (ChainElemOffset.sle(IOffset) &&
-           (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
-        LLVM_DEBUG({
-          // Double check that AA also sees this alias.  If not, we probably
-          // have a bug.
-          ModRefInfo MR =
-              BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
-          assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
-          dbgs() << "LSV: Found alias in chain: " << *I << "\n";
-        });
-        return false; // We found an aliasing instruction; bail.
-      }
-
-      continue; // We're confident there's no alias.
-    }
+    // Allow on-chain aliasing because write-order is preserved when stores are
+    // vectorized.
+    if (ChainSet.count(I))
+      continue;
 
     LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
     ModRefInfo MR = BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
@@ -10,11 +10,7 @@ define amdgpu_kernel void @no_crash(i32 %arg) {
 ; GCN-SAME: i32 [[ARG:%.*]]) {
 ; GCN-NEXT:    [[TEMP2:%.*]] = add i32 [[ARG]], 14
 ; GCN-NEXT:    [[TEMP3:%.*]] = getelementptr [16384 x i32], ptr addrspace(3) @[[GLOB0:[0-9]+]], i32 0, i32 [[TEMP2]]
-; GCN-NEXT:    [[TEMP4:%.*]] = add i32 [[ARG]], 15
-; GCN-NEXT:    [[TEMP5:%.*]] = getelementptr [16384 x i32], ptr addrspace(3) @[[GLOB0]], i32 0, i32 [[TEMP4]]
 ; GCN-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(3) [[TEMP3]], align 4
-; GCN-NEXT:    store i32 0, ptr addrspace(3) [[TEMP5]], align 4
-; GCN-NEXT:    store i32 0, ptr addrspace(3) [[TEMP5]], align 4
 ; GCN-NEXT:    ret void
 ;
   %temp2 = add i32 %arg, 14
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-stores.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
+
+define void @onevec(ptr %ptr, <1 x i32> %sd0, i32 %sd1, i32 %sd2, <1 x i32> %sd3, <1 x i32> %sd4, <1 x i32> %sd5) {
+; CHECK-LABEL: define void @onevec(
+; CHECK-SAME: ptr [[PTR:%.*]], <1 x i32> [[SD0:%.*]], i32 [[SD1:%.*]], i32 [[SD2:%.*]], <1 x i32> [[SD3:%.*]], <1 x i32> [[SD4:%.*]], <1 x i32> [[SD5:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i32> [[SD0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <1 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <1 x i32> [[TMP2]], i32 [[SD1]], i32 0
+; CHECK-NEXT:    store <1 x i32> [[TMP3]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <1 x i32> poison, i32 [[SD2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i32> [[SD3]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <1 x i32> [[TMP4]], i32 [[TMP5]], i32 0
+; CHECK-NEXT:    store <1 x i32> [[TMP6]], ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 32
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i32> [[SD4]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <1 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <1 x i32> [[SD5]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <1 x i32> [[TMP8]], i32 [[TMP9]], i32 0
+; CHECK-NEXT:    store <1 x i32> [[TMP10]], ptr [[GEP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  store <1 x i32> %sd0, ptr %ptr, align 4
+  store i32 %sd1, ptr %ptr, align 4
+
+  %gep1 = getelementptr inbounds i8, ptr %ptr, i32 16
+  store i32 %sd2, ptr %gep1, align 4
+  store <1 x i32> %sd3, ptr %gep1, align 4
+
+  %gep2 = getelementptr inbounds i8, ptr %ptr, i32 32
+  store <1 x i32> %sd4, ptr %gep2, align 4
+  store <1 x i32> %sd5, ptr %gep2, align 4
+  ret void
+}
+
+define void @test(ptr %ptr, i32 %sd0, <2 x i32> %sd1, <2 x i32> %sd2, i32 %sd3) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[SD0:%.*]], <2 x i32> [[SD1:%.*]], <2 x i32> [[SD2:%.*]], i32 [[SD3:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[SD0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SD1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[SD1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[SD2]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[SD2]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[SD3]], i32 2
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 %sd0, ptr %ptr, align 4
+  %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
+  store <2 x i32> %sd1, ptr %gep1, align 4
+  %gep2 = getelementptr inbounds i8, ptr %ptr, i32 8
+  store <2 x i32> %sd2, ptr %gep2, align 4
+  %gep3 = getelementptr inbounds i8, ptr %ptr, i32 8
+  store i32 %sd3, ptr %gep3, align 4
+  ret void
+}
+
+define void @vect_zext_bitcast_i8_st4_to_i32_idx(ptr addrspace(1) %arg1, i32 %base, i32 %sd1, i32 %sd2, i32 %sd25, i32 %sd3, i32 %sd4) {
+; CHECK-LABEL: define void @vect_zext_bitcast_i8_st4_to_i32_idx(
+; CHECK-SAME: ptr addrspace(1) [[ARG1:%.*]], i32 [[BASE:%.*]], i32 [[SD1:%.*]], i32 [[SD2:%.*]], i32 [[SD25:%.*]], i32 [[SD3:%.*]], i32 [[SD4:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = add nuw i32 [[BASE]], 0
+; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[SD1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[SD2]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT:    [[ADD25:%.*]] = add nuw i32 [[BASE]], 6
+; CHECK-NEXT:    [[ZEXT25:%.*]] = zext i32 [[ADD25]] to i64
+; CHECK-NEXT:    [[GEP25:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT25]]
+; CHECK-NEXT:    store i32 [[SD25]], ptr addrspace(1) [[GEP25]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add nuw i32 [[BASE]], 8
+; CHECK-NEXT:    [[ZEXT3:%.*]] = zext i32 [[ADD3]] to i64
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[SD3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[SD4]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[TMP4]], ptr addrspace(1) [[GEP3]], align 4
+; CHECK-NEXT:    ret void
+;
+  %add1 = add nuw i32 %base, 0
+  %zext1 = zext i32 %add1 to i64
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext1
+  store i32 %sd1, ptr addrspace(1) %gep1, align 4
+  %add2 = add nuw i32 %base, 4
+  %zext2 = zext i32 %add2 to i64
+  %gep2 = getelementptr inbounds i8,ptr addrspace(1) %arg1, i64 %zext2
+  store i32 %sd2, ptr addrspace(1) %gep2, align 4
+
+  ; A store with 2-byte overlap breaks continuity.
+  %add25 = add nuw i32 %base, 6
+  %zext25 = zext i32 %add25 to i64
+  %gep25 = getelementptr inbounds i8,ptr addrspace(1) %arg1, i64 %zext25
+  store i32 %sd25, ptr addrspace(1) %gep25, align 4
+
+  %add3 = add nuw i32 %base, 8
+  %zext3 = zext i32 %add3 to i64
+  %gep3 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext3
+  store i32 %sd3, ptr addrspace(1) %gep3, align 4
+  %add4 = add nuw i32 %base, 12
+  %zext4 = zext i32 %add4 to i64
+  %gep4 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext4
+  store i32 %sd4, ptr addrspace(1) %gep4, align 4
+  ret void
+}