-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[LoadStoreVectorizer] Allow redundant stores #169946
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Gang Chen (cmc-rep) ChangesFull diff: https://github.com/llvm/llvm-project/pull/169946.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index c28314f6ab124..3ba901c33dbcc 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1084,39 +1084,10 @@ bool Vectorizer::isSafeToMove(
if (!IsLoadChain && isInvariantLoad(I))
continue;
- // If I is in the chain, we can tell whether it aliases ChainIt by checking
- // what offset ChainIt accesses. This may be better than AA is able to do.
- //
- // We should really only have duplicate offsets for stores (the duplicate
- // loads should be CSE'ed), but in case we have a duplicate load, we'll
- // split the chain so we don't have to handle this case specially.
- if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
- // I and ChainElem overlap if:
- // - I and ChainElem have the same offset, OR
- // - I's offset is less than ChainElem's, but I touches past the
- // beginning of ChainElem, OR
- // - ChainElem's offset is less than I's, but ChainElem touches past the
- // beginning of I.
- const APInt &IOffset = OffsetIt->second;
- unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
- if (IOffset == ChainElemOffset ||
- (IOffset.sle(ChainElemOffset) &&
- (IOffset + IElemSize).sgt(ChainElemOffset)) ||
- (ChainElemOffset.sle(IOffset) &&
- (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
- LLVM_DEBUG({
- // Double check that AA also sees this alias. If not, we probably
- // have a bug.
- ModRefInfo MR =
- BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
- assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
- dbgs() << "LSV: Found alias in chain: " << *I << "\n";
- });
- return false; // We found an aliasing instruction; bail.
- }
-
- continue; // We're confident there's no alias.
- }
+ // Allow on-chain aliasing because write-order is preserved when stores are
+ // vectorized.
+ if (ChainOffsets.count(I))
+ continue;
LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
ModRefInfo MR = BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
index 57da5976b3cfa..6f3c2fc5f387e 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
@@ -10,11 +10,7 @@ define amdgpu_kernel void @no_crash(i32 %arg) {
; GCN-SAME: i32 [[ARG:%.*]]) {
; GCN-NEXT: [[TEMP2:%.*]] = add i32 [[ARG]], 14
; GCN-NEXT: [[TEMP3:%.*]] = getelementptr [16384 x i32], ptr addrspace(3) @[[GLOB0:[0-9]+]], i32 0, i32 [[TEMP2]]
-; GCN-NEXT: [[TEMP4:%.*]] = add i32 [[ARG]], 15
-; GCN-NEXT: [[TEMP5:%.*]] = getelementptr [16384 x i32], ptr addrspace(3) @[[GLOB0]], i32 0, i32 [[TEMP4]]
; GCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(3) [[TEMP3]], align 4
-; GCN-NEXT: store i32 0, ptr addrspace(3) [[TEMP5]], align 4
-; GCN-NEXT: store i32 0, ptr addrspace(3) [[TEMP5]], align 4
; GCN-NEXT: ret void
;
%temp2 = add i32 %arg, 14
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-stores.ll
new file mode 100644
index 0000000000000..cd3e3bded681a
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-stores.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
+
+define void @onevec(ptr %ptr, <1 x i32> %sd0, i32 %sd1, i32 %sd2, <1 x i32> %sd3, <1 x i32> %sd4, <1 x i32> %sd5) {
+; CHECK-LABEL: define void @onevec(
+; CHECK-SAME: ptr [[PTR:%.*]], <1 x i32> [[SD0:%.*]], i32 [[SD1:%.*]], i32 [[SD2:%.*]], <1 x i32> [[SD3:%.*]], <1 x i32> [[SD4:%.*]], <1 x i32> [[SD5:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i32> [[SD0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x i32> [[TMP2]], i32 [[SD1]], i32 0
+; CHECK-NEXT: store <1 x i32> [[TMP3]], ptr [[PTR]], align 4
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <1 x i32> poison, i32 [[SD2]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i32> [[SD3]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <1 x i32> [[TMP4]], i32 [[TMP5]], i32 0
+; CHECK-NEXT: store <1 x i32> [[TMP6]], ptr [[GEP1]], align 4
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 32
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i32> [[SD4]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <1 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <1 x i32> [[SD5]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <1 x i32> [[TMP8]], i32 [[TMP9]], i32 0
+; CHECK-NEXT: store <1 x i32> [[TMP10]], ptr [[GEP2]], align 4
+; CHECK-NEXT: ret void
+;
+ store <1 x i32> %sd0, ptr %ptr, align 4
+ store i32 %sd1, ptr %ptr, align 4
+
+ %gep1 = getelementptr inbounds i8, ptr %ptr, i32 16
+ store i32 %sd2, ptr %gep1, align 4
+ store <1 x i32> %sd3, ptr %gep1, align 4
+
+ %gep2 = getelementptr inbounds i8, ptr %ptr, i32 32
+ store <1 x i32> %sd4, ptr %gep2, align 4
+ store <1 x i32> %sd5, ptr %gep2, align 4
+ ret void
+}
+
+define void @test(ptr %ptr, i32 %sd0, <2 x i32> %sd1, <2 x i32> %sd2, i32 %sd3) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[SD0:%.*]], <2 x i32> [[SD1:%.*]], <2 x i32> [[SD2:%.*]], i32 [[SD3:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[SD0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SD1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[SD1]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP4]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[SD2]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i32 2
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[SD2]], i32 1
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 3
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[SD3]], i32 2
+; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[PTR]], align 4
+; CHECK-NEXT: ret void
+;
+ store i32 %sd0, ptr %ptr, align 4
+ %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
+ store <2 x i32> %sd1, ptr %gep1, align 4
+ %gep2 = getelementptr inbounds i8, ptr %ptr, i32 8
+ store <2 x i32> %sd2, ptr %gep2, align 4
+ %gep3 = getelementptr inbounds i8, ptr %ptr, i32 8
+ store i32 %sd3, ptr %gep3, align 4
+ ret void
+}
+
+define void @vect_zext_bitcast_i8_st4_to_i32_idx(ptr addrspace(1) %arg1, i32 %base, i32 %sd1, i32 %sd2, i32 %sd25, i32 %sd3, i32 %sd4) {
+; CHECK-LABEL: define void @vect_zext_bitcast_i8_st4_to_i32_idx(
+; CHECK-SAME: ptr addrspace(1) [[ARG1:%.*]], i32 [[BASE:%.*]], i32 [[SD1:%.*]], i32 [[SD2:%.*]], i32 [[SD25:%.*]], i32 [[SD3:%.*]], i32 [[SD4:%.*]]) {
+; CHECK-NEXT: [[ADD1:%.*]] = add nuw i32 [[BASE]], 0
+; CHECK-NEXT: [[ZEXT1:%.*]] = zext i32 [[ADD1]] to i64
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT1]]
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[SD1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[SD2]], i32 1
+; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: [[ADD25:%.*]] = add nuw i32 [[BASE]], 6
+; CHECK-NEXT: [[ZEXT25:%.*]] = zext i32 [[ADD25]] to i64
+; CHECK-NEXT: [[GEP25:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT25]]
+; CHECK-NEXT: store i32 [[SD25]], ptr addrspace(1) [[GEP25]], align 4
+; CHECK-NEXT: [[ADD3:%.*]] = add nuw i32 [[BASE]], 8
+; CHECK-NEXT: [[ZEXT3:%.*]] = zext i32 [[ADD3]] to i64
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT3]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[SD3]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[SD4]], i32 1
+; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr addrspace(1) [[GEP3]], align 4
+; CHECK-NEXT: ret void
+;
+ %add1 = add nuw i32 %base, 0
+ %zext1 = zext i32 %add1 to i64
+ %gep1 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext1
+ store i32 %sd1, ptr addrspace(1) %gep1, align 4
+ %add2 = add nuw i32 %base, 4
+ %zext2 = zext i32 %add2 to i64
+ %gep2 = getelementptr inbounds i8,ptr addrspace(1) %arg1, i64 %zext2
+ store i32 %sd2, ptr addrspace(1) %gep2, align 4
+
+ ; A store with 2-byte overlap breaks continuity.
+ %add25 = add nuw i32 %base, 6
+ %zext25 = zext i32 %add25 to i64
+ %gep25 = getelementptr inbounds i8,ptr addrspace(1) %arg1, i64 %zext25
+ store i32 %sd25, ptr addrspace(1) %gep25, align 4
+
+ %add3 = add nuw i32 %base, 8
+ %zext3 = zext i32 %add3 to i64
+ %gep3 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext3
+ store i32 %sd3, ptr addrspace(1) %gep3, align 4
+ %add4 = add nuw i32 %base, 12
+ %zext4 = zext i32 %add4 to i64
+ %gep4 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext4
+ store i32 %sd4, ptr addrspace(1) %gep4, align 4
+ ret void
+}
|
🐧 Linux x64 Test Results
|
6b29c2d to
7595483
Compare
a2e0a85 to
e50e04f
Compare
|
Why? The vectorizer should not be making attempts to handle patterns that should have optimized away already |
We have downstream needs for this: major IR transform happens after the last call of DSE, which creates new case of DSE. |
No description provided.