Skip to content

Commit 7595483

Browse files
committed
[LoadStoreVectorizer] Allow redundant stores
1 parent 442f853 commit 7595483

File tree

3 files changed

+124
-56
lines changed

3 files changed

+124
-56
lines changed

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 16 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -316,14 +316,12 @@ class Vectorizer {
316316
/// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
317317
/// instructions.
318318
///
319-
/// The map ChainElemOffsets must contain all of the elements in
320-
/// [ChainBegin, ChainElem] and their offsets from some arbitrary base
321-
/// address. It's ok if it contains additional entries.
319+
/// The map ChainSet must contain all of the elements in
320+
/// [ChainBegin, ChainElem]. It's ok if it contains additional entries.
322321
template <bool IsLoadChain>
323-
bool isSafeToMove(
324-
Instruction *ChainElem, Instruction *ChainBegin,
325-
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
326-
BatchAAResults &BatchAA);
322+
bool isSafeToMove(Instruction *ChainElem, Instruction *ChainBegin,
323+
const DenseSet<Instruction *> &ChainSet,
324+
BatchAAResults &BatchAA);
327325

328326
/// Merges the equivalence classes if they have underlying objects that differ
329327
/// by one level of indirection (i.e., one is a getelementptr and the other is
@@ -540,9 +538,9 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
540538
// We know that elements in the chain with nonverlapping offsets can't
541539
// alias, but AA may not be smart enough to figure this out. Use a
542540
// hashtable.
543-
DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
541+
DenseSet<Instruction *> ChainSet;
544542
for (const auto &E : C)
545-
ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
543+
ChainSet.insert(E.Inst);
546544

547545
// Across a single invocation of this function the IR is not changing, so
548546
// using a batched Alias Analysis is safe and can reduce compile time.
@@ -573,8 +571,8 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
573571
SmallVector<ChainElem, 1> NewChain;
574572
NewChain.emplace_back(*ChainBegin);
575573
for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
576-
if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
577-
ChainOffsets, BatchAA)) {
574+
if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst, ChainSet,
575+
BatchAA)) {
578576
LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
579577
<< *ChainIt->Inst << " into " << *ChainBegin->Inst
580578
<< "\n");
@@ -1037,10 +1035,9 @@ bool Vectorizer::vectorizeChain(Chain &C) {
10371035
}
10381036

10391037
template <bool IsLoadChain>
1040-
bool Vectorizer::isSafeToMove(
1041-
Instruction *ChainElem, Instruction *ChainBegin,
1042-
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
1043-
BatchAAResults &BatchAA) {
1038+
bool Vectorizer::isSafeToMove(Instruction *ChainElem, Instruction *ChainBegin,
1039+
const DenseSet<Instruction *> &ChainSet,
1040+
BatchAAResults &BatchAA) {
10441041
LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
10451042
<< *ChainBegin << ")\n");
10461043

@@ -1066,10 +1063,6 @@ bool Vectorizer::isSafeToMove(
10661063
return BasicBlock::iterator(ChainBegin);
10671064
}());
10681065

1069-
const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1070-
const unsigned ChainElemSize =
1071-
DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1072-
10731066
for (; BBIt != BBItEnd; ++BBIt) {
10741067
Instruction *I = &*BBIt;
10751068

@@ -1084,39 +1077,10 @@ bool Vectorizer::isSafeToMove(
10841077
if (!IsLoadChain && isInvariantLoad(I))
10851078
continue;
10861079

1087-
// If I is in the chain, we can tell whether it aliases ChainIt by checking
1088-
// what offset ChainIt accesses. This may be better than AA is able to do.
1089-
//
1090-
// We should really only have duplicate offsets for stores (the duplicate
1091-
// loads should be CSE'ed), but in case we have a duplicate load, we'll
1092-
// split the chain so we don't have to handle this case specially.
1093-
if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1094-
// I and ChainElem overlap if:
1095-
// - I and ChainElem have the same offset, OR
1096-
// - I's offset is less than ChainElem's, but I touches past the
1097-
// beginning of ChainElem, OR
1098-
// - ChainElem's offset is less than I's, but ChainElem touches past the
1099-
// beginning of I.
1100-
const APInt &IOffset = OffsetIt->second;
1101-
unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1102-
if (IOffset == ChainElemOffset ||
1103-
(IOffset.sle(ChainElemOffset) &&
1104-
(IOffset + IElemSize).sgt(ChainElemOffset)) ||
1105-
(ChainElemOffset.sle(IOffset) &&
1106-
(ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1107-
LLVM_DEBUG({
1108-
// Double check that AA also sees this alias. If not, we probably
1109-
// have a bug.
1110-
ModRefInfo MR =
1111-
BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1112-
assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1113-
dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1114-
});
1115-
return false; // We found an aliasing instruction; bail.
1116-
}
1117-
1118-
continue; // We're confident there's no alias.
1119-
}
1080+
// Allow on-chain aliasing because write-order is preserved when stores are
1081+
// vectorized.
1082+
if (ChainSet.count(I))
1083+
continue;
11201084

11211085
LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
11221086
ModRefInfo MR = BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));

llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,7 @@ define amdgpu_kernel void @no_crash(i32 %arg) {
1010
; GCN-SAME: i32 [[ARG:%.*]]) {
1111
; GCN-NEXT: [[TEMP2:%.*]] = add i32 [[ARG]], 14
1212
; GCN-NEXT: [[TEMP3:%.*]] = getelementptr [16384 x i32], ptr addrspace(3) @[[GLOB0:[0-9]+]], i32 0, i32 [[TEMP2]]
13-
; GCN-NEXT: [[TEMP4:%.*]] = add i32 [[ARG]], 15
14-
; GCN-NEXT: [[TEMP5:%.*]] = getelementptr [16384 x i32], ptr addrspace(3) @[[GLOB0]], i32 0, i32 [[TEMP4]]
1513
; GCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(3) [[TEMP3]], align 4
16-
; GCN-NEXT: store i32 0, ptr addrspace(3) [[TEMP5]], align 4
17-
; GCN-NEXT: store i32 0, ptr addrspace(3) [[TEMP5]], align 4
1814
; GCN-NEXT: ret void
1915
;
2016
%temp2 = add i32 %arg, 14
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
2+
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
3+
4+
define void @onevec(ptr %ptr, <1 x i32> %sd0, i32 %sd1, i32 %sd2, <1 x i32> %sd3, <1 x i32> %sd4, <1 x i32> %sd5) {
5+
; CHECK-LABEL: define void @onevec(
6+
; CHECK-SAME: ptr [[PTR:%.*]], <1 x i32> [[SD0:%.*]], i32 [[SD1:%.*]], i32 [[SD2:%.*]], <1 x i32> [[SD3:%.*]], <1 x i32> [[SD4:%.*]], <1 x i32> [[SD5:%.*]]) {
7+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i32> [[SD0]], i32 0
8+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x i32> poison, i32 [[TMP1]], i32 0
9+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x i32> [[TMP2]], i32 [[SD1]], i32 0
10+
; CHECK-NEXT: store <1 x i32> [[TMP3]], ptr [[PTR]], align 4
11+
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16
12+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <1 x i32> poison, i32 [[SD2]], i32 0
13+
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i32> [[SD3]], i32 0
14+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <1 x i32> [[TMP4]], i32 [[TMP5]], i32 0
15+
; CHECK-NEXT: store <1 x i32> [[TMP6]], ptr [[GEP1]], align 4
16+
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 32
17+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i32> [[SD4]], i32 0
18+
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <1 x i32> poison, i32 [[TMP7]], i32 0
19+
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <1 x i32> [[SD5]], i32 0
20+
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <1 x i32> [[TMP8]], i32 [[TMP9]], i32 0
21+
; CHECK-NEXT: store <1 x i32> [[TMP10]], ptr [[GEP2]], align 4
22+
; CHECK-NEXT: ret void
23+
;
24+
store <1 x i32> %sd0, ptr %ptr, align 4
25+
store i32 %sd1, ptr %ptr, align 4
26+
27+
%gep1 = getelementptr inbounds i8, ptr %ptr, i32 16
28+
store i32 %sd2, ptr %gep1, align 4
29+
store <1 x i32> %sd3, ptr %gep1, align 4
30+
31+
%gep2 = getelementptr inbounds i8, ptr %ptr, i32 32
32+
store <1 x i32> %sd4, ptr %gep2, align 4
33+
store <1 x i32> %sd5, ptr %gep2, align 4
34+
ret void
35+
}
36+
37+
define void @test(ptr %ptr, i32 %sd0, <2 x i32> %sd1, <2 x i32> %sd2, i32 %sd3) {
38+
; CHECK-LABEL: define void @test(
39+
; CHECK-SAME: ptr [[PTR:%.*]], i32 [[SD0:%.*]], <2 x i32> [[SD1:%.*]], <2 x i32> [[SD2:%.*]], i32 [[SD3:%.*]]) {
40+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[SD0]], i32 0
41+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SD1]], i32 0
42+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 1
43+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[SD1]], i32 1
44+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP4]], i32 2
45+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[SD2]], i32 0
46+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i32 2
47+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[SD2]], i32 1
48+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 3
49+
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[SD3]], i32 2
50+
; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[PTR]], align 4
51+
; CHECK-NEXT: ret void
52+
;
53+
store i32 %sd0, ptr %ptr, align 4
54+
%gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
55+
store <2 x i32> %sd1, ptr %gep1, align 4
56+
%gep2 = getelementptr inbounds i8, ptr %ptr, i32 8
57+
store <2 x i32> %sd2, ptr %gep2, align 4
58+
%gep3 = getelementptr inbounds i8, ptr %ptr, i32 8
59+
store i32 %sd3, ptr %gep3, align 4
60+
ret void
61+
}
62+
63+
define void @vect_zext_bitcast_i8_st4_to_i32_idx(ptr addrspace(1) %arg1, i32 %base, i32 %sd1, i32 %sd2, i32 %sd25, i32 %sd3, i32 %sd4) {
64+
; CHECK-LABEL: define void @vect_zext_bitcast_i8_st4_to_i32_idx(
65+
; CHECK-SAME: ptr addrspace(1) [[ARG1:%.*]], i32 [[BASE:%.*]], i32 [[SD1:%.*]], i32 [[SD2:%.*]], i32 [[SD25:%.*]], i32 [[SD3:%.*]], i32 [[SD4:%.*]]) {
66+
; CHECK-NEXT: [[ADD1:%.*]] = add nuw i32 [[BASE]], 0
67+
; CHECK-NEXT: [[ZEXT1:%.*]] = zext i32 [[ADD1]] to i64
68+
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT1]]
69+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[SD1]], i32 0
70+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[SD2]], i32 1
71+
; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(1) [[GEP1]], align 4
72+
; CHECK-NEXT: [[ADD25:%.*]] = add nuw i32 [[BASE]], 6
73+
; CHECK-NEXT: [[ZEXT25:%.*]] = zext i32 [[ADD25]] to i64
74+
; CHECK-NEXT: [[GEP25:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT25]]
75+
; CHECK-NEXT: store i32 [[SD25]], ptr addrspace(1) [[GEP25]], align 4
76+
; CHECK-NEXT: [[ADD3:%.*]] = add nuw i32 [[BASE]], 8
77+
; CHECK-NEXT: [[ZEXT3:%.*]] = zext i32 [[ADD3]] to i64
78+
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT3]]
79+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[SD3]], i32 0
80+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[SD4]], i32 1
81+
; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr addrspace(1) [[GEP3]], align 4
82+
; CHECK-NEXT: ret void
83+
;
84+
%add1 = add nuw i32 %base, 0
85+
%zext1 = zext i32 %add1 to i64
86+
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext1
87+
store i32 %sd1, ptr addrspace(1) %gep1, align 4
88+
%add2 = add nuw i32 %base, 4
89+
%zext2 = zext i32 %add2 to i64
90+
%gep2 = getelementptr inbounds i8,ptr addrspace(1) %arg1, i64 %zext2
91+
store i32 %sd2, ptr addrspace(1) %gep2, align 4
92+
93+
; A store with 2-byte overlap breaks continuity.
94+
%add25 = add nuw i32 %base, 6
95+
%zext25 = zext i32 %add25 to i64
96+
%gep25 = getelementptr inbounds i8,ptr addrspace(1) %arg1, i64 %zext25
97+
store i32 %sd25, ptr addrspace(1) %gep25, align 4
98+
99+
%add3 = add nuw i32 %base, 8
100+
%zext3 = zext i32 %add3 to i64
101+
%gep3 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext3
102+
store i32 %sd3, ptr addrspace(1) %gep3, align 4
103+
%add4 = add nuw i32 %base, 12
104+
%zext4 = zext i32 %add4 to i64
105+
%gep4 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext4
106+
store i32 %sd4, ptr addrspace(1) %gep4, align 4
107+
ret void
108+
}

0 commit comments

Comments
 (0)