-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU: Fix verifier error when waterfall call target is in AV register #168017
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Fix verifier error when waterfall call target is in AV register #168017
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThis isn't an ideal fix; technically this should be an optimization path The lit test changes are mostly regressions to be fixed later. Patch is 95.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168017.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index eafb579b1a2ee..3f0d26e661137 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8178,26 +8178,34 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
- if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
- NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
- // Instead of creating a copy where src and dst are the same register
- // class, we just replace all uses of dst with src. These kinds of
- // copies interfere with the heuristics MachineSink uses to decide
- // whether or not to split a critical edge. Since the pass assumes
- // that copies will end up as machine instructions and not be
- // eliminated.
- addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+ if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
Register NewDstReg = Inst.getOperand(1).getReg();
- MRI.replaceRegWith(DstReg, NewDstReg);
- MRI.clearKillFlags(NewDstReg);
- Inst.getOperand(0).setReg(DstReg);
- Inst.eraseFromParent();
- // Legalize t16 operand since replaceReg is called after addUsersToVALU
- for (MachineOperand &MO :
- make_early_inc_range(MRI.use_operands(NewDstReg))) {
- legalizeOperandsVALUt16(*MO.getParent(), MRI);
+ const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
+ if (const TargetRegisterClass *CommonRC =
+ RI.getCommonSubClass(NewDstRC, SrcRC)) {
+ // Instead of creating a copy where src and dst are the same register
+ // class, we just replace all uses of dst with src. These kinds of
+ // copies interfere with the heuristics MachineSink uses to decide
+ // whether or not to split a critical edge. Since the pass assumes
+ // that copies will end up as machine instructions and not be
+ // eliminated.
+ addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ MRI.clearKillFlags(NewDstReg);
+ Inst.getOperand(0).setReg(DstReg);
+
+ if (!MRI.constrainRegClass(NewDstReg, CommonRC))
+ llvm_unreachable("failed to constrain register");
+
+ Inst.eraseFromParent();
+ // Legalize t16 operand since replaceReg is called after addUsersToVALU
+ for (MachineOperand &MO :
+ make_early_inc_range(MRI.use_operands(NewDstReg))) {
+ legalizeOperandsVALUt16(*MO.getParent(), MRI);
+ }
+
+ return;
}
- return;
}
// If this is a v2s copy between 16bit and 32bit reg,
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index 196958b74442f..ae53bdff7c251 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -10733,15 +10733,16 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
@@ -11000,15 +11001,16 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
@@ -19023,15 +19025,16 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
@@ -19282,15 +19285,16 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll
index aede91b76f441..a13f3513c660e 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll
@@ -43,26 +43,25 @@ define void @phi_with_alloca_and_divergent_copy_to_reg(ptr addrspace(5) %diverge
; CHECK-LABEL: phi_with_alloca_and_divergent_copy_to_reg:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_lshr_b32 s6, s32, 6
; CHECK-NEXT: v_mov_b32_e32 v7, v2
; CHECK-NEXT: v_mov_b32_e32 v6, v1
; CHECK-NEXT: s_mov_b64 s[4:5], 0
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
+; CHECK-NEXT: v_lshrrev_b32_e64 v2, 6, s32
; CHECK-NEXT: .LBB1_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_add_u32_e32 v8, 1, v3
-; CHECK-NEXT: v_lshl_add_u32 v5, v3, 2, v1
-; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v8
-; CHECK-NEXT: v_mov_b32_e32 v2, v1
-; CHECK-NEXT: v_mov_b32_e32 v1, v0
-; CHECK-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; CHECK-NEXT: v_mov_b32_e32 v1, v2
+; CHECK-NEXT: v_lshl_add_u32 v2, v3, 2, v1
+; CHECK-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: v_add_u32_e32 v2, 1, v3
+; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v2
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; CHECK-NEXT: v_mov_b32_e32 v3, v4
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
; CHECK-NEXT: s_cbranch_execnz .LBB1_1
; CHECK-NEXT: ; %bb.2: ; %done
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dword v[6:7], v0, off
; CHECK-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index fe432e9d7594d..331a29b3f4a93 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -101,39 +101,39 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
;
; GFX90A-LABEL: test_mfma_loop_zeroinit:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader
@@ -160,39 +160,39 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
;
; GFX942-LABEL: test_mfma_loop_zeroinit:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader
@@ -333,6 +333,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX90A-LABEL: test_mfma_loop_unfoldable_splat:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
@@ -365,7 +366,6 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader
@@ -393,6 +393,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX942-LABEL: test_mfma_loop_unfoldable_splat:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
@@ -425,7 +426,6 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader
@@ -559,39 +559,39 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
;
; GFX90A-LABEL: test_mfma_loop_non_splat:
; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX90A-NEXT: v_accvg...
[truncated]
|
ronlieb
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This resolves the verifier issues i am seeing.
Fixes another verifier error after introducing AV registers. Also fixes not clearing the subregister index if there was one.
This isn't an ideal fix; technically this should be an optimization path we shouldn't need to go down. The base path where a copy will be inserted is still broken. The lit test changes are mostly regressions to be fixed later.
6da68f8 to
9398eaf
Compare
557be13 to
ab51862
Compare

This isn't an ideal fix; technically this should be an optimization path
we shouldn't need to go down. The base path where a copy will be inserted
is still broken.
The lit test changes are mostly regressions to be fixed later.