Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
b766215
[mlir][amdgpu] Add make_dma_descriptor op
amd-eochoalo Nov 26, 2025
3c31d68
[mlir][amdgpu] Add tensor load store operation
amd-eochoalo Nov 25, 2025
cb116ea
[mlir][amdgpu] Lower amdgpu.make_dma_base.
amd-eochoalo Nov 25, 2025
3ee5464
Update documentation
amd-eochoalo Nov 27, 2025
7aa7699
[amdgpu][mlir] make_dma_base add type information.
amd-eochoalo Nov 27, 2025
9f37e60
[mlir][amdgpu] Add AllElementTypesMatch attribute to make_dma_base
amd-eochoalo Nov 27, 2025
3a42775
[mlir][amdgpu] verify element type sizes for make_dma_base
amd-eochoalo Nov 27, 2025
c0cd803
[mlir][amdgpu] Lower make_dma_descriptor
amd-eochoalo Nov 26, 2025
2973181
Folding
amd-eochoalo Nov 28, 2025
bf0600b
const SmallVector<T>& to ArrayRef<T>
amd-eochoalo Dec 2, 2025
0aa6fe1
change expression for 48-bits
amd-eochoalo Dec 2, 2025
1c987e8
Merge branch 'main' into eochoa/2025-12-02/merging-last
amd-eochoalo Dec 2, 2025
f482bbb
int to int64_t
amd-eochoalo Dec 2, 2025
ffd5685
format
amd-eochoalo Dec 2, 2025
5d45a72
revert exposing utility functions
amd-eochoalo Dec 2, 2025
f187e76
Avoid or if possible
amd-eochoalo Dec 2, 2025
661931c
Use a single constant
amd-eochoalo Dec 2, 2025
a1a82f8
explicit type to auto
amd-eochoalo Dec 2, 2025
ac543c2
Remove unnecessary braces
amd-eochoalo Dec 2, 2025
5672371
Use log2_32
amd-eochoalo Dec 3, 2025
1549f5b
!isDynamic -> isStatic
amd-eochoalo Dec 3, 2025
dc76238
Dot at end of comments
amd-eochoalo Dec 3, 2025
8ee42b1
C-array and for-loop
amd-eochoalo Dec 3, 2025
535f8ce
Delete superfluous empty line
amd-eochoalo Dec 3, 2025
be6560d
Assert type conversion succeeded.
amd-eochoalo Dec 3, 2025
2092acb
Use getIntOrFloatBitWidth
amd-eochoalo Dec 3, 2025
ea45349
use getIntOrFloatBitWidth
amd-eochoalo Dec 3, 2025
0634350
Add documentation.
amd-eochoalo Dec 3, 2025
2fafa28
documentation
amd-eochoalo Dec 3, 2025
dab96a5
General nit against ints
amd-eochoalo Dec 3, 2025
9b7e059
one-liner getElementTypeWidth
amd-eochoalo Dec 3, 2025
c04e41a
Use getMixedValues
amd-eochoalo Dec 3, 2025
eeb008a
Better messages in assertion
amd-eochoalo Dec 3, 2025
7955fe0
correction about rank2
amd-eochoalo Dec 3, 2025
e154dd4
Use pattern matcher
amd-eochoalo Dec 3, 2025
7a9d429
format
amd-eochoalo Dec 3, 2025
46d3e64
Propoagte type from getTypeWidth and int64_t
amd-eochoalo Dec 3, 2025
203ef02
Remove maybeUpdateDynamicIndexList
amd-eochoalo Dec 4, 2025
3fdf187
Use getIntOrFloatBitWidth
amd-eochoalo Dec 4, 2025
2039294
Undef -> Poison
amd-eochoalo Dec 5, 2025
b5313a3
Allow tensors less than rank 2
amd-eochoalo Dec 5, 2025
5f4ad77
Allow tensors with rank less than 2
amd-eochoalo Dec 5, 2025
91b2e96
Add TODOs
amd-eochoalo Dec 5, 2025
097a0e8
Fix atomicBarrierAddress calculation.
amd-eochoalo Dec 5, 2025
6b62d0e
Merge branch 'main' into eochoa/2025-12-02/merging-last
amd-eochoalo Dec 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 190 additions & 10 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,15 @@ def AMDGPU_AddressSpaceAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_AddressSpace,
let assemblyFormat = "`<` $value `>`";
}

//===----------------------------------------------------------------------===//
// AMDGPU Type definitions
//===----------------------------------------------------------------------===//

class AMDGPU_Type<string name, string typeMnemonic, list<Trait> traits = []>
: TypeDef<AMDGPU_Dialect, name, traits> {
let mnemonic = typeMnemonic;
}

//===----------------------------------------------------------------------===//
// AMDGPU Type definitions
//===----------------------------------------------------------------------===//

def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
let summary = "Pair of base addresses that move data between LDS and global storage.";
let description = [{
Expand All @@ -104,6 +104,15 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
let assemblyFormat = "`<` $elementType `>`";
}

def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
let summary = "Descriptors used in tensor store/load operations.";
let description = [{
This type is opaque and corresponds to the two or four descriptor groups
used in tensor_load_to_lds or tensor_store_from_lds.
}];

}

//===----------------------------------------------------------------------===//
// AMDGPU Op definitions
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1219,17 +1228,15 @@ def AMDGPU_ScaledMFMAOp :
}

def AMDGPU_MakeDmaBaseOp :
AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>,
AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments, AllElementTypesMatch<["src", "dst"]>]>,
Arguments<(ins
Arg<AnyMemRef, "buffer to read from">:$src,
Variadic<Index>:$srcIndices,
Variadic<Index>:$src_indices,
Arg<AnyMemRef, "buffer to write to">:$dst,
Variadic<Index>:$dstIndices)>,
Variadic<Index>:$dst_indices)>,
Results<(outs AMDGPU_TDMBaseType: $base)> {

// TODO:
// * Add verifiers such that one of the memrefs is from LDS and the other global.
// * Add verifiers to make sure that the type is in the correct direction.
// * Add verifiers to make sure that the number of indices do not exceed the number of dimensions.

let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
Expand All @@ -1240,11 +1247,184 @@ def AMDGPU_MakeDmaBaseOp :
This operation creates a value corresponding to the tensor descriptor (D#) group 0
found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect.

For example:

```mlir
%base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
%descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
```

to

```mlir
// pseudo-code
%global_base = llvm.extractvalue %global_memref[1]
%global_address = llvm.get_element_ptr ...

%lds_base = llvm.extractvalue %lds_memref[1]
%lds_address = llvm.get_element_ptr ...

// Definition of %base
%undef = llvm.mlir.undef : vector<4xi32>
%v0 = llvm.insertelement %15, %undef[0] : vector<4xi32>
%v1 = llvm.insertelement %lds_address, %v0[1] : vector<4xi32>
%v2 = llvm.insertelement %global_address_low, %v1[2] : vector<4xi32>
%base = llvm.insertelement %global_address_high, %v2[3] : vector<4xi32>

rocdl.tensor.load.to.lds %base, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
```

These tensor DMA operations were introduced in gfx1250.
}];

let assemblyFormat = [{
$src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results)
$src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results)
}];

let hasVerifier = 1;
}

def AMDGPU_MakeDmaDescriptorOp :
AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>,
Arguments<(ins
AMDGPU_TDMBaseType: $base,
Variadic<Index>: $global_dynamic_sizes,
DenseI64ArrayAttr: $global_static_sizes,
Variadic<Index>: $global_dynamic_strides,
DenseI64ArrayAttr: $global_static_strides,
Variadic<Index>: $shared_dynamic_sizes,
DenseI64ArrayAttr: $shared_static_sizes,
Optional<Index>: $pad_amount,
Optional<Index>: $pad_interval,
Optional<AnyMemRef>: $atomic_barrier_address,
Variadic<Index>: $atomic_barrier_indices,
Optional<Index>: $global_increment,
Optional<Index>: $lds_increment,
Optional<Index>: $iteration_count)>,
Results<(outs AMDGPU_TDMDescriptorType: $desc)> {

let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS.";
let description = [{
Make all descriptor groups needed by tensor memory operations.

The $base operand corresponds to the base pair addresses, one must be an address in LDS
while the other must be a global memory location.

$global_{static/dynamic}_sizes determine the size of the tensor.
$global_{static/dynamic}_strides determine the strides of the tensor.
$shared_{static/dynamic}_sizes determines the size of the tile.

Padding can be applied to the LDS address when copying from memory to LDS,
but not when copying from LDS to memory.
The values in the padded target addresses remain the same as before the operation was applied.

2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
$global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
$lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
$iterate_count determines how many times to iterate.

```mlir
// Example of moving a two-dimensional tensor to LDS.
%base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
%descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor

// Example of moving a two dimension tensor to LDS where padding is applied after every integer.
%base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
%descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad_amount every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
```
}];

let assemblyFormat = [{
$base
`globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
`globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
`sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
( `padShared` `(` $pad_amount^ `every` $pad_interval`)` )?
( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]`
`:` type($atomic_barrier_address) `)`)?
( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )?
attr-dict `:` qualified(type($base)) `->` type(results)
}];

let extraClassDeclaration = [{
int getRank() {
return getGlobalStaticSizes().size();
}

int getElementTypeWidth() {
Type elementType = getBase().getType().getElementType();
int width;
if (auto floatType = dyn_cast<FloatType>(elementType)) {
width = floatType.getWidth();
} else if (auto intType = dyn_cast<IntegerType>(elementType)) {
width = intType.getWidth();
} else {
llvm_unreachable("element type must have getWidth interface");
}
return width;
}

SmallVector<OpFoldResult> getMixedList(SmallVector<Value> dynamics, ArrayRef<int64_t> statics) {
SmallVector<OpFoldResult> result;
unsigned ctr = 0;
OpBuilder b(getContext());
for (int64_t static_elem : statics) {
if (ShapedType::isDynamic(static_elem)) {
result.push_back(dynamics[ctr++]);
} else {
result.push_back(b.getIndexAttr(static_elem));
}
}
return result;
}

SmallVector<OpFoldResult> getMixedGlobalSizes() {
return getMixedList(getGlobalDynamicSizes(), getGlobalStaticSizes());
}

SmallVector<OpFoldResult> getMixedGlobalStrides() {
return getMixedList(getGlobalDynamicStrides(), getGlobalStaticStrides());
}

SmallVector<OpFoldResult> getMixedSharedSizes() {
return getMixedList(getSharedDynamicSizes(), getSharedStaticSizes());
}
}];

let hasVerifier = 1;
let hasFolder = 1;
}

def AMDGPU_TensorLoadToLDSOp :
AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
let summary = "Load tensors from global memory to LDS.";
let description = [{
Load tensors of up to five dimensions from global memory to LDS.

The operation is fully described by the descriptor operand.
}];

let assemblyFormat = [{
$desc attr-dict `:` qualified(type($desc))
}];
}

def AMDGPU_TensorStoreFromLDSOp :
AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
let summary = "Store tensors from LDS to global memory.";
let description = [{
Store tensors of up to five dimensions from LDS to global memory.

The operation is fully described by the descriptor operand.
}];

let assemblyFormat = [{
$desc attr-dict `:` qualified(type($desc))
}];
}

Expand Down
5 changes: 5 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ inline void printMNKDimensionList(OpAsmPrinter &printer, Operation *,
IntegerAttr m, IntegerAttr n, IntegerAttr k) {
printMNKDimensionList(printer, m, n, k);
}

// Utility functions for quering the address space.
bool hasGlobalMemorySpace(Attribute memorySpace);
bool hasWorkgroupMemorySpace(Attribute memorySpace);
bool hasFatRawBufferMemorySpace(Attribute memorySpace);
} // namespace mlir::amdgpu

#define GET_ATTRDEF_CLASSES
Expand Down
Loading