diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 6ac84c646e3ae..ec9f449c35dc4 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1227,7 +1227,7 @@ def AMDGPU_ScaledMFMAOp : } def AMDGPU_MakeDmaBaseOp : - AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>, + AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments, AllElementTypesMatch<["global", "lds"]>]>, Arguments<(ins Arg:$global, Variadic:$global_indices, Arg:$lds, @@ -1293,8 +1293,8 @@ def AMDGPU_MakeDmaDescriptorOp : DenseI64ArrayAttr: $global_static_strides, Variadic: $shared_dynamic_sizes, DenseI64ArrayAttr: $shared_static_sizes, - Optional: $pad, - Optional: $pad_every, + Optional: $pad_amount, + Optional: $pad_interval, Optional: $atomic_barrier_address, Variadic: $atomic_barrier_indices, Optional: $global_increment, @@ -1316,6 +1316,10 @@ def AMDGPU_MakeDmaDescriptorOp : Padding can be applied to the LDS address when copying from memory to LDS, but not when copying from LDS to memory. The values in the padded target addresses remain the same as before the operation was applied. + $pad_interval must be a power of two contained in [2, 256]. + $pad_amount must be a value contained in [1, 128]. + + $atomic_barrier_address must be aligned to 8 bytes. 2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count. $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type. @@ -1330,7 +1334,7 @@ def AMDGPU_MakeDmaDescriptorOp : // Example of moving a two dimension tensor to LDS where padding is applied after every integer. %base = amdgpu.make_dma_base %global[0, 0], %lds[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base - %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad pad_every %pad_every) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad_amount pad_every %pad_interval) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor ``` }]; @@ -1340,14 +1344,37 @@ def AMDGPU_MakeDmaDescriptorOp : `globalSize` custom($global_dynamic_sizes, $global_static_sizes) `globalStride` custom($global_dynamic_strides, $global_static_strides) `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) - ( `padShared` `(` $pad^ `every` $pad_every `)` )? + ( `padShared` `(` $pad_amount^ `every` $pad_interval `)` )? ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]` `:` type($atomic_barrier_address) `)`)? ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )? attr-dict `:` qualified(type($base)) `->` type(results) }]; + let extraClassDeclaration = [{ + int64_t getRank() { + return getGlobalStaticSizes().size(); + } + + unsigned getElementTypeWidth() { + return getBase().getType().getElementType().getIntOrFloatBitWidth(); + } + + SmallVector getMixedGlobalSizes() { + return getMixedValues(getGlobalStaticSizes(), getGlobalDynamicSizes(), getContext()); + } + + SmallVector getMixedGlobalStrides() { + return getMixedValues(getGlobalStaticStrides(), getGlobalDynamicStrides(), getContext()); + } + + SmallVector getMixedSharedSizes() { + return getMixedValues(getSharedStaticSizes(), getSharedDynamicSizes(), getContext()); + } + }]; + let hasVerifier = 1; + let hasFolder = 1; } #endif // AMDGPU diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index a85973c2493ee..6cf0d5450d5a7 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -19,6 +19,7 @@ #include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Matchers.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Pass/Pass.h" @@ -2326,6 +2327,7 @@ struct AMDGPUMakeDmaBaseLowering Value c3 = createI32Constant(rewriter, loc, 3); Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32)); + assert(v4i32 && "expected type conversion to succeed"); Value result = LLVM::PoisonOp::create(rewriter, loc, v4i32); result = LLVM::InsertElementOp::create(rewriter, loc, result, c1, c0); result = LLVM::InsertElementOp::create(rewriter, loc, result, @@ -2339,6 +2341,344 @@ struct AMDGPUMakeDmaBaseLowering } }; +struct AMDGPUMakeDmaDescriptorLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + AMDGPUMakeDmaDescriptorLowering(const LLVMTypeConverter &converter, + Chipset chipset) + : ConvertOpToLLVMPattern(converter), + chipset(chipset) {} + Chipset chipset; + + Value getDGroup0(OpAdaptor adaptor) const { return adaptor.getBase(); } + + Value setValueAtOffset(ConversionPatternRewriter &rewriter, Location loc, + Value accumulator, Value value, int64_t shift) const { + shift = shift % 32; + Value shiftAmount; + if (shift != 0) { + shiftAmount = createI32Constant(rewriter, loc, shift % 32); + value = LLVM::ShlOp::create(rewriter, loc, value, shiftAmount); + } + + if (matchPattern(accumulator, mlir::m_Zero())) + return value; + + return LLVM::OrOp::create(rewriter, loc, accumulator, value); + } + + Value setDataSize(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, ArrayRef consts) const { + // Compute data_size. + unsigned elementTypeWidthInBits = op.getElementTypeWidth(); + assert( + llvm::is_contained({8, 16, 32, 64}, elementTypeWidthInBits) && + "expected type width to be 8, 16, 32, or 64."); + int64_t dataSize = llvm::Log2_32(elementTypeWidthInBits / 8); + return createI32Constant(rewriter, loc, dataSize << 16); + } + + Value setAtomicBarrier(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, ArrayRef consts) const { + bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr; + if (!atomic_barrier_enable) + return sgpr0; + + return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 18); + } + + Value setIterateEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, ArrayRef consts) const { + bool iterate_enable = adaptor.getGlobalIncrement() != nullptr; + if (!iterate_enable) + return sgpr0; + + // TODO: In future PR, add other required fields for iteration. + return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 19); + } + + Value setPadEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, ArrayRef consts) const { + bool pad_enable = op.getPadAmount() != nullptr; + if (!pad_enable) + return sgpr0; + + return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 20); + } + + Value setPadInterval(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, ArrayRef consts) const { + bool pad_enable = op.getPadAmount() != nullptr; + if (!pad_enable) + return sgpr0; + + IntegerType i32 = rewriter.getI32Type(); + Value padInterval = adaptor.getPadInterval(); + // pre-condition: padInterval can be a power of two between 2 and 256. + padInterval = LLVM::CountTrailingZerosOp::create(rewriter, loc, i32, + padInterval, false); + padInterval = LLVM::SubOp::create(rewriter, loc, padInterval, consts[1]); + // post-condition: padInterval can be a value between 0 and 7. + return setValueAtOffset(rewriter, loc, sgpr0, padInterval, 22); + } + + Value setPadAmount(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, ArrayRef consts) const { + bool pad_enable = op.getPadAmount() != nullptr; + if (!pad_enable) + return sgpr0; + + Value padAmount = adaptor.getPadAmount(); + // pre-condition: padAmount is a value between 1-128. + padAmount = LLVM::SubOp::create(rewriter, loc, padAmount, consts[1]); + // post-condition: padAmount is a value between 0-127. + return setValueAtOffset(rewriter, loc, sgpr0, padAmount, 25); + } + + Value setAtomicBarrierAddress(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, + Location loc, Value sgpr1, + ArrayRef consts) const { + bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr; + if (!atomic_barrier_enable) + return sgpr1; + + Value atomicBarrierAddress = adaptor.getAtomicBarrierAddress(); + auto barrierAddressTy = + cast(op.getAtomicBarrierAddress().getType()); + ValueRange atomicBarrierIndices = adaptor.getAtomicBarrierIndices(); + atomicBarrierAddress = + getStridedElementPtr(rewriter, loc, barrierAddressTy, + atomicBarrierAddress, atomicBarrierIndices); + IntegerType i32 = rewriter.getI32Type(); + // pre-condition: atomicBarrierAddress is aligned to 8 bytes which implies + // that the 3 LSBs are zero. + atomicBarrierAddress = + LLVM::PtrToIntOp::create(rewriter, loc, i32, atomicBarrierAddress); + atomicBarrierAddress = + LLVM::LShrOp::create(rewriter, loc, atomicBarrierAddress, consts[3]); + Value mask = createI32Constant(rewriter, loc, 0xFFFF); + atomicBarrierAddress = + LLVM::AndOp::create(rewriter, loc, atomicBarrierAddress, mask); + return setValueAtOffset(rewriter, loc, sgpr1, atomicBarrierAddress, 32); + } + + std::pair setTensorDim0(MakeDmaDescriptorOp op, + OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, + Location loc, Value sgpr1, Value sgpr2, + ArrayRef consts) const { + SmallVector mixedGlobalSizes = op.getMixedGlobalSizes(); + OpFoldResult tensorDim0OpFoldResult = mixedGlobalSizes.back(); + Value tensorDim0; + if (auto attr = dyn_cast(tensorDim0OpFoldResult)) + tensorDim0 = + createI32Constant(rewriter, loc, cast(attr).getInt()); + else + tensorDim0 = cast(tensorDim0OpFoldResult); + + Value c16 = createI32Constant(rewriter, loc, 16); + Value tensorDim0High = LLVM::LShrOp::create(rewriter, loc, tensorDim0, c16); + sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDim0, 48); + sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim0High, 48 + 16); + return {sgpr1, sgpr2}; + } + + std::pair setTensorDim1(MakeDmaDescriptorOp op, + OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, + Location loc, Value sgpr2, Value sgpr3, + ArrayRef consts) const { + // TODO: Generalize to setTensorDimX. + SmallVector mixedGlobalSizes = op.getMixedGlobalSizes(); + OpFoldResult tensorDim1OpFoldResult = *(mixedGlobalSizes.rbegin() + 1); + Value tensorDim1; + if (auto attr = dyn_cast(tensorDim1OpFoldResult)) + tensorDim1 = + createI32Constant(rewriter, loc, cast(attr).getInt()); + else + tensorDim1 = cast(tensorDim1OpFoldResult); + + Value c16 = createI32Constant(rewriter, loc, 16); + Value tensorDim1High = LLVM::LShrOp::create(rewriter, loc, tensorDim1, c16); + sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim1, 80); + sgpr3 = setValueAtOffset(rewriter, loc, sgpr3, tensorDim1High, 80 + 16); + return {sgpr2, sgpr3}; + } + + Value setTileDimX(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr, ArrayRef consts, size_t dimX, + int64_t offset) const { + SmallVector mixedSharedSizes = op.getMixedSharedSizes(); + + if (mixedSharedSizes.size() <= dimX) + return sgpr; + + OpFoldResult tileDimXOpFoldResult = *(mixedSharedSizes.rbegin() + dimX); + Value tileDimX; + if (auto attr = dyn_cast(tileDimXOpFoldResult)) + tileDimX = + createI32Constant(rewriter, loc, cast(attr).getInt()); + else + tileDimX = cast(tileDimXOpFoldResult); + + return setValueAtOffset(rewriter, loc, sgpr, tileDimX, offset); + } + + Value setTileDim0(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr3, ArrayRef consts) const { + return setTileDimX(op, adaptor, rewriter, loc, sgpr3, consts, 0, 112); + } + + Value setTileDim1(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr4, ArrayRef consts) const { + return setTileDimX(op, adaptor, rewriter, loc, sgpr4, consts, 1, 128); + } + + Value setTileDim2(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr4, ArrayRef consts) const { + return setTileDimX(op, adaptor, rewriter, loc, sgpr4, consts, 2, 144); + } + + std::pair + setTensorDimXStride(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgprY, Value sgprZ, ArrayRef consts, + size_t dimX, int64_t offset) const { + SmallVector mixedGlobalStrides = op.getMixedGlobalStrides(); + + if (mixedGlobalStrides.size() <= dimX) + return {sgprY, sgprZ}; + + OpFoldResult tensorDimXStrideOpFoldResult = + *(mixedGlobalStrides.rbegin() + dimX); + Value tensorDimXStride; + if (auto attr = dyn_cast(tensorDimXStrideOpFoldResult)) + tensorDimXStride = + createI64Constant(rewriter, loc, cast(attr).getInt()); + else + tensorDimXStride = cast(tensorDimXStrideOpFoldResult); + + constexpr int64_t first48bits = (1ll << 48) - 1; + Value mask = createI64Constant(rewriter, loc, first48bits); + tensorDimXStride = + LLVM::AndOp::create(rewriter, loc, mask, tensorDimXStride); + IntegerType i32 = rewriter.getI32Type(); + Value tensorDimXStrideLow = + LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStride); + + int64_t shift = (offset % 32) == 0 ? 32 : offset % 32; + Value shiftVal = createI64Constant(rewriter, loc, shift); + Value tensorDimXStrideHigh = + LLVM::LShrOp::create(rewriter, loc, tensorDimXStride, shiftVal); + tensorDimXStrideHigh = + LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStrideHigh); + + sgprY = setValueAtOffset(rewriter, loc, sgprY, tensorDimXStrideLow, offset); + sgprZ = setValueAtOffset(rewriter, loc, sgprZ, tensorDimXStrideHigh, + offset + shift); + return {sgprY, sgprZ}; + } + + std::pair + setTensorDim0Stride(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr5, Value sgpr6, ArrayRef consts) const { + return setTensorDimXStride(op, adaptor, rewriter, loc, sgpr5, sgpr6, consts, + 0, 160); + } + + std::pair + setTensorDim1Stride(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr5, Value sgpr6, ArrayRef consts) const { + return setTensorDimXStride(op, adaptor, rewriter, loc, sgpr5, sgpr6, consts, + 1, 208); + } + + Value getDGroup1(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + ArrayRef consts) const { + Value sgprs[8]; + for (int64_t i = 0; i < 8; i++) { + sgprs[i] = consts[0]; + } + + sgprs[0] = setDataSize(op, adaptor, rewriter, loc, sgprs[0], consts); + sgprs[0] = setAtomicBarrier(op, adaptor, rewriter, loc, sgprs[0], consts); + sgprs[0] = setIterateEnable(op, adaptor, rewriter, loc, sgprs[0], consts); + sgprs[0] = setPadEnable(op, adaptor, rewriter, loc, sgprs[0], consts); + sgprs[0] = setPadInterval(op, adaptor, rewriter, loc, sgprs[0], consts); + sgprs[0] = setPadAmount(op, adaptor, rewriter, loc, sgprs[0], consts); + + sgprs[1] = + setAtomicBarrierAddress(op, adaptor, rewriter, loc, sgprs[1], consts); + std::tie(sgprs[1], sgprs[2]) = + setTensorDim0(op, adaptor, rewriter, loc, sgprs[1], sgprs[2], consts); + std::tie(sgprs[2], sgprs[3]) = + setTensorDim1(op, adaptor, rewriter, loc, sgprs[2], sgprs[3], consts); + + sgprs[3] = setTileDim0(op, adaptor, rewriter, loc, sgprs[3], consts); + sgprs[4] = setTileDim1(op, adaptor, rewriter, loc, sgprs[4], consts); + sgprs[4] = setTileDim2(op, adaptor, rewriter, loc, sgprs[4], consts); + std::tie(sgprs[5], sgprs[6]) = setTensorDim0Stride( + op, adaptor, rewriter, loc, sgprs[5], sgprs[6], consts); + std::tie(sgprs[6], sgprs[7]) = setTensorDim1Stride( + op, adaptor, rewriter, loc, sgprs[6], sgprs[7], consts); + + IntegerType i32 = rewriter.getI32Type(); + Type v8i32 = this->typeConverter->convertType(VectorType::get(8, i32)); + assert(v8i32 && "expected type conversion to succeed"); + Value dgroup1 = LLVM::PoisonOp::create(rewriter, loc, v8i32); + + for (auto [sgpr, constant] : llvm::zip_equal(sgprs, consts)) { + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr, constant); + } + + return dgroup1; + } + + LogicalResult + matchAndRewrite(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (chipset < kGfx1250) + return op->emitOpError( + "make_dma_descriptor is only supported on gfx1250"); + + if (op.getRank() > 2) + return op->emitOpError("unimplemented"); + + Location loc = op.getLoc(); + + IntegerType i32 = rewriter.getI32Type(); + Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32)); + assert(v4i32 && "expected type conversion to succeed"); + + SmallVector consts; + for (int64_t i = 0; i < 8; i++) + consts.push_back(createI32Constant(rewriter, loc, i)); + + Value dgroup0 = this->getDGroup0(adaptor); + Value dgroup1 = this->getDGroup1(op, adaptor, rewriter, loc, consts); + + SmallVector results = {dgroup0, dgroup1}; + rewriter.replaceOpWithMultiple(op, {results}); + return success(); + } +}; + struct ConvertAMDGPUToROCDLPass : public impl::ConvertAMDGPUToROCDLPassBase { using Base::Base; @@ -2357,6 +2697,7 @@ struct ConvertAMDGPUToROCDLPass Type i32 = IntegerType::get(type.getContext(), 32); return converter.convertType(VectorType::get(4, i32)); }); + populateAMDGPUToROCDLConversionPatterns(converter, patterns, *maybeChipset); LLVMConversionTarget target(getContext()); target.addIllegalDialect<::mlir::amdgpu::AMDGPUDialect>(); @@ -2412,6 +2753,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, GatherToLDSOpLowering, TransposeLoadOpLowering, AMDGPUPermlaneLowering, - AMDGPUMakeDmaBaseLowering>(converter, chipset); + AMDGPUMakeDmaBaseLowering, AMDGPUMakeDmaDescriptorLowering>(converter, + chipset); patterns.add(converter); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index f78eca621da52..cf74f671db216 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -692,15 +692,14 @@ LogicalResult TransposeLoadOp::verify() { }; auto validNumElems = kValidLoadSizeMap.find(elementTypeSize); - if (validNumElems == kValidLoadSizeMap.end()) { + if (validNumElems == kValidLoadSizeMap.end()) return emitOpError("Unsupported element type size for transpose load: ") << elementTypeSize << " bits"; - } - if (numElements != validNumElems->second) { + + if (numElements != validNumElems->second) return emitOpError( "Transferring type size mismatch: expected num of elements: ") << validNumElems->second; - } return success(); } @@ -710,16 +709,24 @@ LogicalResult TransposeLoadOp::verify() { //===----------------------------------------------------------------------===// LogicalResult MakeDmaBaseOp::verify() { - MemRefType ldsType = cast(getLds().getType()); - MemRefType globalType = cast(getGlobal().getType()); - if (!hasWorkgroupMemorySpace(ldsType.getMemorySpace())) { + + auto ldsType = cast(getLds().getType()); + auto globalType = cast(getGlobal().getType()); + if (!hasWorkgroupMemorySpace(ldsType.getMemorySpace())) return emitOpError( "lds memref must have workgroup address space attribute."); - } - if (!hasGlobalMemorySpace(globalType.getMemorySpace())) { + if (!hasGlobalMemorySpace(globalType.getMemorySpace())) return emitOpError( "global memref must have global address space attribute."); - } + + Type elementType = ldsType.getElementType(); + unsigned width = elementType.getIntOrFloatBitWidth(); + + if (!llvm::is_contained({8, 16, 32, 64}, width)) + return emitOpError( + "element type must be 1, 2, 4, or 8 bytes long but type was ") + << width << " bits long."; + return success(); } @@ -730,37 +737,75 @@ LogicalResult MakeDmaBaseOp::verify() { LogicalResult MakeDmaDescriptorOp::verify() { ArrayRef globalStaticStrides = getGlobalStaticStrides(); - if (globalStaticStrides.empty()) { + if (globalStaticStrides.empty()) return emitOpError("strides must not be empty."); - } - if (globalStaticStrides.back() != 1) { + if (globalStaticStrides.back() != 1) return emitOpError("strides for the innermost dimension must be 1."); - } ArrayRef globalStaticSizes = getGlobalStaticSizes(); size_t rank = globalStaticSizes.size(); - if (rank != globalStaticStrides.size()) { + if (rank > 5) + return emitOpError("tensor and tile must be at most of rank 5."); + if (rank != globalStaticStrides.size()) return emitOpError("strides and sizes must have same rank."); - } ArrayRef sharedStaticSizes = getSharedStaticSizes(); - if (rank != sharedStaticSizes.size()) { + if (rank != sharedStaticSizes.size()) return emitOpError("tensor must have same rank as tile."); - } + + unsigned elementTypeWidth = getElementTypeWidth(); + if (!llvm::is_contained({8, 16, 32, 64}, elementTypeWidth)) + return emitOpError( + "element type width must be 1, 2, 4 or 8 bytes, but was ") + << elementTypeWidth << " bits long"; if (Value atomicBarrierAddress = getAtomicBarrierAddress()) { - MemRefType atomicBarrierAddressType = + auto atomicBarrierAddressType = cast(atomicBarrierAddress.getType()); bool barrierInLDS = hasWorkgroupMemorySpace(atomicBarrierAddressType.getMemorySpace()); - if (!barrierInLDS) { + if (!barrierInLDS) return emitOpError("atomic barrier address must be in LDS."); - } } return success(); } +OpFoldResult MakeDmaDescriptorOp::fold(FoldAdaptor adaptor) { + SmallVector mixedGlobalSizes(getMixedGlobalSizes()); + SmallVector mixedGlobalStrides(getMixedGlobalStrides()); + SmallVector mixedSharedSizes(getMixedSharedSizes()); + + if (failed(foldDynamicIndexList(mixedGlobalSizes, /*onlyNonNegative=*/true, + /*onlyNonZero=*/true)) && + failed(foldDynamicIndexList(mixedGlobalStrides, /*onlyNonNegative=*/true, + /*onlyNonZero=*/true)) && + failed(foldDynamicIndexList(mixedSharedSizes, /*onlyNonNegative=*/true, + /*onlyNonZero=*/true))) + return nullptr; + + SmallVector dynamicGlobalSizes, dynamicGlobalStrides, + dynamicSharedSizes; + SmallVector staticGlobalSizes, staticGlobalStrides, + staticSharedSizes; + + dispatchIndexOpFoldResults(mixedGlobalSizes, dynamicGlobalSizes, + staticGlobalSizes); + setGlobalStaticSizes(staticGlobalSizes); + getGlobalDynamicSizesMutable().assign(dynamicGlobalSizes); + + dispatchIndexOpFoldResults(mixedGlobalStrides, dynamicGlobalStrides, + staticGlobalStrides); + setGlobalStaticStrides(staticGlobalStrides); + getGlobalDynamicStridesMutable().assign(dynamicGlobalStrides); + + dispatchIndexOpFoldResults(mixedSharedSizes, dynamicSharedSizes, + staticSharedSizes); + setSharedStaticSizes(staticSharedSizes); + getSharedDynamicSizesMutable().assign(dynamicSharedSizes); + return getResult(); +} + //===----------------------------------------------------------------------===// // ScaledMFMAOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index d0ec69d6fea6e..5a65689ec1f93 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -169,6 +169,30 @@ func.func @amdgpu.scaled_ext_packed_matrix_invalid_dst_elem_type(%v: vector<16xf #gpu_lds_addrspace = 3 #amdgpu_fat_buffer_addrspace = 7 +func.func @amdgpu.make_dma_base.invalid_element_types(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xf32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_base' op failed to verify that all of {global, lds} have same element type}} + %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu_global_addrspace>, memref<8xf32, #gpu_lds_addrspace> -> !amdgpu.tdm_base + return %0 : !amdgpu.tdm_base +} + +// ----- + +#gpu_global_addrspace = 1 +#gpu_lds_addrspace = 3 +#amdgpu_fat_buffer_addrspace = 7 + +func.func @amdgpu.make_dma_base.invalid_element_types(%idx: index, %mem: memref<8xi7, #gpu_global_addrspace>, %smem: memref<8xi7,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_base' op element type must be 1, 2, 4, or 8 bytes long but type was 7 bits long.}} + %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi7, #gpu_global_addrspace>, memref<8xi7, #gpu_lds_addrspace> -> !amdgpu.tdm_base + return %0 : !amdgpu.tdm_base +} + +// ----- + +#gpu_global_addrspace = 1 +#gpu_lds_addrspace = 3 +#amdgpu_fat_buffer_addrspace = 7 + // CHECK-LABEL: func @make_dma_base // CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32, 1>, %[[SMEM:.+]]: memref<8xi32, 3>) func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xi32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base) { @@ -210,3 +234,129 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace> func.return %0 : !amdgpu.tdm_base } + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor(%base: !amdgpu.tdm_base) -> !amdgpu.tdm_descriptor { + // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]] + + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) + // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) + // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) + // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32) + // CHECK-DAG: %[[C4:.+]] = llvm.mlir.constant(4 : i32) + // CHECK-DAG: %[[C5:.+]] = llvm.mlir.constant(5 : i32) + // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32) + // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32) + + // CHECK-DAG: %[[SGPR0:.+]] = llvm.mlir.constant(131072 : i32) + + // CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32) + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]] + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[SGPR1:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]] + + // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32) + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]] + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[TENSOR_DIM_1_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[C16]] + // CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TENSOR_DIM_1_SHIFTED]] + + // CHECK-DAG: %[[TILE_DIM_0:.+]] = llvm.mlir.constant(64 : i32) + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0:.+]], %[[C16]] + // CHECK: %[[SGPR3:.+]] = llvm.or %[[SGPR3_0]], %[[TILE_DIM_0_SHIFTED]] + + // CHECK-DAG: %[[SGPR4:.+]] = llvm.mlir.constant(128 : i32) + + // CHECK-DAG: %[[TENSOR_DIM_0_STRIDE:.+]] = llvm.mlir.constant(1 : i64) : i64 + // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64 + // CHECK: %[[TENSOR_DIM_0_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_0_STRIDE]] + // CHECK-DAG: %[[SGPR5:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_MASKED]] : i64 to i32 + // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64 + // CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_0_STRIDE_MASKED]], %[[SHIFT]] + // CHECK: %[[SGPR6_0:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_HIGH_64]] : i64 to i32 + + // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE:.+]] = llvm.mlir.constant(64 : i64) + // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64 + // CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]] + // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]] + // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64 + // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]] + // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32 + // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]] + // CHECK-DAG: %[[SGPR6:.+]] = llvm.or %[[SGPR6_0]], %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED]] + + // CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32> + // CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32] + // CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32] + // CHECK: %[[DGROUP1_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP1_1]][%[[C2]] : i32] + // CHECK: %[[DGROUP1_3:.+]] = llvm.insertelement %[[SGPR3]], %[[DGROUP1_2]][%[[C3]] : i32] + // CHECK: %[[DGROUP1_4:.+]] = llvm.insertelement %[[SGPR4]], %[[DGROUP1_3]][%[[C4]] : i32] + // CHECK: %[[DGROUP1_5:.+]] = llvm.insertelement %[[SGPR5]], %[[DGROUP1_4]][%[[C5]] : i32] + // CHECK: %[[DGROUP1_6:.+]] = llvm.insertelement %[[SGPR6]], %[[DGROUP1_5]][%[[C6]] : i32] + // CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32] + + // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor + %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return %descriptor : !amdgpu.tdm_descriptor +} + +// ----- + +#gpu_global_addrspace = 1 +#gpu_lds_addrspace = 3 +#amdgpu_fat_buffer_addrspace = 7 + +// CHECK-LABEL: func @make_dma_descriptor_atomic_barrier +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base, %[[BARRIER:.+]]: {{.*}}, %[[IDX:.+]]: index) +func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base, %barrier : memref<8xi32, #gpu_lds_addrspace>, %idx: index) -> !amdgpu.tdm_descriptor { + // CHECK-DAG: %[[INDEX:.+]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64 + // CHECK-DAG: %[[BARRIER_MEMREF_DESC:.+]] = builtin.unrealized_conversion_cast %[[BARRIER]] + // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]] + + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) + // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) + // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) + // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32) + // CHECK-DAG: %[[C4:.+]] = llvm.mlir.constant(4 : i32) + // CHECK-DAG: %[[C5:.+]] = llvm.mlir.constant(5 : i32) + // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32) + // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32) + + // CHECK-DAG: %[[SGPR0_0:.+]] = llvm.mlir.constant(131072 : i32) + + // CHECK-DAG: %[[ATOMIC_BARRIER_ENABLE_OFFSET:.+]] = llvm.mlir.constant(18 : i32) + // CHECK: %[[ATOMIC_BARRIER_ENABLE_FIELD:.+]] = llvm.shl %[[C1]], %[[ATOMIC_BARRIER_ENABLE_OFFSET]] + // CHECK: %[[SGPR0:.+]] = llvm.or %[[SGPR0_0]], %[[ATOMIC_BARRIER_ENABLE_FIELD]] + + // CHECK: %[[ATOMIC_BARRIER_ALIGNED_PTR:.+]] = llvm.extractvalue %[[BARRIER_MEMREF_DESC]][1] + // CHECK: %[[ATOMIC_BARRIER_ADDR:.+]] = llvm.getelementptr %[[ATOMIC_BARRIER_ALIGNED_PTR]][%[[INDEX]] + // CHECK: %[[ATOMIC_BARRIER_I32:.+]] = llvm.ptrtoint %[[ATOMIC_BARRIER_ADDR]] : !llvm.ptr<3> to i32 + // CHECK: %[[ATOMIC_BARRIER_NO_3_LSB:.+]] = llvm.lshr %[[ATOMIC_BARRIER_I32]], %[[C3]] + // CHECK: %[[MASK:.+]] = llvm.mlir.constant(65535 : i32) + // CHECK: %[[ATOMIC_BARRIER:.+]] = llvm.and %[[ATOMIC_BARRIER_NO_3_LSB]], %[[MASK]] + + // CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32) + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]] + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[SGPR1_0:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]] + // CHECK: %[[SGPR1:.+]] = llvm.or %[[ATOMIC_BARRIER]], %[[SGPR1_0]] + + // CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32> + // CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32] + // CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32] + + %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] + globalStride [64, 1] + sharedSize [128, 64] + atomicBarrier(%barrier[%idx] : memref<8xi32, #gpu_lds_addrspace>) + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return %descriptor : !amdgpu.tdm_descriptor +} diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir new file mode 100644 index 0000000000000..9d43c9940f8e0 --- /dev/null +++ b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir @@ -0,0 +1,19 @@ +// RUN: mlir-opt --canonicalize %s | FileCheck %s + +// CHECK-LABEL: @make_dma_descriptor_fold +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base, %[[IDX:.+]]: index) +func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base, %idx: index) -> !amdgpu.tdm_descriptor { + %c64 = arith.constant 64 : index + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + %0 = amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [64, 64] + globalSize [%c64, %c64] + // CHECK-SAME: globalStride [64, 1] + globalStride [%c64, 1] + // CHECK-SAME: sharedSize [64, 64] + sharedSize [%c64, %c64] + iterate %idx, %idx, %idx + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return %0 : !amdgpu.tdm_descriptor +} diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index b915bfa324c77..6308ea9a6a096 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -373,7 +373,7 @@ func.func @make_dma_base_invalid_addressspace(%idx: index, %smem : memref<8xi32, func.func @make_dma_base_invalid_barrier(%base: !amdgpu.tdm_base, %barrier: memref<8xi32>, %idx: index) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op atomic barrier address must be in LDS.}} - amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1] sharedSize [0] atomicBarrier(%barrier[%idx] : memref<8xi32>) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] atomicBarrier(%barrier[%idx] : memref<8xi32>) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor } // ----- @@ -382,7 +382,7 @@ func.func @make_dma_base_invalid_barrier(%base: !amdgpu.tdm_base, %barrier: // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides must not be empty.}} - amdgpu.make_dma_descriptor %base globalSize [0] globalStride [] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [0, 1] globalStride [] sharedSize [1, 0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } @@ -392,7 +392,7 @@ func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}} - amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [1, 0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } @@ -402,7 +402,7 @@ func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base< // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides and sizes must have same rank.}} - amdgpu.make_dma_descriptor %base globalSize [1] globalStride [1, 1] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [1, 1, 1] globalStride [1, 1] sharedSize [1, 0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } @@ -412,6 +412,7 @@ func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_ // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_shared_and_global_rank(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op tensor must have same rank as tile.}} - amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [1, 2, 3] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } + diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 0eccd0a7430bc..e32fd41f91dc8 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -700,45 +700,45 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base - // CHECK-SAME: globalSize [0] - globalSize [0] - // CHECK-SAME: globalStride [1] - globalStride [1] - // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor - sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + // CHECK-SAME: globalSize [64, 64] + globalSize [64, 64] + // CHECK-SAME: globalStride [64, 1] + globalStride [64, 1] + // CHECK-SAME: sharedSize [64, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + sharedSize [64, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base - // CHECK-SAME: globalSize [0] - globalSize [0] - // CHECK-SAME: globalStride [1] - globalStride [1] - // CHECK-SAME: sharedSize [0] - sharedSize [0] + // CHECK-SAME: globalSize [64, 64] + globalSize [64, 64] + // CHECK-SAME: globalStride [64, 1] + globalStride [64, 1] + // CHECK-SAME: sharedSize [64, 64] + sharedSize [64, 64] // CHECK-SAME: padShared(%[[IDX]] every %[[IDX]]) padShared(%idx every %idx) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base - // CHECK-SAME: globalSize [0] - globalSize [0] - // CHECK-SAME: globalStride [1] - globalStride [1] - // CHECK-SAME: sharedSize [0] - sharedSize [0] + // CHECK-SAME: globalSize [64, 64] + globalSize [64, 64] + // CHECK-SAME: globalStride [64, 1] + globalStride [64, 1] + // CHECK-SAME: sharedSize [64, 64] + sharedSize [64, 64] // CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32, #gpu.address_space>) atomicBarrier(%barrier[%idx] : memref<8xi32, #gpu.address_space>) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base - // CHECK-SAME: globalSize [0] - globalSize [0] - // CHECK-SAME: globalStride [1] - globalStride [1] - // CHECK-SAME: sharedSize [0] - sharedSize [0] + // CHECK-SAME: globalSize [64, 64] + globalSize [64, 64] + // CHECK-SAME: globalStride [64, 1] + globalStride [64, 1] + // CHECK-SAME: sharedSize [64, 64] + sharedSize [64, 64] // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]] iterate %idx, %idx, %idx : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor