From eb787b3dcd93251132857ea6fcc19c7b844285ab Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Tue, 18 Feb 2025 12:32:05 +0100 Subject: [PATCH] Pipeline fixes Fix pipeline + update syntax --- lib/TPP/DefaultPipeline.cpp | 11 +++++++---- lib/TPP/GPU/GpuPipeline.cpp | 1 + lib/TPP/GPU/GpuToCuda.cpp | 4 ++-- lib/TPP/Transforms/ToBlockLayoutAndBack.cpp | 3 +++ test/GPU/CUDA/Integration/gpu-printf.mlir | 2 +- test/GPU/set-spirv-abi-attr.mlir | 2 +- test/Passes/DefaultPipeline/default-pipeline.mlir | 2 -- .../lower-packs-and-unpacks-without-transpose.mlir | 4 ++-- 8 files changed, 17 insertions(+), 12 deletions(-) diff --git a/lib/TPP/DefaultPipeline.cpp b/lib/TPP/DefaultPipeline.cpp index a6515607f..d17b033a0 100644 --- a/lib/TPP/DefaultPipeline.cpp +++ b/lib/TPP/DefaultPipeline.cpp @@ -195,22 +195,25 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase, pm.addPass(createSCFToControlFlowPass()); if (defParallel) pm.addPass(createConvertOpenMPToLLVMPass()); - pm.addPass(createConvertMathToLLVMPass()); pm.addNestedPass(createGpuAsyncRegionPass()); pm.addPass(createGpuToLLVMConversionPass()); GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions; gpuModuleToBinaryPassOptions.compilationTarget = "fatbin"; pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions)); + pm.addPass(createConvertMathToLLVMPass()); pm.addPass(createAsyncToAsyncRuntimePass()); pm.addPass(createAsyncRuntimeRefCountingPass()); pm.addPass(createConvertAsyncToLLVMPass()); + pm.addPass(createConvertIndexToLLVMPass()); pm.addPass(createConvertFuncToLLVMPass()); - pm.addNestedPass(createArithToLLVMConversionPass()); - pm.addNestedPass(createCanonicalizerPass()); - pm.addNestedPass(createCSEPass()); + pm.addPass(createArithToLLVMConversionPass()); + pm.addPass(createConvertControlFlowToLLVMPass()); + pm.addPass(createUBToLLVMConversionPass()); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); pm.addPass(createReconcileUnrealizedCastsPass()); // Anything useful has been lowered by now. diff --git a/lib/TPP/GPU/GpuPipeline.cpp b/lib/TPP/GPU/GpuPipeline.cpp index 4a0118fc3..99227eadb 100644 --- a/lib/TPP/GPU/GpuPipeline.cpp +++ b/lib/TPP/GPU/GpuPipeline.cpp @@ -11,6 +11,7 @@ #include "mlir/Conversion/Passes.h" #include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/Pipelines/Passes.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" diff --git a/lib/TPP/GPU/GpuToCuda.cpp b/lib/TPP/GPU/GpuToCuda.cpp index 3c1e156c0..2c7edfa46 100644 --- a/lib/TPP/GPU/GpuToCuda.cpp +++ b/lib/TPP/GPU/GpuToCuda.cpp @@ -77,6 +77,7 @@ struct GpuToCuda : public tpp::impl::GpuToCudaBase, pm.addNestedPass(createConvertFuncToLLVMPass()); pm.addNestedPass(createArithToLLVMConversionPass()); pm.addNestedPass(createConvertIndexToLLVMPass()); + pm.addNestedPass(createUBToLLVMConversionPass()); GpuNVVMAttachTargetOptions nvvmTargetOptions; nvvmTargetOptions.triple = gpuTriple; @@ -85,12 +86,11 @@ struct GpuToCuda : public tpp::impl::GpuToCudaBase, pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions)); // Create CUDA kernels. - pm.addNestedPass(createStripDebugInfoPass()); pm.addNestedPass(createCanonicalizerPass()); pm.addNestedPass(createCSEPass()); pm.addNestedPass(createReconcileUnrealizedCastsPass()); - // Cleanup IR. + // // Cleanup IR. pm.addPass(createCanonicalizerPass()); pm.addPass(createCSEPass()); #endif // TPP_CUDA_ENABLE diff --git a/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp b/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp index b63506c16..a5fb6f68d 100644 --- a/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp +++ b/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp @@ -832,6 +832,7 @@ struct SimplifyAndCanonicalizePack void mlir::tpp::populateSimplifyPacking(RewritePatternSet &patterns) { MLIRContext *ctx = patterns.getContext(); linalg::populateSimplifyPackAndUnpackPatterns(patterns); + linalg::populateFoldPackUnpackIntoTensorEmptyPatterns(patterns); tensor::populateFoldTensorEmptyPatterns(patterns); linalg::PackOp::getCanonicalizationPatterns(patterns, ctx); linalg::UnPackOp::getCanonicalizationPatterns(patterns, ctx); @@ -849,6 +850,8 @@ void mlir::tpp::populateSimplifyPacking(RewritePatternSet &patterns) { patterns, [](OpOperand *operand) { return isa(operand->get().getDefiningOp()); }); + ctx->getLoadedDialect()->getCanonicalizationPatterns( + patterns); ctx->getLoadedDialect()->getCanonicalizationPatterns( patterns); patterns.add(ctx); diff --git a/test/GPU/CUDA/Integration/gpu-printf.mlir b/test/GPU/CUDA/Integration/gpu-printf.mlir index 3002aaa09..9373a4a7f 100644 --- a/test/GPU/CUDA/Integration/gpu-printf.mlir +++ b/test/GPU/CUDA/Integration/gpu-printf.mlir @@ -9,7 +9,7 @@ module attributes {gpu.container_module} { %0 = gpu.thread_id x %csti8 = arith.constant 2 : i8 %cstf32 = arith.constant 3.0 : f32 - gpu.printf "Hello from %lld, %d, %f\n" %0, %csti8, %cstf32 : index, i8, f32 + gpu.printf "Hello from %lld, %d, %f\n", %0, %csti8, %cstf32 : index, i8, f32 gpu.return } } diff --git a/test/GPU/set-spirv-abi-attr.mlir b/test/GPU/set-spirv-abi-attr.mlir index 6496bb48e..89f24cc65 100644 --- a/test/GPU/set-spirv-abi-attr.mlir +++ b/test/GPU/set-spirv-abi-attr.mlir @@ -17,7 +17,7 @@ module attributes {gpu.container_module} { %b1 = gpu.block_id y %t0 = gpu.thread_id x %t1 = gpu.thread_id y - gpu.printf "Block (%lld, %lld, 1) - Thread (%lld, %lld, 1)\n" %b0, %b1, %t0, %t1 : index, index, index, index + gpu.printf "Block (%lld, %lld, 1) - Thread (%lld, %lld, 1)\n", %b0, %b1, %t0, %t1 : index, index, index, index gpu.return } } diff --git a/test/Passes/DefaultPipeline/default-pipeline.mlir b/test/Passes/DefaultPipeline/default-pipeline.mlir index fd8118a86..18e300f22 100644 --- a/test/Passes/DefaultPipeline/default-pipeline.mlir +++ b/test/Passes/DefaultPipeline/default-pipeline.mlir @@ -9,8 +9,6 @@ func.func @matmul(%A: tensor<4x8xf32>, // CHECK: llvm.func @xsmm_gemm_invoke // CHECK: llvm.func @xsmm_gemm_dispatch // CHECK: llvm.func @matmul(%[[ARG0:.+]]: !llvm.ptr, -// CHECK: llvm.insertvalue -// CHECK: llvm.mlir.constant // CHECK: llvm.call @xsmm_gemm_dispatch // CHECK: llvm.call @xsmm_gemm_invoke // CHECK: llvm.return diff --git a/test/Passes/lower-packs-and-unpacks-without-transpose.mlir b/test/Passes/lower-packs-and-unpacks-without-transpose.mlir index 00f982d72..1d9d48438 100644 --- a/test/Passes/lower-packs-and-unpacks-without-transpose.mlir +++ b/test/Passes/lower-packs-and-unpacks-without-transpose.mlir @@ -137,14 +137,14 @@ module { // CHECK: %[[M_ROUNDED_UP:.*]] = affine.apply {{.*}}()[%[[M_DUP]], %[[M]]] // CHECK: %[[ARG0_PADDED:.*]] = tensor.pad %[[ARG0]] low[0, 0] high[%[[M_ROUNDED_UP]], 0] // CHECK: %[[M_PADDED:.*]] = tensor.dim %[[ARG0_PADDED]], %[[C0]] - // CHECK: %[[NUM_CHUNKS_PADDED_M:.*]] = arith.divui %[[M_PADDED]], %[[C32]] + // CHECK: %[[NUM_CHUNKS_PADDED_M:.*]] = arith.divsi %[[M_PADDED]], %[[C32]] // CHECK: %[[EXP0:.+]] = tensor.expand_shape %[[ARG0_PADDED]] {{\[}}[0, 1], [2, 3]{{\]}} output_shape [%[[NUM_CHUNKS_PADDED_M]], 32, 16, 32] : tensor into tensor // CHECK: %[[M_ARG1:.*]] = tensor.dim %[[ARG1]], %[[C0]] // CHECK: %[[M_ARG1_DUP:.*]] = tensor.dim %[[ARG1]], %[[C0]] // CHECK: %[[M_ARG1_ROUNDED_UP:.*]] = affine.apply {{.*}}()[%[[M_ARG1_DUP]], %[[M_ARG1]]] // CHECK: %[[ARG1_PADDED:.*]] = tensor.pad %[[ARG1]] low[0, 0] high[%[[M_ARG1_ROUNDED_UP]], 0] // CHECK: %[[M_ARG1_PADDED:.*]] = tensor.dim %[[ARG1_PADDED]], %[[C0]] - // CHECK: %[[NUM_CHUNKS_PADDED_M_ARG1:.*]] = arith.divui %[[M_ARG1_PADDED]], %[[C32]] + // CHECK: %[[NUM_CHUNKS_PADDED_M_ARG1:.*]] = arith.divsi %[[M_ARG1_PADDED]], %[[C32]] // CHECK: %[[EXP1:.+]] = tensor.expand_shape %[[ARG1_PADDED]] {{\[}}[0, 1], [2, 3]{{\]}} output_shape [%[[NUM_CHUNKS_PADDED_M_ARG1]], 32, 8, 32] : tensor into tensor // CHECK: %[[RES:.+]] = linalg.generic {{.*}} ins(%[[EXP0]], %[[CST]] : tensor, tensor<8x16x32x32xf32>) outs(%[[EXP1]] : tensor) // CHECK: %[[COL:.+]] = tensor.collapse_shape %[[RES]] {{\[}}[0, 1], [2, 3]{{\]}} : tensor into tensor