Skip to content

Commit

Permalink
Pipeline fixes
Browse files Browse the repository at this point in the history
Fix pipeline + update syntax
  • Loading branch information
adam-smnk committed Feb 18, 2025
1 parent 88f1c8a commit eb787b3
Show file tree
Hide file tree
Showing 8 changed files with 17 additions and 12 deletions.
11 changes: 7 additions & 4 deletions lib/TPP/DefaultPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,22 +195,25 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase<DefaultPipeline>,
pm.addPass(createSCFToControlFlowPass());
if (defParallel)
pm.addPass(createConvertOpenMPToLLVMPass());
pm.addPass(createConvertMathToLLVMPass());

pm.addNestedPass<func::FuncOp>(createGpuAsyncRegionPass());
pm.addPass(createGpuToLLVMConversionPass());
GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
gpuModuleToBinaryPassOptions.compilationTarget = "fatbin";
pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
pm.addPass(createConvertMathToLLVMPass());
pm.addPass(createAsyncToAsyncRuntimePass());
pm.addPass(createAsyncRuntimeRefCountingPass());
pm.addPass(createConvertAsyncToLLVMPass());
pm.addPass(createConvertIndexToLLVMPass());

pm.addPass(createConvertFuncToLLVMPass());

pm.addNestedPass<func::FuncOp>(createArithToLLVMConversionPass());
pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
pm.addNestedPass<func::FuncOp>(createCSEPass());
pm.addPass(createArithToLLVMConversionPass());
pm.addPass(createConvertControlFlowToLLVMPass());
pm.addPass(createUBToLLVMConversionPass());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
pm.addPass(createReconcileUnrealizedCastsPass());

// Anything useful has been lowered by now.
Expand Down
1 change: 1 addition & 0 deletions lib/TPP/GPU/GpuPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "mlir/Conversion/Passes.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/Pipelines/Passes.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
Expand Down
4 changes: 2 additions & 2 deletions lib/TPP/GPU/GpuToCuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ struct GpuToCuda : public tpp::impl::GpuToCudaBase<GpuToCuda>,
pm.addNestedPass<gpu::GPUModuleOp>(createConvertFuncToLLVMPass());
pm.addNestedPass<gpu::GPUModuleOp>(createArithToLLVMConversionPass());
pm.addNestedPass<gpu::GPUModuleOp>(createConvertIndexToLLVMPass());
pm.addNestedPass<gpu::GPUModuleOp>(createUBToLLVMConversionPass());

GpuNVVMAttachTargetOptions nvvmTargetOptions;
nvvmTargetOptions.triple = gpuTriple;
Expand All @@ -85,12 +86,11 @@ struct GpuToCuda : public tpp::impl::GpuToCudaBase<GpuToCuda>,
pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));

// Create CUDA kernels.
pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());

// Cleanup IR.
// // Cleanup IR.
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
#endif // TPP_CUDA_ENABLE
Expand Down
3 changes: 3 additions & 0 deletions lib/TPP/Transforms/ToBlockLayoutAndBack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -832,6 +832,7 @@ struct SimplifyAndCanonicalizePack
void mlir::tpp::populateSimplifyPacking(RewritePatternSet &patterns) {
MLIRContext *ctx = patterns.getContext();
linalg::populateSimplifyPackAndUnpackPatterns(patterns);
linalg::populateFoldPackUnpackIntoTensorEmptyPatterns(patterns);
tensor::populateFoldTensorEmptyPatterns(patterns);
linalg::PackOp::getCanonicalizationPatterns(patterns, ctx);
linalg::UnPackOp::getCanonicalizationPatterns(patterns, ctx);
Expand All @@ -849,6 +850,8 @@ void mlir::tpp::populateSimplifyPacking(RewritePatternSet &patterns) {
patterns, [](OpOperand *operand) {
return isa<tensor::ExpandShapeOp>(operand->get().getDefiningOp());
});
ctx->getLoadedDialect<linalg::LinalgDialect>()->getCanonicalizationPatterns(
patterns);
ctx->getLoadedDialect<tensor::TensorDialect>()->getCanonicalizationPatterns(
patterns);
patterns.add<FoldUnPackIntoInsertSlice>(ctx);
Expand Down
2 changes: 1 addition & 1 deletion test/GPU/CUDA/Integration/gpu-printf.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ module attributes {gpu.container_module} {
%0 = gpu.thread_id x
%csti8 = arith.constant 2 : i8
%cstf32 = arith.constant 3.0 : f32
gpu.printf "Hello from %lld, %d, %f\n" %0, %csti8, %cstf32 : index, i8, f32
gpu.printf "Hello from %lld, %d, %f\n", %0, %csti8, %cstf32 : index, i8, f32
gpu.return
}
}
Expand Down
2 changes: 1 addition & 1 deletion test/GPU/set-spirv-abi-attr.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ module attributes {gpu.container_module} {
%b1 = gpu.block_id y
%t0 = gpu.thread_id x
%t1 = gpu.thread_id y
gpu.printf "Block (%lld, %lld, 1) - Thread (%lld, %lld, 1)\n" %b0, %b1, %t0, %t1 : index, index, index, index
gpu.printf "Block (%lld, %lld, 1) - Thread (%lld, %lld, 1)\n", %b0, %b1, %t0, %t1 : index, index, index, index
gpu.return
}
}
Expand Down
2 changes: 0 additions & 2 deletions test/Passes/DefaultPipeline/default-pipeline.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ func.func @matmul(%A: tensor<4x8xf32>,
// CHECK: llvm.func @xsmm_gemm_invoke
// CHECK: llvm.func @xsmm_gemm_dispatch
// CHECK: llvm.func @matmul(%[[ARG0:.+]]: !llvm.ptr,
// CHECK: llvm.insertvalue
// CHECK: llvm.mlir.constant
// CHECK: llvm.call @xsmm_gemm_dispatch
// CHECK: llvm.call @xsmm_gemm_invoke
// CHECK: llvm.return
4 changes: 2 additions & 2 deletions test/Passes/lower-packs-and-unpacks-without-transpose.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -137,14 +137,14 @@ module {
// CHECK: %[[M_ROUNDED_UP:.*]] = affine.apply {{.*}}()[%[[M_DUP]], %[[M]]]
// CHECK: %[[ARG0_PADDED:.*]] = tensor.pad %[[ARG0]] low[0, 0] high[%[[M_ROUNDED_UP]], 0]
// CHECK: %[[M_PADDED:.*]] = tensor.dim %[[ARG0_PADDED]], %[[C0]]
// CHECK: %[[NUM_CHUNKS_PADDED_M:.*]] = arith.divui %[[M_PADDED]], %[[C32]]
// CHECK: %[[NUM_CHUNKS_PADDED_M:.*]] = arith.divsi %[[M_PADDED]], %[[C32]]
// CHECK: %[[EXP0:.+]] = tensor.expand_shape %[[ARG0_PADDED]] {{\[}}[0, 1], [2, 3]{{\]}} output_shape [%[[NUM_CHUNKS_PADDED_M]], 32, 16, 32] : tensor<?x512xf32> into tensor<?x32x16x32xf32>
// CHECK: %[[M_ARG1:.*]] = tensor.dim %[[ARG1]], %[[C0]]
// CHECK: %[[M_ARG1_DUP:.*]] = tensor.dim %[[ARG1]], %[[C0]]
// CHECK: %[[M_ARG1_ROUNDED_UP:.*]] = affine.apply {{.*}}()[%[[M_ARG1_DUP]], %[[M_ARG1]]]
// CHECK: %[[ARG1_PADDED:.*]] = tensor.pad %[[ARG1]] low[0, 0] high[%[[M_ARG1_ROUNDED_UP]], 0]
// CHECK: %[[M_ARG1_PADDED:.*]] = tensor.dim %[[ARG1_PADDED]], %[[C0]]
// CHECK: %[[NUM_CHUNKS_PADDED_M_ARG1:.*]] = arith.divui %[[M_ARG1_PADDED]], %[[C32]]
// CHECK: %[[NUM_CHUNKS_PADDED_M_ARG1:.*]] = arith.divsi %[[M_ARG1_PADDED]], %[[C32]]
// CHECK: %[[EXP1:.+]] = tensor.expand_shape %[[ARG1_PADDED]] {{\[}}[0, 1], [2, 3]{{\]}} output_shape [%[[NUM_CHUNKS_PADDED_M_ARG1]], 32, 8, 32] : tensor<?x256xf32> into tensor<?x32x8x32xf32>
// CHECK: %[[RES:.+]] = linalg.generic {{.*}} ins(%[[EXP0]], %[[CST]] : tensor<?x32x16x32xf32>, tensor<8x16x32x32xf32>) outs(%[[EXP1]] : tensor<?x32x8x32xf32>)
// CHECK: %[[COL:.+]] = tensor.collapse_shape %[[RES]] {{\[}}[0, 1], [2, 3]{{\]}} : tensor<?x32x8x32xf32> into tensor<?x256xf32>
Expand Down

0 comments on commit eb787b3

Please sign in to comment.