From b005ec275247f68c2f8c1a94414928055b2f0d53 Mon Sep 17 00:00:00 2001 From: Vivian Zhang Date: Fri, 21 Feb 2025 12:37:07 -0800 Subject: [PATCH] Bump IREE to 055ce1f (#1124) The main change is to update `tensor.pack/unpack to linalg.pack/unpack` followed by upstream change https://github.com/llvm/llvm-project/pull/123902. --- .../AMDAIEBufferizeToAllocation.cpp | 2 +- .../Transforms/AMDAIEFuseProducerIntoLoop.cpp | 12 +-- .../Transforms/AMDAIEPackAndTranspose.cpp | 6 +- .../Transforms/AMDAIETileAndFuse.cpp | 8 +- .../iree-amd-aie/Transforms/Passes.cpp | 2 +- .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 2 +- .../test/bufferize_to_allocation.mlir | 84 +++++++++---------- .../bufferize_to_allocation_pack_or_copy.mlir | 36 ++++---- .../test/create_reference_to_allocation.mlir | 12 +-- .../test/fuse_consumer_into_loop.mlir | 72 ++++++++-------- .../test/fuse_producer_into_loop.mlir | 64 +++++++------- .../test/pack_and_transpose_level1.mlir | 12 +-- .../test/pack_and_transpose_level2.mlir | 24 +++--- .../test/propagate_data_layout.mlir | 34 ++++---- third_party/iree | 2 +- 15 files changed, 186 insertions(+), 186 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp index 969fa9f34..afaa8a569 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp @@ -76,7 +76,7 @@ static FailureOr> getPackOrCopyOperands( uint32_t currentLevel{0}; Operation *currentOp = input.value().getDefiningOp(); while (currentLevel < depthLevel && currentOp != nullptr) { - if (dyn_cast(currentOp)) { + if (dyn_cast(currentOp)) { currentLevel++; if (currentLevel == depthLevel) break; } else if (dyn_cast(currentOp)) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseProducerIntoLoop.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseProducerIntoLoop.cpp index 2c6366718..a53ced50b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseProducerIntoLoop.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseProducerIntoLoop.cpp @@ -21,7 +21,7 @@ namespace { /// A utility function specific to this pass which, given a value `operand`, /// traverses the def-chain till it finds a tensor.extract_slice. Currently, -/// the two producer ops that are allowed in the def-chain are tensor.pack and +/// the two producer ops that are allowed in the def-chain are linalg.pack and /// linalg.copy ops. The 2 cases where it successfully finds and returns an /// extract_slice (SLICE) are: /// @@ -39,7 +39,7 @@ namespace { static FailureOr getTensorExtractSliceDefiningOp( Value operand) { // Roll back through all the pack or copy ops immediately preceding `operand`. - while (isa_and_present( + while (isa_and_present( operand.getDefiningOp())) { operand = operand.getDefiningOp()->getOperand(0); } @@ -49,7 +49,7 @@ static FailureOr getTensorExtractSliceDefiningOp( if (!sliceOp) return failure(); // Case 1 outlined above. - if (isa_and_present( + if (isa_and_present( sliceOp.getSource().getDefiningOp())) { return sliceOp; } @@ -60,7 +60,7 @@ static FailureOr getTensorExtractSliceDefiningOp( LoopLikeOpInterface loop = dyn_cast(parent); if (!loop) return failure(); Operation *operandParent = loop.getTiedLoopInit(blkArg)->getOwner(); - if (isa_and_present(operandParent)) + if (isa_and_present(operandParent)) return sliceOp; } @@ -110,7 +110,7 @@ void AMDAIEFuseProducerIntoLoopPass::runOnOperation() { LoopLikeOpInterface loops = cast(scfLoopOp); // Based on the `fuseDepth`, we would greedily fuse the producers of a linalg - // computation op. Currently, we are limiting the producers to tensor.pack or + // computation op. Currently, we are limiting the producers to linalg.pack or // linalg.copy ops. for (unsigned depth = 1; depth <= fuseDepth; depth++) { // Search the last compute op in the loop and its producer slices. @@ -153,7 +153,7 @@ void AMDAIEFuseProducerIntoLoopPass::runOnOperation() { // Case where operand of a generic op is a pack/copy op which is in a // different block than the generic's block. - else if (isa_and_present( + else if (isa_and_present( operand.getDefiningOp())) { Operation *parent = operand.getDefiningOp(); Block *genericBlock = genericOp->getBlock(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp index 62544391e..2ee70ee9e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp @@ -106,9 +106,9 @@ void AMDAIEPackAndTransposePass::runOnOperation() { } // Step 3. Pack Transpose - SmallVector packOps = packResult->packOps; + SmallVector packOps = packResult->packOps; linalg::LinalgOp packedOp = packResult->packedLinalgOp; - SmallVector unpackOps = packResult->unPackOps; + SmallVector unpackOps = packResult->unPackOps; if (packOps.size() != 3 || !packedOp || unpackOps.empty()) { funcOp->emitOpError("failed to get correct pack and unpack ops"); @@ -122,7 +122,7 @@ void AMDAIEPackAndTransposePass::runOnOperation() { for (auto [index, unpackEmpty, innerPerm, outerPerm] : llvm::zip(packIndices, unpackArr, innerPermArr, outerPermArr)) { - tensor::UnPackOp unpackOp; + linalg::UnPackOp unpackOp; if (unpackEmpty) { unpackOp = unpackOps.back(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp index bb05de0f1..c267a239f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp @@ -178,8 +178,8 @@ static bool isTilingReductionDimension(TilingInterface consumerOp, } static bool consumerToSkip(TilingInterface op) { - if (isa(op) || isa(op) || - isa(op)) + if (isa(op) || isa(op) || + isa(op)) return true; return false; } @@ -279,7 +279,7 @@ void AMDAIETileAndFusePass::runOnOperation() { TilingInterface consumerOp; funcOp->walk([&](TilingInterface op) { // Find the next consumer op if it does not have loops OR it is from - // the skip ops list which currently contains linalg.copy and tensor.unpack. + // the skip ops list which currently contains linalg.copy and linalg.unpack. if (op.getLoopIteratorTypes().empty() || consumerToSkip(op)) return WalkResult::advance(); @@ -356,7 +356,7 @@ void AMDAIETileAndFusePass::runOnOperation() { bool fusableOp = TypeSwitch(originalProducer.getOwner()) // List ops that shouldnt be fused. - .Case([](Operation *) { return false; }) // Fuse all Linalg ops (can be generalized later) .Default([&](Operation *op) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index de7f4d103..caa707a6f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -794,7 +794,7 @@ void addMLIRAIELoweringPasses(OpPassManager &pm) { pm.addPass(createCanonicalizerPass()); pm.addPass(createConvertLinalgToLoopsPass()); pm.addPass(createLowerAffinePass()); - pm.addPass(createConvertSCFToCFPass()); + pm.addPass(createSCFToControlFlowPass()); { OpPassManager &devicePM = pm.nest(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 550920c0b..21c2624de 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -354,7 +354,7 @@ def AMDAIEFuseProducerIntoLoop : let description = [{ Greedily fuse the producers of a linalg computation op based on the `fuseDepth`. Currently, the two producer ops that are allowed in the defining op chain are - tensor.pack and linalg.copy ops. + linalg.pack and linalg.copy ops. }]; let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEFuseProducerIntoLoopPass()"; let options = [ diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir index a4d359f29..c3e3d8b7e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir @@ -14,14 +14,14 @@ func.func @matmul_static(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512x %c0 = arith.constant 0 : index %5 = tensor.empty() : tensor<1024x512xi32> %6 = tensor.empty() : tensor<16x32x64x64xi32> - %pack = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<1024x2048xi32> -> tensor<16x32x64x64xi32> + %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<1024x2048xi32> -> tensor<16x32x64x64xi32> %7 = tensor.empty() : tensor<32x8x64x64xi32> - %pack_0 = tensor.pack %arg1 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<2048x512xi32> -> tensor<32x8x64x64xi32> + %pack_0 = linalg.pack %arg1 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<2048x512xi32> -> tensor<32x8x64x64xi32> %8 = tensor.empty() : tensor<16x8x64x64xi32> %9 = tensor.empty() : tensor<16x32x16x8x4x8xi32> - %pack_1 = tensor.pack %pack inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %9 : tensor<16x32x64x64xi32> -> tensor<16x32x16x8x4x8xi32> + %pack_1 = linalg.pack %pack inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %9 : tensor<16x32x64x64xi32> -> tensor<16x32x16x8x4x8xi32> %10 = tensor.empty() : tensor<32x8x8x8x8x8xi32> - %pack_2 = tensor.pack %pack_0 inner_dims_pos = [3, 2] inner_tiles = [8, 8] into %10 : tensor<32x8x64x64xi32> -> tensor<32x8x8x8x8x8xi32> + %pack_2 = linalg.pack %pack_0 inner_dims_pos = [3, 2] inner_tiles = [8, 8] into %10 : tensor<32x8x64x64xi32> -> tensor<32x8x8x8x8x8xi32> %11 = tensor.empty() : tensor<16x8x16x8x4x8xi32> %12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<16x8x16x8x4x8xi32>) -> tensor<16x8x16x8x4x8xi32> %13 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_1, %pack_2 : tensor<16x32x16x8x4x8xi32>, tensor<32x8x8x8x8x8xi32>) outs(%12 : tensor<16x8x16x8x4x8xi32>) { @@ -30,48 +30,48 @@ func.func @matmul_static(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512x %15 = arith.addi %out, %14 : i32 linalg.yield %15 : i32 } -> tensor<16x8x16x8x4x8xi32> - %unpack = tensor.unpack %13 inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %8 : tensor<16x8x16x8x4x8xi32> -> tensor<16x8x64x64xi32> - %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %5 : tensor<16x8x64x64xi32> -> tensor<1024x512xi32> + %unpack = linalg.unpack %13 inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %8 : tensor<16x8x16x8x4x8xi32> -> tensor<16x8x64x64xi32> + %unpack_3 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %5 : tensor<16x8x64x64xi32> -> tensor<1024x512xi32> return %unpack_3 : tensor<1024x512xi32> } // LINALG-INPUT-OUTPUT-NOT: memref.alloc -// LINALG-INPUT-OUTPUT: tensor.pack +// LINALG-INPUT-OUTPUT: linalg.pack // LINALG-INPUT-OUTPUT-NOT: memref.alloc -// LINALG-INPUT-OUTPUT: tensor.pack +// LINALG-INPUT-OUTPUT: linalg.pack // LINALG-INPUT-OUTPUT: memref.alloc() : memref<16x32x16x8x4x8xi32, 2 : i32> // LINALG-INPUT-OUTPUT: bufferization.to_tensor -// LINALG-INPUT-OUTPUT: tensor.pack +// LINALG-INPUT-OUTPUT: linalg.pack // LINALG-INPUT-OUTPUT: memref.alloc() : memref<32x8x8x8x8x8xi32, 2 : i32> // LINALG-INPUT-OUTPUT: bufferization.to_tensor -// LINALG-INPUT-OUTPUT: tensor.pack +// LINALG-INPUT-OUTPUT: linalg.pack // LINALG-INPUT-OUTPUT: memref.alloc() : memref<16x8x16x8x4x8xi32, 2 : i32> // LINALG-INPUT-OUTPUT: bufferization.to_tensor // LINALG-INPUT-OUTPUT: linalg.fill // LINALG-INPUT-OUTPUT: linalg.generic // LINALG-INPUT-NOT: memref.alloc -// LINALG-INPUT: tensor.pack +// LINALG-INPUT: linalg.pack // LINALG-INPUT-NOT: memref.alloc -// LINALG-INPUT: tensor.pack +// LINALG-INPUT: linalg.pack // LINALG-INPUT: memref.alloc() : memref<16x32x16x8x4x8xi32, 2 : i32> // LINALG-INPUT: bufferization.to_tensor -// LINALG-INPUT: tensor.pack +// LINALG-INPUT: linalg.pack // LINALG-INPUT: memref.alloc() : memref<32x8x8x8x8x8xi32, 2 : i32> // LINALG-INPUT: bufferization.to_tensor -// LINALG-INPUT: tensor.pack +// LINALG-INPUT: linalg.pack // LINALG-INPUT-NOT: memref.alloc // LINALG-INPUT: linalg.fill // LINALG-INPUT: linalg.generic // LINALG-OUTPUT-NOT: memref.alloc -// LINALG-OUTPUT: tensor.pack +// LINALG-OUTPUT: linalg.pack // LINALG-OUTPUT-NOT: memref.alloc -// LINALG-OUTPUT: tensor.pack +// LINALG-OUTPUT: linalg.pack // LINALG-OUTPUT-NOT: memref.alloc -// LINALG-OUTPUT: tensor.pack +// LINALG-OUTPUT: linalg.pack // LINALG-OUTPUT-NOT: memref.alloc -// LINALG-OUTPUT: tensor.pack +// LINALG-OUTPUT: linalg.pack // LINALG-OUTPUT: memref.alloc() : memref<16x8x16x8x4x8xi32, 2 : i32> // LINALG-OUTPUT: bufferization.to_tensor // LINALG-OUTPUT: linalg.fill @@ -79,14 +79,14 @@ func.func @matmul_static(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512x // PACK-INPUT: memref.alloc() : memref<16x32x64x64xi32, 1 : i32> // PACK-INPUT: bufferization.to_tensor -// PACK-INPUT: tensor.pack +// PACK-INPUT: linalg.pack // PACK-INPUT: memref.alloc() : memref<32x8x64x64xi32, 1 : i32> // PACK-INPUT: bufferization.to_tensor -// PACK-INPUT: tensor.pack +// PACK-INPUT: linalg.pack // PACK-INPUT-NOT: memref.alloc -// PACK-INPUT: tensor.pack +// PACK-INPUT: linalg.pack // PACK-INPUT-NOT: memref.alloc -// PACK-INPUT: tensor.pack +// PACK-INPUT: linalg.pack // PACK-INPUT-NOT: memref.alloc // PACK-INPUT: linalg.fill // PACK-INPUT: linalg.generic @@ -105,14 +105,14 @@ func.func @matmul_elementwise(%arg0: tensor<1024x512xi8>, %arg1: tensor<512x1024 %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg4] [512, 64] [1, 1] : tensor<512x1024xi8> to tensor<512x64xi8> %extracted_slice_1 = tensor.extract_slice %0[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32> %2 = tensor.empty() : tensor<1x16x64x32xi8> - %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi8> -> tensor<1x16x64x32xi8> + %pack = linalg.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi8> -> tensor<1x16x64x32xi8> %3 = tensor.empty() : tensor<16x1x32x64xi8> - %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %3 : tensor<512x64xi8> -> tensor<16x1x32x64xi8> + %pack_2 = linalg.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %3 : tensor<512x64xi8> -> tensor<16x1x32x64xi8> %4 = tensor.empty() : tensor<1x1x64x64xi32> %5 = tensor.empty() : tensor<1x16x4x16x4x8xi8> - %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x16x64x32xi8> -> tensor<1x16x4x16x4x8xi8> + %pack_3 = linalg.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x16x64x32xi8> -> tensor<1x16x4x16x4x8xi8> %6 = tensor.empty() : tensor<16x1x8x4x8x8xi8> - %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %6 : tensor<16x1x32x64xi8> -> tensor<16x1x8x4x8x8xi8> + %pack_4 = linalg.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %6 : tensor<16x1x32x64xi8> -> tensor<16x1x8x4x8x8xi8> %7 = tensor.empty() : tensor<1x1x8x16x4x8xi32> %8 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<1x1x8x16x4x8xi32>) -> tensor<1x1x8x16x4x8xi32> %9 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<1x16x4x16x4x8xi8>, tensor<16x1x8x4x8x8xi8>) outs(%8 : tensor<1x1x8x16x4x8xi32>) { @@ -125,17 +125,17 @@ func.func @matmul_elementwise(%arg0: tensor<1024x512xi8>, %arg1: tensor<512x1024 } -> tensor<1x1x8x16x4x8xi32> %extracted_slice_5 = tensor.extract_slice %arg2[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32> %extracted_slice_6 = tensor.extract_slice %arg5[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32> - %pack_7 = tensor.pack %extracted_slice_6 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<64x64xi32> -> tensor<1x1x64x64xi32> - %pack_8 = tensor.pack %extracted_slice_5 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<64x64xi32> -> tensor<1x1x64x64xi32> - %pack_9 = tensor.pack %pack_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %7 : tensor<1x1x64x64xi32> -> tensor<1x1x8x16x4x8xi32> - %pack_10 = tensor.pack %pack_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %7 : tensor<1x1x64x64xi32> -> tensor<1x1x8x16x4x8xi32> + %pack_7 = linalg.pack %extracted_slice_6 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<64x64xi32> -> tensor<1x1x64x64xi32> + %pack_8 = linalg.pack %extracted_slice_5 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<64x64xi32> -> tensor<1x1x64x64xi32> + %pack_9 = linalg.pack %pack_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %7 : tensor<1x1x64x64xi32> -> tensor<1x1x8x16x4x8xi32> + %pack_10 = linalg.pack %pack_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %7 : tensor<1x1x64x64xi32> -> tensor<1x1x8x16x4x8xi32> %10 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%9, %pack_10 : tensor<1x1x8x16x4x8xi32>, tensor<1x1x8x16x4x8xi32>) outs(%pack_9 : tensor<1x1x8x16x4x8xi32>) { ^bb0(%in: i32, %in_12: i32, %out: i32): %11 = arith.addi %in, %in_12 : i32 linalg.yield %11 : i32 } -> tensor<1x1x8x16x4x8xi32> - %unpack = tensor.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32> - %unpack_11 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> + %unpack = linalg.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32> + %unpack_11 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> scf.forall.in_parallel { tensor.parallel_insert_slice %unpack_11 into %arg5[%arg3, %arg4] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<1024x1024xi32> } @@ -143,31 +143,31 @@ func.func @matmul_elementwise(%arg0: tensor<1024x512xi8>, %arg1: tensor<512x1024 return %1 : tensor<1024x1024xi32> } -// ELEMENTWISE-INPUT-COUNT-4: tensor.pack +// ELEMENTWISE-INPUT-COUNT-4: linalg.pack // ELEMENTWISE-INPUT: linalg.fill // ELEMENTWISE-INPUT: linalg.generic // ELEMENTWISE-INPUT-NOT: memref.alloc -// ELEMENTWISE-INPUT: tensor.pack +// ELEMENTWISE-INPUT: linalg.pack // ELEMENTWISE-INPUT-NOT: memref.alloc -// ELEMENTWISE-INPUT: tensor.pack +// ELEMENTWISE-INPUT: linalg.pack // ELEMENTWISE-INPUT-NOT: memref.alloc -// ELEMENTWISE-INPUT: tensor.pack +// ELEMENTWISE-INPUT: linalg.pack // ELEMENTWISE-INPUT: memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32> // ELEMENTWISE-INPUT: bufferization.to_tensor -// ELEMENTWISE-INPUT: tensor.pack +// ELEMENTWISE-INPUT: linalg.pack // ELEMENTWISE-INPUT: linalg.generic -// ELEMENTWISE-INPUT-OUTPUT-COUNT-4: tensor.pack +// ELEMENTWISE-INPUT-OUTPUT-COUNT-4: linalg.pack // ELEMENTWISE-INPUT-OUTPUT: linalg.fill // ELEMENTWISE-INPUT-OUTPUT: linalg.generic // ELEMENTWISE-INPUT-OUTPUT-NOT: memref.alloc -// ELEMENTWISE-INPUT-OUTPUT: tensor.pack +// ELEMENTWISE-INPUT-OUTPUT: linalg.pack // ELEMENTWISE-INPUT-OUTPUT-NOT: memref.alloc -// ELEMENTWISE-INPUT-OUTPUT: tensor.pack +// ELEMENTWISE-INPUT-OUTPUT: linalg.pack // ELEMENTWISE-INPUT-OUTPUT: memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32> // ELEMENTWISE-INPUT-OUTPUT: bufferization.to_tensor -// ELEMENTWISE-INPUT-OUTPUT: tensor.pack +// ELEMENTWISE-INPUT-OUTPUT: linalg.pack // ELEMENTWISE-INPUT-OUTPUT: memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32> // ELEMENTWISE-INPUT-OUTPUT: bufferization.to_tensor -// ELEMENTWISE-INPUT-OUTPUT: tensor.pack +// ELEMENTWISE-INPUT-OUTPUT: linalg.pack // ELEMENTWISE-INPUT-OUTPUT: linalg.generic diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation_pack_or_copy.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation_pack_or_copy.mlir index a8d2b59a9..7d08dbe6f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation_pack_or_copy.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation_pack_or_copy.mlir @@ -5,11 +5,11 @@ // CHECK: scf.forall // CHECK: %[[ALLOC_0:.+]] = memref.alloc() : memref<4x8x32x64xbf16, 1 : i32> // CHECK: %[[TO_TENSOR_0:.+]] = bufferization.to_tensor %[[ALLOC_0]] -// CHECK: %[[PACK_0:.+]] = tensor.pack +// CHECK: %[[PACK_0:.+]] = linalg.pack // CHECK-SAME: into %[[TO_TENSOR_0]] // CHECK: %[[ALLOC_1:.+]] = memref.alloc() : memref<4x8x64x32xbf16, 1 : i32> // CHECK: %[[TO_TENSOR_1:.+]] = bufferization.to_tensor %[[ALLOC_1]] -// CHECK: %[[PACK_1:.+]] = tensor.pack +// CHECK: %[[PACK_1:.+]] = linalg.pack // CHECK-SAME: into %[[TO_TENSOR_1]] // CHECK: scf.forall // CHECK: %[[SLICE_0:.+]] = tensor.extract_slice %[[PACK_0]] @@ -17,9 +17,9 @@ // CHECK: linalg.fill // CHECK: scf.for // CHECK: %[[SLICE_2:.+]] = tensor.extract_slice %[[SLICE_0]] -// CHECK: %[[PACK_2:.+]] = tensor.pack %[[SLICE_2]] +// CHECK: %[[PACK_2:.+]] = linalg.pack %[[SLICE_2]] // CHECK: %[[SLICE_3:.+]] = tensor.extract_slice %[[SLICE_1]] -// CHECK: %[[PACK_3:.+]] = tensor.pack %[[SLICE_3]] +// CHECK: %[[PACK_3:.+]] = linalg.pack %[[SLICE_3]] // CHECK: linalg.generic func.func @matmul_tensor_extract_slice() { %c1 = arith.constant 1 : index @@ -37,9 +37,9 @@ func.func @matmul_tensor_extract_slice() { %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 128] [1, 1] : tensor<512x4096xbf16> to tensor<512x128xbf16> %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<512x4096xf32> to tensor<128x128xf32> %7 = tensor.empty() : tensor<4x8x32x64xbf16> - %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %7 : tensor<128x512xbf16> -> tensor<4x8x32x64xbf16> + %pack = linalg.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %7 : tensor<128x512xbf16> -> tensor<4x8x32x64xbf16> %8 = tensor.empty() : tensor<4x8x64x32xbf16> - %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %8 : tensor<512x128xbf16> -> tensor<4x8x64x32xbf16> + %pack_2 = linalg.pack %extracted_slice_0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %8 : tensor<512x128xbf16> -> tensor<4x8x64x32xbf16> %alloc = memref.alloc() : memref<4x4x32x32xf32, 1 : i32> %9 = bufferization.to_tensor %alloc restrict writable : memref<4x4x32x32xf32, 1 : i32> to tensor<4x4x32x32xf32> %10 = tensor.empty() : tensor<4x4x8x8x4x4xf32> @@ -53,10 +53,10 @@ func.func @matmul_tensor_extract_slice() { %15 = scf.for %arg6 = %c0 to %c8 step %c1 iter_args(%arg7 = %14) -> (tensor<2x2x8x8x4x4xf32>) { %extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, %arg6, 0, 0] [2, 1, 32, 64] [1, 1, 1, 1] : tensor<2x8x32x64xbf16> to tensor<2x1x32x64xbf16> %extracted_slice_8 = tensor.extract_slice %12[0, %arg6, 0, 0, 0, 0] [2, 1, 8, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x8x8x4x8xbf16> to tensor<2x1x8x8x4x8xbf16> - %pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_8 : tensor<2x1x32x64xbf16> -> tensor<2x1x8x8x4x8xbf16> + %pack_9 = linalg.pack %extracted_slice_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_8 : tensor<2x1x32x64xbf16> -> tensor<2x1x8x8x4x8xbf16> %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, %arg6, 0, 0] [2, 1, 64, 32] [1, 1, 1, 1] : tensor<2x8x64x32xbf16> to tensor<2x1x64x32xbf16> %extracted_slice_11 = tensor.extract_slice %13[0, %arg6, 0, 0, 0, 0] [2, 1, 8, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<2x8x8x8x8x4xbf16> to tensor<2x1x8x8x8x4xbf16> - %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<2x1x64x32xbf16> -> tensor<2x1x8x8x8x4xbf16> + %pack_12 = linalg.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<2x1x64x32xbf16> -> tensor<2x1x8x8x8x4xbf16> %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d1, d2, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_9, %pack_12 : tensor<2x1x8x8x4x8xbf16>, tensor<2x1x8x8x8x4xbf16>) outs(%arg7 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config, packing_config = #amdaie.packing_config} { ^bb0(%in: bf16, %in_13: bf16, %out: f32): %17 = arith.extf %in : bf16 to f32 @@ -71,8 +71,8 @@ func.func @matmul_tensor_extract_slice() { tensor.parallel_insert_slice %15 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [2, 2, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> into tensor<4x4x8x8x4x4xf32> } } {mapping = [#gpu.block, #gpu.block]} - %unpack = tensor.unpack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<4x4x8x8x4x4xf32> -> tensor<4x4x32x32xf32> - %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<4x4x32x32xf32> -> tensor<128x128xf32> + %unpack = linalg.unpack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<4x4x8x8x4x4xf32> -> tensor<4x4x32x32xf32> + %unpack_3 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<4x4x32x32xf32> -> tensor<128x128xf32> memref.dealloc %alloc : memref<4x4x32x32xf32, 1 : i32> scf.forall.in_parallel { tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xf32> into tensor<512x4096xf32> @@ -89,12 +89,12 @@ func.func @matmul_tensor_extract_slice() { // CHECK: bufferization.to_tensor // CHECK: linalg.copy // CHECK-NOT: memref.alloc -// CHECK: tensor.pack +// CHECK: linalg.pack // CHECK: memref.alloc() : memref<4x1x32x32xi32, 1 : i32> // CHECK: bufferization.to_tensor // CHECK: linalg.copy // CHECK-NOT: memref.alloc -// CHECK: tensor.pack +// CHECK: linalg.pack // CHECK: linalg.generic #map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d1, d2, d5, d4, d7, d8)> @@ -109,16 +109,16 @@ func.func @copy_pack_matmul(%arg0: tensor<4x1x32x32xi32>, %arg1: tensor<4x1x32x3 %4 = tensor.empty() : tensor<4x1x4x8x4x8xi32> %5 = tensor.empty() : tensor<4x4x8x8x4x4xi32> %6 = linalg.copy ins(%arg0 : tensor<4x1x32x32xi32>) outs(%0 : tensor<4x1x32x32xi32>) -> tensor<4x1x32x32xi32> - %pack = tensor.pack %6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %3 : tensor<4x1x32x32xi32> -> tensor<4x1x4x8x4x8xi32> + %pack = linalg.pack %6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %3 : tensor<4x1x32x32xi32> -> tensor<4x1x4x8x4x8xi32> %7 = linalg.copy ins(%arg1 : tensor<4x1x32x32xi32>) outs(%1 : tensor<4x1x32x32xi32>) -> tensor<4x1x32x32xi32> - %pack_0 = tensor.pack %7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<4x1x32x32xi32> -> tensor<4x1x4x8x4x8xi32> + %pack_0 = linalg.pack %7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<4x1x32x32xi32> -> tensor<4x1x4x8x4x8xi32> %8 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_0 : tensor<4x1x4x8x4x8xi32>, tensor<4x1x4x8x4x8xi32>) outs(%5 : tensor<4x4x8x8x4x4xi32>) { ^bb0(%in: i32, %in_1: i32, %out: i32): %9 = arith.muli %in, %in_1 : i32 %10 = arith.addi %out, %9 : i32 linalg.yield %10 : i32 } -> tensor<4x4x8x8x4x4xi32> - %unpack = tensor.unpack %8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %2 : tensor<4x4x8x8x4x4xi32> -> tensor<4x4x32x32xi32> + %unpack = linalg.unpack %8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %2 : tensor<4x4x8x8x4x4xi32> -> tensor<4x4x32x32xi32> return %unpack : tensor<4x4x32x32xi32> } @@ -133,9 +133,9 @@ func.func @pack_error(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512xi32 %c0 = arith.constant 0 : index %5 = tensor.empty() : tensor<1024x512xi32> %6 = tensor.empty() : tensor<16x32x64x64xi32> - %pack = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<1024x2048xi32> -> tensor<16x32x64x64xi32> + %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<1024x2048xi32> -> tensor<16x32x64x64xi32> %7 = tensor.empty() : tensor<32x8x64x64xi32> - %pack_0 = tensor.pack %arg1 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<2048x512xi32> -> tensor<32x8x64x64xi32> + %pack_0 = linalg.pack %arg1 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<2048x512xi32> -> tensor<32x8x64x64xi32> %8 = tensor.empty() : tensor<16x8x64x64xi32> %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<16x8x64x64xi32>) -> tensor<16x8x64x64xi32> // expected-error @+2 {{could not fetch operands to bufferize}} @@ -146,6 +146,6 @@ func.func @pack_error(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512xi32 %15 = arith.addi %out, %14 : i32 linalg.yield %15 : i32 } -> tensor<16x8x64x64xi32> - %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %5 : tensor<16x8x64x64xi32> -> tensor<1024x512xi32> + %unpack = linalg.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %5 : tensor<16x8x64x64xi32> -> tensor<1024x512xi32> return %unpack : tensor<1024x512xi32> } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_reference_to_allocation.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_reference_to_allocation.mlir index 6d9307c83..a70432956 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_reference_to_allocation.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_reference_to_allocation.mlir @@ -129,22 +129,22 @@ func.func @matmul_example(%arg0: tensor<128x256xi32>, %arg1: tensor<256x128xi32> %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %8] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32> %alloc_5 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> %9 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xi32, 1 : i32> to tensor<2x1x32x32xi32> - %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xi32> -> tensor<2x1x32x32xi32> + %pack = linalg.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xi32> -> tensor<2x1x32x32xi32> %extracted_slice_6 = tensor.extract_slice %extracted_slice_0[%8, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32> %alloc_7 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> %10 = bufferization.to_tensor %alloc_7 restrict writable : memref<1x2x32x32xi32, 1 : i32> to tensor<1x2x32x32xi32> - %pack_8 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xi32> -> tensor<1x2x32x32xi32> + %pack_8 = linalg.pack %extracted_slice_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xi32> -> tensor<1x2x32x32xi32> %11 = scf.forall (%arg7, %arg8) in (2, 2) shared_outs(%arg9 = %arg6) -> (tensor<2x2x8x8x4x4xi32>) { %extracted_slice_9 = tensor.extract_slice %pack[%arg7, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32> %extracted_slice_10 = tensor.extract_slice %5[%arg7, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xi32> to tensor<1x1x4x8x4x8xi32> %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %12 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32> to tensor<1x1x4x8x4x8xi32> - %pack_12 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %12 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32> + %pack_12 = linalg.pack %extracted_slice_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %12 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32> %extracted_slice_13 = tensor.extract_slice %pack_8[0, %arg8, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32> %extracted_slice_14 = tensor.extract_slice %6[0, %arg8, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xi32> to tensor<1x1x8x4x8x4xi32> %alloc_15 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32> %13 = bufferization.to_tensor %alloc_15 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32> to tensor<1x1x8x4x8x4xi32> - %pack_16 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %13 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32> + %pack_16 = linalg.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %13 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32> %extracted_slice_17 = tensor.extract_slice %arg9[%arg7, %arg8, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32> %14 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_16 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_17 : tensor<1x1x8x8x4x4xi32>) { ^bb0(%in: i32, %in_18: i32, %out: i32): @@ -162,8 +162,8 @@ func.func @matmul_example(%arg0: tensor<128x256xi32>, %arg1: tensor<256x128xi32> memref.dealloc %alloc_7 : memref<1x2x32x32xi32, 1 : i32> scf.yield %11 : tensor<2x2x8x8x4x4xi32> } - %unpack = tensor.unpack %7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %2 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32> - %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32> + %unpack = linalg.unpack %7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %2 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32> + %unpack_3 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32> memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32> memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xi32, 2 : i32> scf.forall.in_parallel { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop.mlir index b8d91fc27..837c2eaae 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop.mlir @@ -22,11 +22,11 @@ // CHECK-DAG: %[[iv2:.*]] = affine.min #[[EXTRACT_SLICE_MAP0]](%[[IV0]]) // CHECK-DAG: %[[iv3:.*]] = affine.min #[[EXTRACT_SLICE_MAP1]](%[[IV0]]) // CHECK-DAG: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1] -// CHECK-DAG: %[[TILED_UNPACK:.*]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] +// CHECK-DAG: %[[TILED_UNPACK:.*]] = linalg.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] // CHECK: %[[YIELD_UNPACK:.*]] = tensor.insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]] // CHECK: scf.yield %[[YIELD_MATMUL]], %[[YIELD_UNPACK]] // CHECK: } -// CHECK: %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] : +// CHECK: %[[SECOND_UNPACK:.*]] = linalg.unpack %[[SECOND_LOOP]]#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] : // CHECK: scf.forall.in_parallel // CHECK: tensor.parallel_insert_slice %[[SECOND_UNPACK]] into %[[ITER_ARG_FINAL]] // CHECK: } @@ -72,8 +72,8 @@ module { } %3 = tensor.empty() : tensor<64x64xi32> %4 = tensor.empty() : tensor<1x1x64x64xi32> - %unpack = tensor.unpack %2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32> - %unpack_0 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %3 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> + %unpack = linalg.unpack %2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32> + %unpack_0 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %3 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> scf.forall.in_parallel { tensor.parallel_insert_slice %unpack_0 into %arg6[%arg4, %arg5] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<1024x1024xi32> } @@ -115,11 +115,11 @@ module { // CHECK-DAG: %[[iv2:.*]] = affine.min #[[EXTRACT_SLICE_MAP0]](%[[IV0]]) // CHECK-DAG: %[[iv3:.*]] = affine.min #[[EXTRACT_SLICE_MAP1]](%[[IV0]]) // CHECK-DAG: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1] -// CHECK-DAG: %[[TILED_UNPACK:.*]] = tensor.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] +// CHECK-DAG: %[[TILED_UNPACK:.*]] = linalg.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] // CHECK-DAG: %[[YIELD_UNPACK:.*]] = tensor.insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]] // CHECK: scf.yield %[[YIELD_MATMUL]], %[[YIELD_ELEM]], %[[YIELD_UNPACK]] // CHECK: } -// CHECK: %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] : +// CHECK: %[[SECOND_UNPACK:.*]] = linalg.unpack %[[SECOND_LOOP]]#2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] : // CHECK: scf.forall.in_parallel // CHECK: tensor.parallel_insert_slice %[[SECOND_UNPACK]] into %[[ITER_ARG_FINAL]] // CHECK: } @@ -171,8 +171,8 @@ module { %7 = arith.addi %in, %in_1 : i32 linalg.yield %7 : i32 } -> tensor<1x1x8x16x4x8xi32> - %unpack = tensor.unpack %6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32> - %unpack_0 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> + %unpack = linalg.unpack %6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32> + %unpack_0 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> scf.forall.in_parallel { tensor.parallel_insert_slice %unpack_0 into %arg6[%arg4, %arg5] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<1024x1024xi32> } @@ -222,13 +222,13 @@ func.func @no_consumer_fusion(%arg0: tensor<64xf32>) -> tensor<64xf32> { // CHECK-DAG: %[[iv2:.*]] = affine.min #[[EXTRACT_SLICE_MAP0]](%[[IV0]]) // CHECK-DAG: %[[iv3:.*]] = affine.min #[[EXTRACT_SLICE_MAP1]](%[[IV1]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1] -// CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] +// CHECK: %[[TILED_UNPACK:.*]] = linalg.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] // CHECK: scf.forall.in_parallel { // CHECK: tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] // CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1] // CHECK: } // CHECK: } -// CHECK: %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] : +// CHECK: %[[SECOND_UNPACK:.*]] = linalg.unpack %[[SECOND_LOOP]]#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] : // CHECK: scf.forall.in_parallel // CHECK: tensor.parallel_insert_slice %[[SECOND_UNPACK]] into %[[ITER_ARG_FINAL]] // CHECK: } @@ -272,8 +272,8 @@ module { } %3 = tensor.empty() : tensor<64x64xi32> %4 = tensor.empty() : tensor<1x1x64x64xi32> - %unpack = tensor.unpack %2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32> - %unpack_0 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %3 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> + %unpack = linalg.unpack %2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32> + %unpack_0 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %3 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> scf.forall.in_parallel { tensor.parallel_insert_slice %unpack_0 into %arg6[%arg4, %arg5] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<1024x1024xi32> } @@ -313,14 +313,14 @@ module { // CHECK-DAG: %[[iv2:.*]] = affine.min #[[EXTRACT_SLICE_MAP0]](%[[IV0]]) // CHECK-DAG: %[[iv3:.*]] = affine.min #[[EXTRACT_SLICE_MAP1]](%[[IV1]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1] -// CHECK: %[[TILED_UNPACK:.*]] = tensor.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] +// CHECK: %[[TILED_UNPACK:.*]] = linalg.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]] // CHECK: scf.forall.in_parallel { // CHECK: tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] // CHECK: tensor.parallel_insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] // CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1] // CHECK: } // CHECK: } -// CHECK: %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] : +// CHECK: %[[SECOND_UNPACK:.*]] = linalg.unpack %[[SECOND_LOOP]]#2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] : // CHECK: scf.forall.in_parallel // CHECK: tensor.parallel_insert_slice %[[SECOND_UNPACK]] into %[[ITER_ARG_FINAL]] // CHECK: } @@ -370,8 +370,8 @@ module { %7 = arith.addi %in, %in_1 : i32 linalg.yield %7 : i32 } -> tensor<1x1x8x16x4x8xi32> - %unpack = tensor.unpack %6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32> - %unpack_0 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> + %unpack = linalg.unpack %6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32> + %unpack_0 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> scf.forall.in_parallel { tensor.parallel_insert_slice %unpack_0 into %arg6[%arg4, %arg5] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<1024x1024xi32> } @@ -407,14 +407,14 @@ func.func @no_consumer_fusion(%arg0: tensor<64xf32>) -> tensor<64xf32> { // CHECK: %[[MATMUL:.*]] = linalg.generic // CHECK: scf.yield %[[MATMUL]] // CHECK: } -// CHECK: %[[FUSED_UNPACK:.*]] = tensor.unpack %[[FOR]] +// CHECK: %[[FUSED_UNPACK:.*]] = linalg.unpack %[[FOR]] // CHECK-SAME: tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32> // CHECK: scf.forall.in_parallel { // CHECK: tensor.parallel_insert_slice %[[FOR]] // CHECK: tensor.parallel_insert_slice %[[FUSED_UNPACK]] // CHECK: } // CHECK: } -// CHECK: %[[UNPACK:.*]] = tensor.unpack %[[FORALL]]#1 +// CHECK: %[[UNPACK:.*]] = linalg.unpack %[[FORALL]]#1 // CHECK: scf.forall.in_parallel { // CHECK: tensor.parallel_insert_slice %[[UNPACK]] // CHECK: } @@ -447,8 +447,8 @@ module { } %2 = tensor.empty() : tensor<4x4x32x32xi32> %3 = tensor.empty() : tensor<128x128xi32> - %unpack = tensor.unpack %1 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %2 : tensor<4x4x8x8x4x4xi32> -> tensor<4x4x32x32xi32> - %unpack_0 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %3 : tensor<4x4x32x32xi32> -> tensor<128x128xi32> + %unpack = linalg.unpack %1 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %2 : tensor<4x4x8x8x4x4xi32> -> tensor<4x4x32x32xi32> + %unpack_0 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %3 : tensor<4x4x32x32xi32> -> tensor<128x128xi32> scf.forall.in_parallel { tensor.parallel_insert_slice %unpack_0 into %arg6[0, 0] [128, 128] [1, 1] : tensor<128x128xi32> into tensor<128x128xi32> } @@ -467,7 +467,7 @@ module { // CHECK-SAME: { // CHECK: %[[MATMUL:.+]] = linalg.generic // CHECK-DAG: %[[EXTRACT_SLICE_1:.+]] = tensor.extract_slice %[[UNPACK_LOCAL_OUT]][%[[ARG2]], %[[ARG3]], 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] -// CHECK-DAG: %[[UNPACK:.+]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %[[EXTRACT_SLICE_1]] +// CHECK-DAG: %[[UNPACK:.+]] = linalg.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %[[EXTRACT_SLICE_1]] // CHECK: scf.forall.in_parallel { // CHECK-DAG: tensor.parallel_insert_slice %[[MATMUL]] into %[[MATMUL_LOCAL_OUT]][%[[ARG2]], %[[ARG3]], 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] // CHECK-DAG: tensor.parallel_insert_slice %[[UNPACK]] into %[[UNPACK_LOCAL_OUT]][%[[ARG2]], %[[ARG3]], 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] @@ -477,7 +477,7 @@ module { // CHECK-DAG: tensor.parallel_insert_slice %[[FORALL_1]]#0 into %[[MATMUL_OUT]][%[[ARG0]], %[[ARG1]], 0, 0, 0, 0] [4, 4, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] // CHECK-DAG: tensor.parallel_insert_slice %[[FORALL_1]]#1 into %[[UNPACK_OUT]][%[[ARG0]], %[[ARG1]], 0, 0] [4, 4, 32, 32] [1, 1, 1, 1] // CHECK: } -// CHECK: tensor.unpack %[[FORALL_0]]#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[EXTRACT_SLICE_0]] +// CHECK: linalg.unpack %[[FORALL_0]]#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[EXTRACT_SLICE_0]] #map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d1, d2, d4, d5, d8, d7)> #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> @@ -498,9 +498,9 @@ module { %extracted_slice_4 = tensor.extract_slice %4[0, %arg1] [512, 256] [1, 1] : tensor<512x4096xbf16> to tensor<512x256xbf16> %extracted_slice_5 = tensor.extract_slice %arg2[%arg0, %arg1] [256, 256] [1, 1] : tensor<512x4096xf32> to tensor<256x256xf32> %7 = bufferization.to_tensor %alloc_3 restrict writable : memref<8x8x32x64xbf16, 1 : i32> to tensor<8x8x32x64xbf16> - %pack = tensor.pack %extracted_slice outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %7 : tensor<256x512xbf16> -> tensor<8x8x32x64xbf16> + %pack = linalg.pack %extracted_slice outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %7 : tensor<256x512xbf16> -> tensor<8x8x32x64xbf16> %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<8x8x64x32xbf16, 1 : i32> to tensor<8x8x64x32xbf16> - %pack_6 = tensor.pack %extracted_slice_4 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %8 : tensor<512x256xbf16> -> tensor<8x8x64x32xbf16> + %pack_6 = linalg.pack %extracted_slice_4 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %8 : tensor<512x256xbf16> -> tensor<8x8x64x32xbf16> %9 = bufferization.to_tensor %alloc_1 restrict writable : memref<8x8x32x32xf32, 1 : i32> to tensor<8x8x32x32xf32> %10 = tensor.empty() : tensor<8x8x8x8x4x4xf32> %11 = scf.forall (%arg3, %arg4) = (0, 0) to (8, 8) step (4, 4) shared_outs(%arg5 = %10) -> (tensor<8x8x8x8x4x4xf32>) { @@ -512,10 +512,10 @@ module { %13 = scf.forall (%arg6, %arg7) in (4, 4) shared_outs(%arg8 = %12) -> (tensor<4x4x8x8x4x4xf32>) { %extracted_slice_12 = tensor.extract_slice %extracted_slice_10[%arg6, 0, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : tensor<4x1x32x64xbf16> to tensor<1x1x32x64xbf16> %14 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x8x4x8xbf16, 2 : i32> to tensor<1x1x8x8x4x8xbf16> - %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x64xbf16> -> tensor<1x1x8x8x4x8xbf16> + %pack_13 = linalg.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x64xbf16> -> tensor<1x1x8x8x4x8xbf16> %extracted_slice_14 = tensor.extract_slice %extracted_slice_11[%arg7, 0, 0, 0] [1, 1, 64, 32] [1, 1, 1, 1] : tensor<4x1x64x32xbf16> to tensor<1x1x64x32xbf16> %15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x8x8x4xbf16, 2 : i32> to tensor<1x1x8x8x8x4xbf16> - %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x64x32xbf16> -> tensor<1x1x8x8x8x4xbf16> + %pack_15 = linalg.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x64x32xbf16> -> tensor<1x1x8x8x8x4xbf16> %extracted_slice_16 = tensor.extract_slice %arg8[%arg6, %arg7, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<4x4x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> %16 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x8x8x4x8xbf16>, tensor<1x1x8x8x8x4xbf16>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xf32>) { ^bb0(%in: bf16, %in_17: bf16, %out: f32): @@ -533,8 +533,8 @@ module { tensor.parallel_insert_slice %13 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [4, 4, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<4x4x8x8x4x4xf32> into tensor<8x8x8x8x4x4xf32> } } {mapping = [#gpu.block, #gpu.block]} - %unpack = tensor.unpack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<8x8x8x8x4x4xf32> -> tensor<8x8x32x32xf32> - %unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_5 : tensor<8x8x32x32xf32> -> tensor<256x256xf32> + %unpack = linalg.unpack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<8x8x8x8x4x4xf32> -> tensor<8x8x32x32xf32> + %unpack_7 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_5 : tensor<8x8x32x32xf32> -> tensor<256x256xf32> scf.forall.in_parallel { tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [256, 256] [1, 1] : tensor<256x256xf32> into tensor<512x4096xf32> } @@ -562,7 +562,7 @@ module { // CHECK-SAME: ins(%[[MATMUL]] : tensor<1x1x8x8x4x4xf32>) // CHECK: arith.truncf // CHECK: %[[EXTRACT_SLICE_1:.+]] = tensor.extract_slice %[[UNPACK_LOCAL_OUT]][%[[ARG2]], %[[ARG3]], 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ELEMWISE]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %[[EXTRACT_SLICE_1]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ELEMWISE]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %[[EXTRACT_SLICE_1]] // CHECK: scf.forall.in_parallel { // CHECK-DAG: tensor.parallel_insert_slice %[[MATMUL]] into %[[MATMUL_LOCAL_OUT]][%[[ARG2]], %[[ARG3]], 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] // CHECK-DAG: tensor.parallel_insert_slice %[[ELEMWISE]] into %[[ELEMWISE_LOCAL_OUT]][%[[ARG2]], %[[ARG3]], 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] @@ -574,7 +574,7 @@ module { // CHECK-DAG: tensor.parallel_insert_slice %[[FORALL_1]]#1 into %[[ELEMWISE_OUT]][%[[ARG0]], %[[ARG1]], 0, 0, 0, 0] [4, 4, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] // CHECK-DAG: tensor.parallel_insert_slice %[[FORALL_1]]#2 into %[[UNPACK_OUT]][%[[ARG0]], %[[ARG1]], 0, 0] [4, 4, 32, 32] [1, 1, 1, 1] // CHECK: } -// CHECK: tensor.unpack %[[FORALL_0]]#2 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[EXTRACT_SLICE_0]] +// CHECK: linalg.unpack %[[FORALL_0]]#2 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[EXTRACT_SLICE_0]] #map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d1, d2, d4, d5, d8, d7)> #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> @@ -595,9 +595,9 @@ module { %extracted_slice_4 = tensor.extract_slice %1[0, %arg1] [512, 256] [1, 1] : tensor<512x4096xbf16> to tensor<512x256xbf16> %extracted_slice_5 = tensor.extract_slice %arg2[%arg0, %arg1] [256, 256] [1, 1] : tensor<512x4096xbf16> to tensor<256x256xbf16> %4 = bufferization.to_tensor %alloc_3 restrict writable : memref<8x8x32x64xbf16, 1 : i32> to tensor<8x8x32x64xbf16> - %pack = tensor.pack %extracted_slice outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %4 : tensor<256x512xbf16> -> tensor<8x8x32x64xbf16> + %pack = linalg.pack %extracted_slice outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %4 : tensor<256x512xbf16> -> tensor<8x8x32x64xbf16> %5 = bufferization.to_tensor %alloc_2 restrict writable : memref<8x8x64x32xbf16, 1 : i32> to tensor<8x8x64x32xbf16> - %pack_6 = tensor.pack %extracted_slice_4 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %5 : tensor<512x256xbf16> -> tensor<8x8x64x32xbf16> + %pack_6 = linalg.pack %extracted_slice_4 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %5 : tensor<512x256xbf16> -> tensor<8x8x64x32xbf16> %6 = bufferization.to_tensor %alloc_1 restrict writable : memref<8x8x32x32xbf16, 1 : i32> to tensor<8x8x32x32xbf16> %7 = tensor.empty() : tensor<8x8x8x8x4x4xbf16> %8 = tensor.empty() : tensor<8x8x8x8x4x4xf32> @@ -610,10 +610,10 @@ module { %12 = scf.forall (%arg6, %arg7) in (4, 4) shared_outs(%arg8 = %11) -> (tensor<4x4x8x8x4x4xf32>) { %extracted_slice_12 = tensor.extract_slice %extracted_slice_10[%arg6, 0, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : tensor<4x1x32x64xbf16> to tensor<1x1x32x64xbf16> %13 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x8x4x8xbf16, 2 : i32> to tensor<1x1x8x8x4x8xbf16> - %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x64xbf16> -> tensor<1x1x8x8x4x8xbf16> + %pack_13 = linalg.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x64xbf16> -> tensor<1x1x8x8x4x8xbf16> %extracted_slice_14 = tensor.extract_slice %extracted_slice_11[%arg7, 0, 0, 0] [1, 1, 64, 32] [1, 1, 1, 1] : tensor<4x1x64x32xbf16> to tensor<1x1x64x32xbf16> %14 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x8x8x4xbf16, 2 : i32> to tensor<1x1x8x8x8x4xbf16> - %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %14 : tensor<1x1x64x32xbf16> -> tensor<1x1x8x8x8x4xbf16> + %pack_15 = linalg.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %14 : tensor<1x1x64x32xbf16> -> tensor<1x1x8x8x8x4xbf16> %extracted_slice_16 = tensor.extract_slice %arg8[%arg6, %arg7, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<4x4x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> %15 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x8x8x4x8xbf16>, tensor<1x1x8x8x8x4xbf16>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xf32>) { ^bb0(%in: bf16, %in_17: bf16, %out: f32): @@ -636,8 +636,8 @@ module { %11 = arith.truncf %in : f32 to bf16 linalg.yield %11 : bf16 } -> tensor<8x8x8x8x4x4xbf16> - %unpack = tensor.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %6 : tensor<8x8x8x8x4x4xbf16> -> tensor<8x8x32x32xbf16> - %unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_5 : tensor<8x8x32x32xbf16> -> tensor<256x256xbf16> + %unpack = linalg.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %6 : tensor<8x8x8x8x4x4xbf16> -> tensor<8x8x32x32xbf16> + %unpack_7 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_5 : tensor<8x8x32x32xbf16> -> tensor<256x256xbf16> scf.forall.in_parallel { tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [256, 256] [1, 1] : tensor<256x256xbf16> into tensor<512x4096xbf16> } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_producer_into_loop.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_producer_into_loop.mlir index 925fbd2ae..9ed41e3a4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_producer_into_loop.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_producer_into_loop.mlir @@ -14,9 +14,9 @@ func.func @fuse_pack_into_for(%arg0: tensor<1x1x32x512xi32>, %arg1: tensor<1x1x5 %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index %15 = tensor.empty() : tensor<1x1x64x8x4x8xi32> - %pack_8 = tensor.pack %arg0 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32> + %pack_8 = linalg.pack %arg0 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32> %16 = tensor.empty() : tensor<1x1x4x64x8x8xi32> - %pack_9 = tensor.pack %arg1 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %16 : tensor<1x1x512x32xi32> -> tensor<1x1x4x64x8x8xi32> + %pack_9 = linalg.pack %arg1 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %16 : tensor<1x1x512x32xi32> -> tensor<1x1x4x64x8x8xi32> %17 = tensor.empty() : tensor<1x1x4x8x4x8xi32> %18 = linalg.fill ins(%c0_i32 : i32) outs(%17 : tensor<1x1x4x8x4x8xi32>) -> tensor<1x1x4x8x4x8xi32> %19 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %18) -> (tensor<1x1x4x8x4x8xi32>) { @@ -38,10 +38,10 @@ func.func @fuse_pack_into_for(%arg0: tensor<1x1x32x512xi32>, %arg1: tensor<1x1x5 // DEPTH-1: { // DEPTH-1: tensor.extract_slice %{{.*}} : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32> // DEPTH-1: tensor.extract_slice %{{.*}} : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32> -// DEPTH-1: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32> +// DEPTH-1: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32> // DEPTH-1: tensor.extract_slice %{{.*}} : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32> // DEPTH-1: tensor.extract_slice %{{.*}} : tensor<1x1x4x64x8x8xi32> to tensor<1x1x4x4x8x8xi32> -// DEPTH-1: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x4x8x8xi32> +// DEPTH-1: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x4x8x8xi32> // DEPTH-1: linalg.generic // DEPTH-1: } @@ -63,15 +63,15 @@ func.func @fuse_multilevel_pack_into_for(%arg0: tensor<2048x2048xi32>, %arg1: te %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3] [2048, 64] [1, 1] : tensor<2048x2048xi32> to tensor<2048x64xi32> %extracted_slice_1 = tensor.extract_slice %arg4[%arg2, %arg3] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32> %2 = tensor.empty() : tensor<1x64x64x32xi32> - %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x2048xi32> -> tensor<1x64x64x32xi32> + %pack = linalg.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x2048xi32> -> tensor<1x64x64x32xi32> %3 = tensor.empty() : tensor<64x1x32x64xi32> - %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %3 : tensor<2048x64xi32> -> tensor<64x1x32x64xi32> + %pack_2 = linalg.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %3 : tensor<2048x64xi32> -> tensor<64x1x32x64xi32> %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32> %4 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x64xi32, 1 : i32> to tensor<1x1x64x64xi32> %5 = tensor.empty() : tensor<1x64x4x16x4x8xi32> - %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x64x64x32xi32> -> tensor<1x64x4x16x4x8xi32> + %pack_3 = linalg.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x64x64x32xi32> -> tensor<1x64x4x16x4x8xi32> %6 = tensor.empty() : tensor<64x1x16x4x8x4xi32> - %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %6 : tensor<64x1x32x64xi32> -> tensor<64x1x16x4x8x4xi32> + %pack_4 = linalg.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %6 : tensor<64x1x32x64xi32> -> tensor<64x1x16x4x8x4xi32> %alloc_5 = memref.alloc() : memref<1x1x16x16x4x4xi32, 2 : i32> %7 = bufferization.to_tensor %alloc_5 restrict writable : memref<1x1x16x16x4x4xi32, 2 : i32> to tensor<1x1x16x16x4x4xi32> %8 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<1x1x16x16x4x4xi32>) -> tensor<1x1x16x16x4x4xi32> @@ -86,8 +86,8 @@ func.func @fuse_multilevel_pack_into_for(%arg0: tensor<2048x2048xi32>, %arg1: te } -> tensor<1x1x16x16x4x4xi32> scf.yield %10 : tensor<1x1x16x16x4x4xi32> } - %unpack = tensor.unpack %9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %4 : tensor<1x1x16x16x4x4xi32> -> tensor<1x1x64x64xi32> - %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> + %unpack = linalg.unpack %9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %4 : tensor<1x1x16x16x4x4xi32> -> tensor<1x1x64x64xi32> + %unpack_6 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> memref.dealloc %alloc : memref<1x1x64x64xi32, 1 : i32> memref.dealloc %alloc_5 : memref<1x1x16x16x4x4xi32, 2 : i32> scf.forall.in_parallel { @@ -102,10 +102,10 @@ func.func @fuse_multilevel_pack_into_for(%arg0: tensor<2048x2048xi32>, %arg1: te // DEPTH-1: { // DEPTH-1: %[[PACK_1_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<1x64x64x32xi32> to tensor<1x1x64x32xi32> // DEPTH-1: %[[PACK_1_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<1x64x4x16x4x8xi32> to tensor<1x1x4x16x4x8xi32> -// DEPTH-1: %[[PACK_1:.*]] = tensor.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32> +// DEPTH-1: %[[PACK_1:.*]] = linalg.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32> // DEPTH-1: %[[PACK_2_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<64x1x32x64xi32> to tensor<1x1x32x64xi32> // DEPTH-1: %[[PACK_2_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<64x1x16x4x8x4xi32> to tensor<1x1x16x4x8x4xi32> -// DEPTH-1: %[[PACK_2:.*]] = tensor.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32> +// DEPTH-1: %[[PACK_2:.*]] = linalg.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32> // DEPTH-1: linalg.generic {{.*}} ins(%[[PACK_1]], %[[PACK_2]] : // DEPTH-1: } @@ -114,14 +114,14 @@ func.func @fuse_multilevel_pack_into_for(%arg0: tensor<2048x2048xi32>, %arg1: te // DEPTH-2: { // DEPTH-2: %[[PACK_1_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<64x2048xi32> to tensor<64x32xi32> // DEPTH-2: %[[PACK_1_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<1x64x64x32xi32> to tensor<1x1x64x32xi32> -// DEPTH-2: %[[PACK_1_DEPTH_2:.*]] = tensor.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<64x32xi32> -> tensor<1x1x64x32xi32> +// DEPTH-2: %[[PACK_1_DEPTH_2:.*]] = linalg.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<64x32xi32> -> tensor<1x1x64x32xi32> // DEPTH-2: %[[PACK_1_DEST_2:.*]] = tensor.extract_slice %{{.*}} : tensor<1x64x4x16x4x8xi32> to tensor<1x1x4x16x4x8xi32> -// DEPTH-2: %[[PACK_1_DEPTH_1:.*]] = tensor.pack %[[PACK_1_DEPTH_2]] {{.*}} into %[[PACK_1_DEST_2]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32> +// DEPTH-2: %[[PACK_1_DEPTH_1:.*]] = linalg.pack %[[PACK_1_DEPTH_2]] {{.*}} into %[[PACK_1_DEST_2]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32> // DEPTH-2: %[[PACK_2_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<2048x64xi32> to tensor<32x64xi32> // DEPTH-2: %[[PACK_2_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<64x1x32x64xi32> to tensor<1x1x32x64xi32> -// DEPTH-2: %[[PACK_2_DEPTH_2:.*]] = tensor.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<32x64xi32> -> tensor<1x1x32x64xi32> +// DEPTH-2: %[[PACK_2_DEPTH_2:.*]] = linalg.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<32x64xi32> -> tensor<1x1x32x64xi32> // DEPTH-2: %[[PACK_2_DEST_2:.*]] = tensor.extract_slice %{{.*}} : tensor<64x1x16x4x8x4xi32> to tensor<1x1x16x4x8x4xi32> -// DEPTH-2: %[[PACK_2_DEPTH_1:.*]] = tensor.pack %[[PACK_2_DEPTH_2]] {{.*}} into %[[PACK_2_DEST_2]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32> +// DEPTH-2: %[[PACK_2_DEPTH_1:.*]] = linalg.pack %[[PACK_2_DEPTH_2]] {{.*}} into %[[PACK_2_DEST_2]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32> // DEPTH-2: linalg.generic {{.*}} ins(%[[PACK_1_DEPTH_1]], %[[PACK_2_DEPTH_1]] : // DEPTH-2: } @@ -156,14 +156,14 @@ func.func @fuse_multilevel_pack_into_forall(%arg0: tensor<2048x2048xi32>, %arg1: %10 = affine.apply #map(%arg5) %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %10] [64, 32] [1, 1] : tensor<64x2048xi32> to tensor<64x32xi32> %extracted_slice_5 = tensor.extract_slice %2[0, %arg5, 0, 0] [1, 1, 64, 32] [1, 1, 1, 1] : tensor<1x64x64x32xi32> to tensor<1x1x64x32xi32> - %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %extracted_slice_5 : tensor<64x32xi32> -> tensor<1x1x64x32xi32> + %pack = linalg.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %extracted_slice_5 : tensor<64x32xi32> -> tensor<1x1x64x32xi32> %extracted_slice_6 = tensor.extract_slice %5[0, %arg5, 0, 0, 0, 0] [1, 1, 4, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x64x4x16x4x8xi32> to tensor<1x1x4x16x4x8xi32> - %pack_7 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_6 : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32> + %pack_7 = linalg.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_6 : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32> %extracted_slice_8 = tensor.extract_slice %extracted_slice_0[%10, 0] [32, 64] [1, 1] : tensor<2048x64xi32> to tensor<32x64xi32> %extracted_slice_9 = tensor.extract_slice %3[%arg5, 0, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : tensor<64x1x32x64xi32> to tensor<1x1x32x64xi32> - %pack_10 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %extracted_slice_9 : tensor<32x64xi32> -> tensor<1x1x32x64xi32> + %pack_10 = linalg.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %extracted_slice_9 : tensor<32x64xi32> -> tensor<1x1x32x64xi32> %extracted_slice_11 = tensor.extract_slice %6[%arg5, 0, 0, 0, 0, 0] [1, 1, 16, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<64x1x16x4x8x4xi32> to tensor<1x1x16x4x8x4xi32> - %pack_12 = tensor.pack %pack_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32> + %pack_12 = linalg.pack %pack_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32> %11 = scf.forall (%arg7, %arg8) in (1, 1) shared_outs(%arg9 = %arg6) -> (tensor<1x1x16x16x4x4xi32>) { %extracted_slice_13 = tensor.extract_slice %pack_7[%arg7, 0, 0, 0, 0, 0] [1, 1, 4, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x16x4x8xi32> to tensor<1x1x4x16x4x8xi32> %extracted_slice_14 = tensor.extract_slice %pack_12[0, %arg8, 0, 0, 0, 0] [1, 1, 16, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x4x8x4xi32> to tensor<1x1x16x4x8x4xi32> @@ -180,8 +180,8 @@ func.func @fuse_multilevel_pack_into_forall(%arg0: tensor<2048x2048xi32>, %arg1: } {mapping = [#gpu.block, #gpu.block]} scf.yield %11 : tensor<1x1x16x16x4x4xi32> } - %unpack = tensor.unpack %9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %4 : tensor<1x1x16x16x4x4xi32> -> tensor<1x1x64x64xi32> - %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> + %unpack = linalg.unpack %9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %4 : tensor<1x1x16x16x4x4xi32> -> tensor<1x1x64x64xi32> + %unpack_3 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> memref.dealloc %alloc : memref<1x1x64x64xi32, 1 : i32> memref.dealloc %alloc_2 : memref<1x1x16x16x4x4xi32, 2 : i32> scf.forall.in_parallel { @@ -198,10 +198,10 @@ func.func @fuse_multilevel_pack_into_forall(%arg0: tensor<2048x2048xi32>, %arg1: // FORALL-DEPTH-1: { // FORALL-DEPTH-1: %[[PACK_1_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x64x32xi32> to tensor<1x1x64x32xi32> // FORALL-DEPTH-1: %[[PACK_1_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x4x16x4x8xi32> to tensor<1x1x4x16x4x8xi32> -// FORALL-DEPTH-1: %[[PACK_1:.*]] = tensor.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32> +// FORALL-DEPTH-1: %[[PACK_1:.*]] = linalg.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32> // FORALL-DEPTH-1: %[[PACK_2_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x32x64xi32> to tensor<1x1x32x64xi32> // FORALL-DEPTH-1: %[[PACK_2_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x16x4x8x4xi32> to tensor<1x1x16x4x8x4xi32> -// FORALL-DEPTH-1: %[[PACK_2:.*]] = tensor.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32> +// FORALL-DEPTH-1: %[[PACK_2:.*]] = linalg.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32> // FORALL-DEPTH-1: linalg.generic {{.*}} ins(%[[PACK_1]], %[[PACK_2]] : // FORALL-DEPTH-1: } // FORALL-DEPTH-1: } @@ -213,14 +213,14 @@ func.func @fuse_multilevel_pack_into_forall(%arg0: tensor<2048x2048xi32>, %arg1: // FORALL-DEPTH-2: { // FORALL-DEPTH-2: %[[PACK_1_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<64x32xi32> to tensor<64x32xi32> // FORALL-DEPTH-2: %[[PACK_1_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x64x32xi32> to tensor<1x1x64x32xi32> -// FORALL-DEPTH-2: %[[PACK_1_DEPTH_2:.*]] = tensor.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<64x32xi32> -> tensor<1x1x64x32xi32> +// FORALL-DEPTH-2: %[[PACK_1_DEPTH_2:.*]] = linalg.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<64x32xi32> -> tensor<1x1x64x32xi32> // FORALL-DEPTH-2: %[[PACK_1_DEST_2:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x4x16x4x8xi32> to tensor<1x1x4x16x4x8xi32> -// FORALL-DEPTH-2: %[[PACK_1_DEPTH_1:.*]] = tensor.pack %[[PACK_1_DEPTH_2]] {{.*}} into %[[PACK_1_DEST_2]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32> +// FORALL-DEPTH-2: %[[PACK_1_DEPTH_1:.*]] = linalg.pack %[[PACK_1_DEPTH_2]] {{.*}} into %[[PACK_1_DEST_2]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32> // FORALL-DEPTH-2: %[[PACK_2_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<32x64xi32> to tensor<32x64xi32> // FORALL-DEPTH-2: %[[PACK_2_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x32x64xi32> to tensor<1x1x32x64xi32> -// FORALL-DEPTH-2: %[[PACK_2_DEPTH_2:.*]] = tensor.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<32x64xi32> -> tensor<1x1x32x64xi32> +// FORALL-DEPTH-2: %[[PACK_2_DEPTH_2:.*]] = linalg.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<32x64xi32> -> tensor<1x1x32x64xi32> // FORALL-DEPTH-2: %[[PACK_2_DEST_2:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x16x4x8x4xi32> to tensor<1x1x16x4x8x4xi32> -// FORALL-DEPTH-2: %[[PACK_2_DEPTH_1:.*]] = tensor.pack %[[PACK_2_DEPTH_2]] {{.*}} into %[[PACK_2_DEST_2]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32> +// FORALL-DEPTH-2: %[[PACK_2_DEPTH_1:.*]] = linalg.pack %[[PACK_2_DEPTH_2]] {{.*}} into %[[PACK_2_DEST_2]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32> // FORALL-DEPTH-2: linalg.generic {{.*}} ins(%[[PACK_1_DEPTH_1]], %[[PACK_2_DEPTH_1]] : // FORALL-DEPTH-2: } // FORALL-DEPTH-2: } @@ -382,9 +382,9 @@ func.func @pack_without_slice(%arg0: tensor<1x1x32x512xi32>, %arg1: tensor<1x1x3 %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index %15 = tensor.empty() : tensor<1x1x64x8x4x8xi32> - %pack_8 = tensor.pack %arg0 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32> + %pack_8 = linalg.pack %arg0 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32> %16 = tensor.empty() : tensor<1x1x4x4x8x8xi32> - %pack_10 = tensor.pack %arg1 outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x4x8x8xi32> + %pack_10 = linalg.pack %arg1 outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x4x8x8xi32> %17 = tensor.empty() : tensor<1x1x4x8x4x8xi32> %18 = linalg.fill ins(%c0_i32 : i32) outs(%17 : tensor<1x1x4x8x4x8xi32>) -> tensor<1x1x4x8x4x8xi32> @@ -403,7 +403,7 @@ func.func @pack_without_slice(%arg0: tensor<1x1x32x512xi32>, %arg1: tensor<1x1x3 // DEPTH-1-LABEL: pack_without_slice // DEPTH-1: scf.for -// DEPTH-1-DAG: %[[PACK_1:.*]] = tensor.pack %{{.*}} into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x4x8x8xi32> -// DEPTH-1-DAG: %[[PACK_2:.*]] = tensor.pack %{{.*}} into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32> +// DEPTH-1-DAG: %[[PACK_1:.*]] = linalg.pack %{{.*}} into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x4x8x8xi32> +// DEPTH-1-DAG: %[[PACK_2:.*]] = linalg.pack %{{.*}} into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32> // DEPTH-1: linalg.generic // DEPTH-1-SAME: ins(%[[PACK_2]], %[[PACK_1]] diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir index 053f9f7ed..5173b4c6d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir @@ -9,9 +9,9 @@ func.func @matmul_example_dispatch_0_matmul_16x256x256_i8xi8xi32(%arg0 : tensor< %c0_i32 = arith.constant 0 : i32 %0 = tensor.empty() : tensor<16x256xi32> %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<16x256xi32>) -> tensor<16x256xi32> - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 128] into %{{.*}} : tensor<16x256xi8> -> tensor<1x2x16x128xi8> - // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %{{.*}} : tensor<256x256xi8> -> tensor<2x1x128x256xi8> - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 256] into %{{.*}} : tensor<16x256xi32> -> tensor<1x1x16x256xi32> + // CHECK: linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 128] into %{{.*}} : tensor<16x256xi8> -> tensor<1x2x16x128xi8> + // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %{{.*}} : tensor<256x256xi8> -> tensor<2x1x128x256xi8> + // CHECK: linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 256] into %{{.*}} : tensor<16x256xi32> -> tensor<1x1x16x256xi32> // CHECK: linalg.generic // CHECK-SAME: attrs = {lowering_config = #config, packing_config = #packingConfig} %2 = linalg.matmul {lowering_config = #config, packing_config = #packingConfig} ins(%arg0, %arg1 : tensor<16x256xi8>, tensor<256x256xi8>) outs(%1 : tensor<16x256xi32>) -> tensor<16x256xi32> @@ -29,9 +29,9 @@ func.func @matmul_transpose_b_dispatch_0_matmul_transpose_b_256x1024x512_i32(%ar %c0_i32 = arith.constant 0 : i32 %0 = tensor.empty() : tensor<256x1024xi32> %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<256x1024xi32>) -> tensor<256x1024xi32> - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<256x512xi32> -> tensor<4x16x64x32xi32> - // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %{{.*}} : tensor<1024x512xi32> -> tensor<16x16x64x32xi32> - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %{{.*}} : tensor<256x1024xi32> -> tensor<4x16x64x64xi32> + // CHECK: linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<256x512xi32> -> tensor<4x16x64x32xi32> + // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %{{.*}} : tensor<1024x512xi32> -> tensor<16x16x64x32xi32> + // CHECK: linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %{{.*}} : tensor<256x1024xi32> -> tensor<4x16x64x64xi32> // CHECK: linalg.generic // CHECK-SAME: attrs = {lowering_config = #config, packing_config = #packingConfig} %2 = linalg.matmul_transpose_b {lowering_config = #config, packing_config = #packingConfig} ins(%arg0, %arg1 : tensor<256x512xi32>, tensor<1024x512xi32>) outs(%1 : tensor<256x1024xi32>) -> tensor<256x1024xi32> diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level2.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level2.mlir index 23f0f9acc..162075365 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level2.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level2.mlir @@ -15,14 +15,14 @@ func.func @matmul_example_dispatch_0_matmul_16x256x256_i8xi8xi32(%arg0: tensor<1 %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3] [256, 256] [1, 1] : tensor<256x256xi8> to tensor<256x256xi8> %extracted_slice_1 = tensor.extract_slice %arg4[%arg2, %arg3] [16, 256] [1, 1] : tensor<16x256xi32> to tensor<16x256xi32> %2 = tensor.empty() : tensor<1x2x16x128xi8> - %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [16, 128] into %2 : tensor<16x256xi8> -> tensor<1x2x16x128xi8> + %pack = linalg.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [16, 128] into %2 : tensor<16x256xi8> -> tensor<1x2x16x128xi8> %3 = tensor.empty() : tensor<2x1x128x256xi8> - %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %3 : tensor<256x256xi8> -> tensor<2x1x128x256xi8> + %pack_2 = linalg.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %3 : tensor<256x256xi8> -> tensor<2x1x128x256xi8> %4 = tensor.empty() : tensor<1x1x16x256xi32> %5 = linalg.fill ins(%c0_i32 : i32) outs(%4 : tensor<1x1x16x256xi32>) -> tensor<1x1x16x256xi32> - // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x2x16x128xi8> -> tensor<1x2x16x4x4x8xi8> - // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %{{.*}} : tensor<2x1x128x256xi8> -> tensor<2x1x32x16x8x8xi8> - // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x1x16x256xi32> -> tensor<1x1x32x4x4x8xi32> + // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x2x16x128xi8> -> tensor<1x2x16x4x4x8xi8> + // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %{{.*}} : tensor<2x1x128x256xi8> -> tensor<2x1x32x16x8x8xi8> + // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x1x16x256xi32> -> tensor<1x1x32x4x4x8xi32> // CHECK: linalg.generic // CHECK-SAME: attrs = {lowering_config = #config, packing_config = #packingConfig} %6 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<1x2x16x128xi8>, tensor<2x1x128x256xi8>) outs(%5 : tensor<1x1x16x256xi32>) attrs = {lowering_config = #config, packing_config = #packingConfig} { @@ -33,7 +33,7 @@ func.func @matmul_example_dispatch_0_matmul_16x256x256_i8xi8xi32(%arg0: tensor<1 %10 = arith.addi %out, %9 : i32 linalg.yield %10 : i32 } -> tensor<1x1x16x256xi32> - %unpack = tensor.unpack %6 inner_dims_pos = [0, 1] inner_tiles = [16, 256] into %extracted_slice_1 : tensor<1x1x16x256xi32> -> tensor<16x256xi32> + %unpack = linalg.unpack %6 inner_dims_pos = [0, 1] inner_tiles = [16, 256] into %extracted_slice_1 : tensor<1x1x16x256xi32> -> tensor<16x256xi32> scf.forall.in_parallel { tensor.parallel_insert_slice %unpack into %arg4[%arg2, %arg3] [16, 256] [1, 1] : tensor<16x256xi32> into tensor<16x256xi32> } @@ -59,13 +59,13 @@ func.func @matmul_transpose_b_dispatch_0_matmul_transpose_b_256x1024x512_i32(%ar %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, 0] [64, 512] [1, 1] : tensor<1024x512xi32> to tensor<64x512xi32> %extracted_slice_1 = tensor.extract_slice %arg4[%arg2, %arg3] [64, 64] [1, 1] : tensor<256x1024xi32> to tensor<64x64xi32> %2 = tensor.empty() : tensor<1x16x64x32xi32> - %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi32> -> tensor<1x16x64x32xi32> - %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi32> -> tensor<1x16x64x32xi32> + %pack = linalg.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi32> -> tensor<1x16x64x32xi32> + %pack_2 = linalg.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi32> -> tensor<1x16x64x32xi32> %3 = tensor.empty() : tensor<1x1x64x64xi32> %4 = linalg.fill ins(%c0_i32 : i32) outs(%3 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32> - // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x16x64x32xi32> -> tensor<1x16x4x16x4x8xi32> - // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x16x64x32xi32> -> tensor<1x16x4x16x4x8xi32> - // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %{{.*}} : tensor<1x1x64x64xi32> -> tensor<1x1x16x16x4x4xi32> + // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x16x64x32xi32> -> tensor<1x16x4x16x4x8xi32> + // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x16x64x32xi32> -> tensor<1x16x4x16x4x8xi32> + // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %{{.*}} : tensor<1x1x64x64xi32> -> tensor<1x1x16x16x4x4xi32> // CHECK: linalg.generic // CHECK-SAME: attrs = {lowering_config = #config, packing_config = #packingConfig} %5 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<1x16x64x32xi32>, tensor<1x16x64x32xi32>) outs(%4 : tensor<1x1x64x64xi32>) attrs = {lowering_config = #config, packing_config = #packingConfig} { @@ -74,7 +74,7 @@ func.func @matmul_transpose_b_dispatch_0_matmul_transpose_b_256x1024x512_i32(%ar %7 = arith.addi %out, %6 : i32 linalg.yield %7 : i32 } -> tensor<1x1x64x64xi32> - %unpack = tensor.unpack %5 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> + %unpack = linalg.unpack %5 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> scf.forall.in_parallel { tensor.parallel_insert_slice %unpack into %arg4[%arg2, %arg3] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<256x1024xi32> } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/propagate_data_layout.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/propagate_data_layout.mlir index cbf64e2b4..2c62c8871 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/propagate_data_layout.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/propagate_data_layout.mlir @@ -8,9 +8,9 @@ func.func @matmul_static(%arg0: tensor<1x4x16x64xi32>, %arg1: tensor<4x1x64x64xi32>) -> tensor<1x1x16x64xi32> { %c0_i32 = arith.constant 0 : i32 %0 = tensor.empty() : tensor<1x4x8x4x4x8xi32> - %pack = tensor.pack %arg0 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %0 : tensor<1x4x16x64xi32> -> tensor<1x4x8x4x4x8xi32> + %pack = linalg.pack %arg0 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %0 : tensor<1x4x16x64xi32> -> tensor<1x4x8x4x4x8xi32> %1 = tensor.empty() : tensor<4x1x8x8x8x8xi32> - %pack_0 = tensor.pack %arg1 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %1 : tensor<4x1x64x64xi32> -> tensor<4x1x8x8x8x8xi32> + %pack_0 = linalg.pack %arg1 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %1 : tensor<4x1x64x64xi32> -> tensor<4x1x8x8x8x8xi32> %2 = tensor.empty() : tensor<1x1x8x4x4x8xi32> %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1x1x8x4x4x8xi32>) -> tensor<1x1x8x4x4x8xi32> %4 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_0 : tensor<1x4x8x4x4x8xi32>, tensor<4x1x8x8x8x8xi32>) outs(%3 : tensor<1x1x8x4x4x8xi32>) { @@ -20,7 +20,7 @@ func.func @matmul_static(%arg0: tensor<1x4x16x64xi32>, %arg1: tensor<4x1x64x64xi linalg.yield %7 : i32 } -> tensor<1x1x8x4x4x8xi32> %empty = tensor.empty() : tensor<1x1x16x64xi32> - %unpack = tensor.unpack %4 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %empty : tensor<1x1x8x4x4x8xi32> -> tensor<1x1x16x64xi32> + %unpack = linalg.unpack %4 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %empty : tensor<1x1x8x4x4x8xi32> -> tensor<1x1x16x64xi32> %empty2 = tensor.empty() : tensor<1x1x16x64xi32> %fill = linalg.fill ins(%c0_i32 : i32) outs(%empty2 : tensor<1x1x16x64xi32>) -> tensor<1x1x16x64xi32> %5 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%unpack: tensor<1x1x16x64xi32>) outs(%fill : tensor<1x1x16x64xi32>) { @@ -33,14 +33,14 @@ func.func @matmul_static(%arg0: tensor<1x4x16x64xi32>, %arg1: tensor<4x1x64x64xi } // CHECK-LABEL: matmul_static -// CHECK: %[[PACK_0:.*]] = tensor.pack {{.*}} : tensor<1x4x16x64xi32> -> tensor<1x4x8x4x4x8xi32> -// CHECK: %[[PACK_1:.*]] = tensor.pack {{.*}} : tensor<4x1x64x64xi32> -> tensor<4x1x8x8x8x8xi32> +// CHECK: %[[PACK_0:.*]] = linalg.pack {{.*}} : tensor<1x4x16x64xi32> -> tensor<1x4x8x4x4x8xi32> +// CHECK: %[[PACK_1:.*]] = linalg.pack {{.*}} : tensor<4x1x64x64xi32> -> tensor<4x1x8x8x8x8xi32> // CHECK: %[[FILL:.*]] = linalg.fill {{.*}} -> tensor<1x1x8x4x4x8xi32> // CHECK: %[[MATMUL_0:.*]] = linalg.generic {{.*}} ins(%[[PACK_0]], %[[PACK_1]] : tensor<1x4x8x4x4x8xi32>, tensor<4x1x8x8x8x8xi32>) outs(%[[FILL]] : tensor<1x1x8x4x4x8xi32>) -// CHECK-NOT: tensor.unpack -// CHECK-NOT: tensor.pack +// CHECK-NOT: linalg.unpack +// CHECK-NOT: linalg.pack // CHECK: %[[MATMUL_1:.*]] = linalg.generic {{.*}} ins(%[[MATMUL_0]] : tensor<1x1x8x4x4x8xi32>) outs(%[[FILL]] : tensor<1x1x8x4x4x8xi32>) -// CHECK: %[[UNPACK:.*]] = tensor.unpack %[[MATMUL_1:.*]] {{.*}} : tensor<1x1x8x4x4x8xi32> -> tensor<1x1x16x64xi32> +// CHECK: %[[UNPACK:.*]] = linalg.unpack %[[MATMUL_1:.*]] {{.*}} : tensor<1x1x8x4x4x8xi32> -> tensor<1x1x16x64xi32> // ----- @@ -54,12 +54,12 @@ func.func @matmul_elementwise_1024x1024x512_i8xi8xi32(%arg0: tensor<1024x512xi8> %extracted_slice_1 = tensor.extract_slice %0[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32> %2 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32> %3 = tensor.empty() : tensor<1x16x64x32xi8> - %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %3 : tensor<64x512xi8> -> tensor<1x16x64x32xi8> + %pack = linalg.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %3 : tensor<64x512xi8> -> tensor<1x16x64x32xi8> %4 = tensor.empty() : tensor<16x1x64x32xi8> %5 = tensor.empty() : tensor<16x1x32x64xi8> - %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %5 : tensor<512x64xi8> -> tensor<16x1x32x64xi8> + %pack_2 = linalg.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %5 : tensor<512x64xi8> -> tensor<16x1x32x64xi8> %6 = tensor.empty() : tensor<1x1x64x64xi32> - %pack_3 = tensor.pack %2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<64x64xi32> -> tensor<1x1x64x64xi32> + %pack_3 = linalg.pack %2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<64x64xi32> -> tensor<1x1x64x64xi32> %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<1x16x64x32xi8>, tensor<16x1x32x64xi8>) outs(%pack_3 : tensor<1x1x64x64xi32>) { ^bb0(%in: i8, %in_6: i8, %out: i32): %9 = arith.extsi %in : i8 to i32 @@ -68,7 +68,7 @@ func.func @matmul_elementwise_1024x1024x512_i8xi8xi32(%arg0: tensor<1024x512xi8> %12 = arith.addi %out, %11 : i32 linalg.yield %12 : i32 } -> tensor<1x1x64x64xi32> - %unpack = tensor.unpack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %2 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> + %unpack = linalg.unpack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %2 : tensor<1x1x64x64xi32> -> tensor<64x64xi32> %extracted_slice_4 = tensor.extract_slice %arg2[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32> %extracted_slice_5 = tensor.extract_slice %arg5[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32> %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack, %extracted_slice_4 : tensor<64x64xi32>, tensor<64x64xi32>) outs(%extracted_slice_5 : tensor<64x64xi32>) { @@ -84,12 +84,12 @@ func.func @matmul_elementwise_1024x1024x512_i8xi8xi32(%arg0: tensor<1024x512xi8> } // CHECK-LABEL: matmul_elementwise_1024x1024x512_i8xi8xi32 -// CHECK: %[[PACK_0:.*]] = tensor.pack {{.*}} : tensor<64x512xi8> -> tensor<1x16x64x32xi8> -// CHECK: %[[PACK_1:.*]] = tensor.pack {{.*}} : tensor<512x64xi8> -> tensor<16x1x32x64xi8> +// CHECK: %[[PACK_0:.*]] = linalg.pack {{.*}} : tensor<64x512xi8> -> tensor<1x16x64x32xi8> +// CHECK: %[[PACK_1:.*]] = linalg.pack {{.*}} : tensor<512x64xi8> -> tensor<16x1x32x64xi8> // CHECK: %[[EMPTY:.*]] = tensor.empty() : tensor<1x1x64x64xi32> // CHECK: %[[FILL:.*]] = linalg.fill {{.*}} -> tensor<1x1x64x64xi32> // CHECK: %[[MATMUL:.*]] = linalg.generic {{.*}} ins(%[[PACK_0]], %[[PACK_1]] : tensor<1x16x64x32xi8>, tensor<16x1x32x64xi8>) outs(%[[FILL]] : tensor<1x1x64x64xi32>) -// CHECK-NOT: tensor.unpack -// CHECK: %[[PACK_2:.*]] = tensor.pack {{.*}} : tensor<64x64xi32> -> tensor<1x1x64x64xi32> +// CHECK-NOT: linalg.unpack +// CHECK: %[[PACK_2:.*]] = linalg.pack {{.*}} : tensor<64x64xi32> -> tensor<1x1x64x64xi32> // CHECK: %[[ELEMENT:.*]] = linalg.generic {{.*}} ins(%[[MATMUL]], %[[PACK_2]] : tensor<1x1x64x64xi32>, tensor<1x1x64x64xi32>) outs(%[[EMPTY]] : tensor<1x1x64x64xi32>) -// CHECK: %[[UNPACK:.*]] = tensor.unpack %[[ELEMENT:.*]] {{.*}} : tensor<1x1x64x64xi32> -> tensor<64x64xi32> +// CHECK: %[[UNPACK:.*]] = linalg.unpack %[[ELEMENT:.*]] {{.*}} : tensor<1x1x64x64xi32> -> tensor<64x64xi32> diff --git a/third_party/iree b/third_party/iree index 756e9e661..055ce1f80 160000 --- a/third_party/iree +++ b/third_party/iree @@ -1 +1 @@ -Subproject commit 756e9e66138129df1bb28a2f2fac06058f976bcf +Subproject commit 055ce1f80c87f9087035db5e300668553d2871e2