From b005ec275247f68c2f8c1a94414928055b2f0d53 Mon Sep 17 00:00:00 2001
From: Vivian Zhang <vivian.zhang@amd.com>
Date: Fri, 21 Feb 2025 12:37:07 -0800
Subject: [PATCH] Bump IREE to 055ce1f (#1124)

The main change is to update `tensor.pack/unpack to linalg.pack/unpack`
followed by upstream change
https://github.com/llvm/llvm-project/pull/123902.
---
 .../AMDAIEBufferizeToAllocation.cpp           |  2 +-
 .../Transforms/AMDAIEFuseProducerIntoLoop.cpp | 12 +--
 .../Transforms/AMDAIEPackAndTranspose.cpp     |  6 +-
 .../Transforms/AMDAIETileAndFuse.cpp          |  8 +-
 .../iree-amd-aie/Transforms/Passes.cpp        |  2 +-
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.td |  2 +-
 .../test/bufferize_to_allocation.mlir         | 84 +++++++++----------
 .../bufferize_to_allocation_pack_or_copy.mlir | 36 ++++----
 .../test/create_reference_to_allocation.mlir  | 12 +--
 .../test/fuse_consumer_into_loop.mlir         | 72 ++++++++--------
 .../test/fuse_producer_into_loop.mlir         | 64 +++++++-------
 .../test/pack_and_transpose_level1.mlir       | 12 +--
 .../test/pack_and_transpose_level2.mlir       | 24 +++---
 .../test/propagate_data_layout.mlir           | 34 ++++----
 third_party/iree                              |  2 +-
 15 files changed, 186 insertions(+), 186 deletions(-)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp
index 969fa9f34..afaa8a569 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp
@@ -76,7 +76,7 @@ static FailureOr<SmallVector<Value>> getPackOrCopyOperands(
     uint32_t currentLevel{0};
     Operation *currentOp = input.value().getDefiningOp();
     while (currentLevel < depthLevel && currentOp != nullptr) {
-      if (dyn_cast<tensor::PackOp>(currentOp)) {
+      if (dyn_cast<linalg::PackOp>(currentOp)) {
         currentLevel++;
         if (currentLevel == depthLevel) break;
       } else if (dyn_cast<linalg::CopyOp>(currentOp)) {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseProducerIntoLoop.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseProducerIntoLoop.cpp
index 2c6366718..a53ced50b 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseProducerIntoLoop.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseProducerIntoLoop.cpp
@@ -21,7 +21,7 @@ namespace {
 
 /// A utility function specific to this pass which, given a value `operand`,
 /// traverses the def-chain till it finds a tensor.extract_slice. Currently,
-/// the two producer ops that are allowed in the def-chain are tensor.pack and
+/// the two producer ops that are allowed in the def-chain are linalg.pack and
 /// linalg.copy ops. The 2 cases where it successfully finds and returns an
 /// extract_slice (SLICE) are:
 ///
@@ -39,7 +39,7 @@ namespace {
 static FailureOr<tensor::ExtractSliceOp> getTensorExtractSliceDefiningOp(
     Value operand) {
   // Roll back through all the pack or copy ops immediately preceding `operand`.
-  while (isa_and_present<tensor::PackOp, linalg::CopyOp>(
+  while (isa_and_present<linalg::PackOp, linalg::CopyOp>(
       operand.getDefiningOp())) {
     operand = operand.getDefiningOp()->getOperand(0);
   }
@@ -49,7 +49,7 @@ static FailureOr<tensor::ExtractSliceOp> getTensorExtractSliceDefiningOp(
   if (!sliceOp) return failure();
 
   // Case 1 outlined above.
-  if (isa_and_present<tensor::PackOp, linalg::CopyOp>(
+  if (isa_and_present<linalg::PackOp, linalg::CopyOp>(
           sliceOp.getSource().getDefiningOp())) {
     return sliceOp;
   }
@@ -60,7 +60,7 @@ static FailureOr<tensor::ExtractSliceOp> getTensorExtractSliceDefiningOp(
     LoopLikeOpInterface loop = dyn_cast<LoopLikeOpInterface>(parent);
     if (!loop) return failure();
     Operation *operandParent = loop.getTiedLoopInit(blkArg)->getOwner();
-    if (isa_and_present<tensor::PackOp, linalg::CopyOp>(operandParent))
+    if (isa_and_present<linalg::PackOp, linalg::CopyOp>(operandParent))
       return sliceOp;
   }
 
@@ -110,7 +110,7 @@ void AMDAIEFuseProducerIntoLoopPass::runOnOperation() {
   LoopLikeOpInterface loops = cast<LoopLikeOpInterface>(scfLoopOp);
 
   // Based on the `fuseDepth`, we would greedily fuse the producers of a linalg
-  // computation op. Currently, we are limiting the producers to tensor.pack or
+  // computation op. Currently, we are limiting the producers to linalg.pack or
   // linalg.copy ops.
   for (unsigned depth = 1; depth <= fuseDepth; depth++) {
     // Search the last compute op in the loop and its producer slices.
@@ -153,7 +153,7 @@ void AMDAIEFuseProducerIntoLoopPass::runOnOperation() {
 
       // Case where operand of a generic op is a pack/copy op which is in a
       // different block than the generic's block.
-      else if (isa_and_present<tensor::PackOp, linalg::CopyOp>(
+      else if (isa_and_present<linalg::PackOp, linalg::CopyOp>(
                    operand.getDefiningOp())) {
         Operation *parent = operand.getDefiningOp();
         Block *genericBlock = genericOp->getBlock();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp
index 62544391e..2ee70ee9e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp
@@ -106,9 +106,9 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
   }
 
   // Step 3. Pack Transpose
-  SmallVector<tensor::PackOp> packOps = packResult->packOps;
+  SmallVector<linalg::PackOp> packOps = packResult->packOps;
   linalg::LinalgOp packedOp = packResult->packedLinalgOp;
-  SmallVector<tensor::UnPackOp> unpackOps = packResult->unPackOps;
+  SmallVector<linalg::UnPackOp> unpackOps = packResult->unPackOps;
 
   if (packOps.size() != 3 || !packedOp || unpackOps.empty()) {
     funcOp->emitOpError("failed to get correct pack and unpack ops");
@@ -122,7 +122,7 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
 
   for (auto [index, unpackEmpty, innerPerm, outerPerm] :
        llvm::zip(packIndices, unpackArr, innerPermArr, outerPermArr)) {
-    tensor::UnPackOp unpackOp;
+    linalg::UnPackOp unpackOp;
     if (unpackEmpty) {
       unpackOp = unpackOps.back();
     }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp
index bb05de0f1..c267a239f 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp
@@ -178,8 +178,8 @@ static bool isTilingReductionDimension(TilingInterface consumerOp,
 }
 
 static bool consumerToSkip(TilingInterface op) {
-  if (isa<linalg::CopyOp>(op) || isa<tensor::PackOp>(op) ||
-      isa<tensor::UnPackOp>(op))
+  if (isa<linalg::CopyOp>(op) || isa<linalg::PackOp>(op) ||
+      isa<linalg::UnPackOp>(op))
     return true;
   return false;
 }
@@ -279,7 +279,7 @@ void AMDAIETileAndFusePass::runOnOperation() {
   TilingInterface consumerOp;
   funcOp->walk<WalkOrder::PostOrder, ReverseIterator>([&](TilingInterface op) {
     // Find the next consumer op if it does not have loops OR it is from
-    // the skip ops list which currently contains linalg.copy and tensor.unpack.
+    // the skip ops list which currently contains linalg.copy and linalg.unpack.
     if (op.getLoopIteratorTypes().empty() || consumerToSkip(op))
       return WalkResult::advance();
 
@@ -356,7 +356,7 @@ void AMDAIETileAndFusePass::runOnOperation() {
           bool fusableOp =
               TypeSwitch<Operation *, bool>(originalProducer.getOwner())
                   // List ops that shouldnt be fused.
-                  .Case<tensor::PackOp, tensor::PadOp, linalg::CopyOp,
+                  .Case<linalg::PackOp, tensor::PadOp, linalg::CopyOp,
                         memref::CopyOp>([](Operation *) { return false; })
                   // Fuse all Linalg ops (can be generalized later)
                   .Default([&](Operation *op) {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index de7f4d103..caa707a6f 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -794,7 +794,7 @@ void addMLIRAIELoweringPasses(OpPassManager &pm) {
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createConvertLinalgToLoopsPass());
   pm.addPass(createLowerAffinePass());
-  pm.addPass(createConvertSCFToCFPass());
+  pm.addPass(createSCFToControlFlowPass());
 
   {
     OpPassManager &devicePM = pm.nest<xilinx::AIE::DeviceOp>();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index 550920c0b..21c2624de 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -354,7 +354,7 @@ def AMDAIEFuseProducerIntoLoop :
   let description = [{
     Greedily fuse the producers of a linalg computation op based on the `fuseDepth`.
     Currently, the two producer ops that are allowed in the defining op chain are
-    tensor.pack and linalg.copy ops.
+    linalg.pack and linalg.copy ops.
   }];
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEFuseProducerIntoLoopPass()";
   let options = [
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir
index a4d359f29..c3e3d8b7e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir
@@ -14,14 +14,14 @@ func.func @matmul_static(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512x
     %c0 = arith.constant 0 : index
     %5 = tensor.empty() : tensor<1024x512xi32>
     %6 = tensor.empty() : tensor<16x32x64x64xi32>
-    %pack = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<1024x2048xi32> -> tensor<16x32x64x64xi32>
+    %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<1024x2048xi32> -> tensor<16x32x64x64xi32>
     %7 = tensor.empty() : tensor<32x8x64x64xi32>
-    %pack_0 = tensor.pack %arg1 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<2048x512xi32> -> tensor<32x8x64x64xi32>
+    %pack_0 = linalg.pack %arg1 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<2048x512xi32> -> tensor<32x8x64x64xi32>
     %8 = tensor.empty() : tensor<16x8x64x64xi32>
     %9 = tensor.empty() : tensor<16x32x16x8x4x8xi32>
-    %pack_1 = tensor.pack %pack inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %9 : tensor<16x32x64x64xi32> -> tensor<16x32x16x8x4x8xi32>
+    %pack_1 = linalg.pack %pack inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %9 : tensor<16x32x64x64xi32> -> tensor<16x32x16x8x4x8xi32>
     %10 = tensor.empty() : tensor<32x8x8x8x8x8xi32>
-    %pack_2 = tensor.pack %pack_0 inner_dims_pos = [3, 2] inner_tiles = [8, 8] into %10 : tensor<32x8x64x64xi32> -> tensor<32x8x8x8x8x8xi32>
+    %pack_2 = linalg.pack %pack_0 inner_dims_pos = [3, 2] inner_tiles = [8, 8] into %10 : tensor<32x8x64x64xi32> -> tensor<32x8x8x8x8x8xi32>
     %11 = tensor.empty() : tensor<16x8x16x8x4x8xi32>
     %12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<16x8x16x8x4x8xi32>) -> tensor<16x8x16x8x4x8xi32>
     %13 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_1, %pack_2 : tensor<16x32x16x8x4x8xi32>, tensor<32x8x8x8x8x8xi32>) outs(%12 : tensor<16x8x16x8x4x8xi32>) {
@@ -30,48 +30,48 @@ func.func @matmul_static(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512x
       %15 = arith.addi %out, %14 : i32
       linalg.yield %15 : i32
     } -> tensor<16x8x16x8x4x8xi32>
-    %unpack = tensor.unpack %13 inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %8 : tensor<16x8x16x8x4x8xi32> -> tensor<16x8x64x64xi32>
-    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %5 : tensor<16x8x64x64xi32> -> tensor<1024x512xi32>
+    %unpack = linalg.unpack %13 inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %8 : tensor<16x8x16x8x4x8xi32> -> tensor<16x8x64x64xi32>
+    %unpack_3 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %5 : tensor<16x8x64x64xi32> -> tensor<1024x512xi32>
     return %unpack_3 : tensor<1024x512xi32>
 }
 
 // LINALG-INPUT-OUTPUT-NOT:  memref.alloc
-// LINALG-INPUT-OUTPUT:      tensor.pack
+// LINALG-INPUT-OUTPUT:      linalg.pack
 // LINALG-INPUT-OUTPUT-NOT:  memref.alloc
-// LINALG-INPUT-OUTPUT:      tensor.pack
+// LINALG-INPUT-OUTPUT:      linalg.pack
 // LINALG-INPUT-OUTPUT:      memref.alloc() : memref<16x32x16x8x4x8xi32, 2 : i32>
 // LINALG-INPUT-OUTPUT:      bufferization.to_tensor
-// LINALG-INPUT-OUTPUT:      tensor.pack
+// LINALG-INPUT-OUTPUT:      linalg.pack
 // LINALG-INPUT-OUTPUT:      memref.alloc() : memref<32x8x8x8x8x8xi32, 2 : i32>
 // LINALG-INPUT-OUTPUT:      bufferization.to_tensor
-// LINALG-INPUT-OUTPUT:      tensor.pack
+// LINALG-INPUT-OUTPUT:      linalg.pack
 // LINALG-INPUT-OUTPUT:      memref.alloc() : memref<16x8x16x8x4x8xi32, 2 : i32>
 // LINALG-INPUT-OUTPUT:      bufferization.to_tensor
 // LINALG-INPUT-OUTPUT:      linalg.fill
 // LINALG-INPUT-OUTPUT:      linalg.generic
 
 // LINALG-INPUT-NOT:  memref.alloc
-// LINALG-INPUT:      tensor.pack
+// LINALG-INPUT:      linalg.pack
 // LINALG-INPUT-NOT:  memref.alloc
-// LINALG-INPUT:      tensor.pack
+// LINALG-INPUT:      linalg.pack
 // LINALG-INPUT:      memref.alloc() : memref<16x32x16x8x4x8xi32, 2 : i32>
 // LINALG-INPUT:      bufferization.to_tensor
-// LINALG-INPUT:      tensor.pack
+// LINALG-INPUT:      linalg.pack
 // LINALG-INPUT:      memref.alloc() : memref<32x8x8x8x8x8xi32, 2 : i32>
 // LINALG-INPUT:      bufferization.to_tensor
-// LINALG-INPUT:      tensor.pack
+// LINALG-INPUT:      linalg.pack
 // LINALG-INPUT-NOT:  memref.alloc
 // LINALG-INPUT:      linalg.fill
 // LINALG-INPUT:      linalg.generic
 
 // LINALG-OUTPUT-NOT:  memref.alloc
-// LINALG-OUTPUT:      tensor.pack
+// LINALG-OUTPUT:      linalg.pack
 // LINALG-OUTPUT-NOT:  memref.alloc
-// LINALG-OUTPUT:      tensor.pack
+// LINALG-OUTPUT:      linalg.pack
 // LINALG-OUTPUT-NOT:  memref.alloc
-// LINALG-OUTPUT:      tensor.pack
+// LINALG-OUTPUT:      linalg.pack
 // LINALG-OUTPUT-NOT:  memref.alloc
-// LINALG-OUTPUT:      tensor.pack
+// LINALG-OUTPUT:      linalg.pack
 // LINALG-OUTPUT:      memref.alloc() : memref<16x8x16x8x4x8xi32, 2 : i32>
 // LINALG-OUTPUT:      bufferization.to_tensor
 // LINALG-OUTPUT:      linalg.fill
@@ -79,14 +79,14 @@ func.func @matmul_static(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512x
 
 // PACK-INPUT:      memref.alloc() : memref<16x32x64x64xi32, 1 : i32>
 // PACK-INPUT:      bufferization.to_tensor
-// PACK-INPUT:      tensor.pack
+// PACK-INPUT:      linalg.pack
 // PACK-INPUT:      memref.alloc() : memref<32x8x64x64xi32, 1 : i32>
 // PACK-INPUT:      bufferization.to_tensor
-// PACK-INPUT:      tensor.pack
+// PACK-INPUT:      linalg.pack
 // PACK-INPUT-NOT:  memref.alloc
-// PACK-INPUT:      tensor.pack
+// PACK-INPUT:      linalg.pack
 // PACK-INPUT-NOT:  memref.alloc
-// PACK-INPUT:      tensor.pack
+// PACK-INPUT:      linalg.pack
 // PACK-INPUT-NOT:  memref.alloc
 // PACK-INPUT:      linalg.fill
 // PACK-INPUT:      linalg.generic
@@ -105,14 +105,14 @@ func.func @matmul_elementwise(%arg0: tensor<1024x512xi8>, %arg1: tensor<512x1024
     %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg4] [512, 64] [1, 1] : tensor<512x1024xi8> to tensor<512x64xi8>
     %extracted_slice_1 = tensor.extract_slice %0[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32>
     %2 = tensor.empty() : tensor<1x16x64x32xi8>
-    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi8> -> tensor<1x16x64x32xi8>
+    %pack = linalg.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi8> -> tensor<1x16x64x32xi8>
     %3 = tensor.empty() : tensor<16x1x32x64xi8>
-    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %3 : tensor<512x64xi8> -> tensor<16x1x32x64xi8>
+    %pack_2 = linalg.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %3 : tensor<512x64xi8> -> tensor<16x1x32x64xi8>
     %4 = tensor.empty() : tensor<1x1x64x64xi32>
     %5 = tensor.empty() : tensor<1x16x4x16x4x8xi8>
-    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x16x64x32xi8> -> tensor<1x16x4x16x4x8xi8>
+    %pack_3 = linalg.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x16x64x32xi8> -> tensor<1x16x4x16x4x8xi8>
     %6 = tensor.empty() : tensor<16x1x8x4x8x8xi8>
-    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %6 : tensor<16x1x32x64xi8> -> tensor<16x1x8x4x8x8xi8>
+    %pack_4 = linalg.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %6 : tensor<16x1x32x64xi8> -> tensor<16x1x8x4x8x8xi8>
     %7 = tensor.empty() : tensor<1x1x8x16x4x8xi32>
     %8 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<1x1x8x16x4x8xi32>) -> tensor<1x1x8x16x4x8xi32>
     %9 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<1x16x4x16x4x8xi8>, tensor<16x1x8x4x8x8xi8>) outs(%8 : tensor<1x1x8x16x4x8xi32>) {
@@ -125,17 +125,17 @@ func.func @matmul_elementwise(%arg0: tensor<1024x512xi8>, %arg1: tensor<512x1024
     } -> tensor<1x1x8x16x4x8xi32>
     %extracted_slice_5 = tensor.extract_slice %arg2[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32>
     %extracted_slice_6 = tensor.extract_slice %arg5[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32>
-    %pack_7 = tensor.pack %extracted_slice_6 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
-    %pack_8 = tensor.pack %extracted_slice_5 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
-    %pack_9 = tensor.pack %pack_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %7 : tensor<1x1x64x64xi32> -> tensor<1x1x8x16x4x8xi32>
-    %pack_10 = tensor.pack %pack_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %7 : tensor<1x1x64x64xi32> -> tensor<1x1x8x16x4x8xi32>
+    %pack_7 = linalg.pack %extracted_slice_6 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
+    %pack_8 = linalg.pack %extracted_slice_5 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
+    %pack_9 = linalg.pack %pack_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %7 : tensor<1x1x64x64xi32> -> tensor<1x1x8x16x4x8xi32>
+    %pack_10 = linalg.pack %pack_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %7 : tensor<1x1x64x64xi32> -> tensor<1x1x8x16x4x8xi32>
     %10 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%9, %pack_10 : tensor<1x1x8x16x4x8xi32>, tensor<1x1x8x16x4x8xi32>) outs(%pack_9 : tensor<1x1x8x16x4x8xi32>) {
     ^bb0(%in: i32, %in_12: i32, %out: i32):
       %11 = arith.addi %in, %in_12 : i32
       linalg.yield %11 : i32
     } -> tensor<1x1x8x16x4x8xi32>
-    %unpack = tensor.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32>
-    %unpack_11 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
+    %unpack = linalg.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32>
+    %unpack_11 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
     scf.forall.in_parallel {
       tensor.parallel_insert_slice %unpack_11 into %arg5[%arg3, %arg4] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<1024x1024xi32>
     }
@@ -143,31 +143,31 @@ func.func @matmul_elementwise(%arg0: tensor<1024x512xi8>, %arg1: tensor<512x1024
   return %1 : tensor<1024x1024xi32>
 }
 
-// ELEMENTWISE-INPUT-COUNT-4: tensor.pack
+// ELEMENTWISE-INPUT-COUNT-4: linalg.pack
 // ELEMENTWISE-INPUT:         linalg.fill
 // ELEMENTWISE-INPUT:         linalg.generic
 // ELEMENTWISE-INPUT-NOT:     memref.alloc
-// ELEMENTWISE-INPUT:         tensor.pack
+// ELEMENTWISE-INPUT:         linalg.pack
 // ELEMENTWISE-INPUT-NOT:     memref.alloc
-// ELEMENTWISE-INPUT:         tensor.pack
+// ELEMENTWISE-INPUT:         linalg.pack
 // ELEMENTWISE-INPUT-NOT:     memref.alloc
-// ELEMENTWISE-INPUT:         tensor.pack
+// ELEMENTWISE-INPUT:         linalg.pack
 // ELEMENTWISE-INPUT:         memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
 // ELEMENTWISE-INPUT:         bufferization.to_tensor
-// ELEMENTWISE-INPUT:         tensor.pack
+// ELEMENTWISE-INPUT:         linalg.pack
 // ELEMENTWISE-INPUT:         linalg.generic
 
-// ELEMENTWISE-INPUT-OUTPUT-COUNT-4:  tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT-COUNT-4:  linalg.pack
 // ELEMENTWISE-INPUT-OUTPUT:          linalg.fill
 // ELEMENTWISE-INPUT-OUTPUT:          linalg.generic
 // ELEMENTWISE-INPUT-OUTPUT-NOT:      memref.alloc
-// ELEMENTWISE-INPUT-OUTPUT:          tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT:          linalg.pack
 // ELEMENTWISE-INPUT-OUTPUT-NOT:      memref.alloc
-// ELEMENTWISE-INPUT-OUTPUT:          tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT:          linalg.pack
 // ELEMENTWISE-INPUT-OUTPUT:          memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
 // ELEMENTWISE-INPUT-OUTPUT:          bufferization.to_tensor
-// ELEMENTWISE-INPUT-OUTPUT:          tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT:          linalg.pack
 // ELEMENTWISE-INPUT-OUTPUT:          memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
 // ELEMENTWISE-INPUT-OUTPUT:          bufferization.to_tensor
-// ELEMENTWISE-INPUT-OUTPUT:          tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT:          linalg.pack
 // ELEMENTWISE-INPUT-OUTPUT:          linalg.generic
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation_pack_or_copy.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation_pack_or_copy.mlir
index a8d2b59a9..7d08dbe6f 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation_pack_or_copy.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation_pack_or_copy.mlir
@@ -5,11 +5,11 @@
 // CHECK:        scf.forall
 // CHECK:          %[[ALLOC_0:.+]] = memref.alloc() : memref<4x8x32x64xbf16, 1 : i32>
 // CHECK:          %[[TO_TENSOR_0:.+]] = bufferization.to_tensor %[[ALLOC_0]]
-// CHECK:          %[[PACK_0:.+]] = tensor.pack
+// CHECK:          %[[PACK_0:.+]] = linalg.pack
 // CHECK-SAME:     into %[[TO_TENSOR_0]]
 // CHECK:          %[[ALLOC_1:.+]] = memref.alloc() : memref<4x8x64x32xbf16, 1 : i32>
 // CHECK:          %[[TO_TENSOR_1:.+]] = bufferization.to_tensor %[[ALLOC_1]]
-// CHECK:          %[[PACK_1:.+]] = tensor.pack
+// CHECK:          %[[PACK_1:.+]] = linalg.pack
 // CHECK-SAME:     into %[[TO_TENSOR_1]]
 // CHECK:          scf.forall
 // CHECK:            %[[SLICE_0:.+]] = tensor.extract_slice %[[PACK_0]]
@@ -17,9 +17,9 @@
 // CHECK:            linalg.fill
 // CHECK:            scf.for
 // CHECK:              %[[SLICE_2:.+]] = tensor.extract_slice %[[SLICE_0]]
-// CHECK:              %[[PACK_2:.+]] = tensor.pack %[[SLICE_2]]
+// CHECK:              %[[PACK_2:.+]] = linalg.pack %[[SLICE_2]]
 // CHECK:              %[[SLICE_3:.+]] = tensor.extract_slice %[[SLICE_1]]
-// CHECK:              %[[PACK_3:.+]] = tensor.pack %[[SLICE_3]]
+// CHECK:              %[[PACK_3:.+]] = linalg.pack %[[SLICE_3]]
 // CHECK:              linalg.generic
 func.func @matmul_tensor_extract_slice() {
   %c1 = arith.constant 1 : index
@@ -37,9 +37,9 @@ func.func @matmul_tensor_extract_slice() {
     %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 128] [1, 1] : tensor<512x4096xbf16> to tensor<512x128xbf16>
     %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<512x4096xf32> to tensor<128x128xf32>
     %7 = tensor.empty() : tensor<4x8x32x64xbf16>
-    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %7 : tensor<128x512xbf16> -> tensor<4x8x32x64xbf16>
+    %pack = linalg.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %7 : tensor<128x512xbf16> -> tensor<4x8x32x64xbf16>
     %8 = tensor.empty() : tensor<4x8x64x32xbf16>
-    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %8 : tensor<512x128xbf16> -> tensor<4x8x64x32xbf16>
+    %pack_2 = linalg.pack %extracted_slice_0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %8 : tensor<512x128xbf16> -> tensor<4x8x64x32xbf16>
     %alloc = memref.alloc() : memref<4x4x32x32xf32, 1 : i32>
     %9 = bufferization.to_tensor %alloc restrict writable : memref<4x4x32x32xf32, 1 : i32> to tensor<4x4x32x32xf32>
     %10 = tensor.empty() : tensor<4x4x8x8x4x4xf32>
@@ -53,10 +53,10 @@ func.func @matmul_tensor_extract_slice() {
       %15 = scf.for %arg6 = %c0 to %c8 step %c1 iter_args(%arg7 = %14) -> (tensor<2x2x8x8x4x4xf32>) {
         %extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, %arg6, 0, 0] [2, 1, 32, 64] [1, 1, 1, 1] : tensor<2x8x32x64xbf16> to tensor<2x1x32x64xbf16>
         %extracted_slice_8 = tensor.extract_slice %12[0, %arg6, 0, 0, 0, 0] [2, 1, 8, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x8x8x4x8xbf16> to tensor<2x1x8x8x4x8xbf16>
-        %pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_8 : tensor<2x1x32x64xbf16> -> tensor<2x1x8x8x4x8xbf16>
+        %pack_9 = linalg.pack %extracted_slice_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_8 : tensor<2x1x32x64xbf16> -> tensor<2x1x8x8x4x8xbf16>
         %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, %arg6, 0, 0] [2, 1, 64, 32] [1, 1, 1, 1] : tensor<2x8x64x32xbf16> to tensor<2x1x64x32xbf16>
         %extracted_slice_11 = tensor.extract_slice %13[0, %arg6, 0, 0, 0, 0] [2, 1, 8, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<2x8x8x8x8x4xbf16> to tensor<2x1x8x8x8x4xbf16>
-        %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<2x1x64x32xbf16> -> tensor<2x1x8x8x8x4xbf16>
+        %pack_12 = linalg.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<2x1x64x32xbf16> -> tensor<2x1x8x8x8x4xbf16>
         %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d1, d2, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_9, %pack_12 : tensor<2x1x8x8x4x8xbf16>, tensor<2x1x8x8x8x4xbf16>) outs(%arg7 : tensor<2x2x8x8x4x4xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [2, 2, 0], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
         ^bb0(%in: bf16, %in_13: bf16, %out: f32):
           %17 = arith.extf %in : bf16 to f32
@@ -71,8 +71,8 @@ func.func @matmul_tensor_extract_slice() {
         tensor.parallel_insert_slice %15 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [2, 2, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> into tensor<4x4x8x8x4x4xf32>
       }
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
-    %unpack = tensor.unpack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<4x4x8x8x4x4xf32> -> tensor<4x4x32x32xf32>
-    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<4x4x32x32xf32> -> tensor<128x128xf32>
+    %unpack = linalg.unpack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<4x4x8x8x4x4xf32> -> tensor<4x4x32x32xf32>
+    %unpack_3 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<4x4x32x32xf32> -> tensor<128x128xf32>
     memref.dealloc %alloc : memref<4x4x32x32xf32, 1 : i32>
     scf.forall.in_parallel {
       tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xf32> into tensor<512x4096xf32>
@@ -89,12 +89,12 @@ func.func @matmul_tensor_extract_slice() {
 // CHECK:        bufferization.to_tensor
 // CHECK:        linalg.copy
 // CHECK-NOT:    memref.alloc
-// CHECK:        tensor.pack
+// CHECK:        linalg.pack
 // CHECK:        memref.alloc() : memref<4x1x32x32xi32, 1 : i32>
 // CHECK:        bufferization.to_tensor
 // CHECK:        linalg.copy
 // CHECK-NOT:    memref.alloc
-// CHECK:        tensor.pack
+// CHECK:        linalg.pack
 // CHECK:        linalg.generic
 #map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d1, d2, d5, d4, d7, d8)>
@@ -109,16 +109,16 @@ func.func @copy_pack_matmul(%arg0: tensor<4x1x32x32xi32>, %arg1: tensor<4x1x32x3
   %4 = tensor.empty() : tensor<4x1x4x8x4x8xi32>
   %5 = tensor.empty() : tensor<4x4x8x8x4x4xi32>
   %6 = linalg.copy ins(%arg0 : tensor<4x1x32x32xi32>) outs(%0 : tensor<4x1x32x32xi32>) -> tensor<4x1x32x32xi32>
-  %pack = tensor.pack %6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %3 : tensor<4x1x32x32xi32> -> tensor<4x1x4x8x4x8xi32>
+  %pack = linalg.pack %6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %3 : tensor<4x1x32x32xi32> -> tensor<4x1x4x8x4x8xi32>
   %7 = linalg.copy ins(%arg1 : tensor<4x1x32x32xi32>) outs(%1 : tensor<4x1x32x32xi32>) -> tensor<4x1x32x32xi32>
-  %pack_0 = tensor.pack %7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<4x1x32x32xi32> -> tensor<4x1x4x8x4x8xi32>
+  %pack_0 = linalg.pack %7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<4x1x32x32xi32> -> tensor<4x1x4x8x4x8xi32>
   %8 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_0 : tensor<4x1x4x8x4x8xi32>, tensor<4x1x4x8x4x8xi32>) outs(%5 : tensor<4x4x8x8x4x4xi32>) {
   ^bb0(%in: i32, %in_1: i32, %out: i32):
     %9 = arith.muli %in, %in_1 : i32
     %10 = arith.addi %out, %9 : i32
     linalg.yield %10 : i32
   } -> tensor<4x4x8x8x4x4xi32>
-  %unpack = tensor.unpack %8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %2 : tensor<4x4x8x8x4x4xi32> -> tensor<4x4x32x32xi32>
+  %unpack = linalg.unpack %8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %2 : tensor<4x4x8x8x4x4xi32> -> tensor<4x4x32x32xi32>
   return %unpack : tensor<4x4x32x32xi32>
 }
 
@@ -133,9 +133,9 @@ func.func @pack_error(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512xi32
     %c0 = arith.constant 0 : index
     %5 = tensor.empty() : tensor<1024x512xi32>
     %6 = tensor.empty() : tensor<16x32x64x64xi32>
-    %pack = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<1024x2048xi32> -> tensor<16x32x64x64xi32>
+    %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<1024x2048xi32> -> tensor<16x32x64x64xi32>
     %7 = tensor.empty() : tensor<32x8x64x64xi32>
-    %pack_0 = tensor.pack %arg1 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<2048x512xi32> -> tensor<32x8x64x64xi32>
+    %pack_0 = linalg.pack %arg1 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<2048x512xi32> -> tensor<32x8x64x64xi32>
     %8 = tensor.empty() : tensor<16x8x64x64xi32>
     %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<16x8x64x64xi32>) -> tensor<16x8x64x64xi32>
     // expected-error @+2 {{could not fetch operands to bufferize}}
@@ -146,6 +146,6 @@ func.func @pack_error(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512xi32
       %15 = arith.addi %out, %14 : i32
       linalg.yield %15 : i32
     } -> tensor<16x8x64x64xi32>
-    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %5 : tensor<16x8x64x64xi32> -> tensor<1024x512xi32>
+    %unpack = linalg.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %5 : tensor<16x8x64x64xi32> -> tensor<1024x512xi32>
     return %unpack : tensor<1024x512xi32>
 }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_reference_to_allocation.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_reference_to_allocation.mlir
index 6d9307c83..a70432956 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_reference_to_allocation.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_reference_to_allocation.mlir
@@ -129,22 +129,22 @@ func.func @matmul_example(%arg0: tensor<128x256xi32>, %arg1: tensor<256x128xi32>
       %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %8] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
       %alloc_5 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
       %9 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xi32, 1 : i32> to tensor<2x1x32x32xi32>
-      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
+      %pack = linalg.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
       %extracted_slice_6 = tensor.extract_slice %extracted_slice_0[%8, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
       %alloc_7 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
       %10 = bufferization.to_tensor %alloc_7 restrict writable : memref<1x2x32x32xi32, 1 : i32> to tensor<1x2x32x32xi32>
-      %pack_8 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
+      %pack_8 = linalg.pack %extracted_slice_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
       %11 = scf.forall (%arg7, %arg8) in (2, 2) shared_outs(%arg9 = %arg6) -> (tensor<2x2x8x8x4x4xi32>) {
         %extracted_slice_9 = tensor.extract_slice %pack[%arg7, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
         %extracted_slice_10 = tensor.extract_slice %5[%arg7, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
         %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
         %12 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32> to tensor<1x1x4x8x4x8xi32>
-        %pack_12 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %12 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
+        %pack_12 = linalg.pack %extracted_slice_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %12 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
         %extracted_slice_13 = tensor.extract_slice %pack_8[0, %arg8, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
         %extracted_slice_14 = tensor.extract_slice %6[0, %arg8, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xi32> to tensor<1x1x8x4x8x4xi32>
         %alloc_15 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
         %13 = bufferization.to_tensor %alloc_15 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32> to tensor<1x1x8x4x8x4xi32>
-        %pack_16 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %13 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
+        %pack_16 = linalg.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %13 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
         %extracted_slice_17 = tensor.extract_slice %arg9[%arg7, %arg8, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
         %14 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_16 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_17 : tensor<1x1x8x8x4x4xi32>) {
         ^bb0(%in: i32, %in_18: i32, %out: i32):
@@ -162,8 +162,8 @@ func.func @matmul_example(%arg0: tensor<128x256xi32>, %arg1: tensor<256x128xi32>
       memref.dealloc %alloc_7 : memref<1x2x32x32xi32, 1 : i32>
       scf.yield %11 : tensor<2x2x8x8x4x4xi32>
     }
-    %unpack = tensor.unpack %7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %2 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
-    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
+    %unpack = linalg.unpack %7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %2 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
+    %unpack_3 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
     memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
     memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xi32, 2 : i32>
     scf.forall.in_parallel {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop.mlir
index b8d91fc27..837c2eaae 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop.mlir
@@ -22,11 +22,11 @@
 // CHECK-DAG:             %[[iv2:.*]] = affine.min #[[EXTRACT_SLICE_MAP0]](%[[IV0]])
 // CHECK-DAG:             %[[iv3:.*]] = affine.min #[[EXTRACT_SLICE_MAP1]](%[[IV0]])
 // CHECK-DAG:             %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
-// CHECK-DAG:             %[[TILED_UNPACK:.*]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
+// CHECK-DAG:             %[[TILED_UNPACK:.*]] = linalg.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
 // CHECK:                 %[[YIELD_UNPACK:.*]] = tensor.insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]]
 // CHECK:                 scf.yield %[[YIELD_MATMUL]], %[[YIELD_UNPACK]]
 // CHECK:            }
-// CHECK:            %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :
+// CHECK:            %[[SECOND_UNPACK:.*]] = linalg.unpack %[[SECOND_LOOP]]#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :
 // CHECK:            scf.forall.in_parallel
 // CHECK:                 tensor.parallel_insert_slice %[[SECOND_UNPACK]] into %[[ITER_ARG_FINAL]]
 // CHECK:            }
@@ -72,8 +72,8 @@ module {
       }
       %3 = tensor.empty() : tensor<64x64xi32>
       %4 = tensor.empty() : tensor<1x1x64x64xi32>
-      %unpack = tensor.unpack %2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32>
-      %unpack_0 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %3 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
+      %unpack = linalg.unpack %2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32>
+      %unpack_0 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %3 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
       scf.forall.in_parallel {
         tensor.parallel_insert_slice %unpack_0 into %arg6[%arg4, %arg5] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<1024x1024xi32>
       }
@@ -115,11 +115,11 @@ module {
 // CHECK-DAG:             %[[iv2:.*]] = affine.min #[[EXTRACT_SLICE_MAP0]](%[[IV0]])
 // CHECK-DAG:             %[[iv3:.*]] = affine.min #[[EXTRACT_SLICE_MAP1]](%[[IV0]])
 // CHECK-DAG:             %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
-// CHECK-DAG:             %[[TILED_UNPACK:.*]] = tensor.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
+// CHECK-DAG:             %[[TILED_UNPACK:.*]] = linalg.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
 // CHECK-DAG:             %[[YIELD_UNPACK:.*]] = tensor.insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]]
 // CHECK:                 scf.yield %[[YIELD_MATMUL]], %[[YIELD_ELEM]], %[[YIELD_UNPACK]]
 // CHECK:            }
-// CHECK:            %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :
+// CHECK:            %[[SECOND_UNPACK:.*]] = linalg.unpack %[[SECOND_LOOP]]#2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :
 // CHECK:            scf.forall.in_parallel
 // CHECK:                 tensor.parallel_insert_slice %[[SECOND_UNPACK]] into %[[ITER_ARG_FINAL]]
 // CHECK:            }
@@ -171,8 +171,8 @@ module {
         %7 = arith.addi %in, %in_1 : i32
         linalg.yield %7 : i32
       } -> tensor<1x1x8x16x4x8xi32>
-      %unpack = tensor.unpack %6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32>
-      %unpack_0 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
+      %unpack = linalg.unpack %6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32>
+      %unpack_0 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
       scf.forall.in_parallel {
         tensor.parallel_insert_slice %unpack_0 into %arg6[%arg4, %arg5] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<1024x1024xi32>
       }
@@ -222,13 +222,13 @@ func.func @no_consumer_fusion(%arg0: tensor<64xf32>) -> tensor<64xf32> {
 // CHECK-DAG:       %[[iv2:.*]] = affine.min #[[EXTRACT_SLICE_MAP0]](%[[IV0]])
 // CHECK-DAG:       %[[iv3:.*]] = affine.min #[[EXTRACT_SLICE_MAP1]](%[[IV1]])
 // CHECK:           %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
-// CHECK:           %[[TILED_UNPACK:.*]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
+// CHECK:           %[[TILED_UNPACK:.*]] = linalg.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
 // CHECK:           scf.forall.in_parallel {
 // CHECK:                tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
 // CHECK:                tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1,  %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
 // CHECK:           }
 // CHECK:         }
-// CHECK:         %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :
+// CHECK:         %[[SECOND_UNPACK:.*]] = linalg.unpack %[[SECOND_LOOP]]#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :
 // CHECK:         scf.forall.in_parallel
 // CHECK:           tensor.parallel_insert_slice %[[SECOND_UNPACK]] into %[[ITER_ARG_FINAL]]
 // CHECK:         }
@@ -272,8 +272,8 @@ module {
       }
       %3 = tensor.empty() : tensor<64x64xi32>
       %4 = tensor.empty() : tensor<1x1x64x64xi32>
-      %unpack = tensor.unpack %2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32>
-      %unpack_0 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %3 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
+      %unpack = linalg.unpack %2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32>
+      %unpack_0 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %3 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
       scf.forall.in_parallel {
         tensor.parallel_insert_slice %unpack_0 into %arg6[%arg4, %arg5] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<1024x1024xi32>
       }
@@ -313,14 +313,14 @@ module {
 // CHECK-DAG:       %[[iv2:.*]] = affine.min #[[EXTRACT_SLICE_MAP0]](%[[IV0]])
 // CHECK-DAG:       %[[iv3:.*]] = affine.min #[[EXTRACT_SLICE_MAP1]](%[[IV1]])
 // CHECK:           %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
-// CHECK:           %[[TILED_UNPACK:.*]] = tensor.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
+// CHECK:           %[[TILED_UNPACK:.*]] = linalg.unpack %[[FUSED_CONSUMER]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %[[TILED_UNPACK_DEST]]
 // CHECK:           scf.forall.in_parallel {
 // CHECK:                tensor.parallel_insert_slice %[[MATMUL]] into %[[ITER_ARG_1]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
 // CHECK:                tensor.parallel_insert_slice %[[FUSED_CONSUMER]] into %[[ITER_ARG_2]][0, 0, %[[IV1]], %[[IV0]], 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1]
 // CHECK:                tensor.parallel_insert_slice %[[TILED_UNPACK]] into %[[ITER_ARG_3]][0, 0, %[[iv0]], %[[iv1]]] [1, 1, %[[iv2]], %[[iv3]]] [1, 1, 1, 1]
 // CHECK:           }
 // CHECK:         }
-// CHECK:         %[[SECOND_UNPACK:.*]] = tensor.unpack %[[SECOND_LOOP]]#2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :
+// CHECK:         %[[SECOND_UNPACK:.*]] = linalg.unpack %[[SECOND_LOOP]]#2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %[[SECOND_UNPACK_OUT]] :
 // CHECK:         scf.forall.in_parallel
 // CHECK:           tensor.parallel_insert_slice %[[SECOND_UNPACK]] into %[[ITER_ARG_FINAL]]
 // CHECK:         }
@@ -370,8 +370,8 @@ module {
         %7 = arith.addi %in, %in_1 : i32
         linalg.yield %7 : i32
       } -> tensor<1x1x8x16x4x8xi32>
-      %unpack = tensor.unpack %6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32>
-      %unpack_0 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
+      %unpack = linalg.unpack %6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32>
+      %unpack_0 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
       scf.forall.in_parallel {
         tensor.parallel_insert_slice %unpack_0 into %arg6[%arg4, %arg5] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<1024x1024xi32>
       }
@@ -407,14 +407,14 @@ func.func @no_consumer_fusion(%arg0: tensor<64xf32>) -> tensor<64xf32> {
 // CHECK:               %[[MATMUL:.*]] = linalg.generic
 // CHECK:               scf.yield %[[MATMUL]]
 // CHECK:             }
-// CHECK:             %[[FUSED_UNPACK:.*]] = tensor.unpack %[[FOR]]
+// CHECK:             %[[FUSED_UNPACK:.*]] = linalg.unpack %[[FOR]]
 // CHECK-SAME:                               tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
 // CHECK:             scf.forall.in_parallel {
 // CHECK:                 tensor.parallel_insert_slice %[[FOR]]
 // CHECK:                 tensor.parallel_insert_slice %[[FUSED_UNPACK]]
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[UNPACK:.*]] = tensor.unpack %[[FORALL]]#1
+// CHECK:           %[[UNPACK:.*]] = linalg.unpack %[[FORALL]]#1
 // CHECK:           scf.forall.in_parallel {
 // CHECK:               tensor.parallel_insert_slice %[[UNPACK]]
 // CHECK:           }
@@ -447,8 +447,8 @@ module {
       }
       %2 = tensor.empty() : tensor<4x4x32x32xi32>
       %3 = tensor.empty() : tensor<128x128xi32>
-      %unpack = tensor.unpack %1 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %2 : tensor<4x4x8x8x4x4xi32> -> tensor<4x4x32x32xi32>
-      %unpack_0 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %3 : tensor<4x4x32x32xi32> -> tensor<128x128xi32>
+      %unpack = linalg.unpack %1 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %2 : tensor<4x4x8x8x4x4xi32> -> tensor<4x4x32x32xi32>
+      %unpack_0 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %3 : tensor<4x4x32x32xi32> -> tensor<128x128xi32>
       scf.forall.in_parallel {
         tensor.parallel_insert_slice %unpack_0 into %arg6[0, 0] [128, 128] [1, 1] : tensor<128x128xi32> into tensor<128x128xi32>
       }
@@ -467,7 +467,7 @@ module {
 // CHECK-SAME:    {
 // CHECK:           %[[MATMUL:.+]] = linalg.generic
 // CHECK-DAG:       %[[EXTRACT_SLICE_1:.+]] = tensor.extract_slice %[[UNPACK_LOCAL_OUT]][%[[ARG2]], %[[ARG3]], 0, 0] [1, 1, 32, 32] [1, 1, 1, 1]
-// CHECK-DAG:       %[[UNPACK:.+]] = tensor.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %[[EXTRACT_SLICE_1]]
+// CHECK-DAG:       %[[UNPACK:.+]] = linalg.unpack %[[MATMUL]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %[[EXTRACT_SLICE_1]]
 // CHECK:           scf.forall.in_parallel {
 // CHECK-DAG:         tensor.parallel_insert_slice %[[MATMUL]] into %[[MATMUL_LOCAL_OUT]][%[[ARG2]], %[[ARG3]], 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1]
 // CHECK-DAG:         tensor.parallel_insert_slice %[[UNPACK]] into %[[UNPACK_LOCAL_OUT]][%[[ARG2]], %[[ARG3]], 0, 0] [1, 1, 32, 32] [1, 1, 1, 1]
@@ -477,7 +477,7 @@ module {
 // CHECK-DAG:     tensor.parallel_insert_slice %[[FORALL_1]]#0 into %[[MATMUL_OUT]][%[[ARG0]], %[[ARG1]], 0, 0, 0, 0] [4, 4, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1]
 // CHECK-DAG:     tensor.parallel_insert_slice %[[FORALL_1]]#1 into %[[UNPACK_OUT]][%[[ARG0]], %[[ARG1]], 0, 0] [4, 4, 32, 32] [1, 1, 1, 1]
 // CHECK:       }
-// CHECK:       tensor.unpack %[[FORALL_0]]#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[EXTRACT_SLICE_0]]
+// CHECK:       linalg.unpack %[[FORALL_0]]#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[EXTRACT_SLICE_0]]
 #map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d1, d2, d4, d5, d8, d7)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
@@ -498,9 +498,9 @@ module {
       %extracted_slice_4 = tensor.extract_slice %4[0, %arg1] [512, 256] [1, 1] : tensor<512x4096xbf16> to tensor<512x256xbf16>
       %extracted_slice_5 = tensor.extract_slice %arg2[%arg0, %arg1] [256, 256] [1, 1] : tensor<512x4096xf32> to tensor<256x256xf32>
       %7 = bufferization.to_tensor %alloc_3 restrict writable : memref<8x8x32x64xbf16, 1 : i32> to tensor<8x8x32x64xbf16>
-      %pack = tensor.pack %extracted_slice outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %7 : tensor<256x512xbf16> -> tensor<8x8x32x64xbf16>
+      %pack = linalg.pack %extracted_slice outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %7 : tensor<256x512xbf16> -> tensor<8x8x32x64xbf16>
       %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<8x8x64x32xbf16, 1 : i32> to tensor<8x8x64x32xbf16>
-      %pack_6 = tensor.pack %extracted_slice_4 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %8 : tensor<512x256xbf16> -> tensor<8x8x64x32xbf16>
+      %pack_6 = linalg.pack %extracted_slice_4 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %8 : tensor<512x256xbf16> -> tensor<8x8x64x32xbf16>
       %9 = bufferization.to_tensor %alloc_1 restrict writable : memref<8x8x32x32xf32, 1 : i32> to tensor<8x8x32x32xf32>
       %10 = tensor.empty() : tensor<8x8x8x8x4x4xf32>
       %11 = scf.forall (%arg3, %arg4) = (0, 0) to (8, 8) step (4, 4) shared_outs(%arg5 = %10) -> (tensor<8x8x8x8x4x4xf32>) {
@@ -512,10 +512,10 @@ module {
         %13 = scf.forall (%arg6, %arg7) in (4, 4) shared_outs(%arg8 = %12) -> (tensor<4x4x8x8x4x4xf32>) {
           %extracted_slice_12 = tensor.extract_slice %extracted_slice_10[%arg6, 0, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : tensor<4x1x32x64xbf16> to tensor<1x1x32x64xbf16>
           %14 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x8x4x8xbf16, 2 : i32> to tensor<1x1x8x8x4x8xbf16>
-          %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x64xbf16> -> tensor<1x1x8x8x4x8xbf16>
+          %pack_13 = linalg.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x64xbf16> -> tensor<1x1x8x8x4x8xbf16>
           %extracted_slice_14 = tensor.extract_slice %extracted_slice_11[%arg7, 0, 0, 0] [1, 1, 64, 32] [1, 1, 1, 1] : tensor<4x1x64x32xbf16> to tensor<1x1x64x32xbf16>
           %15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x8x8x4xbf16, 2 : i32> to tensor<1x1x8x8x8x4xbf16>
-          %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x64x32xbf16> -> tensor<1x1x8x8x8x4xbf16>
+          %pack_15 = linalg.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x64x32xbf16> -> tensor<1x1x8x8x8x4xbf16>
           %extracted_slice_16 = tensor.extract_slice %arg8[%arg6, %arg7, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<4x4x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32>
           %16 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x8x8x4x8xbf16>, tensor<1x1x8x8x8x4xbf16>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xf32>) {
           ^bb0(%in: bf16, %in_17: bf16, %out: f32):
@@ -533,8 +533,8 @@ module {
           tensor.parallel_insert_slice %13 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [4, 4, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<4x4x8x8x4x4xf32> into tensor<8x8x8x8x4x4xf32>
         }
       } {mapping = [#gpu.block<y>, #gpu.block<x>]}
-      %unpack = tensor.unpack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<8x8x8x8x4x4xf32> -> tensor<8x8x32x32xf32>
-      %unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_5 : tensor<8x8x32x32xf32> -> tensor<256x256xf32>
+      %unpack = linalg.unpack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<8x8x8x8x4x4xf32> -> tensor<8x8x32x32xf32>
+      %unpack_7 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_5 : tensor<8x8x32x32xf32> -> tensor<256x256xf32>
       scf.forall.in_parallel {
         tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [256, 256] [1, 1] : tensor<256x256xf32> into tensor<512x4096xf32>
       }
@@ -562,7 +562,7 @@ module {
 // CHECK-SAME:      ins(%[[MATMUL]] : tensor<1x1x8x8x4x4xf32>)
 // CHECK:             arith.truncf
 // CHECK:           %[[EXTRACT_SLICE_1:.+]] = tensor.extract_slice %[[UNPACK_LOCAL_OUT]][%[[ARG2]], %[[ARG3]], 0, 0] [1, 1, 32, 32] [1, 1, 1, 1]
-// CHECK:           %[[UNPACK:.+]] = tensor.unpack %[[ELEMWISE]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %[[EXTRACT_SLICE_1]]
+// CHECK:           %[[UNPACK:.+]] = linalg.unpack %[[ELEMWISE]] outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %[[EXTRACT_SLICE_1]]
 // CHECK:           scf.forall.in_parallel {
 // CHECK-DAG:         tensor.parallel_insert_slice %[[MATMUL]] into %[[MATMUL_LOCAL_OUT]][%[[ARG2]], %[[ARG3]], 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1]
 // CHECK-DAG:         tensor.parallel_insert_slice %[[ELEMWISE]] into %[[ELEMWISE_LOCAL_OUT]][%[[ARG2]], %[[ARG3]], 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1]
@@ -574,7 +574,7 @@ module {
 // CHECK-DAG:     tensor.parallel_insert_slice %[[FORALL_1]]#1 into %[[ELEMWISE_OUT]][%[[ARG0]], %[[ARG1]], 0, 0, 0, 0] [4, 4, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1]
 // CHECK-DAG:     tensor.parallel_insert_slice %[[FORALL_1]]#2 into %[[UNPACK_OUT]][%[[ARG0]], %[[ARG1]], 0, 0] [4, 4, 32, 32] [1, 1, 1, 1]
 // CHECK:       }
-// CHECK:       tensor.unpack %[[FORALL_0]]#2 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[EXTRACT_SLICE_0]]
+// CHECK:       linalg.unpack %[[FORALL_0]]#2 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[EXTRACT_SLICE_0]]
 #map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d1, d2, d4, d5, d8, d7)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
@@ -595,9 +595,9 @@ module {
       %extracted_slice_4 = tensor.extract_slice %1[0, %arg1] [512, 256] [1, 1] : tensor<512x4096xbf16> to tensor<512x256xbf16>
       %extracted_slice_5 = tensor.extract_slice %arg2[%arg0, %arg1] [256, 256] [1, 1] : tensor<512x4096xbf16> to tensor<256x256xbf16>
       %4 = bufferization.to_tensor %alloc_3 restrict writable : memref<8x8x32x64xbf16, 1 : i32> to tensor<8x8x32x64xbf16>
-      %pack = tensor.pack %extracted_slice outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %4 : tensor<256x512xbf16> -> tensor<8x8x32x64xbf16>
+      %pack = linalg.pack %extracted_slice outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %4 : tensor<256x512xbf16> -> tensor<8x8x32x64xbf16>
       %5 = bufferization.to_tensor %alloc_2 restrict writable : memref<8x8x64x32xbf16, 1 : i32> to tensor<8x8x64x32xbf16>
-      %pack_6 = tensor.pack %extracted_slice_4 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %5 : tensor<512x256xbf16> -> tensor<8x8x64x32xbf16>
+      %pack_6 = linalg.pack %extracted_slice_4 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %5 : tensor<512x256xbf16> -> tensor<8x8x64x32xbf16>
       %6 = bufferization.to_tensor %alloc_1 restrict writable : memref<8x8x32x32xbf16, 1 : i32> to tensor<8x8x32x32xbf16>
       %7 = tensor.empty() : tensor<8x8x8x8x4x4xbf16>
       %8 = tensor.empty() : tensor<8x8x8x8x4x4xf32>
@@ -610,10 +610,10 @@ module {
         %12 = scf.forall (%arg6, %arg7) in (4, 4) shared_outs(%arg8 = %11) -> (tensor<4x4x8x8x4x4xf32>) {
           %extracted_slice_12 = tensor.extract_slice %extracted_slice_10[%arg6, 0, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : tensor<4x1x32x64xbf16> to tensor<1x1x32x64xbf16>
           %13 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x8x4x8xbf16, 2 : i32> to tensor<1x1x8x8x4x8xbf16>
-          %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x64xbf16> -> tensor<1x1x8x8x4x8xbf16>
+          %pack_13 = linalg.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x64xbf16> -> tensor<1x1x8x8x4x8xbf16>
           %extracted_slice_14 = tensor.extract_slice %extracted_slice_11[%arg7, 0, 0, 0] [1, 1, 64, 32] [1, 1, 1, 1] : tensor<4x1x64x32xbf16> to tensor<1x1x64x32xbf16>
           %14 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x8x8x4xbf16, 2 : i32> to tensor<1x1x8x8x8x4xbf16>
-          %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %14 : tensor<1x1x64x32xbf16> -> tensor<1x1x8x8x8x4xbf16>
+          %pack_15 = linalg.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %14 : tensor<1x1x64x32xbf16> -> tensor<1x1x8x8x8x4xbf16>
           %extracted_slice_16 = tensor.extract_slice %arg8[%arg6, %arg7, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<4x4x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32>
           %15 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x8x8x4x8xbf16>, tensor<1x1x8x8x8x4xbf16>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xf32>) {
           ^bb0(%in: bf16, %in_17: bf16, %out: f32):
@@ -636,8 +636,8 @@ module {
         %11 = arith.truncf %in : f32 to bf16
         linalg.yield %11 : bf16
       } -> tensor<8x8x8x8x4x4xbf16>
-      %unpack = tensor.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %6 : tensor<8x8x8x8x4x4xbf16> -> tensor<8x8x32x32xbf16>
-      %unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_5 : tensor<8x8x32x32xbf16> -> tensor<256x256xbf16>
+      %unpack = linalg.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %6 : tensor<8x8x8x8x4x4xbf16> -> tensor<8x8x32x32xbf16>
+      %unpack_7 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_5 : tensor<8x8x32x32xbf16> -> tensor<256x256xbf16>
       scf.forall.in_parallel {
         tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [256, 256] [1, 1] : tensor<256x256xbf16> into tensor<512x4096xbf16>
       }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_producer_into_loop.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_producer_into_loop.mlir
index 925fbd2ae..9ed41e3a4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_producer_into_loop.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_producer_into_loop.mlir
@@ -14,9 +14,9 @@ func.func @fuse_pack_into_for(%arg0: tensor<1x1x32x512xi32>, %arg1: tensor<1x1x5
   %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
   %15 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
-  %pack_8 = tensor.pack %arg0 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
+  %pack_8 = linalg.pack %arg0 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
   %16 = tensor.empty() : tensor<1x1x4x64x8x8xi32>
-  %pack_9 = tensor.pack %arg1 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %16 : tensor<1x1x512x32xi32> -> tensor<1x1x4x64x8x8xi32>
+  %pack_9 = linalg.pack %arg1 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %16 : tensor<1x1x512x32xi32> -> tensor<1x1x4x64x8x8xi32>
   %17 = tensor.empty() : tensor<1x1x4x8x4x8xi32>
   %18 = linalg.fill ins(%c0_i32 : i32) outs(%17 : tensor<1x1x4x8x4x8xi32>) -> tensor<1x1x4x8x4x8xi32>
   %19 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %18) -> (tensor<1x1x4x8x4x8xi32>) {
@@ -38,10 +38,10 @@ func.func @fuse_pack_into_for(%arg0: tensor<1x1x32x512xi32>, %arg1: tensor<1x1x5
 // DEPTH-1:  {
 // DEPTH-1:     tensor.extract_slice %{{.*}} : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
 // DEPTH-1:     tensor.extract_slice %{{.*}} : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
-// DEPTH-1:     tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
+// DEPTH-1:     linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
 // DEPTH-1:     tensor.extract_slice %{{.*}} : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
 // DEPTH-1:     tensor.extract_slice %{{.*}} : tensor<1x1x4x64x8x8xi32> to tensor<1x1x4x4x8x8xi32>
-// DEPTH-1:     tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x4x8x8xi32>
+// DEPTH-1:     linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x4x8x8xi32>
 // DEPTH-1:     linalg.generic
 // DEPTH-1:  }
 
@@ -63,15 +63,15 @@ func.func @fuse_multilevel_pack_into_for(%arg0: tensor<2048x2048xi32>, %arg1: te
     %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3] [2048, 64] [1, 1] : tensor<2048x2048xi32> to tensor<2048x64xi32>
     %extracted_slice_1 = tensor.extract_slice %arg4[%arg2, %arg3] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
     %2 = tensor.empty() : tensor<1x64x64x32xi32>
-    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x2048xi32> -> tensor<1x64x64x32xi32>
+    %pack = linalg.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x2048xi32> -> tensor<1x64x64x32xi32>
     %3 = tensor.empty() : tensor<64x1x32x64xi32>
-    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %3 : tensor<2048x64xi32> -> tensor<64x1x32x64xi32>
+    %pack_2 = linalg.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %3 : tensor<2048x64xi32> -> tensor<64x1x32x64xi32>
     %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
     %4 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x64xi32, 1 : i32> to tensor<1x1x64x64xi32>
     %5 = tensor.empty() : tensor<1x64x4x16x4x8xi32>
-    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x64x64x32xi32> -> tensor<1x64x4x16x4x8xi32>
+    %pack_3 = linalg.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x64x64x32xi32> -> tensor<1x64x4x16x4x8xi32>
     %6 = tensor.empty() : tensor<64x1x16x4x8x4xi32>
-    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %6 : tensor<64x1x32x64xi32> -> tensor<64x1x16x4x8x4xi32>
+    %pack_4 = linalg.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %6 : tensor<64x1x32x64xi32> -> tensor<64x1x16x4x8x4xi32>
     %alloc_5 = memref.alloc() : memref<1x1x16x16x4x4xi32, 2 : i32>
     %7 = bufferization.to_tensor %alloc_5 restrict writable : memref<1x1x16x16x4x4xi32, 2 : i32> to tensor<1x1x16x16x4x4xi32>
     %8 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<1x1x16x16x4x4xi32>) -> tensor<1x1x16x16x4x4xi32>
@@ -86,8 +86,8 @@ func.func @fuse_multilevel_pack_into_for(%arg0: tensor<2048x2048xi32>, %arg1: te
       } -> tensor<1x1x16x16x4x4xi32>
       scf.yield %10 : tensor<1x1x16x16x4x4xi32>
     }
-    %unpack = tensor.unpack %9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %4 : tensor<1x1x16x16x4x4xi32> -> tensor<1x1x64x64xi32>
-    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
+    %unpack = linalg.unpack %9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %4 : tensor<1x1x16x16x4x4xi32> -> tensor<1x1x64x64xi32>
+    %unpack_6 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
     memref.dealloc %alloc : memref<1x1x64x64xi32, 1 : i32>
     memref.dealloc %alloc_5 : memref<1x1x16x16x4x4xi32, 2 : i32>
     scf.forall.in_parallel {
@@ -102,10 +102,10 @@ func.func @fuse_multilevel_pack_into_for(%arg0: tensor<2048x2048xi32>, %arg1: te
 // DEPTH-1:  {
 // DEPTH-1:        %[[PACK_1_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<1x64x64x32xi32> to tensor<1x1x64x32xi32>
 // DEPTH-1:        %[[PACK_1_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<1x64x4x16x4x8xi32> to tensor<1x1x4x16x4x8xi32>
-// DEPTH-1:        %[[PACK_1:.*]] = tensor.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32>
+// DEPTH-1:        %[[PACK_1:.*]] = linalg.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32>
 // DEPTH-1:        %[[PACK_2_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<64x1x32x64xi32> to tensor<1x1x32x64xi32>
 // DEPTH-1:        %[[PACK_2_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<64x1x16x4x8x4xi32> to tensor<1x1x16x4x8x4xi32>
-// DEPTH-1:        %[[PACK_2:.*]] = tensor.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32>
+// DEPTH-1:        %[[PACK_2:.*]] = linalg.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32>
 // DEPTH-1:        linalg.generic {{.*}} ins(%[[PACK_1]], %[[PACK_2]] :
 // DEPTH-1:  }
 
@@ -114,14 +114,14 @@ func.func @fuse_multilevel_pack_into_for(%arg0: tensor<2048x2048xi32>, %arg1: te
 // DEPTH-2:  {
 // DEPTH-2:        %[[PACK_1_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<64x2048xi32> to tensor<64x32xi32>
 // DEPTH-2:        %[[PACK_1_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<1x64x64x32xi32> to tensor<1x1x64x32xi32>
-// DEPTH-2:        %[[PACK_1_DEPTH_2:.*]] = tensor.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<64x32xi32> -> tensor<1x1x64x32xi32>
+// DEPTH-2:        %[[PACK_1_DEPTH_2:.*]] = linalg.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<64x32xi32> -> tensor<1x1x64x32xi32>
 // DEPTH-2:        %[[PACK_1_DEST_2:.*]] = tensor.extract_slice %{{.*}} : tensor<1x64x4x16x4x8xi32> to tensor<1x1x4x16x4x8xi32>
-// DEPTH-2:        %[[PACK_1_DEPTH_1:.*]] = tensor.pack %[[PACK_1_DEPTH_2]] {{.*}} into %[[PACK_1_DEST_2]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32>
+// DEPTH-2:        %[[PACK_1_DEPTH_1:.*]] = linalg.pack %[[PACK_1_DEPTH_2]] {{.*}} into %[[PACK_1_DEST_2]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32>
 // DEPTH-2:        %[[PACK_2_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<2048x64xi32> to tensor<32x64xi32>
 // DEPTH-2:        %[[PACK_2_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<64x1x32x64xi32> to tensor<1x1x32x64xi32>
-// DEPTH-2:        %[[PACK_2_DEPTH_2:.*]] = tensor.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<32x64xi32> -> tensor<1x1x32x64xi32>
+// DEPTH-2:        %[[PACK_2_DEPTH_2:.*]] = linalg.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<32x64xi32> -> tensor<1x1x32x64xi32>
 // DEPTH-2:        %[[PACK_2_DEST_2:.*]] = tensor.extract_slice %{{.*}} : tensor<64x1x16x4x8x4xi32> to tensor<1x1x16x4x8x4xi32>
-// DEPTH-2:        %[[PACK_2_DEPTH_1:.*]] = tensor.pack %[[PACK_2_DEPTH_2]] {{.*}} into %[[PACK_2_DEST_2]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32>
+// DEPTH-2:        %[[PACK_2_DEPTH_1:.*]] = linalg.pack %[[PACK_2_DEPTH_2]] {{.*}} into %[[PACK_2_DEST_2]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32>
 // DEPTH-2:        linalg.generic {{.*}} ins(%[[PACK_1_DEPTH_1]], %[[PACK_2_DEPTH_1]] :
 // DEPTH-2:  }
 
@@ -156,14 +156,14 @@ func.func @fuse_multilevel_pack_into_forall(%arg0: tensor<2048x2048xi32>, %arg1:
       %10 = affine.apply #map(%arg5)
       %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %10] [64, 32] [1, 1] : tensor<64x2048xi32> to tensor<64x32xi32>
       %extracted_slice_5 = tensor.extract_slice %2[0, %arg5, 0, 0] [1, 1, 64, 32] [1, 1, 1, 1] : tensor<1x64x64x32xi32> to tensor<1x1x64x32xi32>
-      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %extracted_slice_5 : tensor<64x32xi32> -> tensor<1x1x64x32xi32>
+      %pack = linalg.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %extracted_slice_5 : tensor<64x32xi32> -> tensor<1x1x64x32xi32>
       %extracted_slice_6 = tensor.extract_slice %5[0, %arg5, 0, 0, 0, 0] [1, 1, 4, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x64x4x16x4x8xi32> to tensor<1x1x4x16x4x8xi32>
-      %pack_7 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_6 : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32>
+      %pack_7 = linalg.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_6 : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32>
       %extracted_slice_8 = tensor.extract_slice %extracted_slice_0[%10, 0] [32, 64] [1, 1] : tensor<2048x64xi32> to tensor<32x64xi32>
       %extracted_slice_9 = tensor.extract_slice %3[%arg5, 0, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : tensor<64x1x32x64xi32> to tensor<1x1x32x64xi32>
-      %pack_10 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %extracted_slice_9 : tensor<32x64xi32> -> tensor<1x1x32x64xi32>
+      %pack_10 = linalg.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %extracted_slice_9 : tensor<32x64xi32> -> tensor<1x1x32x64xi32>
       %extracted_slice_11 = tensor.extract_slice %6[%arg5, 0, 0, 0, 0, 0] [1, 1, 16, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<64x1x16x4x8x4xi32> to tensor<1x1x16x4x8x4xi32>
-      %pack_12 = tensor.pack %pack_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32>
+      %pack_12 = linalg.pack %pack_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32>
       %11 = scf.forall (%arg7, %arg8) in (1, 1) shared_outs(%arg9 = %arg6) -> (tensor<1x1x16x16x4x4xi32>) {
         %extracted_slice_13 = tensor.extract_slice %pack_7[%arg7, 0, 0, 0, 0, 0] [1, 1, 4, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x16x4x8xi32> to tensor<1x1x4x16x4x8xi32>
         %extracted_slice_14 = tensor.extract_slice %pack_12[0, %arg8, 0, 0, 0, 0] [1, 1, 16, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x4x8x4xi32> to tensor<1x1x16x4x8x4xi32>
@@ -180,8 +180,8 @@ func.func @fuse_multilevel_pack_into_forall(%arg0: tensor<2048x2048xi32>, %arg1:
       } {mapping = [#gpu.block<y>, #gpu.block<x>]}
       scf.yield %11 : tensor<1x1x16x16x4x4xi32>
     }
-    %unpack = tensor.unpack %9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %4 : tensor<1x1x16x16x4x4xi32> -> tensor<1x1x64x64xi32>
-    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
+    %unpack = linalg.unpack %9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %4 : tensor<1x1x16x16x4x4xi32> -> tensor<1x1x64x64xi32>
+    %unpack_3 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
     memref.dealloc %alloc : memref<1x1x64x64xi32, 1 : i32>
     memref.dealloc %alloc_2 : memref<1x1x16x16x4x4xi32, 2 : i32>
     scf.forall.in_parallel {
@@ -198,10 +198,10 @@ func.func @fuse_multilevel_pack_into_forall(%arg0: tensor<2048x2048xi32>, %arg1:
 // FORALL-DEPTH-1:     {
 // FORALL-DEPTH-1:        %[[PACK_1_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x64x32xi32> to tensor<1x1x64x32xi32>
 // FORALL-DEPTH-1:        %[[PACK_1_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x4x16x4x8xi32> to tensor<1x1x4x16x4x8xi32>
-// FORALL-DEPTH-1:        %[[PACK_1:.*]] = tensor.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32>
+// FORALL-DEPTH-1:        %[[PACK_1:.*]] = linalg.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32>
 // FORALL-DEPTH-1:        %[[PACK_2_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x32x64xi32> to tensor<1x1x32x64xi32>
 // FORALL-DEPTH-1:        %[[PACK_2_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x16x4x8x4xi32> to tensor<1x1x16x4x8x4xi32>
-// FORALL-DEPTH-1:        %[[PACK_2:.*]] = tensor.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32>
+// FORALL-DEPTH-1:        %[[PACK_2:.*]] = linalg.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32>
 // FORALL-DEPTH-1:        linalg.generic {{.*}} ins(%[[PACK_1]], %[[PACK_2]] :
 // FORALL-DEPTH-1:      }
 // FORALL-DEPTH-1:  }
@@ -213,14 +213,14 @@ func.func @fuse_multilevel_pack_into_forall(%arg0: tensor<2048x2048xi32>, %arg1:
 // FORALL-DEPTH-2:     {
 // FORALL-DEPTH-2:        %[[PACK_1_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<64x32xi32> to tensor<64x32xi32>
 // FORALL-DEPTH-2:        %[[PACK_1_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x64x32xi32> to tensor<1x1x64x32xi32>
-// FORALL-DEPTH-2:        %[[PACK_1_DEPTH_2:.*]] = tensor.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<64x32xi32> -> tensor<1x1x64x32xi32>
+// FORALL-DEPTH-2:        %[[PACK_1_DEPTH_2:.*]] = linalg.pack %[[PACK_1_SOURCE]] {{.*}} into %[[PACK_1_DEST]] : tensor<64x32xi32> -> tensor<1x1x64x32xi32>
 // FORALL-DEPTH-2:        %[[PACK_1_DEST_2:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x4x16x4x8xi32> to tensor<1x1x4x16x4x8xi32>
-// FORALL-DEPTH-2:        %[[PACK_1_DEPTH_1:.*]] = tensor.pack %[[PACK_1_DEPTH_2]] {{.*}} into %[[PACK_1_DEST_2]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32>
+// FORALL-DEPTH-2:        %[[PACK_1_DEPTH_1:.*]] = linalg.pack %[[PACK_1_DEPTH_2]] {{.*}} into %[[PACK_1_DEST_2]] : tensor<1x1x64x32xi32> -> tensor<1x1x4x16x4x8xi32>
 // FORALL-DEPTH-2:        %[[PACK_2_SOURCE:.*]] = tensor.extract_slice %{{.*}} : tensor<32x64xi32> to tensor<32x64xi32>
 // FORALL-DEPTH-2:        %[[PACK_2_DEST:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x32x64xi32> to tensor<1x1x32x64xi32>
-// FORALL-DEPTH-2:        %[[PACK_2_DEPTH_2:.*]] = tensor.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<32x64xi32> -> tensor<1x1x32x64xi32>
+// FORALL-DEPTH-2:        %[[PACK_2_DEPTH_2:.*]] = linalg.pack %[[PACK_2_SOURCE]] {{.*}} into %[[PACK_2_DEST]] : tensor<32x64xi32> -> tensor<1x1x32x64xi32>
 // FORALL-DEPTH-2:        %[[PACK_2_DEST_2:.*]] = tensor.extract_slice %{{.*}} : tensor<1x1x16x4x8x4xi32> to tensor<1x1x16x4x8x4xi32>
-// FORALL-DEPTH-2:        %[[PACK_2_DEPTH_1:.*]] = tensor.pack %[[PACK_2_DEPTH_2]] {{.*}} into %[[PACK_2_DEST_2]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32>
+// FORALL-DEPTH-2:        %[[PACK_2_DEPTH_1:.*]] = linalg.pack %[[PACK_2_DEPTH_2]] {{.*}} into %[[PACK_2_DEST_2]] : tensor<1x1x32x64xi32> -> tensor<1x1x16x4x8x4xi32>
 // FORALL-DEPTH-2:        linalg.generic {{.*}} ins(%[[PACK_1_DEPTH_1]], %[[PACK_2_DEPTH_1]] :
 // FORALL-DEPTH-2:      }
 // FORALL-DEPTH-2:  }
@@ -382,9 +382,9 @@ func.func @pack_without_slice(%arg0: tensor<1x1x32x512xi32>, %arg1: tensor<1x1x3
   %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
   %15 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
-  %pack_8 = tensor.pack %arg0 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
+  %pack_8 = linalg.pack %arg0 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
   %16 = tensor.empty() : tensor<1x1x4x4x8x8xi32>
-  %pack_10 = tensor.pack %arg1 outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x4x8x8xi32>
+  %pack_10 = linalg.pack %arg1 outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x4x8x8xi32>
 
   %17 = tensor.empty() : tensor<1x1x4x8x4x8xi32>
   %18 = linalg.fill ins(%c0_i32 : i32) outs(%17 : tensor<1x1x4x8x4x8xi32>) -> tensor<1x1x4x8x4x8xi32>
@@ -403,7 +403,7 @@ func.func @pack_without_slice(%arg0: tensor<1x1x32x512xi32>, %arg1: tensor<1x1x3
 
 // DEPTH-1-LABEL: pack_without_slice
 // DEPTH-1:       scf.for
-// DEPTH-1-DAG:   %[[PACK_1:.*]] = tensor.pack %{{.*}} into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x4x8x8xi32>
-// DEPTH-1-DAG:   %[[PACK_2:.*]] = tensor.pack %{{.*}} into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
+// DEPTH-1-DAG:   %[[PACK_1:.*]] = linalg.pack %{{.*}} into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x4x8x8xi32>
+// DEPTH-1-DAG:   %[[PACK_2:.*]] = linalg.pack %{{.*}} into %{{.*}} : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
 // DEPTH-1:       linalg.generic
 // DEPTH-1-SAME:  ins(%[[PACK_2]], %[[PACK_1]]
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir
index 053f9f7ed..5173b4c6d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir
@@ -9,9 +9,9 @@ func.func @matmul_example_dispatch_0_matmul_16x256x256_i8xi8xi32(%arg0 : tensor<
   %c0_i32 = arith.constant 0 : i32
   %0 = tensor.empty() : tensor<16x256xi32>
   %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<16x256xi32>) -> tensor<16x256xi32>
-  // CHECK:       tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 128] into %{{.*}} : tensor<16x256xi8> -> tensor<1x2x16x128xi8>
-  // CHECK:       tensor.pack %{{.*}} outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %{{.*}} : tensor<256x256xi8> -> tensor<2x1x128x256xi8>
-  // CHECK:       tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 256] into %{{.*}} : tensor<16x256xi32> -> tensor<1x1x16x256xi32>
+  // CHECK:       linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 128] into %{{.*}} : tensor<16x256xi8> -> tensor<1x2x16x128xi8>
+  // CHECK:       linalg.pack %{{.*}} outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %{{.*}} : tensor<256x256xi8> -> tensor<2x1x128x256xi8>
+  // CHECK:       linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 256] into %{{.*}} : tensor<16x256xi32> -> tensor<1x1x16x256xi32>
   // CHECK:       linalg.generic
   // CHECK-SAME:  attrs =  {lowering_config = #config, packing_config = #packingConfig}
   %2 = linalg.matmul {lowering_config = #config, packing_config = #packingConfig} ins(%arg0, %arg1 : tensor<16x256xi8>, tensor<256x256xi8>) outs(%1 : tensor<16x256xi32>) -> tensor<16x256xi32>
@@ -29,9 +29,9 @@ func.func @matmul_transpose_b_dispatch_0_matmul_transpose_b_256x1024x512_i32(%ar
   %c0_i32 = arith.constant 0 : i32
   %0 = tensor.empty() : tensor<256x1024xi32>
   %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<256x1024xi32>) -> tensor<256x1024xi32>
-  // CHECK:       tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<256x512xi32> -> tensor<4x16x64x32xi32>
-  // CHECK:       tensor.pack %{{.*}} outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %{{.*}} : tensor<1024x512xi32> -> tensor<16x16x64x32xi32>
-  // CHECK:       tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %{{.*}} : tensor<256x1024xi32> -> tensor<4x16x64x64xi32>
+  // CHECK:       linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<256x512xi32> -> tensor<4x16x64x32xi32>
+  // CHECK:       linalg.pack %{{.*}} outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %{{.*}} : tensor<1024x512xi32> -> tensor<16x16x64x32xi32>
+  // CHECK:       linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %{{.*}} : tensor<256x1024xi32> -> tensor<4x16x64x64xi32>
   // CHECK:       linalg.generic
   // CHECK-SAME:  attrs =  {lowering_config = #config, packing_config = #packingConfig}
   %2 = linalg.matmul_transpose_b {lowering_config = #config, packing_config = #packingConfig} ins(%arg0, %arg1 : tensor<256x512xi32>, tensor<1024x512xi32>) outs(%1 : tensor<256x1024xi32>) -> tensor<256x1024xi32>
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level2.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level2.mlir
index 23f0f9acc..162075365 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level2.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level2.mlir
@@ -15,14 +15,14 @@ func.func @matmul_example_dispatch_0_matmul_16x256x256_i8xi8xi32(%arg0: tensor<1
     %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3] [256, 256] [1, 1] : tensor<256x256xi8> to tensor<256x256xi8>
     %extracted_slice_1 = tensor.extract_slice %arg4[%arg2, %arg3] [16, 256] [1, 1] : tensor<16x256xi32> to tensor<16x256xi32>
     %2 = tensor.empty() : tensor<1x2x16x128xi8>
-    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [16, 128] into %2 : tensor<16x256xi8> -> tensor<1x2x16x128xi8>
+    %pack = linalg.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [16, 128] into %2 : tensor<16x256xi8> -> tensor<1x2x16x128xi8>
     %3 = tensor.empty() : tensor<2x1x128x256xi8>
-    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %3 : tensor<256x256xi8> -> tensor<2x1x128x256xi8>
+    %pack_2 = linalg.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %3 : tensor<256x256xi8> -> tensor<2x1x128x256xi8>
     %4 = tensor.empty() : tensor<1x1x16x256xi32>
     %5 = linalg.fill ins(%c0_i32 : i32) outs(%4 : tensor<1x1x16x256xi32>) -> tensor<1x1x16x256xi32>
-    // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x2x16x128xi8> -> tensor<1x2x16x4x4x8xi8>
-    // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %{{.*}} : tensor<2x1x128x256xi8> -> tensor<2x1x32x16x8x8xi8>
-    // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x1x16x256xi32> -> tensor<1x1x32x4x4x8xi32>
+    // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x2x16x128xi8> -> tensor<1x2x16x4x4x8xi8>
+    // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %{{.*}} : tensor<2x1x128x256xi8> -> tensor<2x1x32x16x8x8xi8>
+    // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x1x16x256xi32> -> tensor<1x1x32x4x4x8xi32>
     // CHECK:       linalg.generic
     // CHECK-SAME:  attrs =  {lowering_config = #config, packing_config = #packingConfig}
     %6 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<1x2x16x128xi8>, tensor<2x1x128x256xi8>) outs(%5 : tensor<1x1x16x256xi32>) attrs =  {lowering_config = #config, packing_config = #packingConfig} {
@@ -33,7 +33,7 @@ func.func @matmul_example_dispatch_0_matmul_16x256x256_i8xi8xi32(%arg0: tensor<1
       %10 = arith.addi %out, %9 : i32
       linalg.yield %10 : i32
     } -> tensor<1x1x16x256xi32>
-    %unpack = tensor.unpack %6 inner_dims_pos = [0, 1] inner_tiles = [16, 256] into %extracted_slice_1 : tensor<1x1x16x256xi32> -> tensor<16x256xi32>
+    %unpack = linalg.unpack %6 inner_dims_pos = [0, 1] inner_tiles = [16, 256] into %extracted_slice_1 : tensor<1x1x16x256xi32> -> tensor<16x256xi32>
     scf.forall.in_parallel {
       tensor.parallel_insert_slice %unpack into %arg4[%arg2, %arg3] [16, 256] [1, 1] : tensor<16x256xi32> into tensor<16x256xi32>
     }
@@ -59,13 +59,13 @@ func.func @matmul_transpose_b_dispatch_0_matmul_transpose_b_256x1024x512_i32(%ar
     %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, 0] [64, 512] [1, 1] : tensor<1024x512xi32> to tensor<64x512xi32>
     %extracted_slice_1 = tensor.extract_slice %arg4[%arg2, %arg3] [64, 64] [1, 1] : tensor<256x1024xi32> to tensor<64x64xi32>
     %2 = tensor.empty() : tensor<1x16x64x32xi32>
-    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi32> -> tensor<1x16x64x32xi32>
-    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi32> -> tensor<1x16x64x32xi32>
+    %pack = linalg.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi32> -> tensor<1x16x64x32xi32>
+    %pack_2 = linalg.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi32> -> tensor<1x16x64x32xi32>
     %3 = tensor.empty() : tensor<1x1x64x64xi32>
     %4 = linalg.fill ins(%c0_i32 : i32) outs(%3 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
-    // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x16x64x32xi32> -> tensor<1x16x4x16x4x8xi32>
-    // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x16x64x32xi32> -> tensor<1x16x4x16x4x8xi32>
-    // CHECK: tensor.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %{{.*}} : tensor<1x1x64x64xi32> -> tensor<1x1x16x16x4x4xi32>
+    // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x16x64x32xi32> -> tensor<1x16x4x16x4x8xi32>
+    // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %{{.*}} : tensor<1x16x64x32xi32> -> tensor<1x16x4x16x4x8xi32>
+    // CHECK: linalg.pack %{{.*}} outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %{{.*}} : tensor<1x1x64x64xi32> -> tensor<1x1x16x16x4x4xi32>
     // CHECK:       linalg.generic
     // CHECK-SAME:  attrs =  {lowering_config = #config, packing_config = #packingConfig}
     %5 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<1x16x64x32xi32>, tensor<1x16x64x32xi32>) outs(%4 : tensor<1x1x64x64xi32>) attrs =  {lowering_config = #config, packing_config = #packingConfig} {
@@ -74,7 +74,7 @@ func.func @matmul_transpose_b_dispatch_0_matmul_transpose_b_256x1024x512_i32(%ar
       %7 = arith.addi %out, %6 : i32
       linalg.yield %7 : i32
     } -> tensor<1x1x64x64xi32>
-    %unpack = tensor.unpack %5 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
+    %unpack = linalg.unpack %5 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
     scf.forall.in_parallel {
       tensor.parallel_insert_slice %unpack into %arg4[%arg2, %arg3] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<256x1024xi32>
     }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/propagate_data_layout.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/propagate_data_layout.mlir
index cbf64e2b4..2c62c8871 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/propagate_data_layout.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/propagate_data_layout.mlir
@@ -8,9 +8,9 @@
 func.func @matmul_static(%arg0: tensor<1x4x16x64xi32>, %arg1: tensor<4x1x64x64xi32>) -> tensor<1x1x16x64xi32> {
   %c0_i32 = arith.constant 0 : i32
   %0 = tensor.empty() : tensor<1x4x8x4x4x8xi32>
-  %pack = tensor.pack %arg0 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %0 : tensor<1x4x16x64xi32> -> tensor<1x4x8x4x4x8xi32>
+  %pack = linalg.pack %arg0 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %0 : tensor<1x4x16x64xi32> -> tensor<1x4x8x4x4x8xi32>
   %1 = tensor.empty() : tensor<4x1x8x8x8x8xi32>
-  %pack_0 = tensor.pack %arg1 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %1 : tensor<4x1x64x64xi32> -> tensor<4x1x8x8x8x8xi32>
+  %pack_0 = linalg.pack %arg1 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %1 : tensor<4x1x64x64xi32> -> tensor<4x1x8x8x8x8xi32>
   %2 = tensor.empty() : tensor<1x1x8x4x4x8xi32>
   %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1x1x8x4x4x8xi32>) -> tensor<1x1x8x4x4x8xi32>
   %4 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_0 : tensor<1x4x8x4x4x8xi32>, tensor<4x1x8x8x8x8xi32>) outs(%3 : tensor<1x1x8x4x4x8xi32>) {
@@ -20,7 +20,7 @@ func.func @matmul_static(%arg0: tensor<1x4x16x64xi32>, %arg1: tensor<4x1x64x64xi
     linalg.yield %7 : i32
   } -> tensor<1x1x8x4x4x8xi32>
   %empty = tensor.empty() : tensor<1x1x16x64xi32>
-  %unpack = tensor.unpack %4 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %empty : tensor<1x1x8x4x4x8xi32> -> tensor<1x1x16x64xi32>
+  %unpack = linalg.unpack %4 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %empty : tensor<1x1x8x4x4x8xi32> -> tensor<1x1x16x64xi32>
   %empty2 = tensor.empty() : tensor<1x1x16x64xi32>
   %fill = linalg.fill ins(%c0_i32 : i32) outs(%empty2 : tensor<1x1x16x64xi32>) -> tensor<1x1x16x64xi32>
   %5 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%unpack: tensor<1x1x16x64xi32>) outs(%fill : tensor<1x1x16x64xi32>) {
@@ -33,14 +33,14 @@ func.func @matmul_static(%arg0: tensor<1x4x16x64xi32>, %arg1: tensor<4x1x64x64xi
 }
 
 // CHECK-LABEL: matmul_static
-//       CHECK: %[[PACK_0:.*]] = tensor.pack {{.*}} : tensor<1x4x16x64xi32> -> tensor<1x4x8x4x4x8xi32>
-//       CHECK: %[[PACK_1:.*]] = tensor.pack {{.*}} : tensor<4x1x64x64xi32> -> tensor<4x1x8x8x8x8xi32>
+//       CHECK: %[[PACK_0:.*]] = linalg.pack {{.*}} : tensor<1x4x16x64xi32> -> tensor<1x4x8x4x4x8xi32>
+//       CHECK: %[[PACK_1:.*]] = linalg.pack {{.*}} : tensor<4x1x64x64xi32> -> tensor<4x1x8x8x8x8xi32>
 //       CHECK: %[[FILL:.*]] = linalg.fill {{.*}} -> tensor<1x1x8x4x4x8xi32>
 //       CHECK: %[[MATMUL_0:.*]] = linalg.generic {{.*}} ins(%[[PACK_0]], %[[PACK_1]] : tensor<1x4x8x4x4x8xi32>, tensor<4x1x8x8x8x8xi32>) outs(%[[FILL]] : tensor<1x1x8x4x4x8xi32>)
-//   CHECK-NOT: tensor.unpack
-//   CHECK-NOT: tensor.pack
+//   CHECK-NOT: linalg.unpack
+//   CHECK-NOT: linalg.pack
 //       CHECK: %[[MATMUL_1:.*]] = linalg.generic {{.*}} ins(%[[MATMUL_0]] : tensor<1x1x8x4x4x8xi32>) outs(%[[FILL]] : tensor<1x1x8x4x4x8xi32>)
-//       CHECK: %[[UNPACK:.*]] = tensor.unpack %[[MATMUL_1:.*]] {{.*}} : tensor<1x1x8x4x4x8xi32> -> tensor<1x1x16x64xi32>
+//       CHECK: %[[UNPACK:.*]] = linalg.unpack %[[MATMUL_1:.*]] {{.*}} : tensor<1x1x8x4x4x8xi32> -> tensor<1x1x16x64xi32>
 
 // -----
 
@@ -54,12 +54,12 @@ func.func @matmul_elementwise_1024x1024x512_i8xi8xi32(%arg0: tensor<1024x512xi8>
     %extracted_slice_1 = tensor.extract_slice %0[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32>
     %2 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
     %3 = tensor.empty() : tensor<1x16x64x32xi8>
-    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %3 : tensor<64x512xi8> -> tensor<1x16x64x32xi8>
+    %pack = linalg.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %3 : tensor<64x512xi8> -> tensor<1x16x64x32xi8>
     %4 = tensor.empty() : tensor<16x1x64x32xi8>
     %5 = tensor.empty() : tensor<16x1x32x64xi8>
-    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %5 : tensor<512x64xi8> -> tensor<16x1x32x64xi8>
+    %pack_2 = linalg.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %5 : tensor<512x64xi8> -> tensor<16x1x32x64xi8>
     %6 = tensor.empty() : tensor<1x1x64x64xi32>
-    %pack_3 = tensor.pack %2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
+    %pack_3 = linalg.pack %2 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
     %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<1x16x64x32xi8>, tensor<16x1x32x64xi8>) outs(%pack_3 : tensor<1x1x64x64xi32>) {
     ^bb0(%in: i8, %in_6: i8, %out: i32):
       %9 = arith.extsi %in : i8 to i32
@@ -68,7 +68,7 @@ func.func @matmul_elementwise_1024x1024x512_i8xi8xi32(%arg0: tensor<1024x512xi8>
       %12 = arith.addi %out, %11 : i32
       linalg.yield %12 : i32
     } -> tensor<1x1x64x64xi32>
-    %unpack = tensor.unpack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %2 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
+    %unpack = linalg.unpack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %2 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
     %extracted_slice_4 = tensor.extract_slice %arg2[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32>
     %extracted_slice_5 = tensor.extract_slice %arg5[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32>
     %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack, %extracted_slice_4 : tensor<64x64xi32>, tensor<64x64xi32>) outs(%extracted_slice_5 : tensor<64x64xi32>) {
@@ -84,12 +84,12 @@ func.func @matmul_elementwise_1024x1024x512_i8xi8xi32(%arg0: tensor<1024x512xi8>
 }
 
 // CHECK-LABEL: matmul_elementwise_1024x1024x512_i8xi8xi32
-//       CHECK: %[[PACK_0:.*]] = tensor.pack {{.*}} : tensor<64x512xi8> -> tensor<1x16x64x32xi8>
-//       CHECK: %[[PACK_1:.*]] = tensor.pack {{.*}} : tensor<512x64xi8> -> tensor<16x1x32x64xi8>
+//       CHECK: %[[PACK_0:.*]] = linalg.pack {{.*}} : tensor<64x512xi8> -> tensor<1x16x64x32xi8>
+//       CHECK: %[[PACK_1:.*]] = linalg.pack {{.*}} : tensor<512x64xi8> -> tensor<16x1x32x64xi8>
 //       CHECK: %[[EMPTY:.*]] = tensor.empty() : tensor<1x1x64x64xi32>
 //       CHECK: %[[FILL:.*]] = linalg.fill {{.*}} -> tensor<1x1x64x64xi32>
 //       CHECK: %[[MATMUL:.*]] = linalg.generic {{.*}} ins(%[[PACK_0]], %[[PACK_1]] : tensor<1x16x64x32xi8>, tensor<16x1x32x64xi8>) outs(%[[FILL]] : tensor<1x1x64x64xi32>)
-//   CHECK-NOT: tensor.unpack
-//       CHECK: %[[PACK_2:.*]] = tensor.pack {{.*}} : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
+//   CHECK-NOT: linalg.unpack
+//       CHECK: %[[PACK_2:.*]] = linalg.pack {{.*}} : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
 //       CHECK: %[[ELEMENT:.*]] = linalg.generic {{.*}} ins(%[[MATMUL]], %[[PACK_2]] : tensor<1x1x64x64xi32>, tensor<1x1x64x64xi32>) outs(%[[EMPTY]] : tensor<1x1x64x64xi32>)
-//       CHECK: %[[UNPACK:.*]] = tensor.unpack %[[ELEMENT:.*]] {{.*}} :  tensor<1x1x64x64xi32> -> tensor<64x64xi32>
+//       CHECK: %[[UNPACK:.*]] = linalg.unpack %[[ELEMENT:.*]] {{.*}} :  tensor<1x1x64x64xi32> -> tensor<64x64xi32>
diff --git a/third_party/iree b/third_party/iree
index 756e9e661..055ce1f80 160000
--- a/third_party/iree
+++ b/third_party/iree
@@ -1 +1 @@
-Subproject commit 756e9e66138129df1bb28a2f2fac06058f976bcf
+Subproject commit 055ce1f80c87f9087035db5e300668553d2871e2