Bump IREE to 055ce1f (#1124)

The main change is to update `tensor.pack/unpack to linalg.pack/unpack` followed by upstream change llvm/llvm-project#123902.
nod-ai · Feb 21, 2025 · b005ec2 · b005ec2
1 parent 5cfa07a
commit b005ec2
Show file tree

Hide file tree

Showing 15 changed files with 186 additions and 186 deletions.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp
@@ -76,7 +76,7 @@ static FailureOr<SmallVector<Value>> getPackOrCopyOperands(
     uint32_t currentLevel{0};
     Operation *currentOp = input.value().getDefiningOp();
     while (currentLevel < depthLevel && currentOp != nullptr) {
-      if (dyn_cast<tensor::PackOp>(currentOp)) {
+      if (dyn_cast<linalg::PackOp>(currentOp)) {
         currentLevel++;
         if (currentLevel == depthLevel) break;
       } else if (dyn_cast<linalg::CopyOp>(currentOp)) {

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseProducerIntoLoop.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseProducerIntoLoop.cpp
@@ -21,7 +21,7 @@ namespace {
 
 /// A utility function specific to this pass which, given a value `operand`,
 /// traverses the def-chain till it finds a tensor.extract_slice. Currently,
-/// the two producer ops that are allowed in the def-chain are tensor.pack and
+/// the two producer ops that are allowed in the def-chain are linalg.pack and
 /// linalg.copy ops. The 2 cases where it successfully finds and returns an
 /// extract_slice (SLICE) are:
 ///
@@ -39,7 +39,7 @@ namespace {
 static FailureOr<tensor::ExtractSliceOp> getTensorExtractSliceDefiningOp(
     Value operand) {
   // Roll back through all the pack or copy ops immediately preceding `operand`.
-  while (isa_and_present<tensor::PackOp, linalg::CopyOp>(
+  while (isa_and_present<linalg::PackOp, linalg::CopyOp>(
       operand.getDefiningOp())) {
     operand = operand.getDefiningOp()->getOperand(0);
   }
@@ -49,7 +49,7 @@ static FailureOr<tensor::ExtractSliceOp> getTensorExtractSliceDefiningOp(
   if (!sliceOp) return failure();
 
   // Case 1 outlined above.
-  if (isa_and_present<tensor::PackOp, linalg::CopyOp>(
+  if (isa_and_present<linalg::PackOp, linalg::CopyOp>(
           sliceOp.getSource().getDefiningOp())) {
     return sliceOp;
   }
@@ -60,7 +60,7 @@ static FailureOr<tensor::ExtractSliceOp> getTensorExtractSliceDefiningOp(
     LoopLikeOpInterface loop = dyn_cast<LoopLikeOpInterface>(parent);
     if (!loop) return failure();
     Operation *operandParent = loop.getTiedLoopInit(blkArg)->getOwner();
-    if (isa_and_present<tensor::PackOp, linalg::CopyOp>(operandParent))
+    if (isa_and_present<linalg::PackOp, linalg::CopyOp>(operandParent))
       return sliceOp;
   }
 
@@ -110,7 +110,7 @@ void AMDAIEFuseProducerIntoLoopPass::runOnOperation() {
   LoopLikeOpInterface loops = cast<LoopLikeOpInterface>(scfLoopOp);
 
   // Based on the `fuseDepth`, we would greedily fuse the producers of a linalg
-  // computation op. Currently, we are limiting the producers to tensor.pack or
+  // computation op. Currently, we are limiting the producers to linalg.pack or
   // linalg.copy ops.
   for (unsigned depth = 1; depth <= fuseDepth; depth++) {
     // Search the last compute op in the loop and its producer slices.
@@ -153,7 +153,7 @@ void AMDAIEFuseProducerIntoLoopPass::runOnOperation() {
 
       // Case where operand of a generic op is a pack/copy op which is in a
       // different block than the generic's block.
-      else if (isa_and_present<tensor::PackOp, linalg::CopyOp>(
+      else if (isa_and_present<linalg::PackOp, linalg::CopyOp>(
                    operand.getDefiningOp())) {
         Operation *parent = operand.getDefiningOp();
         Block *genericBlock = genericOp->getBlock();

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp
@@ -106,9 +106,9 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
   }
 
   // Step 3. Pack Transpose
-  SmallVector<tensor::PackOp> packOps = packResult->packOps;
+  SmallVector<linalg::PackOp> packOps = packResult->packOps;
   linalg::LinalgOp packedOp = packResult->packedLinalgOp;
-  SmallVector<tensor::UnPackOp> unpackOps = packResult->unPackOps;
+  SmallVector<linalg::UnPackOp> unpackOps = packResult->unPackOps;
 
   if (packOps.size() != 3 || !packedOp || unpackOps.empty()) {
     funcOp->emitOpError("failed to get correct pack and unpack ops");
@@ -122,7 +122,7 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
 
   for (auto [index, unpackEmpty, innerPerm, outerPerm] :
        llvm::zip(packIndices, unpackArr, innerPermArr, outerPermArr)) {
-    tensor::UnPackOp unpackOp;
+    linalg::UnPackOp unpackOp;
     if (unpackEmpty) {
       unpackOp = unpackOps.back();
     }

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp
@@ -178,8 +178,8 @@ static bool isTilingReductionDimension(TilingInterface consumerOp,
 }
 
 static bool consumerToSkip(TilingInterface op) {
-  if (isa<linalg::CopyOp>(op) || isa<tensor::PackOp>(op) ||
-      isa<tensor::UnPackOp>(op))
+  if (isa<linalg::CopyOp>(op) || isa<linalg::PackOp>(op) ||
+      isa<linalg::UnPackOp>(op))
     return true;
   return false;
 }
@@ -279,7 +279,7 @@ void AMDAIETileAndFusePass::runOnOperation() {
   TilingInterface consumerOp;
   funcOp->walk<WalkOrder::PostOrder, ReverseIterator>([&](TilingInterface op) {
     // Find the next consumer op if it does not have loops OR it is from
-    // the skip ops list which currently contains linalg.copy and tensor.unpack.
+    // the skip ops list which currently contains linalg.copy and linalg.unpack.
     if (op.getLoopIteratorTypes().empty() || consumerToSkip(op))
       return WalkResult::advance();
 
@@ -356,7 +356,7 @@ void AMDAIETileAndFusePass::runOnOperation() {
           bool fusableOp =
               TypeSwitch<Operation *, bool>(originalProducer.getOwner())
                   // List ops that shouldnt be fused.
-                  .Case<tensor::PackOp, tensor::PadOp, linalg::CopyOp,
+                  .Case<linalg::PackOp, tensor::PadOp, linalg::CopyOp,
                         memref::CopyOp>([](Operation *) { return false; })
                   // Fuse all Linalg ops (can be generalized later)
                   .Default([&](Operation *op) {

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -794,7 +794,7 @@ void addMLIRAIELoweringPasses(OpPassManager &pm) {
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createConvertLinalgToLoopsPass());
   pm.addPass(createLowerAffinePass());
-  pm.addPass(createConvertSCFToCFPass());
+  pm.addPass(createSCFToControlFlowPass());
 
   {
     OpPassManager &devicePM = pm.nest<xilinx::AIE::DeviceOp>();

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -354,7 +354,7 @@ def AMDAIEFuseProducerIntoLoop :
   let description = [{
     Greedily fuse the producers of a linalg computation op based on the `fuseDepth`.
     Currently, the two producer ops that are allowed in the defining op chain are
-    tensor.pack and linalg.copy ops.
+    linalg.pack and linalg.copy ops.
   }];
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEFuseProducerIntoLoopPass()";
   let options = [

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/bufferize_to_allocation.mlir
@@ -14,14 +14,14 @@ func.func @matmul_static(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512x
     %c0 = arith.constant 0 : index
     %5 = tensor.empty() : tensor<1024x512xi32>
     %6 = tensor.empty() : tensor<16x32x64x64xi32>
-    %pack = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<1024x2048xi32> -> tensor<16x32x64x64xi32>
+    %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %6 : tensor<1024x2048xi32> -> tensor<16x32x64x64xi32>
     %7 = tensor.empty() : tensor<32x8x64x64xi32>
-    %pack_0 = tensor.pack %arg1 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<2048x512xi32> -> tensor<32x8x64x64xi32>
+    %pack_0 = linalg.pack %arg1 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<2048x512xi32> -> tensor<32x8x64x64xi32>
     %8 = tensor.empty() : tensor<16x8x64x64xi32>
     %9 = tensor.empty() : tensor<16x32x16x8x4x8xi32>
-    %pack_1 = tensor.pack %pack inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %9 : tensor<16x32x64x64xi32> -> tensor<16x32x16x8x4x8xi32>
+    %pack_1 = linalg.pack %pack inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %9 : tensor<16x32x64x64xi32> -> tensor<16x32x16x8x4x8xi32>
     %10 = tensor.empty() : tensor<32x8x8x8x8x8xi32>
-    %pack_2 = tensor.pack %pack_0 inner_dims_pos = [3, 2] inner_tiles = [8, 8] into %10 : tensor<32x8x64x64xi32> -> tensor<32x8x8x8x8x8xi32>
+    %pack_2 = linalg.pack %pack_0 inner_dims_pos = [3, 2] inner_tiles = [8, 8] into %10 : tensor<32x8x64x64xi32> -> tensor<32x8x8x8x8x8xi32>
     %11 = tensor.empty() : tensor<16x8x16x8x4x8xi32>
     %12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<16x8x16x8x4x8xi32>) -> tensor<16x8x16x8x4x8xi32>
     %13 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_1, %pack_2 : tensor<16x32x16x8x4x8xi32>, tensor<32x8x8x8x8x8xi32>) outs(%12 : tensor<16x8x16x8x4x8xi32>) {
@@ -30,63 +30,63 @@ func.func @matmul_static(%arg0 : tensor<1024x2048xi32>, %arg1 : tensor<2048x512x
       %15 = arith.addi %out, %14 : i32
       linalg.yield %15 : i32
     } -> tensor<16x8x16x8x4x8xi32>
-    %unpack = tensor.unpack %13 inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %8 : tensor<16x8x16x8x4x8xi32> -> tensor<16x8x64x64xi32>
-    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %5 : tensor<16x8x64x64xi32> -> tensor<1024x512xi32>
+    %unpack = linalg.unpack %13 inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %8 : tensor<16x8x16x8x4x8xi32> -> tensor<16x8x64x64xi32>
+    %unpack_3 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %5 : tensor<16x8x64x64xi32> -> tensor<1024x512xi32>
     return %unpack_3 : tensor<1024x512xi32>
 }
 
 // LINALG-INPUT-OUTPUT-NOT:  memref.alloc
-// LINALG-INPUT-OUTPUT:      tensor.pack
+// LINALG-INPUT-OUTPUT:      linalg.pack
 // LINALG-INPUT-OUTPUT-NOT:  memref.alloc
-// LINALG-INPUT-OUTPUT:      tensor.pack
+// LINALG-INPUT-OUTPUT:      linalg.pack
 // LINALG-INPUT-OUTPUT:      memref.alloc() : memref<16x32x16x8x4x8xi32, 2 : i32>
 // LINALG-INPUT-OUTPUT:      bufferization.to_tensor
-// LINALG-INPUT-OUTPUT:      tensor.pack
+// LINALG-INPUT-OUTPUT:      linalg.pack
 // LINALG-INPUT-OUTPUT:      memref.alloc() : memref<32x8x8x8x8x8xi32, 2 : i32>
 // LINALG-INPUT-OUTPUT:      bufferization.to_tensor
-// LINALG-INPUT-OUTPUT:      tensor.pack
+// LINALG-INPUT-OUTPUT:      linalg.pack
 // LINALG-INPUT-OUTPUT:      memref.alloc() : memref<16x8x16x8x4x8xi32, 2 : i32>
 // LINALG-INPUT-OUTPUT:      bufferization.to_tensor
 // LINALG-INPUT-OUTPUT:      linalg.fill
 // LINALG-INPUT-OUTPUT:      linalg.generic
 
 // LINALG-INPUT-NOT:  memref.alloc
-// LINALG-INPUT:      tensor.pack
+// LINALG-INPUT:      linalg.pack
 // LINALG-INPUT-NOT:  memref.alloc
-// LINALG-INPUT:      tensor.pack
+// LINALG-INPUT:      linalg.pack
 // LINALG-INPUT:      memref.alloc() : memref<16x32x16x8x4x8xi32, 2 : i32>
 // LINALG-INPUT:      bufferization.to_tensor
-// LINALG-INPUT:      tensor.pack
+// LINALG-INPUT:      linalg.pack
 // LINALG-INPUT:      memref.alloc() : memref<32x8x8x8x8x8xi32, 2 : i32>
 // LINALG-INPUT:      bufferization.to_tensor
-// LINALG-INPUT:      tensor.pack
+// LINALG-INPUT:      linalg.pack
 // LINALG-INPUT-NOT:  memref.alloc
 // LINALG-INPUT:      linalg.fill
 // LINALG-INPUT:      linalg.generic
 
 // LINALG-OUTPUT-NOT:  memref.alloc
-// LINALG-OUTPUT:      tensor.pack
+// LINALG-OUTPUT:      linalg.pack
 // LINALG-OUTPUT-NOT:  memref.alloc
-// LINALG-OUTPUT:      tensor.pack
+// LINALG-OUTPUT:      linalg.pack
 // LINALG-OUTPUT-NOT:  memref.alloc
-// LINALG-OUTPUT:      tensor.pack
+// LINALG-OUTPUT:      linalg.pack
 // LINALG-OUTPUT-NOT:  memref.alloc
-// LINALG-OUTPUT:      tensor.pack
+// LINALG-OUTPUT:      linalg.pack
 // LINALG-OUTPUT:      memref.alloc() : memref<16x8x16x8x4x8xi32, 2 : i32>
 // LINALG-OUTPUT:      bufferization.to_tensor
 // LINALG-OUTPUT:      linalg.fill
 // LINALG-OUTPUT:      linalg.generic
 
 // PACK-INPUT:      memref.alloc() : memref<16x32x64x64xi32, 1 : i32>
 // PACK-INPUT:      bufferization.to_tensor
-// PACK-INPUT:      tensor.pack
+// PACK-INPUT:      linalg.pack
 // PACK-INPUT:      memref.alloc() : memref<32x8x64x64xi32, 1 : i32>
 // PACK-INPUT:      bufferization.to_tensor
-// PACK-INPUT:      tensor.pack
+// PACK-INPUT:      linalg.pack
 // PACK-INPUT-NOT:  memref.alloc
-// PACK-INPUT:      tensor.pack
+// PACK-INPUT:      linalg.pack
 // PACK-INPUT-NOT:  memref.alloc
-// PACK-INPUT:      tensor.pack
+// PACK-INPUT:      linalg.pack
 // PACK-INPUT-NOT:  memref.alloc
 // PACK-INPUT:      linalg.fill
 // PACK-INPUT:      linalg.generic
@@ -105,14 +105,14 @@ func.func @matmul_elementwise(%arg0: tensor<1024x512xi8>, %arg1: tensor<512x1024
     %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg4] [512, 64] [1, 1] : tensor<512x1024xi8> to tensor<512x64xi8>
     %extracted_slice_1 = tensor.extract_slice %0[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32>
     %2 = tensor.empty() : tensor<1x16x64x32xi8>
-    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi8> -> tensor<1x16x64x32xi8>
+    %pack = linalg.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %2 : tensor<64x512xi8> -> tensor<1x16x64x32xi8>
     %3 = tensor.empty() : tensor<16x1x32x64xi8>
-    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %3 : tensor<512x64xi8> -> tensor<16x1x32x64xi8>
+    %pack_2 = linalg.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %3 : tensor<512x64xi8> -> tensor<16x1x32x64xi8>
     %4 = tensor.empty() : tensor<1x1x64x64xi32>
     %5 = tensor.empty() : tensor<1x16x4x16x4x8xi8>
-    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x16x64x32xi8> -> tensor<1x16x4x16x4x8xi8>
+    %pack_3 = linalg.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %5 : tensor<1x16x64x32xi8> -> tensor<1x16x4x16x4x8xi8>
     %6 = tensor.empty() : tensor<16x1x8x4x8x8xi8>
-    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %6 : tensor<16x1x32x64xi8> -> tensor<16x1x8x4x8x8xi8>
+    %pack_4 = linalg.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %6 : tensor<16x1x32x64xi8> -> tensor<16x1x8x4x8x8xi8>
     %7 = tensor.empty() : tensor<1x1x8x16x4x8xi32>
     %8 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<1x1x8x16x4x8xi32>) -> tensor<1x1x8x16x4x8xi32>
     %9 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<1x16x4x16x4x8xi8>, tensor<16x1x8x4x8x8xi8>) outs(%8 : tensor<1x1x8x16x4x8xi32>) {
@@ -125,49 +125,49 @@ func.func @matmul_elementwise(%arg0: tensor<1024x512xi8>, %arg1: tensor<512x1024
     } -> tensor<1x1x8x16x4x8xi32>
     %extracted_slice_5 = tensor.extract_slice %arg2[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32>
     %extracted_slice_6 = tensor.extract_slice %arg5[%arg3, %arg4] [64, 64] [1, 1] : tensor<1024x1024xi32> to tensor<64x64xi32>
-    %pack_7 = tensor.pack %extracted_slice_6 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
-    %pack_8 = tensor.pack %extracted_slice_5 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
-    %pack_9 = tensor.pack %pack_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %7 : tensor<1x1x64x64xi32> -> tensor<1x1x8x16x4x8xi32>
-    %pack_10 = tensor.pack %pack_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %7 : tensor<1x1x64x64xi32> -> tensor<1x1x8x16x4x8xi32>
+    %pack_7 = linalg.pack %extracted_slice_6 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
+    %pack_8 = linalg.pack %extracted_slice_5 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %4 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
+    %pack_9 = linalg.pack %pack_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %7 : tensor<1x1x64x64xi32> -> tensor<1x1x8x16x4x8xi32>
+    %pack_10 = linalg.pack %pack_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %7 : tensor<1x1x64x64xi32> -> tensor<1x1x8x16x4x8xi32>
     %10 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%9, %pack_10 : tensor<1x1x8x16x4x8xi32>, tensor<1x1x8x16x4x8xi32>) outs(%pack_9 : tensor<1x1x8x16x4x8xi32>) {
     ^bb0(%in: i32, %in_12: i32, %out: i32):
       %11 = arith.addi %in, %in_12 : i32
       linalg.yield %11 : i32
     } -> tensor<1x1x8x16x4x8xi32>
-    %unpack = tensor.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32>
-    %unpack_11 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
+    %unpack = linalg.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %4 : tensor<1x1x8x16x4x8xi32> -> tensor<1x1x64x64xi32>
+    %unpack_11 = linalg.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
     scf.forall.in_parallel {
       tensor.parallel_insert_slice %unpack_11 into %arg5[%arg3, %arg4] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<1024x1024xi32>
     }
   } {mapping = [#gpu.block<y>, #gpu.block<x>]}
   return %1 : tensor<1024x1024xi32>
 }
 
-// ELEMENTWISE-INPUT-COUNT-4: tensor.pack
+// ELEMENTWISE-INPUT-COUNT-4: linalg.pack
 // ELEMENTWISE-INPUT:         linalg.fill
 // ELEMENTWISE-INPUT:         linalg.generic
 // ELEMENTWISE-INPUT-NOT:     memref.alloc
-// ELEMENTWISE-INPUT:         tensor.pack
+// ELEMENTWISE-INPUT:         linalg.pack
 // ELEMENTWISE-INPUT-NOT:     memref.alloc
-// ELEMENTWISE-INPUT:         tensor.pack
+// ELEMENTWISE-INPUT:         linalg.pack
 // ELEMENTWISE-INPUT-NOT:     memref.alloc
-// ELEMENTWISE-INPUT:         tensor.pack
+// ELEMENTWISE-INPUT:         linalg.pack
 // ELEMENTWISE-INPUT:         memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
 // ELEMENTWISE-INPUT:         bufferization.to_tensor
-// ELEMENTWISE-INPUT:         tensor.pack
+// ELEMENTWISE-INPUT:         linalg.pack
 // ELEMENTWISE-INPUT:         linalg.generic
 
-// ELEMENTWISE-INPUT-OUTPUT-COUNT-4:  tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT-COUNT-4:  linalg.pack
 // ELEMENTWISE-INPUT-OUTPUT:          linalg.fill
 // ELEMENTWISE-INPUT-OUTPUT:          linalg.generic
 // ELEMENTWISE-INPUT-OUTPUT-NOT:      memref.alloc
-// ELEMENTWISE-INPUT-OUTPUT:          tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT:          linalg.pack
 // ELEMENTWISE-INPUT-OUTPUT-NOT:      memref.alloc
-// ELEMENTWISE-INPUT-OUTPUT:          tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT:          linalg.pack
 // ELEMENTWISE-INPUT-OUTPUT:          memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
 // ELEMENTWISE-INPUT-OUTPUT:          bufferization.to_tensor
-// ELEMENTWISE-INPUT-OUTPUT:          tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT:          linalg.pack
 // ELEMENTWISE-INPUT-OUTPUT:          memref.alloc() : memref<1x1x8x16x4x8xi32, 2 : i32>
 // ELEMENTWISE-INPUT-OUTPUT:          bufferization.to_tensor
-// ELEMENTWISE-INPUT-OUTPUT:          tensor.pack
+// ELEMENTWISE-INPUT-OUTPUT:          linalg.pack
 // ELEMENTWISE-INPUT-OUTPUT:          linalg.generic