pytorch
diff --git a/‎backends/vulkan/op_registry.py
+2-1 b/‎backends/vulkan/op_registry.py
+2-1
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
+14-5 b/‎backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
+14-5
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
+106 b/‎backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
+106
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml
+12 b/‎backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml
+12
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Cat.cpp
+50-47 b/‎backends/vulkan/runtime/graph/ops/impl/Cat.cpp
+50-47
@@ -528,7 +528,6 @@ def register_view_op(features: OpFeatures):
         exir_ops.edge.aten.index_select.default,
         exir_ops.edge.aten.select_copy.int,
         # Tensor combination
-        exir_ops.edge.aten.cat.default,
         exir_ops.edge.aten.split_with_sizes_copy.default,
         exir_ops.edge.aten.split.Tensor,
         exir_ops.edge.aten.repeat.default,
@@ -562,6 +561,8 @@ def register_ported_op(features: OpFeatures):
         exir_ops.edge.aten.squeeze_copy.dims,
         exir_ops.edge.aten.unsqueeze_copy.default,
         exir_ops.edge.aten.permute_copy.default,
+        # Tensor combination
+        exir_ops.edge.aten.cat.default,
     ]
 )
 def register_ported_op_all_packed_dims(features: OpFeatures):
 
@@ -19,8 +19,10 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 
 layout(push_constant) uniform restrict Block {
   ivec3 range;
-  ivec3 src_offset;
-  ivec3 dst_offset;
+  // xyz is source offset w is channel size
+  ivec4 src_offset;
+  // xyz is destination offset w is channel size
+  ivec4 dst_offset;
 };
 
 #include "indexing_utils.h"
@@ -36,13 +38,20 @@ const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  const ivec3 out_pos = pos + dst_offset;
-  const ivec3 in_pos = pos + src_offset;
-
   if (any(greaterThanEqual(pos, range))) {
     return;
   }
 
+  const ivec3 in_pos = pos + src_offset.xyz;
+  ivec3 out_pos = pos + dst_offset.xyz;
+
+  // If source channel size is specified compose output z based on channel and batch index
+  if (src_offset.w > 0) {
+    const int channel_index = in_pos.z % src_offset.w;
+    const int batch_index = in_pos.z / src_offset.w;
+    out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w;
+  }
+
   write_texel_lpos(
     t_out,
     out_pos,
 
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 range;
+  // xyz is source offset w is channel size
+  ivec4 src_offset;
+  // xyz is destination offset w is channel size
+  ivec4 dst_offset;
+};
+
+#include "indexing_utils.h"
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+const lowp int packed_dim = unhash_packed_dim(out_layout);
+
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, range.xyz))) {
+    return;
+  }
+
+  // Starting offset to write at within a texel
+  const int out_lane_offset = dst_offset[packed_dim] & 0x3;
+  const bool has_lane_offset = out_lane_offset != 0;
+
+  // Position in input tensor
+  const ivec3 in_pos = pos + src_offset.xyz;
+
+  // Read input value mapping to this output texel
+  const VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);
+
+  ivec3 out_pos = pos + dst_offset.xyz;
+  out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2);
+
+  VEC4_T out_value;
+
+  // If lane offset is non zero i.e packed texel is composed from multiple sources
+  if (has_lane_offset) {
+    // When position in packed dim is > 0
+    if (pos[packed_dim] > 0) {
+      // Boundary values will come from previous input texel in the packed dim.
+      ivec3 prev_in_pos = in_pos;
+      prev_in_pos[packed_dim] = in_pos[packed_dim] - 1;
+      VEC4_T prev_value = load_texel_lpos(t_in, prev_in_pos, in_axis_map);
+
+      // Shift values toward the beginning based on out_lane_offset
+      // offset 1 means the last lane from the previous texel is a part of the output texel
+      // offset 2 means last 2 lanes and so on
+      if (out_lane_offset == 1) {
+        out_value.x = prev_value.w;
+      } else if (out_lane_offset == 2) {
+        out_value.xy = prev_value.zw;
+      } else {
+        out_value.xyz = prev_value.yzw;
+      }
+    } else {
+      // When position in packed dim is == 0
+      // Boundary values will be the previous texel values.
+      out_value = load_texel_lpos(existing_out, out_pos, out_axis_map);
+    }
+
+    // Copy input values towards the end of output array, based on lane offset
+    // offset 1 means the first lane from previous texel is part of the output texel starting at offset
+    // offset 2 means first 2 lanes from the previous texel is part of the output texel and so on
+    if (out_lane_offset == 1) {
+      out_value.yzw = in_value.xyz;
+    } else if (out_lane_offset == 2) {
+      out_value.zw = in_value.xy;
+    } else {
+      out_value.w = in_value.x;
+    }
+  } else {
+    out_value = in_value;
+  }
+
+  write_texel_lpos(
+    t_out,
+    out_pos,
+    out_value,
+    out_axis_map);
+}
@@ -0,0 +1,12 @@
+copy_packed_dim_offset:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+    STORAGE: texture3d
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int
+  shader_variants:
+    - NAME: copy_packed_dim_offset
@@ -22,65 +22,68 @@ void add_cat_default_node(
     ValueRef dim_ref,
     ValueRef out) {
   ValueListPtr input_list = graph.get_value_list(in_list_ref);
-
-  for (ValueRef input_ref : *input_list) {
-    vTensorPtr t_in = graph.get_tensor(input_ref);
-    VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
-  }
-
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
   vTensorPtr t_out = graph.get_tensor(out);
 
+  const auto packed_dim = t_out->packed_dim();
+  const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);
+
   DimIndex dim_index = normalize_to_dim_index(*t_out, dim);
+  // Index of dimension to be concatenated in (w, h, c * b) coordinate system
+  const auto dim_xyz_index = std::min(2, -dim_index - 1);
 
-  // TODO: Find ways to factor out the similar code for width, height, and batch
-  if (dim_index == kWidth4D) {
-    utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
-    utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
+  if (dim_index > kWidth4D || dim_index < kBatch4D) {
+    VK_THROW("Unexpected value of dim_index=", dim_index);
+  }
 
-    for (ValueRef input_ref : *input_list) {
-      vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->logical_limits();
-      add_copy_offset_node(
-          graph, input_ref, range, src_offset, dst_offset, out);
-      dst_offset[0] += range[0];
-    }
+  utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
 
-  } else if (dim_index == kHeight4D) {
-    utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
-    utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
+  const bool is_concat_channel = (dim_index == kChannel4D);
 
-    for (ValueRef input_ref : *input_list) {
-      vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->logical_limits();
-      add_copy_offset_node(
-          graph, input_ref, range, src_offset, dst_offset, out);
-      dst_offset[1] += range[1];
-    }
-  } else if (dim_index == kBatch4D) {
-    utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
-    utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
+  // if concatenating channels
+  if (is_concat_channel) {
+    // set destination offset w as channel size of the output tensor
+    dst_offset[3] = dim_at(t_out->sizes(), kChannel4D);
+  }
 
-    for (ValueRef input_ref : *input_list) {
-      vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->logical_limits();
+  for (ValueRef input_ref : *input_list) {
+    const vTensorPtr t_in = graph.get_tensor(input_ref);
+    const utils::ivec3 range = t_in->logical_limits();
+    const auto in_channel_size = dim_at(t_in->sizes(), kChannel4D);
+    // if concatenating same dimension as the packed dimension
+    if (dim_index == packed_dim_index) {
+      // if concatenating channels, use add_copy_channel_offset_node function as
+      // add_copy_packed_dim_offset_node does not support channel packing
+      if (is_concat_channel) {
+        add_copy_channel_offset_node(
+            graph,
+            input_ref,
+            in_channel_size,
+            src_offset[2],
+            dst_offset[2],
+            out);
+        dst_offset[dim_xyz_index] += in_channel_size;
+      } else {
+        // src_offset[3] is not used now but will be used in the future when
+        // add_copy_packed_dim_offset_node will support channel packing
+        //
+        // set source offset w as channel size of the output tensor if
+        // concatenating channels
+        src_offset[3] = is_concat_channel ? in_channel_size : 0;
+        add_copy_packed_dim_offset_node(
+            graph, input_ref, range, src_offset, dst_offset, out);
+        dst_offset[dim_xyz_index] += dim_at(t_in->sizes(), packed_dim_index);
+      }
+    } else {
+      // set source offset w as channel size of the output tensor if
+      // concatenating channels
+      src_offset[3] = is_concat_channel ? in_channel_size : 0;
       add_copy_offset_node(
           graph, input_ref, range, src_offset, dst_offset, out);
-      dst_offset[2] += range[2];
+      dst_offset[dim_xyz_index] +=
+          is_concat_channel ? in_channel_size : range[dim_xyz_index];
     }
-  } else if (dim_index == kChannel4D) {
-    int32_t src_offset = 0;
-    int32_t dst_offset = 0;
-
-    for (ValueRef input_ref : *input_list) {
-      vTensorPtr t_in = graph.get_tensor(input_ref);
-      int32_t range = dim_at(t_in->sizes(), kChannel4D);
-      add_copy_channel_offset_node(
-          graph, input_ref, range, src_offset, dst_offset, out);
-      dst_offset += range;
-    }
-  } else {
-    VK_THROW("Unexpected value of dim_index=", dim_index);
   }
 }