pytorch · facebook-github-bot · Mar 20, 2025 · Mar 17, 2025 · Mar 19, 2025
@@ -44,23 +44,57 @@ void main() {
     return;
   }
 
-  // Starting offset to write at within a texel
-  const int out_lane_offset = dst_offset[packed_dim] & 0x3;
-  const bool has_lane_offset = out_lane_offset != 0;
-
   // Position in input tensor
-  const ivec3 in_pos = pos + src_offset.xyz;
+  ivec3 in_pos = pos + src_offset.xyz;
+  in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);
 
   // Read input value mapping to this output texel
-  const VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);
+  VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);
+
+  // Starting offset to read from a texel
+  const int src_lane_offset = src_offset[packed_dim] & 0x3;
+  const bool has_src_lane_offset = src_lane_offset != 0;
+
+  // If input lane offset is non zero i.e packed texel is composed from multiple sources
+  if (has_src_lane_offset) {
+    // Boundary values will come from next input texel in the packed dim.
+    ivec3 next_in_pos = in_pos;
+    next_in_pos[packed_dim] = in_pos[packed_dim] + 1;
+    VEC4_T next_value = load_texel_lpos(t_in, next_in_pos, in_axis_map);
+
+    // Keep input values from the end of current input pixel based on src_lane_offset
+    // offset 1 means the first lane of current input texel is not a part of the output texel
+    // offset 2 means first 2 lanes are not and so on
+    if (src_lane_offset == 1) {
+      in_value.xyz = in_value.yzw;
+    } else if (src_lane_offset == 2) {
+      in_value.xy = in_value.zw;
+    } else {
+      in_value.x = in_value.w;
+    }
+    // Copy next texel's values towards the end of input texel, based on lane offset
+    // offset 1 means the first lane from next texel is part of the input texel
+    // offset 2 means first 2 lanes from next texel is part of the input texel and so on
+    if (src_lane_offset == 1) {
+      in_value.w = next_value.x;
+    } else if (src_lane_offset == 2) {
+      in_value.zw = next_value.xy;
+    } else {
+      in_value.yzw = next_value.xyz;
+    }
+  }
+
+  // Starting offset to write at within a texel
+  const int out_lane_offset = dst_offset[packed_dim] & 0x3;
+  const bool has_dst_lane_offset = out_lane_offset != 0;
 
   ivec3 out_pos = pos + dst_offset.xyz;
   out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2);
 
   VEC4_T out_value;
 
   // If lane offset is non zero i.e packed texel is composed from multiple sources
-  if (has_lane_offset) {
+  if (has_dst_lane_offset) {
     // When position in packed dim is > 0
     if (pos[packed_dim] > 0) {
       // Boundary values will come from previous input texel in the packed dim.

@@ -92,19 +92,37 @@ void add_copy_packed_dim_offset_node(
   ivec4 final_range = {
       range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)};
   ivec3 global_wg_size = t_out->logical_limits();
+  // The starting offset in a texel where this tensor will start copying from
+  const auto src_lane_offset = src_offset[packed_dim] & 0x3;
   // The starting offset in a texel where this tensor will start copying to
   const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
+
+  // The total packed texels this tensor will be copied from
+  // The first texel of tensor data in packed dimension will be copied from
+  // remaining lanes from current source Hence (4 - src_lane_offset) is added
+  // to tensor size in packed dimension
+  const auto src_packed_size = utils::div_up_4(
+      (4 - src_lane_offset) +
+      dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));
+
   // The total packed texels this tensor will be copied to
-  // The first texel of tensor data in packed dimension will be copied to remain
-  // lanes from previous write Hence (4 - dst_lane_offset) is added to tensor
-  // size in packed dimension
+  // The first texel of tensor data in packed dimension will be copied to
+  // remaining lanes from previous write Hence (4 - dst_lane_offset) is added to
+  // tensor size in packed dimension
   const auto dst_packed_size = utils::div_up_4(
       (4 - dst_lane_offset) +
       dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));
 
-  // If the starting offset is not 0, and the total packed texels is greater
+  // If the starting src offset is not 0, and the total packed texels is greater
   // than the source texel range
-  if (dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]) {
+  const bool has_additional_src_work =
+      src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
+  // If the starting dst offset is not 0, and the total packed texels is greater
+  // than the source texel range
+  const bool has_additional_dst_work =
+      dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
+
+  if (has_additional_src_work || has_additional_dst_work) {
     global_wg_size[packed_dim]++; // Increase the global work group size in
                                   // packed dimension
     final_range[packed_dim]++; // Increase the range in packed dimension