diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl index 02ea6405b4a..e0f09f0be43 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl @@ -44,15 +44,49 @@ void main() { return; } - // Starting offset to write at within a texel - const int out_lane_offset = dst_offset[packed_dim] & 0x3; - const bool has_lane_offset = out_lane_offset != 0; - // Position in input tensor - const ivec3 in_pos = pos + src_offset.xyz; + ivec3 in_pos = pos + src_offset.xyz; + in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2); // Read input value mapping to this output texel - const VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map); + VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map); + + // Starting offset to read from a texel + const int src_lane_offset = src_offset[packed_dim] & 0x3; + const bool has_src_lane_offset = src_lane_offset != 0; + + // If input lane offset is non zero i.e packed texel is composed from multiple sources + if (has_src_lane_offset) { + // Boundary values will come from next input texel in the packed dim. + ivec3 next_in_pos = in_pos; + next_in_pos[packed_dim] = in_pos[packed_dim] + 1; + VEC4_T next_value = load_texel_lpos(t_in, next_in_pos, in_axis_map); + + // Keep input values from the end of current input pixel based on src_lane_offset + // offset 1 means the first lane of current input texel is not a part of the output texel + // offset 2 means first 2 lanes are not and so on + if (src_lane_offset == 1) { + in_value.xyz = in_value.yzw; + } else if (src_lane_offset == 2) { + in_value.xy = in_value.zw; + } else { + in_value.x = in_value.w; + } + // Copy next texel's values towards the end of input texel, based on lane offset + // offset 1 means the first lane from next texel is part of the input texel + // offset 2 means first 2 lanes from next texel is part of the input texel and so on + if (src_lane_offset == 1) { + in_value.w = next_value.x; + } else if (src_lane_offset == 2) { + in_value.zw = next_value.xy; + } else { + in_value.yzw = next_value.xyz; + } + } + + // Starting offset to write at within a texel + const int out_lane_offset = dst_offset[packed_dim] & 0x3; + const bool has_dst_lane_offset = out_lane_offset != 0; ivec3 out_pos = pos + dst_offset.xyz; out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2); @@ -60,7 +94,7 @@ void main() { VEC4_T out_value; // If lane offset is non zero i.e packed texel is composed from multiple sources - if (has_lane_offset) { + if (has_dst_lane_offset) { // When position in packed dim is > 0 if (pos[packed_dim] > 0) { // Boundary values will come from previous input texel in the packed dim. diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp index 2ecc7400d3e..5756d3a9052 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp @@ -92,19 +92,37 @@ void add_copy_packed_dim_offset_node( ivec4 final_range = { range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)}; ivec3 global_wg_size = t_out->logical_limits(); + // The starting offset in a texel where this tensor will start copying from + const auto src_lane_offset = src_offset[packed_dim] & 0x3; // The starting offset in a texel where this tensor will start copying to const auto dst_lane_offset = dst_offset[packed_dim] & 0x3; + + // The total packed texels this tensor will be copied from + // The first texel of tensor data in packed dimension will be copied from + // remaining lanes from current source Hence (4 - src_lane_offset) is added + // to tensor size in packed dimension + const auto src_packed_size = utils::div_up_4( + (4 - src_lane_offset) + + dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim))); + // The total packed texels this tensor will be copied to - // The first texel of tensor data in packed dimension will be copied to remain - // lanes from previous write Hence (4 - dst_lane_offset) is added to tensor - // size in packed dimension + // The first texel of tensor data in packed dimension will be copied to + // remaining lanes from previous write Hence (4 - dst_lane_offset) is added to + // tensor size in packed dimension const auto dst_packed_size = utils::div_up_4( (4 - dst_lane_offset) + dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim))); - // If the starting offset is not 0, and the total packed texels is greater + // If the starting src offset is not 0, and the total packed texels is greater // than the source texel range - if (dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]) { + const bool has_additional_src_work = + src_lane_offset != 0 && src_packed_size > final_range[packed_dim]; + // If the starting dst offset is not 0, and the total packed texels is greater + // than the source texel range + const bool has_additional_dst_work = + dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]; + + if (has_additional_src_work || has_additional_dst_work) { global_wg_size[packed_dim]++; // Increase the global work group size in // packed dimension final_range[packed_dim]++; // Increase the range in packed dimension