Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ET-VK] Adding source_offset processing to copy_packed_dim_offset function. #9344

Merged
merged 2 commits into from
Mar 20, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -44,23 +44,57 @@ void main() {
return;
}

// Starting offset to write at within a texel
const int out_lane_offset = dst_offset[packed_dim] & 0x3;
const bool has_lane_offset = out_lane_offset != 0;

// Position in input tensor
const ivec3 in_pos = pos + src_offset.xyz;
ivec3 in_pos = pos + src_offset.xyz;
in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);

// Read input value mapping to this output texel
const VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);
VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);

// Starting offset to read from a texel
const int src_lane_offset = src_offset[packed_dim] & 0x3;
const bool has_src_lane_offset = src_lane_offset != 0;

// If input lane offset is non zero i.e packed texel is composed from multiple sources
if (has_src_lane_offset) {
// Boundary values will come from next input texel in the packed dim.
ivec3 next_in_pos = in_pos;
next_in_pos[packed_dim] = in_pos[packed_dim] + 1;
VEC4_T next_value = load_texel_lpos(t_in, next_in_pos, in_axis_map);

// Keep input values from the end of current input pixel based on src_lane_offset
// offset 1 means the first lane of current input texel is not a part of the output texel
// offset 2 means first 2 lanes are not and so on
if (src_lane_offset == 1) {
in_value.xyz = in_value.yzw;
} else if (src_lane_offset == 2) {
in_value.xy = in_value.zw;
} else {
in_value.x = in_value.w;
}
// Copy next texel's values towards the end of input texel, based on lane offset
// offset 1 means the first lane from next texel is part of the input texel
// offset 2 means first 2 lanes from next texel is part of the input texel and so on
if (src_lane_offset == 1) {
in_value.w = next_value.x;
} else if (src_lane_offset == 2) {
in_value.zw = next_value.xy;
} else {
in_value.yzw = next_value.xyz;
}
}

// Starting offset to write at within a texel
const int out_lane_offset = dst_offset[packed_dim] & 0x3;
const bool has_dst_lane_offset = out_lane_offset != 0;

ivec3 out_pos = pos + dst_offset.xyz;
out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2);

VEC4_T out_value;

// If lane offset is non zero i.e packed texel is composed from multiple sources
if (has_lane_offset) {
if (has_dst_lane_offset) {
// When position in packed dim is > 0
if (pos[packed_dim] > 0) {
// Boundary values will come from previous input texel in the packed dim.
28 changes: 23 additions & 5 deletions backends/vulkan/runtime/graph/ops/impl/Copy.cpp
Original file line number Diff line number Diff line change
@@ -92,19 +92,37 @@ void add_copy_packed_dim_offset_node(
ivec4 final_range = {
range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)};
ivec3 global_wg_size = t_out->logical_limits();
// The starting offset in a texel where this tensor will start copying from
const auto src_lane_offset = src_offset[packed_dim] & 0x3;
// The starting offset in a texel where this tensor will start copying to
const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;

// The total packed texels this tensor will be copied from
// The first texel of tensor data in packed dimension will be copied from
// remaining lanes from current source Hence (4 - src_lane_offset) is added
// to tensor size in packed dimension
const auto src_packed_size = utils::div_up_4(
(4 - src_lane_offset) +
dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));

// The total packed texels this tensor will be copied to
// The first texel of tensor data in packed dimension will be copied to remain
// lanes from previous write Hence (4 - dst_lane_offset) is added to tensor
// size in packed dimension
// The first texel of tensor data in packed dimension will be copied to
// remaining lanes from previous write Hence (4 - dst_lane_offset) is added to
// tensor size in packed dimension
const auto dst_packed_size = utils::div_up_4(
(4 - dst_lane_offset) +
dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));

// If the starting offset is not 0, and the total packed texels is greater
// If the starting src offset is not 0, and the total packed texels is greater
// than the source texel range
if (dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]) {
const bool has_additional_src_work =
src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
// If the starting dst offset is not 0, and the total packed texels is greater
// than the source texel range
const bool has_additional_dst_work =
dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];

if (has_additional_src_work || has_additional_dst_work) {
global_wg_size[packed_dim]++; // Increase the global work group size in
// packed dimension
final_range[packed_dim]++; // Increase the range in packed dimension