Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ET-VK] Adding all tensor packing support to cat op. #9331

Merged
merged 3 commits into from
Mar 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion backends/vulkan/op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,6 @@ def register_view_op(features: OpFeatures):
exir_ops.edge.aten.index_select.default,
exir_ops.edge.aten.select_copy.int,
# Tensor combination
exir_ops.edge.aten.cat.default,
exir_ops.edge.aten.split_with_sizes_copy.default,
exir_ops.edge.aten.split.Tensor,
exir_ops.edge.aten.repeat.default,
Expand Down Expand Up @@ -562,6 +561,8 @@ def register_ported_op(features: OpFeatures):
exir_ops.edge.aten.squeeze_copy.dims,
exir_ops.edge.aten.unsqueeze_copy.default,
exir_ops.edge.aten.permute_copy.default,
# Tensor combination
exir_ops.edge.aten.cat.default,
]
)
def register_ported_op_all_packed_dims(features: OpFeatures):
Expand Down
19 changes: 14 additions & 5 deletions backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}

layout(push_constant) uniform restrict Block {
ivec3 range;
ivec3 src_offset;
ivec3 dst_offset;
// xyz is source offset w is channel size
ivec4 src_offset;
// xyz is destination offset w is channel size
ivec4 dst_offset;
};

#include "indexing_utils.h"
Expand All @@ -36,13 +38,20 @@ const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);

const ivec3 out_pos = pos + dst_offset;
const ivec3 in_pos = pos + src_offset;

if (any(greaterThanEqual(pos, range))) {
return;
}

const ivec3 in_pos = pos + src_offset.xyz;
ivec3 out_pos = pos + dst_offset.xyz;

// If source channel size is specified compose output z based on channel and batch index
if (src_offset.w > 0) {
const int channel_index = in_pos.z % src_offset.w;
const int batch_index = in_pos.z / src_offset.w;
out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w;
}

write_texel_lpos(
t_out,
out_pos,
Expand Down
106 changes: 106 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

#define VEC4_T ${texel_type(DTYPE)}

layout(std430) buffer;

${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)}
${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}

layout(push_constant) uniform restrict Block {
ivec4 range;
// xyz is source offset w is channel size
ivec4 src_offset;
// xyz is destination offset w is channel size
ivec4 dst_offset;
};

#include "indexing_utils.h"

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
const lowp int packed_dim = unhash_packed_dim(out_layout);

${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);

void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);

if (any(greaterThanEqual(pos, range.xyz))) {
return;
}

// Starting offset to write at within a texel
const int out_lane_offset = dst_offset[packed_dim] & 0x3;
const bool has_lane_offset = out_lane_offset != 0;

// Position in input tensor
const ivec3 in_pos = pos + src_offset.xyz;

// Read input value mapping to this output texel
const VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);

ivec3 out_pos = pos + dst_offset.xyz;
out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2);

VEC4_T out_value;

// If lane offset is non zero i.e packed texel is composed from multiple sources
if (has_lane_offset) {
// When position in packed dim is > 0
if (pos[packed_dim] > 0) {
// Boundary values will come from previous input texel in the packed dim.
ivec3 prev_in_pos = in_pos;
prev_in_pos[packed_dim] = in_pos[packed_dim] - 1;
VEC4_T prev_value = load_texel_lpos(t_in, prev_in_pos, in_axis_map);

// Shift values toward the beginning based on out_lane_offset
// offset 1 means the last lane from the previous texel is a part of the output texel
// offset 2 means last 2 lanes and so on
if (out_lane_offset == 1) {
out_value.x = prev_value.w;
} else if (out_lane_offset == 2) {
out_value.xy = prev_value.zw;
} else {
out_value.xyz = prev_value.yzw;
}
} else {
// When position in packed dim is == 0
// Boundary values will be the previous texel values.
out_value = load_texel_lpos(existing_out, out_pos, out_axis_map);
}

// Copy input values towards the end of output array, based on lane offset
// offset 1 means the first lane from previous texel is part of the output texel starting at offset
// offset 2 means first 2 lanes from the previous texel is part of the output texel and so on
if (out_lane_offset == 1) {
out_value.yzw = in_value.xyz;
} else if (out_lane_offset == 2) {
out_value.zw = in_value.xy;
} else {
out_value.w = in_value.x;
}
} else {
out_value = in_value;
}

write_texel_lpos(
t_out,
out_pos,
out_value,
out_axis_map);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
copy_packed_dim_offset:
parameter_names_with_default_values:
DTYPE: float
NDIM: 3
STORAGE: texture3d
generate_variant_forall:
DTYPE:
- VALUE: half
- VALUE: float
- VALUE: int
shader_variants:
- NAME: copy_packed_dim_offset
97 changes: 50 additions & 47 deletions backends/vulkan/runtime/graph/ops/impl/Cat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,65 +22,68 @@ void add_cat_default_node(
ValueRef dim_ref,
ValueRef out) {
ValueListPtr input_list = graph.get_value_list(in_list_ref);

for (ValueRef input_ref : *input_list) {
vTensorPtr t_in = graph.get_tensor(input_ref);
VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
}

int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
vTensorPtr t_out = graph.get_tensor(out);

const auto packed_dim = t_out->packed_dim();
const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);

DimIndex dim_index = normalize_to_dim_index(*t_out, dim);
// Index of dimension to be concatenated in (w, h, c * b) coordinate system
const auto dim_xyz_index = std::min(2, -dim_index - 1);

// TODO: Find ways to factor out the similar code for width, height, and batch
if (dim_index == kWidth4D) {
utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
if (dim_index > kWidth4D || dim_index < kBatch4D) {
VK_THROW("Unexpected value of dim_index=", dim_index);
}

for (ValueRef input_ref : *input_list) {
vTensorPtr t_in = graph.get_tensor(input_ref);
utils::ivec3 range = t_in->logical_limits();
add_copy_offset_node(
graph, input_ref, range, src_offset, dst_offset, out);
dst_offset[0] += range[0];
}
utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);

} else if (dim_index == kHeight4D) {
utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
const bool is_concat_channel = (dim_index == kChannel4D);

for (ValueRef input_ref : *input_list) {
vTensorPtr t_in = graph.get_tensor(input_ref);
utils::ivec3 range = t_in->logical_limits();
add_copy_offset_node(
graph, input_ref, range, src_offset, dst_offset, out);
dst_offset[1] += range[1];
}
} else if (dim_index == kBatch4D) {
utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
// if concatenating channels
if (is_concat_channel) {
// set destination offset w as channel size of the output tensor
dst_offset[3] = dim_at(t_out->sizes(), kChannel4D);
}

for (ValueRef input_ref : *input_list) {
vTensorPtr t_in = graph.get_tensor(input_ref);
utils::ivec3 range = t_in->logical_limits();
for (ValueRef input_ref : *input_list) {
const vTensorPtr t_in = graph.get_tensor(input_ref);
const utils::ivec3 range = t_in->logical_limits();
const auto in_channel_size = dim_at(t_in->sizes(), kChannel4D);
// if concatenating same dimension as the packed dimension
if (dim_index == packed_dim_index) {
// if concatenating channels, use add_copy_channel_offset_node function as
// add_copy_packed_dim_offset_node does not support channel packing
if (is_concat_channel) {
add_copy_channel_offset_node(
graph,
input_ref,
in_channel_size,
src_offset[2],
dst_offset[2],
out);
dst_offset[dim_xyz_index] += in_channel_size;
} else {
// src_offset[3] is not used now but will be used in the future when
// add_copy_packed_dim_offset_node will support channel packing
//
// set source offset w as channel size of the output tensor if
// concatenating channels
src_offset[3] = is_concat_channel ? in_channel_size : 0;
add_copy_packed_dim_offset_node(
graph, input_ref, range, src_offset, dst_offset, out);
dst_offset[dim_xyz_index] += dim_at(t_in->sizes(), packed_dim_index);
}
} else {
// set source offset w as channel size of the output tensor if
// concatenating channels
src_offset[3] = is_concat_channel ? in_channel_size : 0;
add_copy_offset_node(
graph, input_ref, range, src_offset, dst_offset, out);
dst_offset[2] += range[2];
dst_offset[dim_xyz_index] +=
is_concat_channel ? in_channel_size : range[dim_xyz_index];
}
} else if (dim_index == kChannel4D) {
int32_t src_offset = 0;
int32_t dst_offset = 0;

for (ValueRef input_ref : *input_list) {
vTensorPtr t_in = graph.get_tensor(input_ref);
int32_t range = dim_at(t_in->sizes(), kChannel4D);
add_copy_channel_offset_node(
graph, input_ref, range, src_offset, dst_offset, out);
dst_offset += range;
}
} else {
VK_THROW("Unexpected value of dim_index=", dim_index);
}
}

Expand Down
Loading
Loading