Skip to content

Commit a320a57

Browse files
committed
[ET-VK] Adding all tensor packing support to cat op.
Pull Request resolved: #9331 This diff updates Executorch Vulkan backend's cat operation to support width, height and channel packed tensors. It also updates the op_registry.py file to indicate cat operation supports all packing and adds new test cases to the cases.py file to test the operation. ghstack-source-id: 272554191 @exported-using-ghexport Differential Revision: [D71230768](https://our.internmc.facebook.com/intern/diff/D71230768/)
1 parent 7b185eb commit a320a57

File tree

10 files changed

+311
-78
lines changed

10 files changed

+311
-78
lines changed

backends/vulkan/op_registry.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,6 @@ def register_view_op(features: OpFeatures):
528528
exir_ops.edge.aten.index_select.default,
529529
exir_ops.edge.aten.select_copy.int,
530530
# Tensor combination
531-
exir_ops.edge.aten.cat.default,
532531
exir_ops.edge.aten.split_with_sizes_copy.default,
533532
exir_ops.edge.aten.split.Tensor,
534533
exir_ops.edge.aten.repeat.default,
@@ -562,6 +561,8 @@ def register_ported_op(features: OpFeatures):
562561
exir_ops.edge.aten.squeeze_copy.dims,
563562
exir_ops.edge.aten.unsqueeze_copy.default,
564563
exir_ops.edge.aten.permute_copy.default,
564+
# Tensor combination
565+
exir_ops.edge.aten.cat.default,
565566
]
566567
)
567568
def register_ported_op_all_packed_dims(features: OpFeatures):

backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl

+14-5
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
1919

2020
layout(push_constant) uniform restrict Block {
2121
ivec3 range;
22-
ivec3 src_offset;
23-
ivec3 dst_offset;
22+
// xyz is source offset w is channel size
23+
ivec4 src_offset;
24+
// xyz is destination offset w is channel size
25+
ivec4 dst_offset;
2426
};
2527

2628
#include "indexing_utils.h"
@@ -36,13 +38,20 @@ const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
3638
void main() {
3739
const ivec3 pos = ivec3(gl_GlobalInvocationID);
3840

39-
const ivec3 out_pos = pos + dst_offset;
40-
const ivec3 in_pos = pos + src_offset;
41-
4241
if (any(greaterThanEqual(pos, range))) {
4342
return;
4443
}
4544

45+
const ivec3 in_pos = pos + src_offset.xyz;
46+
ivec3 out_pos = pos + dst_offset.xyz;
47+
48+
// If source channel size is specified compose output z based on channel and batch index
49+
if (src_offset.w > 0) {
50+
const int channel_index = in_pos.z % src_offset.w;
51+
const int batch_index = in_pos.z / src_offset.w;
52+
out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w;
53+
}
54+
4655
write_texel_lpos(
4756
t_out,
4857
out_pos,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
#define VEC4_T ${texel_type(DTYPE)}
14+
15+
layout(std430) buffer;
16+
17+
${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
18+
${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)}
19+
${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
20+
21+
layout(push_constant) uniform restrict Block {
22+
ivec4 range;
23+
// xyz is source offset w is channel size
24+
ivec4 src_offset;
25+
// xyz is destination offset w is channel size
26+
ivec4 dst_offset;
27+
};
28+
29+
#include "indexing_utils.h"
30+
31+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
32+
33+
${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
34+
const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
35+
const lowp int packed_dim = unhash_packed_dim(out_layout);
36+
37+
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
38+
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
39+
40+
void main() {
41+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
42+
43+
if (any(greaterThanEqual(pos, range.xyz))) {
44+
return;
45+
}
46+
47+
// Starting offset to write at within a texel
48+
const int out_lane_offset = dst_offset[packed_dim] & 0x3;
49+
const bool has_lane_offset = out_lane_offset != 0;
50+
51+
// Position in input tensor
52+
const ivec3 in_pos = pos + src_offset.xyz;
53+
54+
// Read input value mapping to this output texel
55+
const VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);
56+
57+
ivec3 out_pos = pos + dst_offset.xyz;
58+
out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2);
59+
60+
VEC4_T out_value;
61+
62+
// If lane offset is non zero i.e packed texel is composed from multiple sources
63+
if (has_lane_offset) {
64+
// When position in packed dim is > 0
65+
if (pos[packed_dim] > 0) {
66+
// Boundary values will come from previous input texel in the packed dim.
67+
ivec3 prev_in_pos = in_pos;
68+
prev_in_pos[packed_dim] = in_pos[packed_dim] - 1;
69+
VEC4_T prev_value = load_texel_lpos(t_in, prev_in_pos, in_axis_map);
70+
71+
// Shift values toward the beginning based on out_lane_offset
72+
// offset 1 means the last lane from the previous texel is a part of the output texel
73+
// offset 2 means last 2 lanes and so on
74+
if (out_lane_offset == 1) {
75+
out_value.x = prev_value.w;
76+
} else if (out_lane_offset == 2) {
77+
out_value.xy = prev_value.zw;
78+
} else {
79+
out_value.xyz = prev_value.yzw;
80+
}
81+
} else {
82+
// When position in packed dim is == 0
83+
// Boundary values will be the previous texel values.
84+
out_value = load_texel_lpos(existing_out, out_pos, out_axis_map);
85+
}
86+
87+
// Copy input values towards the end of output array, based on lane offset
88+
// offset 1 means the first lane from previous texel is part of the output texel starting at offset
89+
// offset 2 means first 2 lanes from the previous texel is part of the output texel and so on
90+
if (out_lane_offset == 1) {
91+
out_value.yzw = in_value.xyz;
92+
} else if (out_lane_offset == 2) {
93+
out_value.zw = in_value.xy;
94+
} else {
95+
out_value.w = in_value.x;
96+
}
97+
} else {
98+
out_value = in_value;
99+
}
100+
101+
write_texel_lpos(
102+
t_out,
103+
out_pos,
104+
out_value,
105+
out_axis_map);
106+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
copy_packed_dim_offset:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
NDIM: 3
5+
STORAGE: texture3d
6+
generate_variant_forall:
7+
DTYPE:
8+
- VALUE: half
9+
- VALUE: float
10+
- VALUE: int
11+
shader_variants:
12+
- NAME: copy_packed_dim_offset

backends/vulkan/runtime/graph/ops/impl/Cat.cpp

+50-47
Original file line numberDiff line numberDiff line change
@@ -22,65 +22,68 @@ void add_cat_default_node(
2222
ValueRef dim_ref,
2323
ValueRef out) {
2424
ValueListPtr input_list = graph.get_value_list(in_list_ref);
25-
26-
for (ValueRef input_ref : *input_list) {
27-
vTensorPtr t_in = graph.get_tensor(input_ref);
28-
VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
29-
}
30-
3125
int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
3226
vTensorPtr t_out = graph.get_tensor(out);
3327

28+
const auto packed_dim = t_out->packed_dim();
29+
const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);
30+
3431
DimIndex dim_index = normalize_to_dim_index(*t_out, dim);
32+
// Index of dimension to be concatenated in (w, h, c * b) coordinate system
33+
const auto dim_xyz_index = std::min(2, -dim_index - 1);
3534

36-
// TODO: Find ways to factor out the similar code for width, height, and batch
37-
if (dim_index == kWidth4D) {
38-
utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
39-
utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
35+
if (dim_index > kWidth4D || dim_index < kBatch4D) {
36+
VK_THROW("Unexpected value of dim_index=", dim_index);
37+
}
4038

41-
for (ValueRef input_ref : *input_list) {
42-
vTensorPtr t_in = graph.get_tensor(input_ref);
43-
utils::ivec3 range = t_in->logical_limits();
44-
add_copy_offset_node(
45-
graph, input_ref, range, src_offset, dst_offset, out);
46-
dst_offset[0] += range[0];
47-
}
39+
utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
40+
utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
4841

49-
} else if (dim_index == kHeight4D) {
50-
utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
51-
utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
42+
const bool is_concat_channel = (dim_index == kChannel4D);
5243

53-
for (ValueRef input_ref : *input_list) {
54-
vTensorPtr t_in = graph.get_tensor(input_ref);
55-
utils::ivec3 range = t_in->logical_limits();
56-
add_copy_offset_node(
57-
graph, input_ref, range, src_offset, dst_offset, out);
58-
dst_offset[1] += range[1];
59-
}
60-
} else if (dim_index == kBatch4D) {
61-
utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
62-
utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
44+
// if concatenating channels
45+
if (is_concat_channel) {
46+
// set destination offset w as channel size of the output tensor
47+
dst_offset[3] = dim_at(t_out->sizes(), kChannel4D);
48+
}
6349

64-
for (ValueRef input_ref : *input_list) {
65-
vTensorPtr t_in = graph.get_tensor(input_ref);
66-
utils::ivec3 range = t_in->logical_limits();
50+
for (ValueRef input_ref : *input_list) {
51+
const vTensorPtr t_in = graph.get_tensor(input_ref);
52+
const utils::ivec3 range = t_in->logical_limits();
53+
const auto in_channel_size = dim_at(t_in->sizes(), kChannel4D);
54+
// if concatenating same dimension as the packed dimension
55+
if (dim_index == packed_dim_index) {
56+
// if concatenating channels, use add_copy_channel_offset_node function as
57+
// add_copy_packed_dim_offset_node does not support channel packing
58+
if (is_concat_channel) {
59+
add_copy_channel_offset_node(
60+
graph,
61+
input_ref,
62+
in_channel_size,
63+
src_offset[2],
64+
dst_offset[2],
65+
out);
66+
dst_offset[dim_xyz_index] += in_channel_size;
67+
} else {
68+
// src_offset[3] is not used now but will be used in the future when
69+
// add_copy_packed_dim_offset_node will support channel packing
70+
//
71+
// set source offset w as channel size of the output tensor if
72+
// concatenating channels
73+
src_offset[3] = is_concat_channel ? in_channel_size : 0;
74+
add_copy_packed_dim_offset_node(
75+
graph, input_ref, range, src_offset, dst_offset, out);
76+
dst_offset[dim_xyz_index] += dim_at(t_in->sizes(), packed_dim_index);
77+
}
78+
} else {
79+
// set source offset w as channel size of the output tensor if
80+
// concatenating channels
81+
src_offset[3] = is_concat_channel ? in_channel_size : 0;
6782
add_copy_offset_node(
6883
graph, input_ref, range, src_offset, dst_offset, out);
69-
dst_offset[2] += range[2];
84+
dst_offset[dim_xyz_index] +=
85+
is_concat_channel ? in_channel_size : range[dim_xyz_index];
7086
}
71-
} else if (dim_index == kChannel4D) {
72-
int32_t src_offset = 0;
73-
int32_t dst_offset = 0;
74-
75-
for (ValueRef input_ref : *input_list) {
76-
vTensorPtr t_in = graph.get_tensor(input_ref);
77-
int32_t range = dim_at(t_in->sizes(), kChannel4D);
78-
add_copy_channel_offset_node(
79-
graph, input_ref, range, src_offset, dst_offset, out);
80-
dst_offset += range;
81-
}
82-
} else {
83-
VK_THROW("Unexpected value of dim_index=", dim_index);
8487
}
8588
}
8689

0 commit comments

Comments
 (0)