Skip to content

Commit d116122

Browse files
authored
[GraphBolt][CUDA] Cooperative Minibatching [1] - UniqueAndCompact. (#7765)
1 parent 9a86a66 commit d116122

File tree

11 files changed

+207
-110
lines changed

11 files changed

+207
-110
lines changed

graphbolt/include/graphbolt/cuda_ops.h

+13-2
Original file line numberDiff line numberDiff line change
@@ -274,10 +274,19 @@ torch::Tensor IndptrEdgeIdsImpl(
274274
* 2. Compact Operation: Utilizes the reverse mapping derived from the unique
275275
* operation to transform 'src_ids' and 'dst_ids' into compacted IDs.
276276
*
277+
* When world_size is greater than 1, then the given ids are partitioned between
278+
* the available ranks. The ids corresponding to the given rank are guaranteed
279+
* to come before the ids of other ranks. To do this, the partition ids are
280+
* rotated backwards by the given rank so that the ids are ordered as:
281+
* [rank, rank + 1, world_size, 0, ..., rank - 1]. This is supported only for
282+
* Volta and later generation NVIDIA GPUs.
283+
*
277284
* @param src_ids A tensor containing source IDs.
278285
* @param dst_ids A tensor containing destination IDs.
279286
* @param unique_dst_ids A tensor containing unique destination IDs, which is
280287
* exactly all the unique elements in 'dst_ids'.
288+
* @param rank The rank of the current GPU.
289+
* @param world_size The total # GPUs, world size.
281290
*
282291
* @return
283292
* - A tensor representing all unique elements in 'src_ids' and 'dst_ids' after
@@ -299,7 +308,8 @@ torch::Tensor IndptrEdgeIdsImpl(
299308
*/
300309
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
301310
const torch::Tensor src_ids, const torch::Tensor dst_ids,
302-
const torch::Tensor unique_dst_ids);
311+
const torch::Tensor unique_dst_ids, const int64_t rank,
312+
const int64_t world_size);
303313

304314
/**
305315
* @brief Batched version of UniqueAndCompact. The ith element of the return
@@ -310,7 +320,8 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
310320
UniqueAndCompactBatched(
311321
const std::vector<torch::Tensor>& src_ids,
312322
const std::vector<torch::Tensor>& dst_ids,
313-
const std::vector<torch::Tensor>& unique_dst_ids);
323+
const std::vector<torch::Tensor>& unique_dst_ids, const int64_t rank,
324+
const int64_t world_size);
314325

315326
} // namespace ops
316327
} // namespace graphbolt

graphbolt/include/graphbolt/unique_and_compact.h

+15-3
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,19 @@ namespace sampling {
2424
* 2. Compact Operation: Utilizes the reverse mapping derived from the unique
2525
* operation to transform 'src_ids' and 'dst_ids' into compacted IDs.
2626
*
27+
* When world_size is greater than 1, then the given ids are partitioned between
28+
* the available ranks. The ids corresponding to the given rank are guaranteed
29+
* to come before the ids of other ranks. To do this, the partition ids are
30+
* rotated backwards by the given rank so that the ids are ordered as:
31+
* [rank, rank + 1, world_size, 0, ..., rank - 1]. This is supported only for
32+
* Volta and later generation NVIDIA GPUs.
33+
*
2734
* @param src_ids A tensor containing source IDs.
2835
* @param dst_ids A tensor containing destination IDs.
2936
* @param unique_dst_ids A tensor containing unique destination IDs, which is
3037
* exactly all the unique elements in 'dst_ids'.
38+
* @param rank The rank of the current GPU.
39+
* @param world_size The total # GPUs, world size.
3140
*
3241
* @return
3342
* - A tensor representing all unique elements in 'src_ids' and 'dst_ids' after
@@ -49,20 +58,23 @@ namespace sampling {
4958
*/
5059
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
5160
const torch::Tensor& src_ids, const torch::Tensor& dst_ids,
52-
const torch::Tensor unique_dst_ids);
61+
const torch::Tensor unique_dst_ids, const int64_t rank,
62+
const int64_t world_size);
5363

5464
std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
5565
UniqueAndCompactBatched(
5666
const std::vector<torch::Tensor>& src_ids,
5767
const std::vector<torch::Tensor>& dst_ids,
58-
const std::vector<torch::Tensor> unique_dst_ids);
68+
const std::vector<torch::Tensor> unique_dst_ids, const int64_t rank,
69+
const int64_t world_size);
5970

6071
c10::intrusive_ptr<Future<
6172
std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>>>
6273
UniqueAndCompactBatchedAsync(
6374
const std::vector<torch::Tensor>& src_ids,
6475
const std::vector<torch::Tensor>& dst_ids,
65-
const std::vector<torch::Tensor> unique_dst_ids);
76+
const std::vector<torch::Tensor> unique_dst_ids, const int64_t rank,
77+
const int64_t world_size);
6678

6779
} // namespace sampling
6880
} // namespace graphbolt

graphbolt/src/cuda/extension/gpu_graph_cache.cu

+3-2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <cub/cub.cuh>
2626
#include <cuco/static_map.cuh>
2727
#include <cuda/std/atomic>
28+
#include <cuda/stream_ref>
2829
#include <limits>
2930
#include <numeric>
3031
#include <type_traits>
@@ -138,7 +139,7 @@ GpuGraphCache::GpuGraphCache(
138139
{},
139140
{},
140141
allocator_t<index_t>{},
141-
cuco::cuda_stream_ref{cuda::GetCurrentStream()}};
142+
::cuda::stream_ref{cuda::GetCurrentStream()}};
142143
map_ = new map_t<index_t>{std::move(map_temp)};
143144
}));
144145
C10_CUDA_KERNEL_LAUNCH_CHECK(); // Check the map constructor's success.
@@ -185,7 +186,7 @@ std::tuple<torch::Tensor, torch::Tensor, int64_t, int64_t> GpuGraphCache::Query(
185186
map_size_ + seeds.size(0) >= map->capacity() * kDoubleLoadFactor)) {
186187
map->rehash_async(
187188
map->capacity() * kIntGrowthFactor,
188-
cuco::cuda_stream_ref{cuda::GetCurrentStream()});
189+
::cuda::stream_ref{cuda::GetCurrentStream()});
189190
}
190191
auto positions = torch::empty_like(seeds);
191192
CUDA_KERNEL_CALL(

graphbolt/src/cuda/extension/unique_and_compact.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> >
3232
UniqueAndCompactBatchedHashMapBased(
3333
const std::vector<torch::Tensor>& src_ids,
3434
const std::vector<torch::Tensor>& dst_ids,
35-
const std::vector<torch::Tensor>& unique_dst_ids);
35+
const std::vector<torch::Tensor>& unique_dst_ids, const int64_t rank,
36+
const int64_t world_size);
3637

3738
} // namespace ops
3839
} // namespace graphbolt

0 commit comments

Comments
 (0)