Skip to content

Commit 32b11c9

Browse files
authored
[GraphBolt][CUDA] Expose RankSort to python, reorganize and test. (#7776)
1 parent f71427f commit 32b11c9

6 files changed

+125
-28
lines changed

graphbolt/src/cuda/cooperative_minibatching_utils.cu

+5-2
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818
* @brief Cooperative Minibatching (arXiv:2310.12403) utility function
1919
* implementations in CUDA.
2020
*/
21+
#include <graphbolt/cuda_ops.h>
2122
#include <thrust/transform.h>
2223

2324
#include <cub/cub.cuh>
2425
#include <cuda/functional>
2526

2627
#include "./common.h"
28+
#include "./cooperative_minibatching_utils.cuh"
2729
#include "./cooperative_minibatching_utils.h"
2830
#include "./utils.h"
2931

@@ -60,7 +62,8 @@ RankSortImpl(
6062
auto part_ids2 = part_ids.clone();
6163
auto part_ids2_sorted = torch::empty_like(part_ids2);
6264
auto nodes_sorted = torch::empty_like(nodes);
63-
auto index = torch::arange(nodes.numel(), nodes.options());
65+
auto index = ops::IndptrEdgeIdsImpl(
66+
offsets_dev, nodes.scalar_type(), torch::nullopt, nodes.numel());
6467
auto index_sorted = torch::empty_like(index);
6568
return AT_DISPATCH_INDEX_TYPES(
6669
nodes.scalar_type(), "RankSortImpl", ([&] {
@@ -103,7 +106,7 @@ RankSortImpl(
103106
}
104107

105108
std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> RankSort(
106-
std::vector<torch::Tensor>& nodes_list, const int64_t rank,
109+
const std::vector<torch::Tensor>& nodes_list, const int64_t rank,
107110
const int64_t world_size) {
108111
const auto num_batches = nodes_list.size();
109112
auto nodes = torch::cat(nodes_list, 0);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/**
2+
* Copyright (c) 2024, mfbalin (Muhammed Fatih Balin)
3+
* All rights reserved.
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*
17+
* @file cuda/cooperative_minibatching_utils.cuh
18+
* @brief Cooperative Minibatching (arXiv:2310.12403) utility device functions
19+
* in CUDA.
20+
*/
21+
#ifndef GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_CUH_
22+
#define GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_CUH_
23+
24+
#include <curand_kernel.h>
25+
26+
namespace graphbolt {
27+
namespace cuda {
28+
29+
using part_t = uint8_t;
30+
constexpr auto kPartDType = torch::kUInt8;
31+
32+
/**
33+
* @brief Given a vertex id, the rank of current GPU and the world size, returns
34+
* the rank that this id belongs in a deterministic manner.
35+
*
36+
* @param id The node id that will mapped to a rank in [0, world_size).
37+
* @param rank The rank of the current GPU.
38+
* @param world_size The world size, the total number of cooperating GPUs.
39+
*
40+
* @return The rank of the GPU the given id is mapped to.
41+
*/
42+
template <typename index_t>
43+
__device__ inline auto rank_assignment(
44+
index_t id, uint32_t rank, uint32_t world_size) {
45+
// Consider using a faster implementation in the future.
46+
constexpr uint64_t kCurandSeed = 999961; // Any random number.
47+
curandStatePhilox4_32_10_t rng;
48+
curand_init(kCurandSeed, 0, id, &rng);
49+
return (curand(&rng) - rank) % world_size;
50+
}
51+
52+
} // namespace cuda
53+
} // namespace graphbolt
54+
55+
#endif // GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_CUH_

graphbolt/src/cuda/cooperative_minibatching_utils.h

+3-25
Original file line numberDiff line numberDiff line change
@@ -21,35 +21,12 @@
2121
#ifndef GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_H_
2222
#define GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_H_
2323

24-
#include <curand_kernel.h>
24+
#include <ATen/cuda/CUDAEvent.h>
2525
#include <torch/script.h>
2626

2727
namespace graphbolt {
2828
namespace cuda {
2929

30-
using part_t = uint8_t;
31-
constexpr auto kPartDType = torch::kUInt8;
32-
33-
/**
34-
* @brief Given a vertex id, the rank of current GPU and the world size, returns
35-
* the rank that this id belongs in a deterministic manner.
36-
*
37-
* @param id The node id that will mapped to a rank in [0, world_size).
38-
* @param rank The rank of the current GPU.
39-
* @param world_size The world size, the total number of cooperating GPUs.
40-
*
41-
* @return The rank of the GPU the given id is mapped to.
42-
*/
43-
template <typename index_t>
44-
__device__ inline auto rank_assignment(
45-
index_t id, uint32_t rank, uint32_t world_size) {
46-
// Consider using a faster implementation in the future.
47-
constexpr uint64_t kCurandSeed = 999961; // Any random number.
48-
curandStatePhilox4_32_10_t rng;
49-
curand_init(kCurandSeed, 0, id, &rng);
50-
return (curand(&rng) - rank) % world_size;
51-
}
52-
5330
/**
5431
* @brief Given node ids, the rank of current GPU and the world size, returns
5532
* the ranks that the given ids belong in a deterministic manner.
@@ -102,7 +79,8 @@ RankSortImpl(
10279
* that belongs to the `i`th rank.
10380
*/
10481
std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> RankSort(
105-
std::vector<torch::Tensor>& nodes_list, int64_t rank, int64_t world_size);
82+
const std::vector<torch::Tensor>& nodes_list, int64_t rank,
83+
int64_t world_size);
10684

10785
} // namespace cuda
10886
} // namespace graphbolt

graphbolt/src/cuda/extension/unique_and_compact_map.cu

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include <numeric>
3434

3535
#include "../common.h"
36+
#include "../cooperative_minibatching_utils.cuh"
3637
#include "../cooperative_minibatching_utils.h"
3738
#include "../utils.h"
3839
#include "./unique_and_compact.h"

graphbolt/src/python_binding.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010
#include <graphbolt/unique_and_compact.h>
1111

1212
#ifdef GRAPHBOLT_USE_CUDA
13+
#include "./cuda/cooperative_minibatching_utils.h"
1314
#include "./cuda/max_uva_threads.h"
1415
#endif
1516
#include "./cnumpy.h"
16-
#include "./expand_indptr.h"
1717
#include "./feature_cache.h"
1818
#include "./index_select.h"
1919
#include "./io_uring.h"
@@ -196,6 +196,7 @@ TORCH_LIBRARY(graphbolt, m) {
196196
m.def("set_seed", &RandomEngine::SetManualSeed);
197197
#ifdef GRAPHBOLT_USE_CUDA
198198
m.def("set_max_uva_threads", &cuda::set_max_uva_threads);
199+
m.def("rank_sort", &cuda::RankSort);
199200
#endif
200201
#ifdef HAS_IMPL_ABSTRACT_PYSTUB
201202
m.impl_abstract_pystub("dgl.graphbolt.base", "//dgl.graphbolt.base");
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import unittest
2+
3+
from functools import partial
4+
5+
import backend as F
6+
import dgl.graphbolt as gb
7+
import pytest
8+
import torch
9+
10+
WORLD_SIZE = 7
11+
12+
assert_equal = partial(torch.testing.assert_close, rtol=0, atol=0)
13+
14+
15+
@unittest.skipIf(
16+
F._default_context_str != "gpu",
17+
reason="This test requires an NVIDIA GPU.",
18+
)
19+
@pytest.mark.parametrize("dtype", [torch.int32, torch.int64])
20+
@pytest.mark.parametrize("rank", list(range(WORLD_SIZE)))
21+
def test_gpu_cached_feature_read_async(dtype, rank):
22+
nodes_list1 = [
23+
torch.randint(0, 11111111, [777], dtype=dtype, device=F.ctx())
24+
for i in range(10)
25+
]
26+
nodes_list2 = [nodes.sort()[0] for nodes in nodes_list1]
27+
28+
res1 = torch.ops.graphbolt.rank_sort(nodes_list1, rank, WORLD_SIZE)
29+
res2 = torch.ops.graphbolt.rank_sort(nodes_list2, rank, WORLD_SIZE)
30+
31+
for i, ((nodes1, idx1, offsets1), (nodes2, idx2, offsets2)) in enumerate(
32+
zip(res1, res2)
33+
):
34+
assert_equal(nodes_list1[i], nodes1[idx1.sort()[1]])
35+
assert_equal(nodes_list2[i], nodes2[idx2.sort()[1]])
36+
assert_equal(offsets1, offsets2)
37+
assert offsets1.is_pinned() and offsets2.is_pinned()
38+
39+
res3 = torch.ops.graphbolt.rank_sort(nodes_list1, rank, WORLD_SIZE)
40+
41+
# This function is deterministic. Call with identical arguments and check.
42+
for (nodes1, idx1, offsets1), (nodes3, idx3, offsets3) in zip(res1, res3):
43+
assert_equal(nodes1, nodes3)
44+
assert_equal(idx1, idx3)
45+
assert_equal(offsets1, offsets3)
46+
47+
# The dependency on the rank argument is simply a permutation.
48+
res4 = torch.ops.graphbolt.rank_sort(nodes_list1, 0, WORLD_SIZE)
49+
for (nodes1, idx1, offsets1), (nodes4, idx4, offsets4) in zip(res1, res4):
50+
off1 = offsets1.tolist()
51+
off4 = offsets4.tolist()
52+
for i in range(WORLD_SIZE):
53+
j = (i - rank + WORLD_SIZE) % WORLD_SIZE
54+
assert_equal(
55+
nodes1[off1[j] : off1[j + 1]], nodes4[off4[i] : off4[i + 1]]
56+
)
57+
assert_equal(
58+
idx1[off1[j] : off1[j + 1]], idx4[off4[i] : off4[i + 1]]
59+
)

0 commit comments

Comments
 (0)