[GraphBolt][CUDA] Fix Cooperative Minibatching bugs. (#7804)

mfbalin · web-flow · commit 31ad9b5a8d88 · 2024-09-19T14:57:42.000-04:00
diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
@@ -19,6 +19,7 @@
  * implementations in CUDA.
  */
 #include <graphbolt/cuda_ops.h>
+#include <thrust/scatter.h>
 #include <thrust/transform.h>
 
 #include <cub/cub.cuh>
@@ -62,8 +63,7 @@ RankSortImpl(
   auto part_ids2 = part_ids.clone();
   auto part_ids2_sorted = torch::empty_like(part_ids2);
   auto nodes_sorted = torch::empty_like(nodes);
-  auto index = ops::IndptrEdgeIdsImpl(
-      offsets_dev, nodes.scalar_type(), torch::nullopt, nodes.numel());
+  auto index = torch::arange(nodes.numel(), nodes.options());
   auto index_sorted = torch::empty_like(index);
   return AT_DISPATCH_INDEX_TYPES(
       nodes.scalar_type(), "RankSortImpl", ([&] {
@@ -100,8 +100,14 @@ RankSortImpl(
             index.data_ptr<index_t>(), index_sorted.data_ptr<index_t>(),
             nodes.numel(), num_batches, offsets_dev_ptr, offsets_dev_ptr + 1, 0,
             num_bits);
+        auto values = ops::IndptrEdgeIdsImpl(
+            offsets_dev, nodes.scalar_type(), torch::nullopt, nodes.numel());
+        THRUST_CALL(
+            scatter, values.data_ptr<index_t>(),
+            values.data_ptr<index_t>() + values.numel(),
+            index_sorted.data_ptr<index_t>(), index.data_ptr<index_t>());
         return std::make_tuple(
-            nodes_sorted, index_sorted, offsets, std::move(offsets_event));
+            nodes_sorted, index, offsets, std::move(offsets_event));
       }));
 }
 
diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.h b/graphbolt/src/cuda/cooperative_minibatching_utils.h
@@ -42,21 +42,21 @@ torch::Tensor RankAssignment(
 
 /**
  * @brief Given node ids, the ranks they belong, the offsets to separate
- * different node types and num_bits indicating the world size is <= 2^num_bits,
- * returns node ids sorted w.r.t. the ranks that the given ids belong along with
- * the original positions.
+ * different node types and world size, returns node ids sorted w.r.t. the ranks
+ * that the given ids belong along with their new positions.
  *
  * @param nodes        Node id tensor to be mapped to a rank in [0, world_size).
  * @param part_ids     Rank tensor the nodes belong to.
  * @param offsets_dev  Offsets to separate different node types.
  * @param world_size   World size, the total number of cooperating GPUs.
  *
- * @return (sorted_nodes, original_positions, rank_offsets, rank_offsets_event),
- * where the first one includes sorted nodes, the second contains original
- * positions of the sorted nodes and the third contains the offsets of the
- * sorted_nodes indicating sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]]
- * contains nodes that belongs to the `i`th rank. Before accessing rank_offsets
- * on the CPU, `rank_offsets_event.synchronize()` is required.
+ * @return (sorted_nodes, new_positions, rank_offsets, rank_offsets_event),
+ * where the first one includes sorted nodes, the second contains new positions
+ * of the given nodes, so that sorted_nodes[new_positions] == nodes, and the
+ * third contains the offsets of the sorted_nodes indicating
+ * sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]] contains nodes that
+ * belongs to the `i`th rank. Before accessing rank_offsets on the CPU,
+ * `rank_offsets_event.synchronize()` is required.
  */
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, at::cuda::CUDAEvent>
 RankSortImpl(
@@ -72,11 +72,12 @@ RankSortImpl(
  * @param rank         Rank of the current GPU.
  * @param world_size   World size, the total number of cooperating GPUs.
  *
- * @return vector of (sorted_nodes, original_positions, rank_offsets), where the
- * first one includes sorted nodes, the second contains original positions of
- * the sorted nodes and the third contains the offsets of the sorted_nodes
- * indicating sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]] contains nodes
- * that belongs to the `i`th rank.
+ * @return vector of (sorted_nodes, new_positions, rank_offsets), where the
+ * first one includes sorted nodes, the second contains new positions of the
+ * given nodes, so that sorted_nodes[new_positions] == nodes, and the third
+ * contains the offsets of the sorted_nodes indicating
+ * sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]] contains nodes that
+ * belongs to the `i`th rank.
  */
 std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> RankSort(
     const std::vector<torch::Tensor>& nodes_list, int64_t rank,
diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
@@ -99,8 +99,12 @@ __global__ void _MapIdsBatched(
 
       auto slot = map.find(key);
       auto new_id = slot->second;
-      if (index) new_id = index[new_id];
-      mapped_ids[i] = new_id - unique_ids_offsets[batch_index];
+      if (index) {
+        new_id = index[new_id];
+      } else {
+        new_id -= unique_ids_offsets[batch_index];
+      }
+      mapped_ids[i] = new_id;
     }
 
     i += stride;
diff --git a/python/dgl/graphbolt/impl/cooperative_conv.py b/python/dgl/graphbolt/impl/cooperative_conv.py
@@ -78,10 +78,10 @@ def backward(
                 torch.split(typed_grad_output, counts_sent[ntype]),
             )
             i = out.new_empty(2, out.shape[0], dtype=torch.int64)
-            i[0] = torch.arange(
+            i[0] = seed_inverse_ids[ntype]  # src
+            i[1] = torch.arange(
                 out.shape[0], device=typed_grad_output.device
-            )  # src
-            i[1] = seed_inverse_ids[ntype]  # dst
+            )  # dst
             coo = torch.sparse_coo_tensor(
                 i,
                 torch.ones(
diff --git a/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py b/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py
@@ -18,7 +18,7 @@
 )
 @pytest.mark.parametrize("dtype", [torch.int32, torch.int64])
 @pytest.mark.parametrize("rank", list(range(WORLD_SIZE)))
-def test_gpu_cached_feature_read_async(dtype, rank):
+def test_rank_sort_and_unique_and_compact(dtype, rank):
     torch.manual_seed(7)
     nodes_list1 = [
         torch.randint(0, 2111111111, [777], dtype=dtype, device=F.ctx())
@@ -32,8 +32,8 @@ def test_gpu_cached_feature_read_async(dtype, rank):
     for i, ((nodes1, idx1, offsets1), (nodes2, idx2, offsets2)) in enumerate(
         zip(res1, res2)
     ):
-        assert_equal(nodes_list1[i], nodes1[idx1.sort()[1]])
-        assert_equal(nodes_list2[i], nodes2[idx2.sort()[1]])
+        assert_equal(nodes_list1[i], nodes1[idx1])
+        assert_equal(nodes_list2[i], nodes2[idx2])
         assert_equal(offsets1, offsets2)
         assert offsets1.is_pinned() and offsets2.is_pinned()
 
@@ -50,14 +50,12 @@ def test_gpu_cached_feature_read_async(dtype, rank):
     for (nodes1, idx1, offsets1), (nodes4, idx4, offsets4) in zip(res1, res4):
         off1 = offsets1.tolist()
         off4 = offsets4.tolist()
+        assert_equal(nodes1[idx1], nodes4[idx4])
         for i in range(WORLD_SIZE):
             j = (i - rank + WORLD_SIZE) % WORLD_SIZE
             assert_equal(
                 nodes1[off1[j] : off1[j + 1]], nodes4[off4[i] : off4[i + 1]]
             )
-            assert_equal(
-                idx1[off1[j] : off1[j + 1]], idx4[off4[i] : off4[i + 1]]
-            )
 
     unique, compacted, offsets = gb.unique_and_compact(
         nodes_list1[:1], rank, WORLD_SIZE