dmlc
diff --git a/‎graphbolt/include/graphbolt/cuda_ops.h
+13-2 b/‎graphbolt/include/graphbolt/cuda_ops.h
+13-2
diff --git a/‎graphbolt/include/graphbolt/unique_and_compact.h
+15-3 b/‎graphbolt/include/graphbolt/unique_and_compact.h
+15-3
diff --git a/‎graphbolt/src/cuda/extension/gpu_graph_cache.cu
+3-2 b/‎graphbolt/src/cuda/extension/gpu_graph_cache.cu
+3-2
diff --git a/‎graphbolt/src/cuda/extension/unique_and_compact.h
+2-1 b/‎graphbolt/src/cuda/extension/unique_and_compact.h
+2-1
@@ -274,10 +274,19 @@ torch::Tensor IndptrEdgeIdsImpl(
  *   2. Compact Operation: Utilizes the reverse mapping derived from the unique
  * operation to transform 'src_ids' and 'dst_ids' into compacted IDs.
  *
+ * When world_size is greater than 1, then the given ids are partitioned between
+ * the available ranks. The ids corresponding to the given rank are guaranteed
+ * to come before the ids of other ranks. To do this, the partition ids are
+ * rotated backwards by the given rank so that the ids are ordered as:
+ * [rank, rank + 1, world_size, 0, ..., rank - 1]. This is supported only for
+ * Volta and later generation NVIDIA GPUs.
+ *
  * @param src_ids         A tensor containing source IDs.
  * @param dst_ids         A tensor containing destination IDs.
  * @param unique_dst_ids  A tensor containing unique destination IDs, which is
  *                        exactly all the unique elements in 'dst_ids'.
+ * @param rank            The rank of the current GPU.
+ * @param world_size      The total # GPUs, world size.
  *
  * @return
  * - A tensor representing all unique elements in 'src_ids' and 'dst_ids' after
@@ -299,7 +308,8 @@ torch::Tensor IndptrEdgeIdsImpl(
  */
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
     const torch::Tensor src_ids, const torch::Tensor dst_ids,
-    const torch::Tensor unique_dst_ids);
+    const torch::Tensor unique_dst_ids, const int64_t rank,
+    const int64_t world_size);
 
 /**
  * @brief Batched version of UniqueAndCompact. The ith element of the return
@@ -310,7 +320,8 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
 UniqueAndCompactBatched(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor>& unique_dst_ids);
+    const std::vector<torch::Tensor>& unique_dst_ids, const int64_t rank,
+    const int64_t world_size);
 
 }  //  namespace ops
 }  //  namespace graphbolt
 
@@ -24,10 +24,19 @@ namespace sampling {
  *   2. Compact Operation: Utilizes the reverse mapping derived from the unique
  * operation to transform 'src_ids' and 'dst_ids' into compacted IDs.
  *
+ * When world_size is greater than 1, then the given ids are partitioned between
+ * the available ranks. The ids corresponding to the given rank are guaranteed
+ * to come before the ids of other ranks. To do this, the partition ids are
+ * rotated backwards by the given rank so that the ids are ordered as:
+ * [rank, rank + 1, world_size, 0, ..., rank - 1]. This is supported only for
+ * Volta and later generation NVIDIA GPUs.
+ *
  * @param src_ids         A tensor containing source IDs.
  * @param dst_ids         A tensor containing destination IDs.
  * @param unique_dst_ids  A tensor containing unique destination IDs, which is
  *                        exactly all the unique elements in 'dst_ids'.
+ * @param rank            The rank of the current GPU.
+ * @param world_size      The total # GPUs, world size.
  *
  * @return
  * - A tensor representing all unique elements in 'src_ids' and 'dst_ids' after
@@ -49,20 +58,23 @@ namespace sampling {
  */
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
     const torch::Tensor& src_ids, const torch::Tensor& dst_ids,
-    const torch::Tensor unique_dst_ids);
+    const torch::Tensor unique_dst_ids, const int64_t rank,
+    const int64_t world_size);
 
 std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
 UniqueAndCompactBatched(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor> unique_dst_ids);
+    const std::vector<torch::Tensor> unique_dst_ids, const int64_t rank,
+    const int64_t world_size);
 
 c10::intrusive_ptr<Future<
     std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>>>
 UniqueAndCompactBatchedAsync(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor> unique_dst_ids);
+    const std::vector<torch::Tensor> unique_dst_ids, const int64_t rank,
+    const int64_t world_size);
 
 }  // namespace sampling
 }  // namespace graphbolt
 
@@ -25,6 +25,7 @@
 #include <cub/cub.cuh>
 #include <cuco/static_map.cuh>
 #include <cuda/std/atomic>
+#include <cuda/stream_ref>
 #include <limits>
 #include <numeric>
 #include <type_traits>
@@ -138,7 +139,7 @@ GpuGraphCache::GpuGraphCache(
             {},
             {},
             allocator_t<index_t>{},
-            cuco::cuda_stream_ref{cuda::GetCurrentStream()}};
+            ::cuda::stream_ref{cuda::GetCurrentStream()}};
         map_ = new map_t<index_t>{std::move(map_temp)};
       }));
   C10_CUDA_KERNEL_LAUNCH_CHECK();  // Check the map constructor's success.
@@ -185,7 +186,7 @@ std::tuple<torch::Tensor, torch::Tensor, int64_t, int64_t> GpuGraphCache::Query(
             map_size_ + seeds.size(0) >= map->capacity() * kDoubleLoadFactor)) {
           map->rehash_async(
               map->capacity() * kIntGrowthFactor,
-              cuco::cuda_stream_ref{cuda::GetCurrentStream()});
+              ::cuda::stream_ref{cuda::GetCurrentStream()});
         }
         auto positions = torch::empty_like(seeds);
         CUDA_KERNEL_CALL(
 
@@ -32,7 +32,8 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> >
 UniqueAndCompactBatchedHashMapBased(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor>& unique_dst_ids);
+    const std::vector<torch::Tensor>& unique_dst_ids, const int64_t rank,
+    const int64_t world_size);
 
 }  // namespace ops
 }  // namespace graphbolt