[GraphBolt][CUDA] Enable overlap_graph_fetch in examples. (#7669)

mfbalin · web-flow · commit c490fd08adb1 · 2024-08-07T18:06:45.000-04:00
diff --git a/examples/graphbolt/disk_based_feature/node_classification.py b/examples/graphbolt/disk_based_feature/node_classification.py
@@ -404,11 +404,7 @@ def main():
     print(f"Training in {args.mode} mode.")
     args.graph_device, args.feature_device, args.device = args.mode.split("-")
     args.overlap_feature_fetch = args.feature_device == "pinned"
-    # For now, only sample_layer_neighbor is faster with this option
-    args.overlap_graph_fetch = (
-        args.sample_mode == "sample_layer_neighbor"
-        and args.graph_device == "pinned"
-    )
+    args.overlap_graph_fetch = args.graph_device == "pinned"
 
     """
     Load and preprocess on-disk dataset.
diff --git a/examples/graphbolt/node_classification.py b/examples/graphbolt/node_classification.py
@@ -159,7 +159,7 @@ def create_dataloader(
     dataloader = gb.DataLoader(
         datapipe,
         num_workers=num_workers,
-        overlap_graph_fetch=args.overlap_graph_fetch,
+        overlap_graph_fetch=args.storage_device == "pinned",
     )
 
     # Return the fully-initialized DataLoader object.
@@ -381,14 +381,6 @@ def parse_args():
         choices=["sample_neighbor", "sample_layer_neighbor"],
         help="The sampling function when doing layerwise sampling.",
     )
-    parser.add_argument(
-        "--overlap-graph-fetch",
-        action="store_true",
-        help="An option for enabling overlap_graph_fetch in graphbolt dataloader."
-        "If True, the data loader will overlap the UVA graph fetching operations"
-        "with the rest of operations by using an alternative CUDA stream. Disabled"
-        "by default.",
-    )
     return parser.parse_args()
 
 
diff --git a/examples/graphbolt/pyg/labor/node_classification.py b/examples/graphbolt/pyg/labor/node_classification.py
@@ -454,11 +454,7 @@ def main():
     print(f"Training in {args.mode} mode.")
     args.graph_device, args.feature_device, args.device = args.mode.split("-")
     args.overlap_feature_fetch = args.feature_device == "pinned"
-    # For now, only sample_layer_neighbor is faster with this option
-    args.overlap_graph_fetch = (
-        args.sample_mode == "sample_layer_neighbor"
-        and args.graph_device == "pinned"
-    )
+    args.overlap_graph_fetch = args.graph_device == "pinned"
 
     # Load and preprocess dataset.
     print("Loading data...")
diff --git a/examples/graphbolt/pyg/node_classification_advanced.py b/examples/graphbolt/pyg/node_classification_advanced.py
@@ -415,11 +415,7 @@ def main():
     print(f"Training in {args.mode} mode.")
     args.graph_device, args.feature_device, args.device = args.mode.split("-")
     args.overlap_feature_fetch = args.feature_device == "pinned"
-    # For now, only sample_layer_neighbor is faster with this option
-    args.overlap_graph_fetch = (
-        args.sample_mode == "sample_layer_neighbor"
-        and args.graph_device == "pinned"
-    )
+    args.overlap_graph_fetch = args.graph_device == "pinned"
 
     # Load and preprocess dataset.
     print("Loading data...")
diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -141,7 +141,11 @@ def create_dataloader(
     # Create a DataLoader from the datapipe.
     # `num_workers`:
     #   The number of worker processes to use for data loading.
-    return gb.DataLoader(datapipe, num_workers=num_workers)
+    return gb.DataLoader(
+        datapipe,
+        num_workers=num_workers,
+        overlap_graph_fetch=args.overlap_graph_fetch,
+    )
 
 
 def extract_embed(node_embed, input_nodes):
@@ -568,9 +572,11 @@ def main(args):
     ) = load_dataset(args.dataset)
 
     # Move the dataset to the pinned memory to enable GPU access.
+    args.overlap_graph_fetch = False
     if device == torch.device("cuda"):
         g.pin_memory_()
         features.pin_memory_()
+        args.overlap_graph_fetch = True
 
     feat_size = features.size("node", "paper", "feat")[0]
 
diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
@@ -139,7 +139,11 @@ def create_dataloader(
     if args.storage_device == "cpu":
         datapipe = datapipe.copy_to(device)
 
-    dataloader = gb.DataLoader(datapipe, args.num_workers)
+    dataloader = gb.DataLoader(
+        datapipe,
+        args.num_workers,
+        overlap_graph_fetch=args.storage_device == "pinned",
+    )
 
     # Return the fully-initialized DataLoader object.
     return dataloader
diff --git a/python/dgl/graphbolt/dataloader.py b/python/dgl/graphbolt/dataloader.py
@@ -127,8 +127,9 @@ class DataLoader(torch_data.DataLoader):
         instances alive.
     overlap_graph_fetch : bool, optional
         If True, the data loader will overlap the UVA graph fetching operations
-        with the rest of operations by using an alternative CUDA stream. Default
-        is False.
+        with the rest of operations by using an alternative CUDA stream. This
+        option should be enabled if you have moved your graph to the pinned
+        memory for optimal performance. Default is False.
     num_gpu_cached_edges : int, optional
         If positive and overlap_graph_fetch is True, then the GPU will cache
         frequently accessed vertex neighborhoods to reduce the PCI-e bandwidth