Skip to content

Commit c490fd0

Browse files
authored
[GraphBolt][CUDA] Enable overlap_graph_fetch in examples. (#7669)
1 parent ce29f58 commit c490fd0

File tree

7 files changed

+19
-28
lines changed

7 files changed

+19
-28
lines changed

examples/graphbolt/disk_based_feature/node_classification.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -404,11 +404,7 @@ def main():
404404
print(f"Training in {args.mode} mode.")
405405
args.graph_device, args.feature_device, args.device = args.mode.split("-")
406406
args.overlap_feature_fetch = args.feature_device == "pinned"
407-
# For now, only sample_layer_neighbor is faster with this option
408-
args.overlap_graph_fetch = (
409-
args.sample_mode == "sample_layer_neighbor"
410-
and args.graph_device == "pinned"
411-
)
407+
args.overlap_graph_fetch = args.graph_device == "pinned"
412408

413409
"""
414410
Load and preprocess on-disk dataset.

examples/graphbolt/node_classification.py

+1-9
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def create_dataloader(
159159
dataloader = gb.DataLoader(
160160
datapipe,
161161
num_workers=num_workers,
162-
overlap_graph_fetch=args.overlap_graph_fetch,
162+
overlap_graph_fetch=args.storage_device == "pinned",
163163
)
164164

165165
# Return the fully-initialized DataLoader object.
@@ -381,14 +381,6 @@ def parse_args():
381381
choices=["sample_neighbor", "sample_layer_neighbor"],
382382
help="The sampling function when doing layerwise sampling.",
383383
)
384-
parser.add_argument(
385-
"--overlap-graph-fetch",
386-
action="store_true",
387-
help="An option for enabling overlap_graph_fetch in graphbolt dataloader."
388-
"If True, the data loader will overlap the UVA graph fetching operations"
389-
"with the rest of operations by using an alternative CUDA stream. Disabled"
390-
"by default.",
391-
)
392384
return parser.parse_args()
393385

394386

examples/graphbolt/pyg/labor/node_classification.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -454,11 +454,7 @@ def main():
454454
print(f"Training in {args.mode} mode.")
455455
args.graph_device, args.feature_device, args.device = args.mode.split("-")
456456
args.overlap_feature_fetch = args.feature_device == "pinned"
457-
# For now, only sample_layer_neighbor is faster with this option
458-
args.overlap_graph_fetch = (
459-
args.sample_mode == "sample_layer_neighbor"
460-
and args.graph_device == "pinned"
461-
)
457+
args.overlap_graph_fetch = args.graph_device == "pinned"
462458

463459
# Load and preprocess dataset.
464460
print("Loading data...")

examples/graphbolt/pyg/node_classification_advanced.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -415,11 +415,7 @@ def main():
415415
print(f"Training in {args.mode} mode.")
416416
args.graph_device, args.feature_device, args.device = args.mode.split("-")
417417
args.overlap_feature_fetch = args.feature_device == "pinned"
418-
# For now, only sample_layer_neighbor is faster with this option
419-
args.overlap_graph_fetch = (
420-
args.sample_mode == "sample_layer_neighbor"
421-
and args.graph_device == "pinned"
422-
)
418+
args.overlap_graph_fetch = args.graph_device == "pinned"
423419

424420
# Load and preprocess dataset.
425421
print("Loading data...")

examples/graphbolt/rgcn/hetero_rgcn.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,11 @@ def create_dataloader(
141141
# Create a DataLoader from the datapipe.
142142
# `num_workers`:
143143
# The number of worker processes to use for data loading.
144-
return gb.DataLoader(datapipe, num_workers=num_workers)
144+
return gb.DataLoader(
145+
datapipe,
146+
num_workers=num_workers,
147+
overlap_graph_fetch=args.overlap_graph_fetch,
148+
)
145149

146150

147151
def extract_embed(node_embed, input_nodes):
@@ -568,9 +572,11 @@ def main(args):
568572
) = load_dataset(args.dataset)
569573

570574
# Move the dataset to the pinned memory to enable GPU access.
575+
args.overlap_graph_fetch = False
571576
if device == torch.device("cuda"):
572577
g.pin_memory_()
573578
features.pin_memory_()
579+
args.overlap_graph_fetch = True
574580

575581
feat_size = features.size("node", "paper", "feat")[0]
576582

examples/multigpu/graphbolt/node_classification.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,11 @@ def create_dataloader(
139139
if args.storage_device == "cpu":
140140
datapipe = datapipe.copy_to(device)
141141

142-
dataloader = gb.DataLoader(datapipe, args.num_workers)
142+
dataloader = gb.DataLoader(
143+
datapipe,
144+
args.num_workers,
145+
overlap_graph_fetch=args.storage_device == "pinned",
146+
)
143147

144148
# Return the fully-initialized DataLoader object.
145149
return dataloader

python/dgl/graphbolt/dataloader.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,9 @@ class DataLoader(torch_data.DataLoader):
127127
instances alive.
128128
overlap_graph_fetch : bool, optional
129129
If True, the data loader will overlap the UVA graph fetching operations
130-
with the rest of operations by using an alternative CUDA stream. Default
131-
is False.
130+
with the rest of operations by using an alternative CUDA stream. This
131+
option should be enabled if you have moved your graph to the pinned
132+
memory for optimal performance. Default is False.
132133
num_gpu_cached_edges : int, optional
133134
If positive and overlap_graph_fetch is True, then the GPU will cache
134135
frequently accessed vertex neighborhoods to reduce the PCI-e bandwidth

0 commit comments

Comments
 (0)