|
3 | 3 | import tempfile
|
4 | 4 |
|
5 | 5 | import dgl
|
| 6 | +import dgl.backend as F |
6 | 7 |
|
7 | 8 | import numpy as np
|
8 | 9 | import pyarrow.parquet as pq
|
|
19 | 20 |
|
20 | 21 | from distpartitioning import array_readwriter
|
21 | 22 | from distpartitioning.utils import generate_read_list
|
22 |
| -from pytest_utils import create_chunked_dataset |
| 23 | +from pytest_utils import chunk_graph, create_chunked_dataset |
| 24 | +from scipy import sparse as spsp |
23 | 25 |
|
24 | 26 | from tools.verification_utils import (
|
25 | 27 | verify_graph_feats,
|
@@ -202,6 +204,103 @@ def test_chunk_graph_arbitrary_chunks(
|
202 | 204 | )
|
203 | 205 |
|
204 | 206 |
|
| 207 | +def create_mini_chunked_dataset( |
| 208 | + root_dir, |
| 209 | + num_chunks, |
| 210 | + data_fmt, |
| 211 | + edges_fmt, |
| 212 | + vector_rows, |
| 213 | + few_entity="node", |
| 214 | + **kwargs, |
| 215 | +): |
| 216 | + num_nodes = {"n1": 1000, "n2": 1010, "n3": 1020} |
| 217 | + etypes = [ |
| 218 | + ("n1", "r1", "n2"), |
| 219 | + ("n2", "r1", "n1"), |
| 220 | + ("n1", "r2", "n3"), |
| 221 | + ("n2", "r3", "n3"), |
| 222 | + ] |
| 223 | + node_items = ["n1", "n2", "n3"] |
| 224 | + edges_coo = {} |
| 225 | + for etype in etypes: |
| 226 | + src_ntype, _, dst_ntype = etype |
| 227 | + arr = spsp.random( |
| 228 | + num_nodes[src_ntype], |
| 229 | + num_nodes[dst_ntype], |
| 230 | + density=0.001, |
| 231 | + format="coo", |
| 232 | + random_state=100, |
| 233 | + ) |
| 234 | + edges_coo[etype] = (arr.row, arr.col) |
| 235 | + edge_items = [] |
| 236 | + if few_entity == "edge": |
| 237 | + edges_coo[("n1", "a0", "n2")] = ( |
| 238 | + torch.tensor([0, 1]), |
| 239 | + torch.tensor([1, 0]), |
| 240 | + ) |
| 241 | + edges_coo[("n1", "a1", "n3")] = ( |
| 242 | + torch.tensor([0, 1]), |
| 243 | + torch.tensor([1, 0]), |
| 244 | + ) |
| 245 | + edge_items.append(("n1", "a0", "n2")) |
| 246 | + edge_items.append(("n1", "a1", "n3")) |
| 247 | + elif few_entity == "node": |
| 248 | + edges_coo[("n1", "r_few", "n_few")] = ( |
| 249 | + torch.tensor([0, 1]), |
| 250 | + torch.tensor([1, 0]), |
| 251 | + ) |
| 252 | + edges_coo[("a0", "a01", "n_1")] = ( |
| 253 | + torch.tensor([0, 1]), |
| 254 | + torch.tensor([1, 0]), |
| 255 | + ) |
| 256 | + edge_items.append(("n1", "r_few", "n_few")) |
| 257 | + edge_items.append(("a0", "a01", "n_1")) |
| 258 | + node_items.append("n_few") |
| 259 | + node_items.append("n_1") |
| 260 | + num_nodes["n_few"] = 2 |
| 261 | + num_nodes["n_1"] = 2 |
| 262 | + g = dgl.heterograph(edges_coo) |
| 263 | + |
| 264 | + node_data = {} |
| 265 | + edge_data = {} |
| 266 | + # save feature |
| 267 | + input_dir = os.path.join(root_dir, "data_test") |
| 268 | + |
| 269 | + for ntype in node_items: |
| 270 | + os.makedirs(os.path.join(input_dir, ntype)) |
| 271 | + feat = np.random.randn(num_nodes[ntype], 3) |
| 272 | + feat_path = os.path.join(input_dir, f"{ntype}/feat.npy") |
| 273 | + with open(feat_path, "wb") as f: |
| 274 | + np.save(f, feat) |
| 275 | + g.nodes[ntype].data["feat"] = torch.from_numpy(feat) |
| 276 | + node_data[ntype] = {"feat": feat_path} |
| 277 | + |
| 278 | + for etype in set(edge_items): |
| 279 | + os.makedirs(os.path.join(input_dir, etype[1])) |
| 280 | + num_edge = len(edges_coo[etype][0]) |
| 281 | + feat = np.random.randn(num_edge, 4) |
| 282 | + feat_path = os.path.join(input_dir, f"{etype[1]}/feat.npy") |
| 283 | + with open(feat_path, "wb") as f: |
| 284 | + np.save(f, feat) |
| 285 | + g.edges[etype].data["feat"] = torch.from_numpy(feat) |
| 286 | + edge_data[etype] = {"feat": feat_path} |
| 287 | + |
| 288 | + output_dir = os.path.join(root_dir, "chunked-data") |
| 289 | + chunk_graph( |
| 290 | + g, |
| 291 | + "mag240m", |
| 292 | + node_data, |
| 293 | + edge_data, |
| 294 | + num_chunks=num_chunks, |
| 295 | + output_path=output_dir, |
| 296 | + data_fmt=data_fmt, |
| 297 | + edges_fmt=edges_fmt, |
| 298 | + vector_rows=vector_rows, |
| 299 | + **kwargs, |
| 300 | + ) |
| 301 | + return g |
| 302 | + |
| 303 | + |
205 | 304 | def _test_pipeline(
|
206 | 305 | num_chunks,
|
207 | 306 | num_parts,
|
@@ -373,6 +472,98 @@ def test_pipeline_feature_format(data_fmt):
|
373 | 472 | _test_pipeline(4, 4, 4, data_fmt=data_fmt)
|
374 | 473 |
|
375 | 474 |
|
| 475 | +@pytest.mark.parametrize( |
| 476 | + "num_chunks, num_parts, world_size", |
| 477 | + [[4, 4, 4], [8, 4, 2], [8, 4, 4], [9, 6, 3], [11, 11, 1], [11, 4, 1]], |
| 478 | +) |
| 479 | +@pytest.mark.parametrize("few_entity", ["node", "edge"]) |
| 480 | +def test_partition_hetero_few_entity( |
| 481 | + num_chunks, |
| 482 | + num_parts, |
| 483 | + world_size, |
| 484 | + few_entity, |
| 485 | + graph_formats=None, |
| 486 | + data_fmt="numpy", |
| 487 | + edges_fmt="csv", |
| 488 | + vector_rows=False, |
| 489 | + num_chunks_nodes=None, |
| 490 | + num_chunks_edges=None, |
| 491 | + num_chunks_node_data=None, |
| 492 | + num_chunks_edge_data=None, |
| 493 | +): |
| 494 | + with tempfile.TemporaryDirectory() as root_dir: |
| 495 | + g = create_mini_chunked_dataset( |
| 496 | + root_dir, |
| 497 | + num_chunks, |
| 498 | + few_entity=few_entity, |
| 499 | + data_fmt=data_fmt, |
| 500 | + edges_fmt=edges_fmt, |
| 501 | + vector_rows=vector_rows, |
| 502 | + num_chunks_nodes=num_chunks_nodes, |
| 503 | + num_chunks_edges=num_chunks_edges, |
| 504 | + num_chunks_node_data=num_chunks_node_data, |
| 505 | + num_chunks_edge_data=num_chunks_edge_data, |
| 506 | + ) |
| 507 | + |
| 508 | + # Step1: graph partition |
| 509 | + in_dir = os.path.join(root_dir, "chunked-data") |
| 510 | + output_dir = os.path.join(root_dir, "parted_data") |
| 511 | + os.system( |
| 512 | + "python3 tools/partition_algo/random_partition.py " |
| 513 | + "--in_dir {} --out_dir {} --num_partitions {}".format( |
| 514 | + in_dir, output_dir, num_parts |
| 515 | + ) |
| 516 | + ) |
| 517 | + |
| 518 | + # Step2: data dispatch |
| 519 | + partition_dir = os.path.join(root_dir, "parted_data") |
| 520 | + out_dir = os.path.join(root_dir, "partitioned") |
| 521 | + ip_config = os.path.join(root_dir, "ip_config.txt") |
| 522 | + with open(ip_config, "w") as f: |
| 523 | + for i in range(world_size): |
| 524 | + f.write(f"127.0.0.{i + 1}\n") |
| 525 | + |
| 526 | + cmd = "python3 tools/dispatch_data.py" |
| 527 | + cmd += f" --in-dir {in_dir}" |
| 528 | + cmd += f" --partitions-dir {partition_dir}" |
| 529 | + cmd += f" --out-dir {out_dir}" |
| 530 | + cmd += f" --ip-config {ip_config}" |
| 531 | + cmd += " --ssh-port 22" |
| 532 | + cmd += " --process-group-timeout 60" |
| 533 | + cmd += " --save-orig-nids" |
| 534 | + cmd += " --save-orig-eids" |
| 535 | + cmd += f" --graph-formats {graph_formats}" if graph_formats else "" |
| 536 | + os.system(cmd) |
| 537 | + |
| 538 | + # read original node/edge IDs |
| 539 | + def read_orig_ids(fname): |
| 540 | + orig_ids = {} |
| 541 | + for i in range(num_parts): |
| 542 | + ids_path = os.path.join(out_dir, f"part{i}", fname) |
| 543 | + part_ids = load_tensors(ids_path) |
| 544 | + for type, data in part_ids.items(): |
| 545 | + if type not in orig_ids: |
| 546 | + orig_ids[type] = data |
| 547 | + else: |
| 548 | + orig_ids[type] = torch.cat((orig_ids[type], data)) |
| 549 | + return orig_ids |
| 550 | + |
| 551 | + orig_nids = read_orig_ids("orig_nids.dgl") |
| 552 | + orig_eids = read_orig_ids("orig_eids.dgl") |
| 553 | + |
| 554 | + # load partitions and verify |
| 555 | + part_config = os.path.join(out_dir, "metadata.json") |
| 556 | + for i in range(num_parts): |
| 557 | + part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition( |
| 558 | + part_config, i |
| 559 | + ) |
| 560 | + verify_partition_data_types(part_g) |
| 561 | + verify_partition_formats(part_g, graph_formats) |
| 562 | + verify_graph_feats( |
| 563 | + g, gpb, part_g, node_feats, edge_feats, orig_nids, orig_eids |
| 564 | + ) |
| 565 | + |
| 566 | + |
376 | 567 | def test_utils_generate_read_list():
|
377 | 568 | read_list = generate_read_list(10, 4)
|
378 | 569 | assert np.array_equal(read_list[0], np.array([0, 1, 2]))
|
|
0 commit comments