diff --git a/Makefile b/Makefile index adaf151..ef222e6 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,7 @@ PYTHONPATH := `pwd` #* Variables IMAGE := dynnode2vec VERSION := latest +SRC_FOLDERS := ./dynnode2vec ./tests ./benchmarks #* Poetry .PHONY: poetry-download @@ -30,8 +31,8 @@ pre-commit-install: #* Formatters .PHONY: codestyle codestyle: - poetry run isort --settings-path pyproject.toml ./dynnode2vec ./tests - poetry run black --config pyproject.toml ./dynnode2vec ./tests + poetry run isort --settings-path pyproject.toml $(SRC_FOLDERS) + poetry run black --config pyproject.toml $(SRC_FOLDERS) #* Linting .PHONY: test @@ -41,18 +42,18 @@ test: .PHONY: check-codestyle check-codestyle: - poetry run isort --diff --check-only --settings-path pyproject.toml ./dynnode2vec ./tests - poetry run black --diff --check --config pyproject.toml ./dynnode2vec ./tests - poetry run darglint --verbosity 2 ./dynnode2vec ./tests - poetry run pylint ./dynnode2vec/ ./tests + poetry run isort --diff --check-only --settings-path pyproject.toml $(SRC_FOLDERS) + poetry run black --diff --check --config pyproject.toml $(SRC_FOLDERS) + poetry run darglint --verbosity 2 $(SRC_FOLDERS) + poetry run pylint $(SRC_FOLDERS) .PHONY: pylint pylint: - poetry run pylint ./dynnode2vec/ ./tests + poetry run pylint $(SRC_FOLDERS) .PHONY: mypy mypy: - poetry run mypy --config-file pyproject.toml ./dynnode2vec ./tests + poetry run mypy --config-file pyproject.toml $(SRC_FOLDERS) .PHONY: lint lint: check-codestyle mypy diff --git a/benchmarks/build_graphs.py b/benchmarks/build_graphs.py new file mode 100644 index 0000000..6389489 --- /dev/null +++ b/benchmarks/build_graphs.py @@ -0,0 +1,30 @@ +""" +Build graphs from datasets. +""" +from __future__ import annotations + +import gzip + +import networkx as nx + + +def build_as_733_graphs() -> list[nx.Graph]: + """ + Build the Autonomous systems AS-733 graphs. + link: https://snap.stanford.edu/data/as-733.html + """ + graphs = [] + graph = nx.Graph() + with gzip.open("benchmarks/data/as-733.tar.gz", "rt") as stream: + for line in stream: + if "Autonomous systems" in line: + if graph.nodes: + graphs.append(graph) + graph.clear() + continue + if line[0].isdigit(): + edge = map(int, line.strip().split("\t")) + graph.add_edge(*edge) + graphs.append(graph) + graphs.reverse() # Input is in reverse chronological order + return graphs diff --git a/benchmarks/build_sets.py b/benchmarks/build_sets.py new file mode 100644 index 0000000..ee8c2c5 --- /dev/null +++ b/benchmarks/build_sets.py @@ -0,0 +1,22 @@ +""" +Build training and test sets from dynamic graphs. +""" +from __future__ import annotations + +import networkx as nx + +from dynnode2vec.dynnode2vec import DynNode2Vec, Embedding + + +def get_node2vec_embeddings( + graphs: list[nx.Graph], parameters: dict +) -> list[Embedding]: + """ + Build plain node2vec embeddings at each time step. + """ + dynnode2vec_obj = DynNode2Vec(**parameters) + embeddings = [] + for graph in graphs: + _, embedding = dynnode2vec_obj.get_node2vec_embeddings(graph) + embeddings.extend(embedding) + return embeddings diff --git a/benchmarks/data/as-733.tar.gz b/benchmarks/data/as-733.tar.gz new file mode 100644 index 0000000..ff3293f Binary files /dev/null and b/benchmarks/data/as-733.tar.gz differ diff --git a/dynnode2vec/dynnode2vec.py b/dynnode2vec/dynnode2vec.py index f11c54e..44c633e 100644 --- a/dynnode2vec/dynnode2vec.py +++ b/dynnode2vec/dynnode2vec.py @@ -91,16 +91,14 @@ def __init__( # see https://stackoverflow.com/questions/53417258/what-is-workers-parameter-in-word2vec-in-nlp # pylint: disable=line-too-long self.gensim_workers = max(self.parallel_processes - 1, 12) - def _initialize_embeddings( - self, graphs: list[nx.Graph] + def get_node2vec_embeddings( + self, graph: nx.Graph ) -> tuple[Word2Vec, list[Embedding]]: """ - Compute normal node2vec embedding at timestep 0. + Compute normal node2vec embedding. """ - first_graph = graphs[0] - - first_walks = BiasedRandomWalk(first_graph).run( - nodes=first_graph.nodes(), + walks = BiasedRandomWalk(graph).run( + nodes=graph.nodes, walk_length=self.walk_length, n_walks=self.n_walks_per_node, p=self.p, @@ -108,7 +106,7 @@ def _initialize_embeddings( ) model = Word2Vec( - sentences=first_walks, + sentences=walks, vector_size=self.embedding_size, window=self.window, min_count=0, @@ -159,7 +157,7 @@ def generate_updated_walks( if self.plain_node2vec: # if we stick to node2vec implementation, we sample walks # for all nodes at each time step - delta_nodes = current_graph.nodes() + delta_nodes = current_graph.nodes else: # if we use dynnode2vec, we sample walks only for nodes # that changed compared to the previous time step @@ -233,8 +231,13 @@ def compute_embeddings(self, graphs: list[nx.Graph]) -> list[Embedding]: Compute dynamic embeddings on a list of graphs. """ # TO DO : check graph weights valid - model, embeddings = self._initialize_embeddings(graphs) + # Compute normal node2vec embedding at timestep 0. + model, embeddings = self.get_node2vec_embeddings(graphs[0]) + + # Simulate walks for all time steps. time_walks = self._simulate_walks(graphs) + + # Compute embeddings for all time steps. self._update_embeddings(embeddings, time_walks, model) return embeddings diff --git a/dynnode2vec/utils.py b/dynnode2vec/utils.py index f245494..4739d6b 100644 --- a/dynnode2vec/utils.py +++ b/dynnode2vec/utils.py @@ -26,7 +26,7 @@ def create_dynamic_graph( graph = nx.fast_gnp_random_graph(n=n_base_nodes, p=base_density) # add one to each node to avoid the perfect case where true_ids match int_ids - graph = nx.relabel_nodes(graph, mapping={n: str(n) for n in graph.nodes()}) + graph = nx.relabel_nodes(graph, mapping={n: str(n) for n in graph.nodes}) # initialize graphs list with first graph graphs = [graph.copy()] diff --git a/tests/test_biased_random_walk.py b/tests/test_biased_random_walk.py index 8249797..d1e2ef1 100644 --- a/tests/test_biased_random_walk.py +++ b/tests/test_biased_random_walk.py @@ -26,7 +26,7 @@ def test_init(graphs): brw = dynnode2vec.biased_random_walk.BiasedRandomWalk(graphs[0]) # make sure nodes ids were converted to integers - assert list(brw.graph.nodes()) == list(range(brw.graph.number_of_nodes())) + assert list(brw.graph.nodes) == list(range(brw.graph.number_of_nodes())) def test_weighted_choice(graphs): @@ -66,7 +66,7 @@ def test_generate_walk(graphs, ip, iq, weighted): ) assert isinstance(walk, list) - assert all(n in brw.graph.nodes() for n in walk) + assert all(n in brw.graph.nodes for n in walk) @pytest.mark.parametrize("p", [0.5, 1.0]) @@ -84,7 +84,7 @@ def test_run(graphs, p, q, weighted, n_processes): brw = dynnode2vec.biased_random_walk.BiasedRandomWalk(graph) random_walks = brw.run( - graph.nodes(), p=p, q=q, weighted=weighted, n_processes=n_processes + graph.nodes, p=p, q=q, weighted=weighted, n_processes=n_processes ) assert all(isinstance(walk, list) for walk in random_walks) - assert all(n in graph.nodes() for walk in random_walks for n in walk) + assert all(n in graph.nodes for walk in random_walks for n in walk) diff --git a/tests/test_dynnode2vec.py b/tests/test_dynnode2vec.py index 70dc250..e7c5be5 100644 --- a/tests/test_dynnode2vec.py +++ b/tests/test_dynnode2vec.py @@ -38,9 +38,8 @@ def plain_node2vec_parallel_fixture(): ) -def test_initialize_embeddings(graphs, dynnode2vec_object): - # pylint: disable=protected-access - init_model, init_embeddings = dynnode2vec_object._initialize_embeddings(graphs) +def test_get_node2vec_embeddings(graphs, dynnode2vec_object): + init_model, init_embeddings = dynnode2vec_object.get_node2vec_embeddings(graphs[0]) assert isinstance(init_model, gensim.models.Word2Vec) assert isinstance(init_embeddings[0], dynnode2vec.Embedding) @@ -74,7 +73,7 @@ def test_generate_updated_walks(graphs, dynnode2vec_object): updated_walks = dynnode2vec_object.generate_updated_walks(current, previous) assert isinstance(updated_walks, list) - assert all(node in current.nodes() for walk in updated_walks for node in walk) + assert all(node in current.nodes for walk in updated_walks for node in walk) def test_node2vec_generate_updated_walks(graphs, node2vec_object): @@ -83,7 +82,7 @@ def test_node2vec_generate_updated_walks(graphs, node2vec_object): updated_walks = node2vec_object.generate_updated_walks(current, previous) assert isinstance(updated_walks, list) - assert all(node in current.nodes() for walk in updated_walks for node in walk) + assert all(node in current.nodes for walk in updated_walks for node in walk) def test_compute_embeddings(graphs, dynnode2vec_object):