Skip to content

Commit dbc94e0

Browse files
committedMar 1, 2018
copied files from stellar-evaluation-plugins/scrips/
1 parent 985a296 commit dbc94e0

File tree

3 files changed

+294
-0
lines changed

3 files changed

+294
-0
lines changed
 

‎data_splitter_test.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import os
2+
from utils.data_splitter import DataSplitter
3+
4+
5+
def run_with_cora_epgm():
6+
input_dir = os.path.expanduser('../tests/resources/data/cora/cora.epgm')
7+
output_dir = os.path.expanduser('../tests/resources/data_splitter/cora.epgm.out')
8+
9+
ds = DataSplitter()
10+
11+
y = ds.load_data(input_dir, dataset_name='cora', target_attribute='subject', node_type='paper')
12+
13+
y_train, y_val, y_test = ds.split_data(y, nc=20, test_size=100)
14+
15+
ds.write_data(output_dir=output_dir, dataset_name='cora', y_train=y_train, y_test=y_test, y_val=y_val)
16+
17+
print("Done")
18+
19+
20+
def run_with_yelp_epgm():
21+
input_dir = os.path.expanduser('../tests/resources/data/yelp/yelp.epgm')
22+
output_dir = os.path.expanduser('../tests/resources/data_splitter/yelp.epgm.out')
23+
dataset_name = 'small_yelp_example'
24+
ds = DataSplitter()
25+
26+
y = ds.load_data(input_dir, dataset_name=dataset_name, target_attribute='elite', node_type='user')
27+
28+
y_train, y_val, y_test, y_unlabeled = ds.split_data(y, nc=20, test_size=100)
29+
30+
ds.write_data(output_dir=output_dir, dataset_name=dataset_name,
31+
y_train=y_train, y_test=y_test, y_val=y_val, y_unlabeled=y_unlabeled)
32+
33+
print("Done")
34+
35+
36+
def run_with_yelp_lab():
37+
input_dir = os.path.expanduser('../tests/resources/data_splitter/yelp.epgm.out/small_yelp_example.lab')
38+
output_dir = os.path.expanduser('../tests/resources/data_splitter/yelp.epgm.out')
39+
dataset_name = 'small_yelp_example'
40+
ds = DataSplitter()
41+
42+
y = ds.load_data(input_dir, dataset_name=dataset_name, target_attribute='elite', node_type='user')
43+
44+
y_train, y_val, y_test, y_unlabeled = ds.split_data(y, nc=20, test_size=100)
45+
46+
ds.write_data(output_dir=output_dir, dataset_name=dataset_name,
47+
y_train=y_train, y_test=y_test, y_val=y_val, y_unlabeled=y_unlabeled)
48+
49+
print("Done")
50+
51+
52+
if __name__ == '__main__':
53+
run_with_yelp_lab()
54+

‎nai_scheduler.py

+215
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
"""
2+
Basic Node Attribute Inference (NAI) scheduler.
3+
4+
It implements a basic pipeline for performing inference for missing node labels by first splitting data to train, test,
5+
and validation sets, performing dimensionality reduction and metric learning and finally inference. Each stage of the
6+
pipeline has options that must be specified such as how many samples per class in the train set, the size of the test
7+
set, the number of output dimensions for dimensionality reduction, the metric learning method and corresponding
8+
parameters, and finally, the classification algorithm to use along with its related parameters.
9+
10+
"""
11+
from utils.nai_epgm_utils import *
12+
import argparse
13+
import shutil
14+
import copy
15+
import os
16+
from utils.nai_pipeline import NAIPipeline, PluginError
17+
18+
19+
def display_results(results):
20+
for r in results:
21+
print(r)
22+
23+
24+
def best_parameters(results):
25+
'''
26+
Finds and returns the best set of parameters, in terms of highest accuracy, for a series of experiments the
27+
results stored in the 'results' list of dictionaries given
28+
:param results: List of dictionaries with parameter and accuracy values
29+
:return: The dictionary in results with highest accuracy entry.
30+
'''
31+
best_result = None
32+
highest_accuracy = -1.0
33+
for r in results:
34+
if r['accuracy']['acc_test'] > highest_accuracy:
35+
highest_accuracy = r['accuracy']['acc_test']
36+
best_result = copy.deepcopy(r)
37+
38+
return best_result
39+
40+
41+
def parse_args():
42+
"""
43+
Parses the command line arguments.
44+
"""
45+
parser = argparse.ArgumentParser(description="Metric Learning for node attribute classification in graph analysis.")
46+
47+
parser.add_argument('--dataset-name', dest='dataset_name', nargs='?', default='cora',
48+
help='Name of dataset.')
49+
50+
parser.add_argument('--target-node-type', dest='node_type', nargs='?', default=None,
51+
help='Type of nodes whose attributes are to be inferred.')
52+
53+
parser.add_argument('--target-attribute', dest='target_attribute', nargs='?', default='',
54+
help='Name of the attribute to infer.')
55+
56+
parser.add_argument('--attributes-to-ignore', dest='attributes_to_ignore', nargs='*', default=[],
57+
help='Names of attribute to ignore as predictors.')
58+
59+
parser.add_argument('--input-dir', dest='input_dir', nargs='?',
60+
default='/Users/eli024/Projects/data/cora/cora.epgm/',
61+
help='Input directory where graph in EPGM format can be found.')
62+
63+
parser.add_argument('--temp-dir', dest='temp_dir', nargs='?',
64+
default='~/temp/',
65+
help='Directory for storing temporary files')
66+
67+
parser.add_argument('--output-dir', dest='output_dir',
68+
nargs='?', default='pred/',
69+
help='Directory to write graph with predicted node labels in EPGM format.')
70+
71+
parser.add_argument('--pipeline', dest='pipeline_filename',
72+
nargs='?', default='',
73+
help='JSON formatted file specifying the NAI pipeline and corresponding plugin parameters.')
74+
75+
parser.add_argument('--convert-epgm', dest='convert_epgm', default=False, action='store_true',
76+
help='Extract edge list, .lab, and .att files from EPGM graph format input. Default is True.')
77+
78+
return parser.parse_args()
79+
80+
81+
def prepare_parameters_dict():
82+
parameters = {}
83+
84+
# if wanting to tune parameters use code similar to below. For each parameter, give a list of values
85+
# to try
86+
# parameters['representation'] = {"p": [0.5, 1.0, 2.0], "q": [0.5, 1.0, 2.0]}
87+
# parameters['metric'] = {"method": ['lfda', 'lmnn'], "with_pca": [True], "pca_dim": [32, 16, 8]}
88+
# parameters['inference'] = {"method": ['logistic', 'rforest']}
89+
#
90+
parameters['representation'] = {"p": [1.0], "q": [1.0]}
91+
parameters['metric'] = {"metric": ['lfda'], "with_pca": [True], "pca_dim": [8, 16], "dim": [8]}
92+
parameters['inference'] = {"method": ['logistic', 'rforest']}
93+
94+
return parameters
95+
96+
97+
if __name__ == '__main__':
98+
99+
use_fixed_pipeline = False
100+
101+
args = parse_args()
102+
103+
input_epgm = os.path.expanduser(args.input_dir)
104+
dataset_name = args.dataset_name # 'cora'
105+
tmp_directory = os.path.expanduser(args.temp_dir)
106+
107+
# check if the tmp directory exists and if not, create it
108+
if not os.path.exists(tmp_directory):
109+
print("Creating temp directory {:s}".format(tmp_directory))
110+
os.mkdir(tmp_directory)
111+
else:
112+
# remove all files (if any) in the temp directory
113+
print("Deleting files in temp directory {:s}".format(tmp_directory))
114+
for fname in os.listdir(tmp_directory):
115+
full_path_fname = os.path.join(tmp_directory, fname)
116+
try:
117+
if os.path.isfile(full_path_fname):
118+
os.unlink(full_path_fname)
119+
except Exception as e:
120+
print(e)
121+
122+
#
123+
nai_pipeline = NAIPipeline()
124+
125+
G_epgm = None
126+
if args.convert_epgm:
127+
# remember the epgm graph and use for output later
128+
G_epgm, v_map, iv_map, unique_vertex_labels, *_ = convert_from_EPGM(input_epgm, dataset_name, tmp_directory,
129+
node_type=args.node_type, target_attribute=args.target_attribute,
130+
attributes_to_ignore=args.attributes_to_ignore)
131+
else:
132+
# copy the EPGM files from input_epgm directory to tmp_directory
133+
the_files = os.listdir(args.input_dir)
134+
for epgm_file in the_files:
135+
if epgm_file.endswith('.json'):
136+
shutil.copy2(args.input_dir+epgm_file, tmp_directory)
137+
138+
dataset_dir = tmp_directory
139+
140+
if args.pipeline_filename == '':
141+
use_fixed_pipeline = True
142+
print("** Using fixed NAI pipeline **")
143+
else:
144+
ml_pipeline, parameters = nai_pipeline.load_pipeline_from_file(args.pipeline_filename)
145+
print("** Using NAI pipeline from {:s}".format(args.pipeline_filename))
146+
# set the target-attribute and node-type values in plugin parameters to the values send to the
147+
# scheduler via the command line.
148+
for plugin in ml_pipeline:
149+
if 'target_attribute' in plugin['parameters'].keys():
150+
plugin['parameters']['target_attribute'] = [args.target_attribute]
151+
if 'node_type' in plugin['parameters'].keys():
152+
plugin['parameters']['node_type'] = [args.node_type]
153+
if 'attributes_to_ignore' in plugin['parameters'].keys():
154+
plugin['parameters']['attributes_to_ignore'] = [args.attributes_to_ignore]
155+
156+
try:
157+
if use_fixed_pipeline:
158+
parameters = prepare_parameters_dict()
159+
all_results = nai_pipeline.run_fixed_pipeline(dataset_dir=dataset_dir,
160+
dataset_name=dataset_name, parameters=parameters)
161+
else:
162+
all_results = nai_pipeline.run_pipeline(dataset_dir=dataset_dir,
163+
dataset_name=dataset_name, plugin_parameters=ml_pipeline)
164+
except PluginError as plugin_error:
165+
print("***********************")
166+
print("PluginError raised")
167+
print(plugin_error.result)
168+
print("***********************")
169+
170+
plugin_names = [p['name'] for p in ml_pipeline]
171+
172+
173+
write_predictions_to_epgm = bool(set(plugin_names).intersection(set(["inference", "gcn"])))
174+
175+
if "inference" in plugin_names:
176+
display_results(all_results)
177+
params = best_parameters(results=all_results)
178+
179+
print("\n------------------------------------------\n")
180+
print("Best set of parameters: ", params)
181+
print("\n------------------------------------------\n")
182+
183+
# Now the biggest hack of all
184+
# Write the predicted labels to the epgm vertices file.
185+
# Assume that only one file with extension *.pred in /temp/ directory, read it, and use the inverse vertex map
186+
# to update the vertices G_epgm before writing back to disk.
187+
write_to_epgm(input_epgm, tmp_directory, args.output_dir, G_epgm, iv_map, unique_vertex_labels, args.target_attribute)
188+
elif "gcn" in plugin_names:
189+
G_epgm, v_map, iv_map, unique_vertex_labels = convert_from_EPGM(input_epgm, dataset_name, tmp_directory,
190+
node_type=args.node_type,
191+
target_attribute=args.target_attribute,
192+
attributes_to_ignore=args.attributes_to_ignore,
193+
write_to_disk=False)
194+
195+
write_to_epgm(input_epgm,
196+
tmp_directory+'predictions/',
197+
args.output_dir,
198+
G_epgm,
199+
None,
200+
None,
201+
target_attribute=args.target_attribute)
202+
203+
# if write_predictions_to_epgm:
204+
# if G_epgm is None:
205+
# # this is necessary to make write_to_epgm work with results from the GCN plugin. There has to be a
206+
# # better way to do this.
207+
# G_epgm, v_map, iv_map, unique_vertex_labels = convert_from_EPGM(input_epgm, dataset_name, tmp_directory,
208+
# node_type=args.node_type,
209+
# target_attribute=args.target_attribute,
210+
# attributes_to_ignore=args.attributes_to_ignore,
211+
# write_to_disk=False)
212+
# # write_to_epgm(input_epgm, tmp_directory, args.output_dir, G_epgm, iv_map, unique_vertex_labels, args.target_attribute)
213+
214+
215+
print("Scheduler Finished!")

‎run_epgm_utils.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import os
2+
from utils.nai_epgm_utils import *
3+
4+
5+
def run_with_yelp():
6+
input_dir = os.path.expanduser('../tests/resources/data/yelp/yelp.epgm')
7+
output_dir = os.path.expanduser('../tests/resources/data_splitter/yelp.epgm.out')
8+
dataset_name = 'small_yelp_example'
9+
attributes_to_ignore = ['yelpId', 'name']
10+
11+
12+
convert_from_EPGM(source_directory=input_dir,
13+
output_directory=output_dir,
14+
dataset_name=dataset_name,
15+
target_attribute="elite",
16+
node_type="user",
17+
write_to_disk=True,
18+
attributes_to_ignore=attributes_to_ignore)
19+
20+
print("Done")
21+
22+
23+
if __name__ == '__main__':
24+
run_with_yelp()
25+

0 commit comments

Comments
 (0)
Please sign in to comment.