|
| 1 | +""" |
| 2 | +Basic Node Attribute Inference (NAI) scheduler. |
| 3 | +
|
| 4 | +It implements a basic pipeline for performing inference for missing node labels by first splitting data to train, test, |
| 5 | +and validation sets, performing dimensionality reduction and metric learning and finally inference. Each stage of the |
| 6 | +pipeline has options that must be specified such as how many samples per class in the train set, the size of the test |
| 7 | +set, the number of output dimensions for dimensionality reduction, the metric learning method and corresponding |
| 8 | +parameters, and finally, the classification algorithm to use along with its related parameters. |
| 9 | +
|
| 10 | +""" |
| 11 | +from utils.nai_epgm_utils import * |
| 12 | +import argparse |
| 13 | +import shutil |
| 14 | +import copy |
| 15 | +import os |
| 16 | +from utils.nai_pipeline import NAIPipeline, PluginError |
| 17 | + |
| 18 | + |
| 19 | +def display_results(results): |
| 20 | + for r in results: |
| 21 | + print(r) |
| 22 | + |
| 23 | + |
| 24 | +def best_parameters(results): |
| 25 | + ''' |
| 26 | + Finds and returns the best set of parameters, in terms of highest accuracy, for a series of experiments the |
| 27 | + results stored in the 'results' list of dictionaries given |
| 28 | + :param results: List of dictionaries with parameter and accuracy values |
| 29 | + :return: The dictionary in results with highest accuracy entry. |
| 30 | + ''' |
| 31 | + best_result = None |
| 32 | + highest_accuracy = -1.0 |
| 33 | + for r in results: |
| 34 | + if r['accuracy']['acc_test'] > highest_accuracy: |
| 35 | + highest_accuracy = r['accuracy']['acc_test'] |
| 36 | + best_result = copy.deepcopy(r) |
| 37 | + |
| 38 | + return best_result |
| 39 | + |
| 40 | + |
| 41 | +def parse_args(): |
| 42 | + """ |
| 43 | + Parses the command line arguments. |
| 44 | + """ |
| 45 | + parser = argparse.ArgumentParser(description="Metric Learning for node attribute classification in graph analysis.") |
| 46 | + |
| 47 | + parser.add_argument('--dataset-name', dest='dataset_name', nargs='?', default='cora', |
| 48 | + help='Name of dataset.') |
| 49 | + |
| 50 | + parser.add_argument('--target-node-type', dest='node_type', nargs='?', default=None, |
| 51 | + help='Type of nodes whose attributes are to be inferred.') |
| 52 | + |
| 53 | + parser.add_argument('--target-attribute', dest='target_attribute', nargs='?', default='', |
| 54 | + help='Name of the attribute to infer.') |
| 55 | + |
| 56 | + parser.add_argument('--attributes-to-ignore', dest='attributes_to_ignore', nargs='*', default=[], |
| 57 | + help='Names of attribute to ignore as predictors.') |
| 58 | + |
| 59 | + parser.add_argument('--input-dir', dest='input_dir', nargs='?', |
| 60 | + default='/Users/eli024/Projects/data/cora/cora.epgm/', |
| 61 | + help='Input directory where graph in EPGM format can be found.') |
| 62 | + |
| 63 | + parser.add_argument('--temp-dir', dest='temp_dir', nargs='?', |
| 64 | + default='~/temp/', |
| 65 | + help='Directory for storing temporary files') |
| 66 | + |
| 67 | + parser.add_argument('--output-dir', dest='output_dir', |
| 68 | + nargs='?', default='pred/', |
| 69 | + help='Directory to write graph with predicted node labels in EPGM format.') |
| 70 | + |
| 71 | + parser.add_argument('--pipeline', dest='pipeline_filename', |
| 72 | + nargs='?', default='', |
| 73 | + help='JSON formatted file specifying the NAI pipeline and corresponding plugin parameters.') |
| 74 | + |
| 75 | + parser.add_argument('--convert-epgm', dest='convert_epgm', default=False, action='store_true', |
| 76 | + help='Extract edge list, .lab, and .att files from EPGM graph format input. Default is True.') |
| 77 | + |
| 78 | + return parser.parse_args() |
| 79 | + |
| 80 | + |
| 81 | +def prepare_parameters_dict(): |
| 82 | + parameters = {} |
| 83 | + |
| 84 | + # if wanting to tune parameters use code similar to below. For each parameter, give a list of values |
| 85 | + # to try |
| 86 | + # parameters['representation'] = {"p": [0.5, 1.0, 2.0], "q": [0.5, 1.0, 2.0]} |
| 87 | + # parameters['metric'] = {"method": ['lfda', 'lmnn'], "with_pca": [True], "pca_dim": [32, 16, 8]} |
| 88 | + # parameters['inference'] = {"method": ['logistic', 'rforest']} |
| 89 | + # |
| 90 | + parameters['representation'] = {"p": [1.0], "q": [1.0]} |
| 91 | + parameters['metric'] = {"metric": ['lfda'], "with_pca": [True], "pca_dim": [8, 16], "dim": [8]} |
| 92 | + parameters['inference'] = {"method": ['logistic', 'rforest']} |
| 93 | + |
| 94 | + return parameters |
| 95 | + |
| 96 | + |
| 97 | +if __name__ == '__main__': |
| 98 | + |
| 99 | + use_fixed_pipeline = False |
| 100 | + |
| 101 | + args = parse_args() |
| 102 | + |
| 103 | + input_epgm = os.path.expanduser(args.input_dir) |
| 104 | + dataset_name = args.dataset_name # 'cora' |
| 105 | + tmp_directory = os.path.expanduser(args.temp_dir) |
| 106 | + |
| 107 | + # check if the tmp directory exists and if not, create it |
| 108 | + if not os.path.exists(tmp_directory): |
| 109 | + print("Creating temp directory {:s}".format(tmp_directory)) |
| 110 | + os.mkdir(tmp_directory) |
| 111 | + else: |
| 112 | + # remove all files (if any) in the temp directory |
| 113 | + print("Deleting files in temp directory {:s}".format(tmp_directory)) |
| 114 | + for fname in os.listdir(tmp_directory): |
| 115 | + full_path_fname = os.path.join(tmp_directory, fname) |
| 116 | + try: |
| 117 | + if os.path.isfile(full_path_fname): |
| 118 | + os.unlink(full_path_fname) |
| 119 | + except Exception as e: |
| 120 | + print(e) |
| 121 | + |
| 122 | + # |
| 123 | + nai_pipeline = NAIPipeline() |
| 124 | + |
| 125 | + G_epgm = None |
| 126 | + if args.convert_epgm: |
| 127 | + # remember the epgm graph and use for output later |
| 128 | + G_epgm, v_map, iv_map, unique_vertex_labels, *_ = convert_from_EPGM(input_epgm, dataset_name, tmp_directory, |
| 129 | + node_type=args.node_type, target_attribute=args.target_attribute, |
| 130 | + attributes_to_ignore=args.attributes_to_ignore) |
| 131 | + else: |
| 132 | + # copy the EPGM files from input_epgm directory to tmp_directory |
| 133 | + the_files = os.listdir(args.input_dir) |
| 134 | + for epgm_file in the_files: |
| 135 | + if epgm_file.endswith('.json'): |
| 136 | + shutil.copy2(args.input_dir+epgm_file, tmp_directory) |
| 137 | + |
| 138 | + dataset_dir = tmp_directory |
| 139 | + |
| 140 | + if args.pipeline_filename == '': |
| 141 | + use_fixed_pipeline = True |
| 142 | + print("** Using fixed NAI pipeline **") |
| 143 | + else: |
| 144 | + ml_pipeline, parameters = nai_pipeline.load_pipeline_from_file(args.pipeline_filename) |
| 145 | + print("** Using NAI pipeline from {:s}".format(args.pipeline_filename)) |
| 146 | + # set the target-attribute and node-type values in plugin parameters to the values send to the |
| 147 | + # scheduler via the command line. |
| 148 | + for plugin in ml_pipeline: |
| 149 | + if 'target_attribute' in plugin['parameters'].keys(): |
| 150 | + plugin['parameters']['target_attribute'] = [args.target_attribute] |
| 151 | + if 'node_type' in plugin['parameters'].keys(): |
| 152 | + plugin['parameters']['node_type'] = [args.node_type] |
| 153 | + if 'attributes_to_ignore' in plugin['parameters'].keys(): |
| 154 | + plugin['parameters']['attributes_to_ignore'] = [args.attributes_to_ignore] |
| 155 | + |
| 156 | + try: |
| 157 | + if use_fixed_pipeline: |
| 158 | + parameters = prepare_parameters_dict() |
| 159 | + all_results = nai_pipeline.run_fixed_pipeline(dataset_dir=dataset_dir, |
| 160 | + dataset_name=dataset_name, parameters=parameters) |
| 161 | + else: |
| 162 | + all_results = nai_pipeline.run_pipeline(dataset_dir=dataset_dir, |
| 163 | + dataset_name=dataset_name, plugin_parameters=ml_pipeline) |
| 164 | + except PluginError as plugin_error: |
| 165 | + print("***********************") |
| 166 | + print("PluginError raised") |
| 167 | + print(plugin_error.result) |
| 168 | + print("***********************") |
| 169 | + |
| 170 | + plugin_names = [p['name'] for p in ml_pipeline] |
| 171 | + |
| 172 | + |
| 173 | + write_predictions_to_epgm = bool(set(plugin_names).intersection(set(["inference", "gcn"]))) |
| 174 | + |
| 175 | + if "inference" in plugin_names: |
| 176 | + display_results(all_results) |
| 177 | + params = best_parameters(results=all_results) |
| 178 | + |
| 179 | + print("\n------------------------------------------\n") |
| 180 | + print("Best set of parameters: ", params) |
| 181 | + print("\n------------------------------------------\n") |
| 182 | + |
| 183 | + # Now the biggest hack of all |
| 184 | + # Write the predicted labels to the epgm vertices file. |
| 185 | + # Assume that only one file with extension *.pred in /temp/ directory, read it, and use the inverse vertex map |
| 186 | + # to update the vertices G_epgm before writing back to disk. |
| 187 | + write_to_epgm(input_epgm, tmp_directory, args.output_dir, G_epgm, iv_map, unique_vertex_labels, args.target_attribute) |
| 188 | + elif "gcn" in plugin_names: |
| 189 | + G_epgm, v_map, iv_map, unique_vertex_labels = convert_from_EPGM(input_epgm, dataset_name, tmp_directory, |
| 190 | + node_type=args.node_type, |
| 191 | + target_attribute=args.target_attribute, |
| 192 | + attributes_to_ignore=args.attributes_to_ignore, |
| 193 | + write_to_disk=False) |
| 194 | + |
| 195 | + write_to_epgm(input_epgm, |
| 196 | + tmp_directory+'predictions/', |
| 197 | + args.output_dir, |
| 198 | + G_epgm, |
| 199 | + None, |
| 200 | + None, |
| 201 | + target_attribute=args.target_attribute) |
| 202 | + |
| 203 | + # if write_predictions_to_epgm: |
| 204 | + # if G_epgm is None: |
| 205 | + # # this is necessary to make write_to_epgm work with results from the GCN plugin. There has to be a |
| 206 | + # # better way to do this. |
| 207 | + # G_epgm, v_map, iv_map, unique_vertex_labels = convert_from_EPGM(input_epgm, dataset_name, tmp_directory, |
| 208 | + # node_type=args.node_type, |
| 209 | + # target_attribute=args.target_attribute, |
| 210 | + # attributes_to_ignore=args.attributes_to_ignore, |
| 211 | + # write_to_disk=False) |
| 212 | + # # write_to_epgm(input_epgm, tmp_directory, args.output_dir, G_epgm, iv_map, unique_vertex_labels, args.target_attribute) |
| 213 | + |
| 214 | + |
| 215 | + print("Scheduler Finished!") |
0 commit comments