-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimulation_utils.py
302 lines (268 loc) · 15.4 KB
/
simulation_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
"""
Utility functions for simulations.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import json
from sklearn.model_selection import ParameterGrid
import pathlib
import numpy as np
import pandas as pd
import glob
import os
from active_learning_dd.utils.evaluation import eval_on_metrics
from active_learning_dd.database_loaders.prepare_loader import prepare_loader
"""
Helper function to return max hits and max cluster hits from the
unlabeled data. Note this is used in the simulation since in a real
scenario these values are unknown.
"""
def get_unlabeled_maxes(training_loader_params,
unlabeled_loader_params,
task_names,
batch_size):
if not isinstance(task_names, list):
task_names = [task_names]
# load loaders
training_loader = prepare_loader(data_loader_params=training_loader_params,
task_names=task_names)
unlabeled_loader = prepare_loader(data_loader_params=unlabeled_loader_params,
task_names=task_names)
# remove already labeled data
unlabeled_loader.drop_duplicates_via_smiles(training_loader.get_smiles())
# now get labels and clusters
y_unlabeled = unlabeled_loader.get_labels()
unlabeled_clusters = unlabeled_loader.get_clusters()
training_clusters = training_loader.get_clusters()
max_hits_list = np.sum(y_unlabeled, axis=0)
max_hits_list = [min(batch_size, actives_count) for actives_count in max_hits_list]
max_cluster_hits_list = [0 for _ in range(len(task_names))]
max_novel_hits_list = [0 for _ in range(len(task_names))]
for ti in range(len(task_names)):
# Get the clusters with actives
active_indices = np.where(y_unlabeled[:,ti] == 1)[0]
clusters_with_actives_ti = unlabeled_clusters[active_indices]
unique_clusters_with_actives_ti = np.unique(clusters_with_actives_ti)
max_cluster_hits_list[ti] = min(batch_size,
unique_clusters_with_actives_ti.shape[0])
novel_clusters_with_actives = np.setdiff1d(unique_clusters_with_actives_ti,
training_clusters)
max_novel_hits_list[ti] = min(batch_size,
novel_clusters_with_actives.shape[0])
return max_hits_list, max_cluster_hits_list, max_novel_hits_list
"""
Random sample from the given parameter set.
Assumes uniform distribution. Samples index in the range [0, total_num_parameter_sets].
"""
def get_random_params_int_based(nbs_config,
rnd_seed=0):
# pop the batch_size, since we want to simulate all batch sizes for this param set
next_batch_selector_params = nbs_config["next_batch_selector_params"]
batch_sizes = next_batch_selector_params.pop("batch_size", None)
# sample random param
param_grid = SimulationParameterGrid(next_batch_selector_params)
np.random.seed(rnd_seed)
param_idx = np.random.randint(len(param_grid), size=1, dtype='int64')[0]
next_batch_selector_params = param_grid[param_idx]
next_batch_selector_params["batch_size"] = batch_sizes
return next_batch_selector_params
"""
Random sample from the given parameter set using the
distribution given in the config file.
If use_uniform=True, then samples each parameter uniformly.
"""
def get_param_from_dist(nbs_config,
rnd_seed=0,
use_uniform=False,
exploration_strategy='weighted'):
nbs_params = nbs_config["next_batch_selector_params"]
nbs_params_probas = nbs_config["nbs_params_probas"]
# sample random param
np.random.seed(rnd_seed)
sorted_params = sorted(nbs_params_probas.keys())
if exploration_strategy not in nbs_params["exploration_strategy"]:
raise ValueError('Given exploration strategy not supported in config file.')
nbs_params["exploration_strategy"] = exploration_strategy
if exploration_strategy == 'random' or exploration_strategy == 'dissimilar':
for removable_param, default_value in [('exploration_use_quantile_for_weight', False),
('exploration_weight_threshold', 0.0),
('exploration_beta', 0.0),
('exploration_dissimilarity_lambda', 0.0)]:
nbs_params[removable_param] = default_value
sorted_params.remove(removable_param)
while len(sorted_params) > 0:
param = sorted_params.pop()
param_choices = np.array(nbs_params[param])
param_probas = nbs_params_probas[param]
if param_choices.ndim > 1:
param_choices = param_choices.flatten()
if use_uniform:
param_probas = [1.0/len(param_choices) for _ in range(len(param_choices))] # discrete uniform sampling
param_sampled_choice = np.random.choice(param_choices, size=1, p=param_probas)[0]
# modify nbs_params dict with sampled choice
nbs_params[param] = param_sampled_choice
nbs_params["class"] = nbs_params["class"][0]
return nbs_params
"""
Evaluates selected batch by assuming all are active/hits.
"""
def evaluate_selected_batch(exploitation_df, exploration_df,
exploitation_array, exploration_array,
params_set_results_dir,
pipeline_config,
iter_num,
batch_size,
total_selection_time,
add_mean_medians=False):
w_novelty = pipeline_config['common']['metrics_params']['w_novelty']
perc_vec = pipeline_config['common']['metrics_params']['perc_vec']
task_names = pipeline_config['common']['task_names']
cost_col_name = pipeline_config['unlabeled_data_params']['cost_col_name']
iter_results_dir = params_set_results_dir+'/'+pipeline_config['common']['iter_results_dir'].format(iter_num)
eval_dest_file = iter_results_dir+'/'+pipeline_config['common']['eval_dest_file']
pathlib.Path(eval_dest_file).parent.mkdir(parents=True, exist_ok=True)
cols_names = task_names
if add_mean_medians:
cols_names = cols_names+['Mean', 'Median']
# retrieve max_hits_list, max_cluster_hits_list of the unlabeled data for this iteration
max_hits_list, max_cluster_hits_list, max_novel_hits_list = get_unlabeled_maxes(training_loader_params=pipeline_config['training_data_params'],
unlabeled_loader_params=pipeline_config['unlabeled_data_params'],
task_names=task_names,
batch_size=batch_size)
train_clusters = prepare_loader(data_loader_params=pipeline_config['training_data_params'],
task_names=task_names).get_clusters()
exploitation_batch_size, exploitation_batch_cost = 0, 0
if exploitation_df is not None:
exploitation_df.to_csv(iter_results_dir+'/'+pipeline_config['common']['batch_csv'].format('exploitation'),
index=False)
exploitation_metrics_mat, metrics_names = eval_on_metrics(exploitation_df[task_names].values, np.ones_like(exploitation_df[task_names].values),
train_clusters, exploitation_array[:,1],
max_hits_list, max_cluster_hits_list, max_novel_hits_list,
add_mean_medians, w_novelty, perc_vec)
exploitation_batch_size = exploitation_df[task_names].shape[0]
try:
exploitation_costs = exploitation_df[cost_col_name].values.astype(float)
except:
exploitation_costs = np.ones(shape=(exploitation_df.shape[0],))
exploitation_batch_cost = np.sum(exploitation_costs)
else:
exploitation_metrics_mat, metrics_names = eval_on_metrics(None, None,
train_clusters, None,
max_hits_list, max_cluster_hits_list, max_novel_hits_list,
add_mean_medians, w_novelty, perc_vec)
exploration_batch_size, exploration_batch_cost = 0, 0
if exploration_df is not None:
exploration_df.to_csv(iter_results_dir+'/'+pipeline_config['common']['batch_csv'].format('exploration'),
index=False)
exploration_metrics_mat, metrics_names = eval_on_metrics(exploration_df[task_names].values, np.ones_like(exploration_df[task_names].values),
train_clusters, exploration_array[:,1],
max_hits_list, max_cluster_hits_list, max_novel_hits_list,
add_mean_medians, w_novelty, perc_vec)
exploration_batch_size = exploration_df[task_names].shape[0]
try:
exploration_costs = exploration_df[cost_col_name].values.astype(float)
except:
exploration_costs = np.ones(shape=(exploration_df.shape[0],))
exploration_batch_cost = np.sum(exploration_costs)
else:
exploration_metrics_mat, metrics_names = eval_on_metrics(None, None,
train_clusters, None,
max_hits_list, max_cluster_hits_list, max_novel_hits_list,
add_mean_medians, w_novelty, perc_vec)
# record rest of metrics
exploitation_metrics_mat = np.vstack([exploitation_metrics_mat, [[exploitation_batch_size], [exploitation_batch_cost]]])
exploration_metrics_mat = np.vstack([exploration_metrics_mat, [[exploration_batch_size], [exploration_batch_cost]]])
# construct exploitation + exploration metrics
total_df = pd.concat([exploitation_df, exploration_df])
if (exploitation_df is not None) and (exploration_df is not None):
total_array = np.vstack([exploitation_array, exploration_array])
elif (exploitation_df is not None) and (exploration_df is None):
total_array = exploitation_array
elif (exploitation_df is None) and (exploration_df is not None):
total_array = exploration_array
else:
raise ValueError('Error in evaluating batch: total selection array is empty.')
total_metrics_mat, metrics_names = eval_on_metrics(total_df[task_names].values, np.ones_like(total_df[task_names].values),
train_clusters, total_array[:,1],
max_hits_list, max_cluster_hits_list, max_novel_hits_list,
add_mean_medians, w_novelty, perc_vec)
metrics_names = metrics_names + ['batch_size', 'batch_cost']
total_batch_size = exploitation_batch_size + exploration_batch_size
try:
total_batch_cost = total_df[cost_col_name].values.astype(float)
except:
total_batch_cost = np.ones(shape=(total_df.shape[0],))
total_batch_cost = np.sum(total_batch_cost)
total_metrics_mat = np.vstack([total_metrics_mat, [[total_batch_size], [total_batch_cost]]])
total_cherry_picking_time = total_batch_size * pipeline_config['common']['cherry_picking_time_per_cpd']
screening_time_per_batch = pipeline_config['common']['screening_time_per_batch']
total_screening_time = total_cherry_picking_time + screening_time_per_batch
metrics_mat = np.vstack([exploitation_metrics_mat, exploration_metrics_mat, total_metrics_mat,
[[total_cherry_picking_time]], [[screening_time_per_batch]], [[total_screening_time]]])
metrics_names = ['exploitation_'+m for m in metrics_names] + \
['exploration_'+m for m in metrics_names] + \
['total_'+m for m in metrics_names] + \
['total_cherry_picking_time', 'screening_time_per_batch', 'total_screening_time']
# save to destination
metrics_df = pd.DataFrame(data=metrics_mat,
columns=[iter_num],
index=metrics_names).T
metrics_df.index.name = 'iter_num'
metrics_df.to_csv(eval_dest_file, index=True)
"""
Summarize simulation evaluation results by aggregating.
"""
def summarize_simulation(params_set_results_dir,
pipeline_config):
summary_dest_file = params_set_results_dir+'/'+pipeline_config['common']['summary_dest_file']
pathlib.Path(summary_dest_file).parent.mkdir(parents=True, exist_ok=True)
metrics_df_list = []
iter_dirs = glob.glob(params_set_results_dir+'/*/')
for i in range(len(iter_dirs)):
iter_d = params_set_results_dir+'/'+pipeline_config['common']['iter_results_dir'].format(i)
eval_dest_file = iter_d+'/'+pipeline_config['common']['eval_dest_file']
if not os.path.exists(eval_dest_file):
print(eval_dest_file, '\nDoes not exist.')
else:
metrics_df_list.append(pd.read_csv(eval_dest_file))
metrics_df_concat = pd.concat(metrics_df_list)
metrics_ordering = [m for m in metrics_df_concat.columns if 'ratio' not in m or 'exploration' in m] + [m for m in metrics_df_concat.columns if 'ratio' in m and 'exploration' not in m]
summary_df = pd.concat([metrics_df_concat[[m for m in metrics_df_concat.columns if 'ratio' not in m or 'exploration' in m]].sum(),
metrics_df_concat[[m for m in metrics_df_concat.columns if 'ratio' in m and 'exploration' not in m]].mean()]).to_frame().T
summary_df.iloc[-1,0] = 9999
summary_df = pd.concat([metrics_df_concat[metrics_ordering], summary_df])
summary_df.to_csv(summary_dest_file, index=False)
class SimulationParameterGrid(ParameterGrid):
"""
Custom parameter grid class due to sklearn's ParameterGrid restriction to int32.
"""
def __getitem__(self, ind):
"""
Same as sklearn's ParameterGrid class but np.product(sizes, dtype='int64').
"""
# This is used to make discrete sampling without replacement memory
# efficient.
for sub_grid in self.param_grid:
# XXX: could memoize information used here
if not sub_grid:
if ind == 0:
return {}
else:
ind -= 1
continue
# Reverse so most frequent cycling parameter comes first
keys, values_lists = zip(*sorted(sub_grid.items())[::-1])
sizes = [len(v_list) for v_list in values_lists]
total = np.product(sizes, dtype='int64')
if ind >= total:
# Try the next grid
ind -= total
else:
out = {}
for key, v_list, n in zip(keys, values_lists, sizes):
ind, offset = divmod(ind, n)
out[key] = v_list[offset]
return out
raise IndexError('SimulationParameterGrid index out of range')