-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
230 lines (187 loc) · 9.14 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
'''
Preleminary Functions
'''
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import cvxpy as cp
import warnings
import time
from numba import jit
from functools import partial
warnings.filterwarnings("ignore", category=DeprecationWarning)
from linear_models import LogisticRegressor, RidgeRegressor
from kernel_models import KernelRidgeRegressor, KernelSVM, KernelMKL
def write_csv(ids, labels, filename):
"""
inputs:
- ids: list of ids, should be an increasing list of integers
- labels: list of corresponding labels, either 0 or 1
- file: string containing the name that should be given to the submission file
"""
df = pd.DataFrame({"Id": ids, "Bound": labels})
df["Bound"] = df["Bound"].replace([-1], 0)
df.to_csv(filename, sep=',', index=False)
def init_model(model_name, default_params, X_HMM=None, kernel=None, precomputed_kernel=None, use_grid_search=False):
"""
Initializes a model depending on the parameters specified.
"""
if model_name == 'logreg':
params = None
model = LogisticRegressor()
elif model_name == 'rr':
params = {'lamb': np.linspace(0.001, 0.1, 20)}
model = RidgeRegressor()
elif model_name == 'krr':
if precomputed_kernel is not None:
raise NotImplementedError("Using a precomputed kernel is only available for the Kernel SVM.")
params = {'lamb': np.linspace(0.1, 2, 2), 'sigma': np.linspace(0.5, 2, 20), 'kernel': ['gaussian']}
model = KernelRidgeRegressor(lamb=default_params['lamb'], sigma=default_params['sigma'], kernel='gaussian')
elif model_name == 'ksvm':
if precomputed_kernel is not None:
params = {'lamb': np.logspace(-10., -7., 4)} # We don't have other values because they have already been used to compute the kernel
model = KernelSVM(lamb=default_params['lamb'], precomputed_kernel=precomputed_kernel)
elif kernel == 'gaussian':
params = {'lamb': np.logspace(-10., -7., 4), 'sigma': np.logspace(-1., 2., 4), 'kernel': ['gaussian']}
model = KernelSVM(lamb=default_params['lamb'], sigma=default_params['sigma'], kernel='gaussian')
elif kernel == 'spectrum':
params = None
model = KernelSVM(lamb=default_params['lamb'], k=default_params['k'][0], kernel='spectrum')
elif kernel == 'substring':
params = None
model = KernelSVM(lamb=default_params['lamb'], k=default_params['k'][0], kernel='substring')
elif kernel == 'fisher':
params = None
model = KernelSVM(lamb=default_params['lamb'], k=default_params['k'][0], X_HMM = X_HMM, kernel='fisher')
else:
print('model not defined')
if use_grid_search and params is not None:
model = GridSearchCV(model, params)
return model
def run_model(model_name,
data_folder = 'data',
prop_test = 0.05,
kernel=None,
kernel_savefiles=None,
K=None,
sequence = False,
use_grid_search = False,
default_params = {'lamb': 15, 'sigma': 1.2, 'k': [4, 5, 6]},
use_mkl=False,
mkl_iterations=1,
seed = 42):
"""
inputs:
- model_name (str): name of the model used for classification
- data_folder (str): relative path to the data folder
- prop_test (float): proportion of examples to use for testing
- kernel (str): name of the kernel to use
- kernel_savefiles (list of dict of str): contains paths to kernel saved as numpy matrices
- K (list of dict of arrays): kernels already computed for training and evaluation for each dataset.
- sequence (bool): if True, use the data under the sequence form. If False, use precomputed representations.
- use_grid_search (bool): set to True if you want to use GridSearchCV
output:
- array with the predictions over the whole evaluation set.
"""
dim = 100
Nb_samples = 2000
# default_params = {'lamb': 15, 'sigma': 1.2, 'k': [4, 5, 6]}
all_y_eval = []
accuracies = {}
np.random.seed(seed)
for name in [0, 1, 2]:
# Load training / testing sets
X = pd.read_csv(f'{data_folder}/Xtr{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
mean, std = X.mean(axis=0), X.std(axis=0)
X = (X - mean)/std
y = pd.read_csv(f'{data_folder}/Ytr{name}.csv')
if sequence:
df = pd.read_csv(f'{data_folder}/Xtr{name}.csv')
X = np.array(df['seq'])
y = pd.read_csv(f'{data_folder}/Ytr{name}.csv')
# Load evaluation set
X_eval = pd.read_csv(f'{data_folder}/Xte{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
X_eval = (X_eval - mean)/std
if sequence:
df_eval = pd.read_csv(f'{data_folder}/Xte{name}.csv')
X_eval = np.array(df_eval['seq'])
y = y["Bound"].to_numpy()
if kernel is not None:
y[y==0] = -1
tr_indices = np.random.choice(Nb_samples, size=int((1-prop_test)*Nb_samples), replace=False)
te_indices = [idx for idx in range(Nb_samples) if idx not in tr_indices]
X_tr = X[tr_indices]
X_te = X[te_indices]
y_tr = y[tr_indices]
y_te = y[te_indices]
assert X_tr.shape[0] + X_te.shape[0] == X.shape[0]
assert y_tr.shape[0] + y_te.shape[0] == y.shape[0]
if sequence and kernel_savefiles is not None:
precomputed_kernel = load_precomputed_kernel(df, df_eval,
kernel_filename_train=kernel_savefiles[name]['train'],
kernel_filename_eval=kernel_savefiles[name]['eval'])
elif not use_mkl and sequence and K is not None:
precomputed_kernel = load_precomputed_kernel(df, df_eval, K_tr=K[name]['train'], K_ev=K[name]['eval'])
else:
precomputed_kernel = None
if use_mkl:
get_precomputed_kernels = partial(load_precomputed_kernel, df_train=df, df_eval=df_eval)
model = KernelMKL(lamb=default_params["lamb"],
kernels=K[name],
get_precomputed_kernels=get_precomputed_kernels,
step=default_params["step"],
n_iterations=mkl_iterations)
# Fitting
model.fit(X_tr, y_tr, tr_indices)
print(f"Optimal weights for kernels: {model.eta}")
else:
model = init_model(model_name,
default_params,
kernel=kernel,
precomputed_kernel=precomputed_kernel,
use_grid_search=use_grid_search)
# Fitting
model.fit(X_tr, y_tr)
if use_grid_search:
print(model.best_params_)
print(f"Accuracy on train set {name}: {model.score(X_tr, y_tr):.2f}")
accuracies["train_{}".format(name)] = model.score(X_tr, y_tr)
print(f"Accuracy on test set {name} : {model.score(X_te, y_te):.2f}\n")
accuracies["test_{}".format(name)] = model.score(X_te, y_te)
# Prediction on the new set
y_eval = model.predict(X_eval)
all_y_eval.append(y_eval)
all_y_eval = np.hstack(all_y_eval).reshape(-1)
return all_y_eval, accuracies
def load_precomputed_kernel(df_train, df_eval,
kernel_filename_train=None, kernel_filename_eval=None,
K_tr=None, K_ev=None):
"""
Create a function that will compute the kernel between datapoints by finding the correct indices and using a precomputed kernel to return the solution.
"""
if kernel_filename_train is not None and kernel_filename_eval is not None:
with open(kernel_filename_train, "rb") as f:
K_tr = np.load(f)
with open(kernel_filename_eval, "rb") as f:
K_ev = np.load(f)
elif K_tr is None and K_ev is None:
raise ValueError("You need to specify a method for loading a preexisting kernel.")
def precomputed_kernel(X1, X2, **args):
K = np.zeros((len(X1), len(X2)))
idx1 = []
for x in X1: # Needed to get elements in the right order
idx1.append(df_train[df_train['seq'] == x]['Id'].iloc[0] % 2000)
if sum(df_train['seq'].isin(X2)) >= len(X2): # Check if all elements are in the training set
idx2 = []
for x in X2: # Needed to get elements in the right order
idx2.append(df_train[df_train['seq'] == x]['Id'].iloc[0] % 2000)
# Extract submatrix using correct indices
K = K_tr[np.ix_(idx1, idx2)]
elif sum(df_eval['seq'].isin(X2)) >= len(X2): # Check if all elements are in the evaluation set
idx2 = []
for x in X2: # Needed to get elements in the right order
idx2.append(df_eval[df_eval['seq'] == x]['Id'].iloc[0] % 1000)
# Extract submatrix using correct indices
K = K_ev[np.ix_(idx1, idx2)]
return K
return precomputed_kernel