Skip to content

Commit bd22352

Browse files
committed
+thesis scripts+linear models
1 parent a8a5886 commit bd22352

39 files changed

+10347
-4308
lines changed

SETTINGS.json

+6-6
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010
"preprocessor":{
1111
"highcut" : 180,
1212
"lowcut" : 0.1,
13-
"nfreq_bands": 6,
14-
"win_length_sec": 60,
13+
"nfreq_bands": 8,
14+
"win_length_sec": 30,
1515
"features": "meanlog_std",
16-
"stride_sec": 60
16+
"stride_sec": 30
1717
},
1818

1919
"validation":{
@@ -24,16 +24,16 @@
2424

2525
"model":{
2626
"scale_time": 1,
27-
"use_test": 1,
28-
"overlap" : 9,
27+
"use_test": 0,
28+
"overlap" : 10,
2929
"dropout_prob" : [0.3, 0.6],
3030
"training_batch_size" : 10,
3131
"activation" : ["relu", "relu", "tanh"],
3232
"weights_variance" : 0.01,
3333
"l2_reg" : 0.0001,
3434
"recept_width" : [1, 2],
3535
"pool_width" : [1, 1],
36-
"nkerns" : [32, 64, 512],
36+
"nkerns" : [16, 32, 512],
3737
"stride" : [1, 2],
3838
"global_pooling": 1
3939
}

cnn/conv_net.py

+4
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ def train(self, train_set, max_iter):
7777
while not done_looping:
7878
for train_x, train_y in train_set_iterator:
7979
self.train_model(train_x, train_y)
80+
# if iter % 10 == 0:
81+
# self.batch_size.set_value(train_set[0].shape[0])
82+
# print self.validate_model(train_set[0], train_set[1])
83+
# self.batch_size.set_value(self.training_batch_size)
8084
if iter > max_iter:
8185
done_looping = True
8286
break

cnn/predict.py

+31-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import json, shutil, cPickle, os, csv
2+
import numpy as np
3+
import preprocessors.fft as fft
24
from pandas import read_csv
3-
from sklearn.preprocessing import MinMaxScaler
5+
from sklearn.preprocessing import MinMaxScaler, StandardScaler
46
from theano import config
57
from pandas import DataFrame
68
from cnn.conv_net import ConvNet
@@ -11,21 +13,36 @@
1113
config.floatX = 'float32'
1214

1315

14-
def rescale(probability):
16+
def minmax_rescale(probability):
1517
scaler = MinMaxScaler(feature_range=(0.000000001, 0.999999999))
1618
return scaler.fit_transform(probability)
1719

1820

19-
def merge_csv_data(submission_path, subjects, submission_name, scale=True):
20-
submission_name += '_scaled' if scale else ''
21+
def softmax_rescale(probability):
22+
norm_x = StandardScaler().fit_transform(probability)
23+
return 1.0 / (1.0 + np.exp(-norm_x))
24+
25+
26+
def median_scaler(x):
27+
return (x - np.median(x))/2.0 + 0.5
28+
29+
30+
def merge_csv_data(submission_path, subjects, submission_name, scale=None):
31+
submission_name += scale if scale else ''
2132

2233
with open(submission_path + '/' + submission_name + '.csv', 'wb') as f:
2334
writer = csv.writer(f)
2435
writer.writerow(['clip', 'preictal'])
2536

2637
for subject in subjects:
2738
df = read_csv(submission_path + '/' + subject + '.csv')
28-
df['preictal'] = rescale(df.drop('clip', axis=1).values)
39+
df['clip'] = [subject+'_'+i for i in df['clip']]
40+
if scale=='minmax':
41+
df['preictal'] = minmax_rescale(df.drop('clip', axis=1).values)
42+
elif scale =='softmax':
43+
df['preictal'] = softmax_rescale(df.drop('clip', axis=1).values)
44+
elif scale =='median':
45+
df['preictal'] = median_scaler(df.drop('clip', axis=1).values)
2946
with open(submission_path + '/' + submission_name + '.csv', 'a') as f:
3047
df.to_csv(f, header=False, index=False)
3148

@@ -65,16 +82,24 @@ def run_predictor():
6582
data_path = settings_dict['path']['processed_data_path'] + '/' + create_fft_data_name(settings_dict)
6683
submission_path = model_path + '/submission'
6784
print submission_path
85+
86+
if not os.path.exists(data_path):
87+
fft.run_fft_preprocessor()
88+
6889
if not os.path.exists(submission_path):
6990
os.makedirs(submission_path)
7091
shutil.copy2('SETTINGS.json', submission_path + '/SETTINGS.json')
7192

7293
subjects = ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']
94+
#subjects = ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4']
7395
for subject in subjects:
7496
print '***********************', subject, '***************************'
7597
predict(subject, data_path, model_path, submission_path)
7698

77-
merge_csv_data(submission_path, subjects, submission_name='submission', scale=True)
99+
merge_csv_data(submission_path, subjects, submission_name='submission', scale='minmax')
100+
merge_csv_data(submission_path, subjects, submission_name='submission', scale='softmax')
101+
merge_csv_data(submission_path, subjects, submission_name='submission', scale='median')
102+
merge_csv_data(submission_path, subjects, submission_name='submission')
78103

79104

80105
if __name__ == '__main__':

cnn/runner.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@
88
settings_files = sorted(os.listdir('settings_dir'))
99
for settings_file in settings_files:
1010
shutil.copy2('settings_dir/'+settings_file, os.getcwd() + '/SETTINGS.json')
11-
#run_trainer()
11+
run_trainer()
1212
run_predictor()

linear_regression/SETTINGS.json linear_models/SETTINGS.json

+4-4
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@
44
"raw_data_path" : "/mnt/sda4/CODING/python/kaggle_data/raw_data",
55
"processed_data_path" :"/mnt/sda4/CODING/python/kaggle_data/processed_data",
66
"model_path" : "/mnt/sda4/CODING/python/kaggle_data/models/LR",
7-
"submission_path" : "/mnt/sda4/CODING/python/kaggle_data/submissions/LR"
7+
"submission_path" : "/mnt/sda4/CODING/python/kaggle_data/submissions/linear_models"
88
},
99

1010
"preprocessor":{
1111
"highcut" : 180,
1212
"lowcut" : 0.1,
13-
"nfreq_bands": 67,
13+
"nfreq_bands": 8,
1414
"features": "meanlog",
15-
"win_length_sec" : 30,
16-
"stride_sec" : 15
15+
"win_length_sec" : 60,
16+
"stride_sec" : 30
1717
}
1818
}
File renamed without changes.

linear_models/commons.py

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import numpy as np
2+
from sklearn.preprocessing import StandardScaler, MinMaxScaler
3+
from pandas import DataFrame
4+
from collections import defaultdict
5+
6+
7+
def print_cm(cm, labels):
8+
columnwidth = max([len(x) for x in labels])
9+
# Print header
10+
print " " * columnwidth,
11+
for label in labels:
12+
print "%{0}s".format(columnwidth) % label,
13+
print
14+
# Print rows
15+
for i, label1 in enumerate(labels):
16+
print "%{0}s".format(columnwidth) % label1,
17+
for j in range(len(labels)):
18+
print "%{0}d".format(columnwidth) % cm[i, j],
19+
print
20+
21+
22+
def load_test_labels(csv_path):
23+
subject_to_df = defaultdict(list)
24+
d = DataFrame.from_csv(csv_path, index_col=None)
25+
for i in d.index:
26+
clip = d['clip'][i]
27+
preictal = d['preictal'][i]
28+
29+
subject_name = '_'.join(clip.split('_', 2)[:2])
30+
subject_to_df[subject_name].append((clip, preictal))
31+
32+
for subject_name, subject_data in subject_to_df.iteritems():
33+
subject_to_df[subject_name] = DataFrame(subject_data, columns=['clip', 'preictal'])
34+
return subject_to_df
35+
36+
37+
def softmax_scaler(x):
38+
norm_x = StandardScaler().fit_transform(x)
39+
return 1.0 / (1.0 + np.exp(-norm_x))
40+
41+
42+
def minmax_scaler(x):
43+
scaler = MinMaxScaler(feature_range=(0.000000001, 0.999999999))
44+
return scaler.fit_transform(x)
45+
46+
47+
def median_scaler(x):
48+
return (x - np.median(x)) / 2.0 + 0.5
49+
50+
51+
def reshape_data(x, y=None):
52+
n_examples = x.shape[0]
53+
n_channels = x.shape[1]
54+
n_fbins = x.shape[2]
55+
n_timesteps = x.shape[3]
56+
x_new = np.zeros((n_examples * n_timesteps, n_channels, n_fbins))
57+
for i in range(n_channels):
58+
xi = np.transpose(x[:, i, :, :], axes=(0, 2, 1))
59+
xi = xi.reshape((n_examples * n_timesteps, n_fbins))
60+
x_new[:, i, :] = xi
61+
62+
x_new = x_new.reshape((n_examples * n_timesteps, n_channels * n_fbins))
63+
if y is not None:
64+
y_new = np.repeat(y, n_timesteps)
65+
return x_new, y_new
66+
else:
67+
return x_new
68+

linear_models/filenames.pickle

+9,078
Large diffs are not rendered by default.

linear_models/lda.py

+113
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import numpy as np
2+
import json
3+
import os
4+
5+
from pandas import DataFrame
6+
from sklearn.preprocessing import StandardScaler
7+
import matplotlib.pyplot as plt
8+
from sklearn.lda import LDA
9+
10+
import preprocessors.fft as fft
11+
from utils.loader import load_test_data, load_train_data
12+
from utils.config_name_creator import *
13+
from merger import merge_csv_files
14+
from commons import reshape_data
15+
from commons import load_test_labels
16+
17+
18+
def train(subject, data_path, plot=False):
19+
d = load_train_data(data_path, subject)
20+
x, y = d['x'], d['y']
21+
print 'n_preictal', np.sum(y)
22+
print 'n_inetrictal', np.sum(y - 1)
23+
n_channels = x.shape[1]
24+
n_fbins = x.shape[2]
25+
26+
x, y = reshape_data(x, y)
27+
data_scaler = StandardScaler()
28+
x = data_scaler.fit_transform(x)
29+
30+
lda = LDA()
31+
lda.fit(x, y)
32+
coef = lda.scalings_ * lda.coef_[:1].T
33+
channels = []
34+
fbins = []
35+
for c in range(n_channels):
36+
fbins.extend(range(n_fbins)) # 0- delta, 1- theta ...
37+
channels.extend([c] * n_fbins)
38+
39+
if plot:
40+
fig = plt.figure()
41+
for i in range(n_channels):
42+
if n_channels == 24:
43+
fig.add_subplot(4, 6, i)
44+
else:
45+
fig.add_subplot(4, 4, i)
46+
ax = plt.gca()
47+
ax.set_xlim([0, n_fbins])
48+
ax.set_xticks(np.arange(0.5, n_fbins + 0.5, 1))
49+
ax.set_xticklabels(np.arange(0, n_fbins))
50+
max_y = max(abs(coef)) + 0.01
51+
ax.set_ylim([0, max_y])
52+
ax.set_yticks(np.around(np.arange(0, max_y, max_y / 4.0), decimals=1))
53+
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
54+
label.set_fontsize(15)
55+
plt.bar(range(0, n_fbins), abs(coef[i * n_fbins:i * n_fbins + n_fbins]))
56+
fig.suptitle(subject, fontsize=20)
57+
plt.show()
58+
59+
coefs = np.reshape(coef, (n_channels, n_fbins))
60+
return lda, data_scaler, coefs
61+
62+
63+
def predict(subject, model, data_scaler, data_path, submission_path, test_labels, opt_threshold_train):
64+
d = load_test_data(data_path, subject)
65+
x_test, id = d['x'], d['id']
66+
n_test_examples = x_test.shape[0]
67+
n_timesteps = x_test.shape[3]
68+
69+
x_test = reshape_data(x_test)
70+
x_test = data_scaler.transform(x_test)
71+
72+
pred_1m = model.predict_proba(x_test)[:, 1]
73+
74+
pred_10m = np.reshape(pred_1m, (n_test_examples, n_timesteps))
75+
pred_10m = np.mean(pred_10m, axis=1)
76+
ans = zip(id, pred_10m)
77+
df = DataFrame(data=ans, columns=['clip', 'preictal'])
78+
df.to_csv(submission_path + '/' + subject + '.csv', index=False, header=True)
79+
80+
81+
def run_trainer():
82+
with open('SETTINGS.json') as f:
83+
settings_dict = json.load(f)
84+
85+
data_path = settings_dict['path']['processed_data_path'] + '/' + create_fft_data_name(settings_dict)
86+
submission_path = settings_dict['path']['submission_path'] + '/LDA_' + create_fft_data_name(settings_dict)
87+
print data_path
88+
89+
if not os.path.exists(data_path):
90+
fft.run_fft_preprocessor()
91+
92+
if not os.path.exists(submission_path):
93+
os.makedirs(submission_path)
94+
95+
test_labels_path = '/mnt/sda4/CODING/python/kaggle_data/test_labels.csv'
96+
test_labels = load_test_labels(test_labels_path)
97+
98+
subjects = ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']
99+
coef_list = []
100+
for subject in subjects:
101+
print '***********************', subject, '***************************'
102+
model, data_scaler, coefs = train(subject, data_path)
103+
predict(subject, model, data_scaler, data_path, submission_path, test_labels[subject]['preictal'])
104+
coef_list.append(coefs)
105+
106+
merge_csv_files(submission_path, subjects, 'submission')
107+
merge_csv_files(submission_path, subjects, 'submission_softmax')
108+
merge_csv_files(submission_path, subjects, 'submission_minmax')
109+
merge_csv_files(submission_path, subjects, 'submission_median')
110+
111+
112+
if __name__ == '__main__':
113+
run_trainer()

0 commit comments

Comments
 (0)