IraKorshunova
diff --git a/‎SETTINGS.json
+6-6 b/‎SETTINGS.json
+6-6
diff --git a/‎cnn/conv_net.py
+4 b/‎cnn/conv_net.py
+4
diff --git a/‎cnn/predict.py
+31-6 b/‎cnn/predict.py
+31-6
diff --git a/‎cnn/runner.py
+1-1 b/‎cnn/runner.py
+1-1
diff --git a/‎linear_regression/SETTINGS.json ‎linear_models/SETTINGS.json
+4-4 b/‎linear_regression/SETTINGS.json ‎linear_models/SETTINGS.json
+4-4
diff --git a/‎linear_regression/__init__.py ‎linear_models/__init__.py b/‎linear_regression/__init__.py ‎linear_models/__init__.py
diff --git a/‎linear_models/commons.py
+68 b/‎linear_models/commons.py
+68
diff --git a/‎linear_models/filenames.pickle
+9,078 b/‎linear_models/filenames.pickle
+9,078
diff --git a/‎linear_models/lda.py
+113 b/‎linear_models/lda.py
+113
@@ -10,10 +10,10 @@
     "preprocessor":{
         "highcut" : 180,
         "lowcut" : 0.1,
-        "nfreq_bands": 6,
-        "win_length_sec": 60,
+        "nfreq_bands": 8,
+        "win_length_sec": 30,
         "features": "meanlog_std",
-        "stride_sec": 60
+        "stride_sec": 30
     },
 
     "validation":{
@@ -24,16 +24,16 @@
 
     "model":{
         "scale_time": 1,
-        "use_test": 1,
-        "overlap" : 9,
+        "use_test": 0,
+        "overlap" : 10,
         "dropout_prob" : [0.3, 0.6],
         "training_batch_size" : 10,
         "activation" : ["relu", "relu", "tanh"],
         "weights_variance" : 0.01,
         "l2_reg" : 0.0001,
         "recept_width" : [1, 2],
         "pool_width" : [1, 1],
-        "nkerns" : [32, 64, 512],
+        "nkerns" : [16, 32, 512],
         "stride" : [1, 2],
         "global_pooling": 1
     }
 
@@ -77,6 +77,10 @@ def train(self, train_set, max_iter):
         while not done_looping:
             for train_x, train_y in train_set_iterator:
                 self.train_model(train_x, train_y)
+                # if iter % 10 == 0:
+                #     self.batch_size.set_value(train_set[0].shape[0])
+                #     print self.validate_model(train_set[0], train_set[1])
+                #     self.batch_size.set_value(self.training_batch_size)
                 if iter > max_iter:
                     done_looping = True
                     break
 
@@ -1,6 +1,8 @@
 import json, shutil, cPickle, os, csv
+import numpy as np
+import preprocessors.fft as fft
 from pandas import read_csv
-from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
 from theano import config
 from pandas import DataFrame
 from cnn.conv_net import ConvNet
@@ -11,21 +13,36 @@
 config.floatX = 'float32'
 
 
-def rescale(probability):
+def minmax_rescale(probability):
     scaler = MinMaxScaler(feature_range=(0.000000001, 0.999999999))
     return scaler.fit_transform(probability)
 
 
-def merge_csv_data(submission_path, subjects, submission_name, scale=True):
-    submission_name += '_scaled' if scale else ''
+def softmax_rescale(probability):
+    norm_x = StandardScaler().fit_transform(probability)
+    return 1.0 / (1.0 + np.exp(-norm_x))
+
+
+def median_scaler(x):
+    return (x - np.median(x))/2.0 + 0.5
+
+
+def merge_csv_data(submission_path, subjects, submission_name, scale=None):
+    submission_name += scale if scale else ''
 
     with open(submission_path + '/' + submission_name + '.csv', 'wb') as f:
         writer = csv.writer(f)
         writer.writerow(['clip', 'preictal'])
 
     for subject in subjects:
         df = read_csv(submission_path + '/' + subject + '.csv')
-        df['preictal'] = rescale(df.drop('clip', axis=1).values)
+        df['clip'] = [subject+'_'+i for i in df['clip']]
+        if scale=='minmax':
+            df['preictal'] = minmax_rescale(df.drop('clip', axis=1).values)
+        elif scale =='softmax':
+            df['preictal'] = softmax_rescale(df.drop('clip', axis=1).values)
+        elif scale =='median':
+            df['preictal'] = median_scaler(df.drop('clip', axis=1).values)
         with open(submission_path + '/' + submission_name + '.csv', 'a') as f:
             df.to_csv(f, header=False, index=False)
 
@@ -65,16 +82,24 @@ def run_predictor():
     data_path = settings_dict['path']['processed_data_path'] + '/' + create_fft_data_name(settings_dict)
     submission_path = model_path + '/submission'
     print submission_path
+
+    if not os.path.exists(data_path):
+        fft.run_fft_preprocessor()
+
     if not os.path.exists(submission_path):
         os.makedirs(submission_path)
     shutil.copy2('SETTINGS.json', submission_path + '/SETTINGS.json')
 
     subjects = ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']
+    #subjects = ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4']
     for subject in subjects:
         print '***********************', subject, '***************************'
         predict(subject, data_path, model_path, submission_path)
 
-    merge_csv_data(submission_path, subjects, submission_name='submission', scale=True)
+    merge_csv_data(submission_path, subjects, submission_name='submission', scale='minmax')
+    merge_csv_data(submission_path, subjects, submission_name='submission', scale='softmax')
+    merge_csv_data(submission_path, subjects, submission_name='submission', scale='median')
+    merge_csv_data(submission_path, subjects, submission_name='submission')
 
 
 if __name__ == '__main__':
 
@@ -8,5 +8,5 @@
     settings_files = sorted(os.listdir('settings_dir'))
     for settings_file in settings_files:
         shutil.copy2('settings_dir/'+settings_file, os.getcwd() + '/SETTINGS.json')
-        #run_trainer()
+        run_trainer()
         run_predictor()
@@ -4,15 +4,15 @@
         "raw_data_path" : "/mnt/sda4/CODING/python/kaggle_data/raw_data",
         "processed_data_path" :"/mnt/sda4/CODING/python/kaggle_data/processed_data",
         "model_path" : "/mnt/sda4/CODING/python/kaggle_data/models/LR",
-        "submission_path" : "/mnt/sda4/CODING/python/kaggle_data/submissions/LR"
+        "submission_path" : "/mnt/sda4/CODING/python/kaggle_data/submissions/linear_models"
     },
 
     "preprocessor":{
         "highcut" : 180,
         "lowcut" : 0.1,
-        "nfreq_bands": 67,
+        "nfreq_bands": 8,
         "features": "meanlog",
-        "win_length_sec" : 30,
-        "stride_sec" : 15
+        "win_length_sec" : 60,
+        "stride_sec" : 30
     }
 }
@@ -0,0 +1,68 @@
+import numpy as np
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from pandas import DataFrame
+from collections import defaultdict
+
+
+def print_cm(cm, labels):
+    columnwidth = max([len(x) for x in labels])
+    # Print header
+    print " " * columnwidth,
+    for label in labels:
+        print "%{0}s".format(columnwidth) % label,
+    print
+    # Print rows
+    for i, label1 in enumerate(labels):
+        print "%{0}s".format(columnwidth) % label1,
+        for j in range(len(labels)):
+            print "%{0}d".format(columnwidth) % cm[i, j],
+        print
+
+
+def load_test_labels(csv_path):
+    subject_to_df = defaultdict(list)
+    d = DataFrame.from_csv(csv_path, index_col=None)
+    for i in d.index:
+        clip = d['clip'][i]
+        preictal = d['preictal'][i]
+
+        subject_name = '_'.join(clip.split('_', 2)[:2])
+        subject_to_df[subject_name].append((clip, preictal))
+
+    for subject_name, subject_data in subject_to_df.iteritems():
+        subject_to_df[subject_name] = DataFrame(subject_data, columns=['clip', 'preictal'])
+    return subject_to_df
+
+
+def softmax_scaler(x):
+    norm_x = StandardScaler().fit_transform(x)
+    return 1.0 / (1.0 + np.exp(-norm_x))
+
+
+def minmax_scaler(x):
+    scaler = MinMaxScaler(feature_range=(0.000000001, 0.999999999))
+    return scaler.fit_transform(x)
+
+
+def median_scaler(x):
+    return (x - np.median(x)) / 2.0 + 0.5
+
+
+def reshape_data(x, y=None):
+    n_examples = x.shape[0]
+    n_channels = x.shape[1]
+    n_fbins = x.shape[2]
+    n_timesteps = x.shape[3]
+    x_new = np.zeros((n_examples * n_timesteps, n_channels, n_fbins))
+    for i in range(n_channels):
+        xi = np.transpose(x[:, i, :, :], axes=(0, 2, 1))
+        xi = xi.reshape((n_examples * n_timesteps, n_fbins))
+        x_new[:, i, :] = xi
+
+    x_new = x_new.reshape((n_examples * n_timesteps, n_channels * n_fbins))
+    if y is not None:
+        y_new = np.repeat(y, n_timesteps)
+        return x_new, y_new
+    else:
+        return x_new
+
@@ -0,0 +1,113 @@
+import numpy as np
+import json
+import os
+
+from pandas import DataFrame
+from sklearn.preprocessing import StandardScaler
+import matplotlib.pyplot as plt
+from sklearn.lda import LDA
+
+import preprocessors.fft as fft
+from utils.loader import load_test_data, load_train_data
+from utils.config_name_creator import *
+from merger import merge_csv_files
+from commons import reshape_data
+from commons import load_test_labels
+
+
+def train(subject, data_path, plot=False):
+    d = load_train_data(data_path, subject)
+    x, y = d['x'], d['y']
+    print 'n_preictal', np.sum(y)
+    print 'n_inetrictal', np.sum(y - 1)
+    n_channels = x.shape[1]
+    n_fbins = x.shape[2]
+
+    x, y = reshape_data(x, y)
+    data_scaler = StandardScaler()
+    x = data_scaler.fit_transform(x)
+
+    lda = LDA()
+    lda.fit(x, y)
+    coef = lda.scalings_ * lda.coef_[:1].T
+    channels = []
+    fbins = []
+    for c in range(n_channels):
+        fbins.extend(range(n_fbins))  # 0- delta, 1- theta ...
+        channels.extend([c] * n_fbins)
+
+    if plot:
+        fig = plt.figure()
+        for i in range(n_channels):
+            if n_channels == 24:
+                fig.add_subplot(4, 6, i)
+            else:
+                fig.add_subplot(4, 4, i)
+            ax = plt.gca()
+            ax.set_xlim([0, n_fbins])
+            ax.set_xticks(np.arange(0.5, n_fbins + 0.5, 1))
+            ax.set_xticklabels(np.arange(0, n_fbins))
+            max_y = max(abs(coef)) + 0.01
+            ax.set_ylim([0, max_y])
+            ax.set_yticks(np.around(np.arange(0, max_y, max_y / 4.0), decimals=1))
+            for label in (ax.get_xticklabels() + ax.get_yticklabels()):
+                label.set_fontsize(15)
+            plt.bar(range(0, n_fbins), abs(coef[i * n_fbins:i * n_fbins + n_fbins]))
+        fig.suptitle(subject, fontsize=20)
+        plt.show()
+
+    coefs = np.reshape(coef, (n_channels, n_fbins))
+    return lda, data_scaler, coefs
+
+
+def predict(subject, model, data_scaler, data_path, submission_path, test_labels, opt_threshold_train):
+    d = load_test_data(data_path, subject)
+    x_test, id = d['x'], d['id']
+    n_test_examples = x_test.shape[0]
+    n_timesteps = x_test.shape[3]
+
+    x_test = reshape_data(x_test)
+    x_test = data_scaler.transform(x_test)
+
+    pred_1m = model.predict_proba(x_test)[:, 1]
+
+    pred_10m = np.reshape(pred_1m, (n_test_examples, n_timesteps))
+    pred_10m = np.mean(pred_10m, axis=1)
+    ans = zip(id, pred_10m)
+    df = DataFrame(data=ans, columns=['clip', 'preictal'])
+    df.to_csv(submission_path + '/' + subject + '.csv', index=False, header=True)
+
+
+def run_trainer():
+    with open('SETTINGS.json') as f:
+        settings_dict = json.load(f)
+
+    data_path = settings_dict['path']['processed_data_path'] + '/' + create_fft_data_name(settings_dict)
+    submission_path = settings_dict['path']['submission_path'] + '/LDA_' + create_fft_data_name(settings_dict)
+    print data_path
+
+    if not os.path.exists(data_path):
+        fft.run_fft_preprocessor()
+
+    if not os.path.exists(submission_path):
+        os.makedirs(submission_path)
+
+    test_labels_path = '/mnt/sda4/CODING/python/kaggle_data/test_labels.csv'
+    test_labels = load_test_labels(test_labels_path)
+
+    subjects = ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']
+    coef_list = []
+    for subject in subjects:
+        print '***********************', subject, '***************************'
+        model, data_scaler, coefs  = train(subject, data_path)
+        predict(subject, model, data_scaler, data_path, submission_path, test_labels[subject]['preictal'])
+        coef_list.append(coefs)
+
+    merge_csv_files(submission_path, subjects, 'submission')
+    merge_csv_files(submission_path, subjects, 'submission_softmax')
+    merge_csv_files(submission_path, subjects, 'submission_minmax')
+    merge_csv_files(submission_path, subjects, 'submission_median')
+
+
+if __name__ == '__main__':
+    run_trainer()