Added Tor and changed test splitting

nmeheus · nmeheus · commit 6d87c2438b84 · 2017-04-25T15:10:16.000+02:00
diff --git a/Multi_label/CV_hyperparameters.py b/Multi_label/CV_hyperparameters.py
@@ -16,7 +16,7 @@
 import sys
 from sklearn.metrics import confusion_matrix
 
-FEATURE = 'size_IAT' # use burst features or size_IAT ('size_IAT', 'burst' or 'both')
+FEATURE = 'both' # use burst features or size_IAT ('size_IAT', 'burst' or 'both')
 METHOD = 'RF' # options: 'NB' : Naive Bayes, 'RF' : random forest, 'MLP' : , 'LR': logistic regression
 TEST_SIZE = 0.20
 
diff --git a/Multi_label/trace_classification.py b/Multi_label/trace_classification.py
@@ -30,8 +30,8 @@
         parameters = {'size_IAT' : 
             {'RF': {'n_estimators': 48}},
          'burst':
-            {'RF': {'n_estimators': 21}}, 
-         'both': {'RF': {'n_estimators': 21}}
+            {'RF': {'n_estimators': 48}}, 
+         'both': {'RF': {'n_estimators': 39}}
         }
     elif mode == 'ipsec_20':
         parameters = {'size_IAT' : 
diff --git a/Multi_label/trace_visualization.py b/Multi_label/trace_visualization.py
@@ -6,7 +6,7 @@
 import matplotlib.pyplot as plt
 import numpy.random as nprnd
 
-FEATURE = 'size_IAT' # use burst features or size_IAT ('size_IAT' or 'burst')
+FEATURE = 'burst' # use burst features or size_IAT ('size_IAT' or 'burst')
 modes = ['ipsec', 'ipsec_20','ipsec_50','ipsec_100','ipsec_200','ipsec_300','ipsec_400']
 
 if __name__ == "__main__":
diff --git a/Single_label/CV_hyperparameters.py b/Single_label/CV_hyperparameters.py
@@ -14,8 +14,8 @@
 import sys
 from sklearn.metrics import confusion_matrix
 
-FEATURE = 'size_IAT' # use burst features or size_IAT ('size_IAT', 'burst' or 'both')
-METHOD = 'RF' # options: 'NB' : Naive Bayes, 'RF' : random forest, 'MLP' : , 'LR': logistic regression
+#FEATURE = 'size_IAT' # use burst features or size_IAT ('size_IAT', 'burst' or 'both')
+#METHOD = 'LR' # options: 'NB' : Naive Bayes, 'RF' : random forest, 'MLP' : , 'LR': logistic regression
 TEST_SIZE = 0.20
 
 
@@ -26,13 +26,16 @@ def log(s):
 
 if __name__ == "__main__":
     mode = sys.argv[1]
+    FEATURE = sys.argv[2] # use burst features or size_IAT ('size_IAT', 'burst' or 'both')
+    METHOD = sys.argv[3] # options: 'NB' : Naive Bayes, 'RF' : random forest, 'MLP' : , 'LR': logistic regression
     all_traces = load_pickled_traces(mode)
-    windowed_traces = window_all_traces(all_traces)
+    #windowed_traces = window_all_traces(all_traces)
 
-    # Split test set
-    labels = [x.label for x in windowed_traces]
-    X_train_val, X_test, y_train_val, y_test = train_test_split(windowed_traces,labels, stratify=np.array(labels), test_size=TEST_SIZE, random_state=0)
+    # Split test set but keep windows from different traces seperated from eachother
+    labels = [x.label for x in all_traces]
+    X_train_val, X_test, y_train_val, y_test = train_test_split(all_traces,labels, stratify=np.array(labels), test_size=TEST_SIZE, random_state=0)
 
+    X_train_val = window_all_traces(X_train_val)
 
     if METHOD == 'NB':
         clf = MultinomialNB()
@@ -53,8 +56,8 @@ def log(s):
     for train, val in kf.split(X_train_val):
         log('Started testing hyperparameters for fold ' + str(fold+1)+'.')
         # Seperate train list from val list
-        train_list = [windowed_traces[i] for i in train]
-        val_list = [windowed_traces[i] for i in val]
+        train_list = [X_train_val[i] for i in train]
+        val_list = [X_train_val[i] for i in val]
 
         if FEATURE == 'size_IAT':
             feature_matrix, classes, train_range = build_feature_matrix_size_IAT(train_list)
@@ -63,7 +66,7 @@ def log(s):
             feature_matrix, classes, train_range = build_feature_matrix_burst(train_list)
             feature_matrix_val, classes_val, val_range = build_feature_matrix_burst(val_list, train_range)
         elif FEATURE == 'both':
-            feature_matrix, classes, train_range = build_feature_matrix_both(X_train_val)
+            feature_matrix, classes, train_range = build_feature_matrix_both(train_list)
             feature_matrix_val, classes_val, val_range = build_feature_matrix_both(val_list, train_range)  
 
         for par in list(parameters):
diff --git a/Single_label/background_traffic.py b/Single_label/background_traffic.py
@@ -1,17 +1,17 @@
 import matplotlib.pyplot as plt
 
 packetrate_up = [0,20,50,100,150,200,300,400]
-accuracy_up = [100,98.72,100,93.75,96.30,97.44,96.30,92.5]
+accuracy_up = [96.40,98.88,98.75,95,93.75,90,95,85]
 
 packetrate_ud = [0,40,100,200,300,400,600,800]
-accuracy_ud = [100,98.75,96.25,97.5,96.25,95,95,92.5]
+accuracy_ud = [96.40,100,98.75,98.75,87.5,96.25,86.25,81.25]
 
-plt.ylim([80,101])
+plt.ylim([75,101])
 plt.plot(packetrate_up, accuracy_up, label='Only upstream packets')
 plt.plot(packetrate_ud, accuracy_ud, label='Upstream and downstream packets')
 plt.title('The effect of background traffic on the classification accuracy')
 plt.ylabel('Accuracy on the testset')
-plt.xlabel('# packets of added background traffic')
+plt.xlabel('# packets/s of added background traffic')
 plt.legend(loc=4)
 plt.show()
 
diff --git a/Single_label/feature_extraction.py b/Single_label/feature_extraction.py
@@ -97,11 +97,16 @@
 		'path': 'traces_400_ud/',
 		'object_file': 'traces_400_ud/pickled_traces.dat',
 		'ip': '192.168.0.2'		
+	},
+	'tor':{
+		'path': 'tor_traces/',
+		'object_file': 'tor_traces/pickled_traces.dat',
+		'ip': '192.168.2.2'		
 	}
 }
 
 # Fill this in to determine which kind of traffic to work on
-mode = 'ipsec_400_ud'
+mode = 'tor'
 
 # Load all traces that match the reg exp
 def load_traces():
diff --git a/Single_label/tor_traces/pickled_traces.dat b/Single_label/tor_traces/pickled_traces.dat
diff --git a/Single_label/trace_classification.py b/Single_label/trace_classification.py
diff --git a/Single_label/trace_visualization.py b/Single_label/trace_visualization.py