sebastian-alfers
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎dimensions.py
+76 b/‎dimensions.py
+76
diff --git a/‎implementation.py
+98 b/‎implementation.py
+98
diff --git a/‎iterations.py
+70 b/‎iterations.py
+70
diff --git a/‎load_data.py
+15 b/‎load_data.py
+15
diff --git a/‎output/rp_dimensions_.png
282 KB b/‎output/rp_dimensions_.png
282 KB
diff --git a/‎output/rp_dimensions_50-250_encode.png
241 KB b/‎output/rp_dimensions_50-250_encode.png
241 KB
diff --git a/‎output/rp_iterations_10_.png
272 KB b/‎output/rp_iterations_10_.png
272 KB
diff --git a/‎output/rp_iterations_10_encode.png
227 KB b/‎output/rp_iterations_10_encode.png
227 KB
diff --git a/‎output/rp_iterations_3_.png
236 KB b/‎output/rp_iterations_3_.png
236 KB
@@ -0,0 +1 @@
+*.pyc
@@ -0,0 +1,76 @@
+import numpy as np
+import implementation as impl
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import os
+
+
+def compare(origDistances, data, dimensionsRange, encode):
+
+    origShape = np.shape(data)
+
+    results = dict()
+    new_dimension = 10
+    orig_dimension = origShape[1]
+
+    results = dict()
+
+    x = dimensionsRange
+
+
+    for key in impl.actions.iterkeys():
+        print key
+        durations = list()
+        distances = list()
+        di = list()
+        du = list()
+
+        for dimension in dimensionsRange:
+            print " %s" % dimension
+            action = impl.actions[key]
+
+            reduced, d = impl.reduceAndMeasure(action, data, orig_dimension, dimension)
+            dist = impl.measureDistances(origDistances, data, reduced, key)
+            du.append(d)
+            di.append(dist)
+
+            durations.append(np.mean(du))
+            distances.append(np.mean(di))
+
+        results[key] = dict()
+        results[key]["durations"] = durations
+        results[key]["distances"] = distances
+
+    plt.subplot(211)
+    plt.grid()
+    plt.xlabel("iterations")
+    plt.ylabel("mean distance")
+
+    for key in results.iterkeys():
+        plt.plot(x, results[key]["distances"], label=key)
+
+    plt.legend(loc="best")
+    #plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.08), fancybox=True, shadow=True, ncol=2)
+
+    plt.subplot(212)
+    plt.grid()
+    plt.xlabel("iterations")
+    plt.ylabel("mean duration")
+
+    for key in results.iterkeys():
+        plt.plot(x, results[key]["durations"], label=key)
+
+    plt.legend(loc="best")
+    #plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.08), fancybox=True, shadow=True, ncol=2)
+
+    outputFolder = os.path.dirname(os.path.abspath(__file__))
+    outputFolder = "%s/output" % outputFolder
+
+    e = ""
+    if encode:
+        e = "encode"
+
+    d = "%s-%s" % (np.min(dimensionsRange), np.max(dimensionsRange))
+
+    plt.savefig( "%s/rp_dimensions_%s_%s.png" % (outputFolder,d, e), dpi=320, bbox_inches = "tight")
@@ -0,0 +1,98 @@
+from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection
+import numpy as np
+import time
+from scipy.spatial.distance import euclidean
+import random
+
+# measure duration and run the reduction
+def reduceAndMeasure(action, data, orig_dimension, new_dimension):
+    start = time.time()
+    reduced = action(data, orig_dimension, new_dimension)
+    duration = time.time() - start
+    return reduced, duration
+
+# scikit-learn implementation: gaussian matrix
+def gaussianRP(data,orig_dimension, new_dimension):
+    rp = GaussianRandomProjection(n_components=new_dimension)
+    return rp.fit_transform(data)
+
+# scikit-learn implementation: sparse matrix
+def sparseRP(data, orig_dimension, new_dimension):
+    rp = SparseRandomProjection(n_components=new_dimension)
+    return rp.fit_transform(data)
+
+# just extract the random matrix from the api
+def otherScikitImpl(data,orig_dimension, new_dimension):
+    rp = GaussianRandomProjection(n_components=new_dimension)
+    m = rp._make_random_matrix(new_dimension, orig_dimension)
+    m = np.mat(m)
+    reduced = m * np.mat(data).transpose()
+    reduced = reduced.transpose()
+    return reduced
+
+# random = np.random.mtrand._rand
+
+# naive implementation of the random matrix
+def custom1(data, orig_dimension, new_dimension):
+    minusOne = 0.1
+    one = 0.9
+    rows = len(data)
+    m = np.empty((orig_dimension, new_dimension))
+    # build random matrix
+    for i in range(len(m)):
+        for j in range(len(m[i])):
+            rand = random.random()
+            if rand < minusOne:
+                m[i][j] = -1
+            elif rand >= one:
+                m[i][j] = 1
+            else:
+                m[i][j] = 0
+
+    reduced = np.mat(data) * m
+    return reduced
+
+# non-sense implementation for comparison
+def custom2(data, orig_dimension, new_dimension):
+    m = np.empty((orig_dimension, new_dimension))
+    for i in range(len(m)):
+        for j in range(len(m[i])):
+            m[i][j] = random.random()
+
+    reduced = np.mat(data) * m
+    return reduced
+
+
+
+
+actions = {
+    "gaussian RP": gaussianRP,
+    "sparse RP": sparseRP,
+    "manual scikit": otherScikitImpl,
+    "custom 1": custom1,
+    "custom 2": custom2
+}
+
+
+# compare original data with reduced data
+def measureDistances(origDistances, data, reduced, desc):
+
+    a = np.shape(data)
+    b = np.shape(reduced)
+    if a[0] != b[0]:
+        raise Exception("%s: same amount of instances required. data: %s, reduced: %s" % (desc, a,b))
+
+    newDistancs = np.empty((b[0], b[0]))
+    items = range(b[0])
+    for i in items:
+        for j in items:
+            if i == j:
+                newDistancs[i][j] = 0
+            else:
+            #if i % 5 == 0 and j % 10 == 0:
+                newDistancs[i][j] = euclidean(reduced[i], reduced[j])
+
+    # compare item by item
+    meanDistance = np.abs(np.mean( - newDistancs))
+    return meanDistance
+
@@ -0,0 +1,70 @@
+import numpy as np
+import implementation as impl
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import os
+
+def compare(origDistances, data, iterations, encode):
+    origShape = np.shape(data)
+    print iterations
+
+    results = dict()
+    new_dimension = 10
+    orig_dimension = origShape[1]
+
+    x = np.arange(1,iterations, 1)
+    for key in impl.actions.iterkeys():
+        print key
+        durations = list()
+        distances = list()
+        di = list()
+        du = list()
+        for i in x:
+            print " %s" % i
+            action = impl.actions[key]
+
+            reduced, d = impl.reduceAndMeasure(action, data, orig_dimension, new_dimension)
+            dist = impl.measureDistances(origDistances, data, reduced, key)
+            du.append(d)
+            di.append(dist)
+
+            durations.append(np.mean(du))
+            distances.append(np.mean(di))
+
+        results[key] = dict()
+        results[key]["durations"] = durations
+        results[key]["distances"] = distances
+
+
+    plt.subplot(211)
+    plt.grid()
+    plt.xlabel("iterations")
+    plt.ylabel("mean distance")
+
+    for key in results.iterkeys():
+        plt.plot(x, results[key]["distances"], label=key)
+
+    plt.legend(loc="best")
+    #plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.08), fancybox=True, shadow=True, ncol=2)
+
+    plt.subplot(212)
+    plt.grid()
+    plt.xlabel("iterations")
+    plt.ylabel("mean duration")
+
+    for key in results.iterkeys():
+        plt.plot(x, results[key]["durations"], label=key)
+
+    plt.legend(loc="best")
+    #plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.08), fancybox=True, shadow=True, ncol=2)
+
+    outputFolder = os.path.dirname(os.path.abspath(__file__))
+    outputFolder = "%s/output" % outputFolder
+
+    e = ""
+    if encode:
+        e = "encode"
+
+    plt.savefig( "%s/rp_iterations_%s_%s.png" % (outputFolder, iterations, e), dpi=320, bbox_inches = "tight")
+
@@ -0,0 +1,15 @@
+import data_factory as df
+from sklearn.preprocessing import OneHotEncoder
+
+def load(binary_encode = False):
+    data, label, desc, size = df.loadFirstCancerDataset()
+
+    print binary_encode
+
+    if binary_encode:
+        enc = OneHotEncoder()
+        enc.fit(data)
+        encoded = enc.transform(data).toarray()
+        return encoded
+    else:
+        return data