Skip to content

Commit 4d329f9

Browse files
author
Sebastian Alfers
committed
cli api, compare by dimensions
1 parent c97b3ab commit 4d329f9

11 files changed

+326
-183
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.pyc

dimensions.py

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import numpy as np
2+
import implementation as impl
3+
import matplotlib
4+
matplotlib.use('Agg')
5+
import matplotlib.pyplot as plt
6+
import os
7+
8+
9+
def compare(origDistances, data, dimensionsRange, encode):
10+
11+
origShape = np.shape(data)
12+
13+
results = dict()
14+
new_dimension = 10
15+
orig_dimension = origShape[1]
16+
17+
results = dict()
18+
19+
x = dimensionsRange
20+
21+
22+
for key in impl.actions.iterkeys():
23+
print key
24+
durations = list()
25+
distances = list()
26+
di = list()
27+
du = list()
28+
29+
for dimension in dimensionsRange:
30+
print " %s" % dimension
31+
action = impl.actions[key]
32+
33+
reduced, d = impl.reduceAndMeasure(action, data, orig_dimension, dimension)
34+
dist = impl.measureDistances(origDistances, data, reduced, key)
35+
du.append(d)
36+
di.append(dist)
37+
38+
durations.append(np.mean(du))
39+
distances.append(np.mean(di))
40+
41+
results[key] = dict()
42+
results[key]["durations"] = durations
43+
results[key]["distances"] = distances
44+
45+
plt.subplot(211)
46+
plt.grid()
47+
plt.xlabel("iterations")
48+
plt.ylabel("mean distance")
49+
50+
for key in results.iterkeys():
51+
plt.plot(x, results[key]["distances"], label=key)
52+
53+
plt.legend(loc="best")
54+
#plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.08), fancybox=True, shadow=True, ncol=2)
55+
56+
plt.subplot(212)
57+
plt.grid()
58+
plt.xlabel("iterations")
59+
plt.ylabel("mean duration")
60+
61+
for key in results.iterkeys():
62+
plt.plot(x, results[key]["durations"], label=key)
63+
64+
plt.legend(loc="best")
65+
#plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.08), fancybox=True, shadow=True, ncol=2)
66+
67+
outputFolder = os.path.dirname(os.path.abspath(__file__))
68+
outputFolder = "%s/output" % outputFolder
69+
70+
e = ""
71+
if encode:
72+
e = "encode"
73+
74+
d = "%s-%s" % (np.min(dimensionsRange), np.max(dimensionsRange))
75+
76+
plt.savefig( "%s/rp_dimensions_%s_%s.png" % (outputFolder,d, e), dpi=320, bbox_inches = "tight")

implementation.py

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection
2+
import numpy as np
3+
import time
4+
from scipy.spatial.distance import euclidean
5+
import random
6+
7+
# measure duration and run the reduction
8+
def reduceAndMeasure(action, data, orig_dimension, new_dimension):
9+
start = time.time()
10+
reduced = action(data, orig_dimension, new_dimension)
11+
duration = time.time() - start
12+
return reduced, duration
13+
14+
# scikit-learn implementation: gaussian matrix
15+
def gaussianRP(data,orig_dimension, new_dimension):
16+
rp = GaussianRandomProjection(n_components=new_dimension)
17+
return rp.fit_transform(data)
18+
19+
# scikit-learn implementation: sparse matrix
20+
def sparseRP(data, orig_dimension, new_dimension):
21+
rp = SparseRandomProjection(n_components=new_dimension)
22+
return rp.fit_transform(data)
23+
24+
# just extract the random matrix from the api
25+
def otherScikitImpl(data,orig_dimension, new_dimension):
26+
rp = GaussianRandomProjection(n_components=new_dimension)
27+
m = rp._make_random_matrix(new_dimension, orig_dimension)
28+
m = np.mat(m)
29+
reduced = m * np.mat(data).transpose()
30+
reduced = reduced.transpose()
31+
return reduced
32+
33+
# random = np.random.mtrand._rand
34+
35+
# naive implementation of the random matrix
36+
def custom1(data, orig_dimension, new_dimension):
37+
minusOne = 0.1
38+
one = 0.9
39+
rows = len(data)
40+
m = np.empty((orig_dimension, new_dimension))
41+
# build random matrix
42+
for i in range(len(m)):
43+
for j in range(len(m[i])):
44+
rand = random.random()
45+
if rand < minusOne:
46+
m[i][j] = -1
47+
elif rand >= one:
48+
m[i][j] = 1
49+
else:
50+
m[i][j] = 0
51+
52+
reduced = np.mat(data) * m
53+
return reduced
54+
55+
# non-sense implementation for comparison
56+
def custom2(data, orig_dimension, new_dimension):
57+
m = np.empty((orig_dimension, new_dimension))
58+
for i in range(len(m)):
59+
for j in range(len(m[i])):
60+
m[i][j] = random.random()
61+
62+
reduced = np.mat(data) * m
63+
return reduced
64+
65+
66+
67+
68+
actions = {
69+
"gaussian RP": gaussianRP,
70+
"sparse RP": sparseRP,
71+
"manual scikit": otherScikitImpl,
72+
"custom 1": custom1,
73+
"custom 2": custom2
74+
}
75+
76+
77+
# compare original data with reduced data
78+
def measureDistances(origDistances, data, reduced, desc):
79+
80+
a = np.shape(data)
81+
b = np.shape(reduced)
82+
if a[0] != b[0]:
83+
raise Exception("%s: same amount of instances required. data: %s, reduced: %s" % (desc, a,b))
84+
85+
newDistancs = np.empty((b[0], b[0]))
86+
items = range(b[0])
87+
for i in items:
88+
for j in items:
89+
if i == j:
90+
newDistancs[i][j] = 0
91+
else:
92+
#if i % 5 == 0 and j % 10 == 0:
93+
newDistancs[i][j] = euclidean(reduced[i], reduced[j])
94+
95+
# compare item by item
96+
meanDistance = np.abs(np.mean( - newDistancs))
97+
return meanDistance
98+

iterations.py

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import numpy as np
2+
import implementation as impl
3+
import matplotlib
4+
matplotlib.use('Agg')
5+
import matplotlib.pyplot as plt
6+
import os
7+
8+
def compare(origDistances, data, iterations, encode):
9+
origShape = np.shape(data)
10+
print iterations
11+
12+
results = dict()
13+
new_dimension = 10
14+
orig_dimension = origShape[1]
15+
16+
x = np.arange(1,iterations, 1)
17+
for key in impl.actions.iterkeys():
18+
print key
19+
durations = list()
20+
distances = list()
21+
di = list()
22+
du = list()
23+
for i in x:
24+
print " %s" % i
25+
action = impl.actions[key]
26+
27+
reduced, d = impl.reduceAndMeasure(action, data, orig_dimension, new_dimension)
28+
dist = impl.measureDistances(origDistances, data, reduced, key)
29+
du.append(d)
30+
di.append(dist)
31+
32+
durations.append(np.mean(du))
33+
distances.append(np.mean(di))
34+
35+
results[key] = dict()
36+
results[key]["durations"] = durations
37+
results[key]["distances"] = distances
38+
39+
40+
plt.subplot(211)
41+
plt.grid()
42+
plt.xlabel("iterations")
43+
plt.ylabel("mean distance")
44+
45+
for key in results.iterkeys():
46+
plt.plot(x, results[key]["distances"], label=key)
47+
48+
plt.legend(loc="best")
49+
#plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.08), fancybox=True, shadow=True, ncol=2)
50+
51+
plt.subplot(212)
52+
plt.grid()
53+
plt.xlabel("iterations")
54+
plt.ylabel("mean duration")
55+
56+
for key in results.iterkeys():
57+
plt.plot(x, results[key]["durations"], label=key)
58+
59+
plt.legend(loc="best")
60+
#plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.08), fancybox=True, shadow=True, ncol=2)
61+
62+
outputFolder = os.path.dirname(os.path.abspath(__file__))
63+
outputFolder = "%s/output" % outputFolder
64+
65+
e = ""
66+
if encode:
67+
e = "encode"
68+
69+
plt.savefig( "%s/rp_iterations_%s_%s.png" % (outputFolder, iterations, e), dpi=320, bbox_inches = "tight")
70+

load_data.py

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import data_factory as df
2+
from sklearn.preprocessing import OneHotEncoder
3+
4+
def load(binary_encode = False):
5+
data, label, desc, size = df.loadFirstCancerDataset()
6+
7+
print binary_encode
8+
9+
if binary_encode:
10+
enc = OneHotEncoder()
11+
enc.fit(data)
12+
encoded = enc.transform(data).toarray()
13+
return encoded
14+
else:
15+
return data

output/rp_dimensions_.png

282 KB
Loading
241 KB
Loading

output/rp_iterations_10_.png

272 KB
Loading

output/rp_iterations_10_encode.png

227 KB
Loading

output/rp_iterations_3_.png

236 KB
Loading

0 commit comments

Comments
 (0)