-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorrelation2.py
132 lines (112 loc) · 3.47 KB
/
correlation2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/python
from sys import argv, stderr
from getopt import getopt, GetoptError
from is_close import isclose
import math
import re
#import crash_on_ipy
cfg = dict()
def help():
name = argv[0]
print('Usage: %s -x=<XColumn> -y=<YColumn> [-v/--verbose] <CSVFilePath>' % name)
print('Note, value of specified columns shall be numeric.')
def config(key, default=None):
if key in cfg and cfg[key]:
return cfg[key]
if default is not None :
return default
else:
raise Exception('Missing config key %s' % key)
csv = dict()
columns = list()
x = list()
y = list()
def isdecimal(text):
return re.match(r'^-?\d+\.?\d*$', text)
def load_csv(fileobj):
global csv, columns
content = fileobj.read()
lines = content.split('\n')
headers = lines[0].split(',')
for col in headers:
if col:
csv[col] = list()
columns.append(col)
for line in lines[1:]:
cols = line.split(',')
for i in range(len(cols)):
if i >= len(columns):
continue
csv[columns[i]].append(cols[i])
'''def process_xy(x_col, y_col):
global x, y, csv, columns
j = 0
y_val = 0
for i in range(len(csv[x_col])):
if i >= len(csv[y_col]):
break
if not isdecimal(csv[x_col][i]) or not isdecimal(csv[y_col][i]):
continue
if float(csv[x_col][i]) <= 0 or float(csv[y_col][i]) <= 0:
continue
if j > 0:
if isclose(x[j - 1], float(csv[x_col][i])):
if y[j - 1] <= float(csv[y_col][i]):
y[j - 1] = float(csv[y_col][i])
continue
if j == 0 or y[j - 1] <= float(csv[y_col][i]):
x.append(float(csv[x_col][i]))
y.append(float(csv[y_col][i]))
j += 1 '''
def compute_rel():
x_bar = 0; y_bar = 0; n = len(x)
cov_xy = 0
div_x = 0; div_y = 0
if n <= 1:
stderr.write('\033[1;31m Insufficient samples - Cannot compute.\033[0m\n')
exit(0)
for i in range(n):
x_bar += x[i] / n
y_bar += y[i] / n
for i in range(n):
div_x += (x[i] - x_bar) ** 2
div_y += (y[i] - y_bar) ** 2
cov_xy += (x[i] - x_bar) * (y[i] - y_bar)
div_x /= n
div_y /= n
cov_xy /= n
if isclose(div_x, 0.0) or isclose(div_y, 0.0):
stderr.write('\033[1;31m Variance is zero.\033[0m\n')
exit(0)
b = cov_xy / div_x
a = y_bar - b * x_bar
r = cov_xy / (math.sqrt(div_x) * math.sqrt(div_y))
return b, a, r, n
def extract_args(args):
global cfg
try:
opts, argvs = getopt(args, 'vx:y:', ['verbose', 'x=', 'y='])
for key, value in opts:
if key in ('-v', '--verbose'):
cfg['verbose'] = True
if key in ('-x', '--x'):
cfg['x'] = value
if key in ('-y', '--y'):
cfg['y'] = value
cfg['file'] = argvs[0] #input file
except (GetoptError, KeyError):
help()
if __name__ == '__main__':
extract_args(argv[1:])
file_path = config('file')
f = open(file_path, 'r')
load_csv(f)
f.close()
process_xy(config('x'), config('y')) #process_xy(cfg['x'],cfg['y'])
b, a, r, n = compute_rel()
if config('verbose', default=False):
print ('\tx\t,\ty')
for i in range(len(x)):
print ('%f,\t%f' % (x[i], y[i]))
print('%s,%s,%s,%d,%f,%f,%f' % (file_path, config('x'), config('y'),
n, b, a, r))