-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFeatureExtraction.py
91 lines (84 loc) · 4.05 KB
/
FeatureExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
__author__ = 'elizajasin'
def makeAtribut (data):
atribut = {}
for i in range(len(data)):
for j in range(len(data[i])):
if data[i][j] not in atribut.keys():
atribut[data[i][j]] = []
return atribut
def makeAtributBigram (data):
atribut = {}
for i in range(len(data)):
for j in range(len(data[i])):
if (j == 0) and (data[i][j]+'|<s>' not in atribut.keys()):
atribut[data[i][j]+'|<s>'] = []
else:
if data[i][j]+'|'+data[i][j-1] not in atribut.keys():
atribut[data[i][j]+'|'+data[i][j-1]] = []
if (j == len(data[i])) and ('</s>|' + data[i][j] not in atribut.keys()):
atribut['</s>|' + data[i][j]] = []
return atribut
def makeAtributTrigram (data):
atribut = {}
for i in range(len(data)):
for j in range(1,len(data[i])):
if (j == 1) and (data[i][j]+'|<s>,'+data[i][j-1] not in atribut.keys()):
atribut[data[i][j]+'|<s>,'+data[i][j-1]] = []
else:
if data[i][j]+'|'+data[i][j-2]+','+data[i][j-1] not in atribut.keys():
atribut[data[i][j]+'|'+data[i][j-2]+','+data[i][j-1]] = []
if (j == len(data[i])) and ('</s>|' + data[i][j-1] + ',' + data[i][j] not in atribut.keys()):
atribut['</s>|' + data[i][j-1] + ',' + data[i][j]] = []
return atribut
def sumFE (data, atribut):
for i in range(len(data)):
for key in atribut:
atribut[key].append(0)
for j in range(len(data[i])):
if data[i][j] in atribut.keys():
atribut[data[i][j]][i] += 1
return atribut
def sumFEBigram (data, atribut):
for i in range(len(data)):
for key in atribut:
atribut[key].append(0)
for j in range(len(data[i])):
if (j == 0) and (data[i][j]+'|<s>' in atribut.keys()) and (atribut[data[i][j]+'|<s>'][i] == 0):
atribut[data[i][j]+'|<s>'][i] = 1
# atribut[data[i][j] + '|<s>'][i] += 1
else:
if (data[i][j]+'|'+data[i][j-1] in atribut.keys()) and (atribut[data[i][j]+'|'+data[i][j-1]][i] == 0):
atribut[data[i][j]+'|'+data[i][j-1]][i] = 1
# atribut[data[i][j] + '|' + data[i][j - 1]][i] += 1
if (j == len(data[i])) and ('</s>|' + data[i][j] in atribut.keys()) and (atribut['</s>|' + data[i][j]][i] == 0):
atribut['</s>|' + data[i][j]][i] = 1
# atribut['</s>|' + data[i][j]][i] += 1
return atribut
def sumFETrigram (data, atribut):
for i in range(len(data)):
for key in atribut:
atribut[key].append(0)
for j in range(1,len(data[i])):
if (j == 1) and (data[i][j]+'|<s>,'+data[i][j-1] in atribut.keys()) and (atribut[data[i][j]+'|<s>,'+data[i][j-1]][i] == 0):
# atribut[data[i][j]+'|<s>,'+data[i][j-1]][i] = 1
atribut[data[i][j]+'|<s>,'+data[i][j-1]][i] += 1
else:
if (data[i][j]+'|'+data[i][j-2]+','+data[i][j-1] in atribut.keys()) and (atribut[data[i][j]+'|'+data[i][j-2]+','+data[i][j-1]][i] == 0):
# atribut[data[i][j]+'|'+data[i][j-2]+','+data[i][j-1]][i] = 1
atribut[data[i][j]+'|'+data[i][j-2]+','+data[i][j-1]][i] += 1
if (j == len(data[i])) and ('</s>|' + data[i][j-1] + ',' + data[i][j] in atribut.keys()) and (atribut['</s>|' + data[i][j-1] + ',' + data[i][j]][i] == 0):
# atribut['</s>|' + data[i][j-1] + ',' + data[i][j]][i] = 1
atribut['</s>|' + data[i][j-1] + ',' + data[i][j]][i] += 1
return atribut
def normalisasi (atribut):
norm_atribut = {}
for key in atribut:
norm_atribut[key] = []
for key in atribut:
sum = 0
for i in range(len(atribut[key])):
sum += atribut[key][i]
for i in range(len(atribut[key])):
norm = atribut[key][i]/sum
norm_atribut[key].append(str(round(norm,2)))
return norm_atribut