Skip to content

Commit eff7bd6

Browse files
Add files via upload
1 parent ccf92a8 commit eff7bd6

8 files changed

+985
-0
lines changed

CNNs/BoW.py

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#-*- coding: utf-8 -*-
2+
3+
#BoW.py
4+
#Naive Bayes for comparison
5+
6+
import os
7+
import re
8+
import locale
9+
from collections import Counter
10+
import numpy as np
11+
12+
#directories for training and test
13+
directory_train = '/Users/semihakbayrak/ConvNet4/training'
14+
directory_test = '/Users/semihakbayrak/ConvNet4/test'
15+
16+
#categoriess
17+
categorylist = []
18+
category_vocabularies = {}
19+
for filename in os.listdir(directory_train):
20+
categorylist.append(filename)
21+
for category in categorylist:
22+
if category == '.DS_Store':
23+
categorylist.remove('.DS_Store')
24+
25+
priors = np.array([])
26+
conclistall = []
27+
for category in categorylist:
28+
directory_category = directory_train+'/'+str(category)
29+
textlist = []
30+
31+
for filename in os.listdir(directory_category):
32+
textlist.append(filename)
33+
34+
for text in textlist:
35+
if text == '.DS_Store':
36+
textlist.remove('.DS_Store')
37+
38+
priors = np.append(priors,1.0*len(textlist)) #prior probabilities for categories
39+
conclist1 = []
40+
41+
#tokenization and training
42+
for text in textlist:
43+
directory_text = directory_category+'/'+str(text)
44+
textfile = open(directory_text,'r').read()
45+
tok = textfile.decode('utf-8','ignore')
46+
tok2 = tok.lower()
47+
tok3 = re.sub(r"\d+",'',tok2,flags=re.U) #remove numbers
48+
tok4 = re.sub(r"\W+",'\n',tok3,flags=re.U) #remove non alphanumerics with new line
49+
conclist2 = tok4.split()
50+
conclist1 = conclist1 + conclist2 #all the words used by category in a list
51+
52+
category_vocabularies[str(category)]=Counter(conclist1) #word frequencies for categories in dictionaries
53+
conclistall = conclistall + conclist1 #all the words used by all categories in a list
54+
55+
vocabulary = Counter(conclistall) #vocabulary for words with their frequencies in a dictionary
56+
priors = priors/sum(priors) #prior normalization
57+
58+
#number of words used by categories
59+
wordcountscategory = []
60+
for i in range(len(categorylist)):
61+
wordcountscategory.append(sum(category_vocabularies[str(categorylist[i])].values()))
62+
63+
counttrue = 0
64+
countfalse = 0
65+
66+
for category in categorylist:
67+
directory_category = directory_test+'/'+str(category)
68+
textlist = []
69+
for filename in os.listdir(directory_category):
70+
textlist.append(filename)
71+
for text in textlist:
72+
if text == '.DS_Store':
73+
textlist.remove('.DS_Store')
74+
for text in textlist:
75+
#tokenization for test inputs
76+
directory_text = directory_category+'/'+str(text)
77+
textfile = open(directory_text,'r').read()
78+
tok = textfile.decode('utf-8','ignore')
79+
tok2 = tok.lower()
80+
tok3 = re.sub(r"\d+",'',tok2,flags=re.U)
81+
tok4 = re.sub(r"\W+",'\n',tok3,flags=re.U)
82+
tlist = tok4.split()
83+
#multinomial naive bayes with laplace smoothing
84+
logprob = np.log2(priors)
85+
alfa = 0.01
86+
for word in tlist:
87+
for i in range(len(categorylist)):
88+
if word in category_vocabularies[str(categorylist[i])]:
89+
probw = 1.0*(category_vocabularies[str(categorylist[i])][word]+alfa)/(wordcountscategory[i]+alfa*len(vocabulary))
90+
else:
91+
probw = 1.0*alfa/(wordcountscategory[i]+alfa*len(vocabulary))
92+
logprob[i] = logprob[i] + np.log2(probw)
93+
print categorylist[np.argmax(logprob)]
94+
#count correct and incorrect instances
95+
if categorylist[np.argmax(logprob)]==category:
96+
print '1'
97+
counttrue = counttrue + 1
98+
else:
99+
print '0'
100+
countfalse = countfalse + 1
101+
102+
103+
104+
accuracy = 1.0*counttrue/(counttrue+countfalse)
105+
print accuracy
106+
107+

CNNs/CNN_final_report.pdf

270 KB
Binary file not shown.

CNNs/convNet4.py

+259
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
#-*- coding: utf-8 -*-
2+
3+
#convNet4.py
4+
#training part of CNN
5+
6+
from inputCreator3 import InOut
7+
import os
8+
import re
9+
import locale
10+
from collections import Counter
11+
import numpy as np
12+
import gensim
13+
14+
#Preperation of inputs outputs
15+
directory = '/Users/semihakbayrak/ConvNet4/training'
16+
category_to_be_trained = 'spor'
17+
objInOut = InOut(directory,category_to_be_trained)
18+
inp_train,out_train = objInOut.inout_return()
19+
20+
#Stop words
21+
fileNameSW = '/Users/semihakbayrak/ConvNet4/turkish_stopwords.txt'
22+
textfile = open(fileNameSW,'r').read()
23+
textfile = textfile.decode('utf-8')
24+
textfile = textfile.split()
25+
stop_words = [w for w in textfile]
26+
27+
#Previously trained word2vec 50 dimensional vectors
28+
fname = "/Users/semihakbayrak/ConvNet4/42bin_haber_w2v_2"
29+
model = gensim.models.Word2Vec.load(fname)
30+
vocab = list(model.vocab.keys())
31+
32+
def weight_init(dim):
33+
w = np.random.uniform(-0.01,0.01,dim)
34+
return w
35+
36+
def func_sigmoid(x):
37+
f = 1.0/(1.0+np.exp(-x))
38+
return f
39+
40+
def func_threshold(x):
41+
if x>=0:
42+
return 1.0
43+
else:
44+
return 0
45+
46+
def forward_conv3_prop(M,weight1,weight2,weight3,weight0):
47+
b = np.zeros((len(M)-2))
48+
for i in range(len(b)):
49+
b[i] = np.inner(M[i],weight1) + np.inner(M[i+1],weight2) + np.inner(M[i+2],weight3) + weight0[0]
50+
c = func_sigmoid(b)
51+
return c
52+
53+
def forward_conv4_prop(M,weight1,weight2,weight3,weight4,weight0):
54+
b = np.zeros((len(M)-3))
55+
for i in range(len(b)):
56+
b[i] = np.inner(M[i],weight1) + np.inner(M[i+1],weight2) + np.inner(M[i+2],weight3) + np.inner(M[i+3],weight4) + weight0[0]
57+
c = func_sigmoid(b)
58+
return c
59+
60+
def forward_conv5_prop(M,weight1,weight2,weight3,weight4,weight5,weight0):
61+
b = np.zeros((len(M)-4))
62+
for i in range(len(b)):
63+
b[i] = np.inner(M[i],weight1) + np.inner(M[i+1],weight2) + np.inner(M[i+2],weight3) + np.inner(M[i+3],weight4) + np.inner(M[i+4],weight5) + weight0[0]
64+
c = func_sigmoid(b)
65+
return c
66+
67+
def training(inp,out,numepoch,direc,stopWords,nu1=0.1,nu2=0.2):
68+
#sentence level weight initialization for filter with window size 3
69+
u1 = weight_init(50)
70+
u2 = weight_init(50)
71+
u3 = weight_init(50)
72+
u0 = weight_init(1)
73+
#sentence level weight initialization for filter with window size 4
74+
v1 = weight_init(50)
75+
v2 = weight_init(50)
76+
v3 = weight_init(50)
77+
v4 = weight_init(50)
78+
v0 = weight_init(1)
79+
#sentence level weight initialization for filter with window size 5
80+
w1 = weight_init(50)
81+
w2 = weight_init(50)
82+
w3 = weight_init(50)
83+
w4 = weight_init(50)
84+
w5 = weight_init(50)
85+
w0 = weight_init(1)
86+
#last layer weight initialization
87+
p = weight_init(15)
88+
p0 = weight_init(1)[0]
89+
for epoch in range(numepoch):
90+
count_valid_docs = 0
91+
y_store = np.array([])
92+
r_store = np.array([])
93+
#going over all shuffled documents in the training set
94+
for doc in range(len(inp)):
95+
documentVector = np.array([]) #document level array
96+
documentIndexVector = np.array([]) #to keep index values of max vals for each sentences
97+
documentVector3 = np.array([]) #document level array for 3 window size filter
98+
documentIndexVector3 = np.array([]) #to keep index values of max vals for each sentences for 3 window size filter
99+
documentVector4 = np.array([]) #document level array for 4 window size filter
100+
documentIndexVector4 = np.array([]) #to keep index values of max vals for each sentences for 4 window size filter
101+
documentVector5 = np.array([]) #document level array for 5 window size filter
102+
documentIndexVector5 = np.array([]) #to keep index values of max vals for each sentences for 5 window size filter
103+
sM_keeper = {} #to keep sentence metrices, will be used during back propagation
104+
cA3_keeper = {} #to keep c arrays, will be used during back propagation
105+
cA4_keeper = {} #to keep c arrays, will be used during back propagation
106+
cA5_keeper = {} #to keep c arrays, will be used during back propagation
107+
doc_dir = direc+'/'+str(inp[doc][0])
108+
doc_name = doc_dir+'/'+str(inp[doc][1])
109+
textfile = open(doc_name,'r').read()
110+
textfile = textfile.decode('utf-8','ignore')
111+
doclist = [ line for line in textfile ]
112+
docstr = '' . join(doclist)
113+
sentences = re.split(r'[.!?]', docstr)
114+
#going over all sentences in the document
115+
count_valid_sentences = 0
116+
for s in range(len(sentences)):
117+
sentence = sentences[s]
118+
sentence = sentence.lower() #lower all the characters
119+
sentence = re.sub(r"\d+",'',sentence,flags=re.U) #remove numbers
120+
sentence = re.sub(r"\W+",'\n',sentence,flags=re.U) #remove non alphanumerics with new line
121+
sentence = sentence.split() #array formed by words in the sentence
122+
words = [w for w in sentence if w not in stopWords] #eliminating stop words
123+
sentence = ' '.join(words)
124+
wordlist = sentence.split() #array formed by words in the sentence
125+
sentenceMatrix = np.empty((0,50),float) #sentence level matrix
126+
for word_index in range(len(wordlist)):
127+
if len(wordlist[word_index])>=2:
128+
if (wordlist[word_index]) in vocab:
129+
sentenceMatrix = np.append(sentenceMatrix,[model[wordlist[word_index]]],axis=0) #filling matrix with word vectors
130+
#forward propagation in sentence level
131+
if len(sentenceMatrix)>=6:
132+
c3_array = forward_conv3_prop(sentenceMatrix,u1,u2,u3,u0)
133+
c4_array = forward_conv4_prop(sentenceMatrix,v1,v2,v3,v4,v0)
134+
c5_array = forward_conv5_prop(sentenceMatrix,w1,w2,w3,w4,w5,w0)
135+
#max-pooling
136+
m3 = c3_array.max()
137+
m3_index = c3_array.argmax()
138+
m4 = c4_array.max()
139+
m4_index = c4_array.argmax()
140+
m5 = c5_array.max()
141+
m5_index = c5_array.argmax()
142+
documentVector3 = np.append(documentVector3,m3) #filling array with sentence representations
143+
documentIndexVector3 = np.append(documentIndexVector3,m3_index) #filling array with indices of max vals
144+
documentVector4 = np.append(documentVector4,m4) #filling array with sentence representations
145+
documentIndexVector4 = np.append(documentIndexVector4,m4_index) #filling array with indices of max vals
146+
documentVector5 = np.append(documentVector5,m5) #filling array with sentence representations
147+
documentIndexVector5 = np.append(documentIndexVector5,m5_index) #filling array with indices of max vals
148+
sM_keeper[count_valid_sentences] = sentenceMatrix #keep valid sentence matrix for back propagation
149+
cA3_keeper[count_valid_sentences] = c3_array[m3_index] #keep max c vals for back propagation
150+
cA4_keeper[count_valid_sentences] = c4_array[m4_index] #keep max c vals for back propagation
151+
cA5_keeper[count_valid_sentences] = c5_array[m5_index] #keep max c vals for back propagation
152+
count_valid_sentences = count_valid_sentences + 1
153+
documentVector = np.concatenate((documentVector3,documentVector4),axis=0)
154+
documentVector = np.concatenate((documentVector,documentVector5),axis=0)
155+
if len(documentVector)>=21:
156+
#max-5-pooling in document level
157+
s3_index = documentVector3.argsort()[-5:][::-1]
158+
s3_array = np.zeros((5))
159+
s4_index = documentVector4.argsort()[-5:][::-1]
160+
s4_array = np.zeros((5))
161+
s5_index = documentVector5.argsort()[-5:][::-1]
162+
s5_array = np.zeros((5))
163+
for i in range(5):
164+
s3_array[i] = documentVector3[s3_index[i]]
165+
s4_array[i] = documentVector4[s4_index[i]]
166+
s5_array[i] = documentVector5[s5_index[i]]
167+
s_array = np.concatenate((s3_array,s4_array),axis=0)
168+
s_array = np.concatenate((s_array,s5_array),axis=0)
169+
s_index = np.concatenate((s3_index,s4_index),axis=0)
170+
s_index = np.concatenate((s_index,s5_index),axis=0)
171+
o = np.inner(s_array,p) + p0
172+
y = func_sigmoid(o)
173+
#backpropagation
174+
r = out[doc]
175+
#delta_p
176+
delta_p = nu1*(r-y)*s_array
177+
delta_p0 = nu1*(r-y)
178+
#delta_u, delta_v, delta_w
179+
delta_u1 = np.zeros((50))
180+
delta_u2 = np.zeros((50))
181+
delta_u3 = np.zeros((50))
182+
delta_u0 = 0
183+
delta_v1 = np.zeros((50))
184+
delta_v2 = np.zeros((50))
185+
delta_v3 = np.zeros((50))
186+
delta_v4 = np.zeros((50))
187+
delta_v0 = 0
188+
delta_w1 = np.zeros((50))
189+
delta_w2 = np.zeros((50))
190+
delta_w3 = np.zeros((50))
191+
delta_w4 = np.zeros((50))
192+
delta_w5 = np.zeros((50))
193+
delta_w0 = 0
194+
for h in range(15):
195+
j = s_index[h]
196+
sM = sM_keeper[j]
197+
if h<5:
198+
cA = cA3_keeper[j]
199+
i = documentIndexVector3[j]
200+
delta_u1 = delta_u1 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i]
201+
delta_u2 = delta_u2 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+1]
202+
delta_u3 = delta_u3 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+2]
203+
delta_u0 = delta_u0 + nu2*(r-y)*p[h]*cA*(1-cA)
204+
elif h<10:
205+
cA = cA4_keeper[j]
206+
i = documentIndexVector4[j]
207+
delta_v1 = delta_v1 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i]
208+
delta_v2 = delta_v2 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+1]
209+
delta_v3 = delta_v3 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+2]
210+
delta_v4 = delta_v4 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+3]
211+
delta_v0 = delta_v0 + nu2*(r-y)*p[h]*cA*(1-cA)
212+
else:
213+
cA = cA5_keeper[j]
214+
i = documentIndexVector5[j]
215+
delta_w1 = delta_w1 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i]
216+
delta_w2 = delta_w2 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+1]
217+
delta_w3 = delta_w3 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+2]
218+
delta_w4 = delta_w4 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+3]
219+
delta_w5 = delta_w5 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+4]
220+
delta_w0 = delta_w0 + nu2*(r-y)*p[h]*cA*(1-cA)
221+
#updating weights
222+
u1 = u1 + delta_u1
223+
u2 = u2 + delta_u2
224+
u3 = u3 + delta_u3
225+
u0 = u0 + delta_u0
226+
v1 = v1 + delta_v1
227+
v2 = v2 + delta_v2
228+
v3 = v3 + delta_v3
229+
v4 = v4 + delta_v4
230+
v0 = v0 + delta_v0
231+
w1 = w1 + delta_w1
232+
w2 = w2 + delta_w2
233+
w3 = w3 + delta_w3
234+
w4 = w4 + delta_w4
235+
w5 = w5 + delta_w5
236+
w0 = w0 + delta_w0
237+
p = p + delta_p
238+
p0 = p0 + delta_p0
239+
#store y and r values
240+
y_store = np.append(y_store,y)
241+
r_store = np.append(r_store,r)
242+
count_valid_docs = count_valid_docs + 1
243+
print r
244+
print y
245+
print doc
246+
if count_valid_docs==50:
247+
cross_entropy = -1*np.inner(r_store,np.log(y_store)) #Cross-Entropy
248+
count_valid_docs = 0
249+
y_store = np.array([])
250+
r_store = np.array([])
251+
print "CROSS-ENTROPY"
252+
print cross_entropy
253+
#save the found weights to txt file
254+
with open(category_to_be_trained+'_weights.txt',"w") as cat_f:
255+
cat_f.write("\n".join(" ".join(map(str, x)) for x in (u1,u2,u3,u0,v1,v2,v3,v4,v0,w1,w2,w3,w4,w5,w0,p,np.array([p0]))))
256+
257+
258+
259+
training(inp_train,out_train,1,directory,stop_words)

0 commit comments

Comments
 (0)