| 1 | +#-*- coding: utf-8 -*- |
| 2 | + |
| 3 | +#convNet4.py |
| 4 | +#training part of CNN |
| 5 | + |
| 6 | +from inputCreator3 import InOut |
| 7 | +import os |
| 8 | +import re |
| 9 | +import locale |
| 10 | +from collections import Counter |
| 11 | +import numpy as np |
| 12 | +import gensim |
| 13 | + |
| 14 | +#Preperation of inputs outputs |
| 15 | +directory = '/Users/semihakbayrak/ConvNet4/training' |
| 16 | +category_to_be_trained = 'spor' |
| 17 | +objInOut = InOut(directory,category_to_be_trained) |
| 18 | +inp_train,out_train = objInOut.inout_return() |
| 19 | + |
| 20 | +#Stop words |
| 21 | +fileNameSW = '/Users/semihakbayrak/ConvNet4/turkish_stopwords.txt' |
| 22 | +textfile = open(fileNameSW,'r').read() |
| 23 | +textfile = textfile.decode('utf-8') |
| 24 | +textfile = textfile.split() |
| 25 | +stop_words = [w for w in textfile] |
| 26 | + |
| 27 | +#Previously trained word2vec 50 dimensional vectors |
| 28 | +fname = "/Users/semihakbayrak/ConvNet4/42bin_haber_w2v_2" |
| 29 | +model = gensim.models.Word2Vec.load(fname) |
| 30 | +vocab = list(model.vocab.keys()) |
| 31 | + |
| 32 | +def weight_init(dim): |
| 33 | + w = np.random.uniform(-0.01,0.01,dim) |
| 34 | + return w |
| 35 | + |
| 36 | +def func_sigmoid(x): |
| 37 | + f = 1.0/(1.0+np.exp(-x)) |
| 38 | + return f |
| 39 | + |
| 40 | +def func_threshold(x): |
| 41 | + if x>=0: |
| 42 | + return 1.0 |
| 43 | + else: |
| 44 | + return 0 |
| 45 | + |
| 46 | +def forward_conv3_prop(M,weight1,weight2,weight3,weight0): |
| 47 | + b = np.zeros((len(M)-2)) |
| 48 | + for i in range(len(b)): |
| 49 | + b[i] = np.inner(M[i],weight1) + np.inner(M[i+1],weight2) + np.inner(M[i+2],weight3) + weight0[0] |
| 50 | + c = func_sigmoid(b) |
| 51 | + return c |
| 52 | + |
| 53 | +def forward_conv4_prop(M,weight1,weight2,weight3,weight4,weight0): |
| 54 | + b = np.zeros((len(M)-3)) |
| 55 | + for i in range(len(b)): |
| 56 | + b[i] = np.inner(M[i],weight1) + np.inner(M[i+1],weight2) + np.inner(M[i+2],weight3) + np.inner(M[i+3],weight4) + weight0[0] |
| 57 | + c = func_sigmoid(b) |
| 58 | + return c |
| 59 | + |
| 60 | +def forward_conv5_prop(M,weight1,weight2,weight3,weight4,weight5,weight0): |
| 61 | + b = np.zeros((len(M)-4)) |
| 62 | + for i in range(len(b)): |
| 63 | + b[i] = np.inner(M[i],weight1) + np.inner(M[i+1],weight2) + np.inner(M[i+2],weight3) + np.inner(M[i+3],weight4) + np.inner(M[i+4],weight5) + weight0[0] |
| 64 | + c = func_sigmoid(b) |
| 65 | + return c |
| 66 | + |
| 67 | +def training(inp,out,numepoch,direc,stopWords,nu1=0.1,nu2=0.2): |
| 68 | + #sentence level weight initialization for filter with window size 3 |
| 69 | + u1 = weight_init(50) |
| 70 | + u2 = weight_init(50) |
| 71 | + u3 = weight_init(50) |
| 72 | + u0 = weight_init(1) |
| 73 | + #sentence level weight initialization for filter with window size 4 |
| 74 | + v1 = weight_init(50) |
| 75 | + v2 = weight_init(50) |
| 76 | + v3 = weight_init(50) |
| 77 | + v4 = weight_init(50) |
| 78 | + v0 = weight_init(1) |
| 79 | + #sentence level weight initialization for filter with window size 5 |
| 80 | + w1 = weight_init(50) |
| 81 | + w2 = weight_init(50) |
| 82 | + w3 = weight_init(50) |
| 83 | + w4 = weight_init(50) |
| 84 | + w5 = weight_init(50) |
| 85 | + w0 = weight_init(1) |
| 86 | + #last layer weight initialization |
| 87 | + p = weight_init(15) |
| 88 | + p0 = weight_init(1)[0] |
| 89 | + for epoch in range(numepoch): |
| 90 | + count_valid_docs = 0 |
| 91 | + y_store = np.array([]) |
| 92 | + r_store = np.array([]) |
| 93 | + #going over all shuffled documents in the training set |
| 94 | + for doc in range(len(inp)): |
| 95 | + documentVector = np.array([]) #document level array |
| 96 | + documentIndexVector = np.array([]) #to keep index values of max vals for each sentences |
| 97 | + documentVector3 = np.array([]) #document level array for 3 window size filter |
| 98 | + documentIndexVector3 = np.array([]) #to keep index values of max vals for each sentences for 3 window size filter |
| 99 | + documentVector4 = np.array([]) #document level array for 4 window size filter |
| 100 | + documentIndexVector4 = np.array([]) #to keep index values of max vals for each sentences for 4 window size filter |
| 101 | + documentVector5 = np.array([]) #document level array for 5 window size filter |
| 102 | + documentIndexVector5 = np.array([]) #to keep index values of max vals for each sentences for 5 window size filter |
| 103 | + sM_keeper = {} #to keep sentence metrices, will be used during back propagation |
| 104 | + cA3_keeper = {} #to keep c arrays, will be used during back propagation |
| 105 | + cA4_keeper = {} #to keep c arrays, will be used during back propagation |
| 106 | + cA5_keeper = {} #to keep c arrays, will be used during back propagation |
| 107 | + doc_dir = direc+'/'+str(inp[doc][0]) |
| 108 | + doc_name = doc_dir+'/'+str(inp[doc][1]) |
| 109 | + textfile = open(doc_name,'r').read() |
| 110 | + textfile = textfile.decode('utf-8','ignore') |
| 111 | + doclist = [ line for line in textfile ] |
| 112 | + docstr = '' . join(doclist) |
| 113 | + sentences = re.split(r'[.!?]', docstr) |
| 114 | + #going over all sentences in the document |
| 115 | + count_valid_sentences = 0 |
| 116 | + for s in range(len(sentences)): |
| 117 | + sentence = sentences[s] |
| 118 | + sentence = sentence.lower() #lower all the characters |
| 119 | + sentence = re.sub(r"\d+",'',sentence,flags=re.U) #remove numbers |
| 120 | + sentence = re.sub(r"\W+",'\n',sentence,flags=re.U) #remove non alphanumerics with new line |
| 121 | + sentence = sentence.split() #array formed by words in the sentence |
| 122 | + words = [w for w in sentence if w not in stopWords] #eliminating stop words |
| 123 | + sentence = ' '.join(words) |
| 124 | + wordlist = sentence.split() #array formed by words in the sentence |
| 125 | + sentenceMatrix = np.empty((0,50),float) #sentence level matrix |
| 126 | + for word_index in range(len(wordlist)): |
| 127 | + if len(wordlist[word_index])>=2: |
| 128 | + if (wordlist[word_index]) in vocab: |
| 129 | + sentenceMatrix = np.append(sentenceMatrix,[model[wordlist[word_index]]],axis=0) #filling matrix with word vectors |
| 130 | + #forward propagation in sentence level |
| 131 | + if len(sentenceMatrix)>=6: |
| 132 | + c3_array = forward_conv3_prop(sentenceMatrix,u1,u2,u3,u0) |
| 133 | + c4_array = forward_conv4_prop(sentenceMatrix,v1,v2,v3,v4,v0) |
| 134 | + c5_array = forward_conv5_prop(sentenceMatrix,w1,w2,w3,w4,w5,w0) |
| 135 | + #max-pooling |
| 136 | + m3 = c3_array.max() |
| 137 | + m3_index = c3_array.argmax() |
| 138 | + m4 = c4_array.max() |
| 139 | + m4_index = c4_array.argmax() |
| 140 | + m5 = c5_array.max() |
| 141 | + m5_index = c5_array.argmax() |
| 142 | + documentVector3 = np.append(documentVector3,m3) #filling array with sentence representations |
| 143 | + documentIndexVector3 = np.append(documentIndexVector3,m3_index) #filling array with indices of max vals |
| 144 | + documentVector4 = np.append(documentVector4,m4) #filling array with sentence representations |
| 145 | + documentIndexVector4 = np.append(documentIndexVector4,m4_index) #filling array with indices of max vals |
| 146 | + documentVector5 = np.append(documentVector5,m5) #filling array with sentence representations |
| 147 | + documentIndexVector5 = np.append(documentIndexVector5,m5_index) #filling array with indices of max vals |
| 148 | + sM_keeper[count_valid_sentences] = sentenceMatrix #keep valid sentence matrix for back propagation |
| 149 | + cA3_keeper[count_valid_sentences] = c3_array[m3_index] #keep max c vals for back propagation |
| 150 | + cA4_keeper[count_valid_sentences] = c4_array[m4_index] #keep max c vals for back propagation |
| 151 | + cA5_keeper[count_valid_sentences] = c5_array[m5_index] #keep max c vals for back propagation |
| 152 | + count_valid_sentences = count_valid_sentences + 1 |
| 153 | + documentVector = np.concatenate((documentVector3,documentVector4),axis=0) |
| 154 | + documentVector = np.concatenate((documentVector,documentVector5),axis=0) |
| 155 | + if len(documentVector)>=21: |
| 156 | + #max-5-pooling in document level |
| 157 | + s3_index = documentVector3.argsort()[-5:][::-1] |
| 158 | + s3_array = np.zeros((5)) |
| 159 | + s4_index = documentVector4.argsort()[-5:][::-1] |
| 160 | + s4_array = np.zeros((5)) |
| 161 | + s5_index = documentVector5.argsort()[-5:][::-1] |
| 162 | + s5_array = np.zeros((5)) |
| 163 | + for i in range(5): |
| 164 | + s3_array[i] = documentVector3[s3_index[i]] |
| 165 | + s4_array[i] = documentVector4[s4_index[i]] |
| 166 | + s5_array[i] = documentVector5[s5_index[i]] |
| 167 | + s_array = np.concatenate((s3_array,s4_array),axis=0) |
| 168 | + s_array = np.concatenate((s_array,s5_array),axis=0) |
| 169 | + s_index = np.concatenate((s3_index,s4_index),axis=0) |
| 170 | + s_index = np.concatenate((s_index,s5_index),axis=0) |
| 171 | + o = np.inner(s_array,p) + p0 |
| 172 | + y = func_sigmoid(o) |
| 173 | + #backpropagation |
| 174 | + r = out[doc] |
| 175 | + #delta_p |
| 176 | + delta_p = nu1*(r-y)*s_array |
| 177 | + delta_p0 = nu1*(r-y) |
| 178 | + #delta_u, delta_v, delta_w |
| 179 | + delta_u1 = np.zeros((50)) |
| 180 | + delta_u2 = np.zeros((50)) |
| 181 | + delta_u3 = np.zeros((50)) |
| 182 | + delta_u0 = 0 |
| 183 | + delta_v1 = np.zeros((50)) |
| 184 | + delta_v2 = np.zeros((50)) |
| 185 | + delta_v3 = np.zeros((50)) |
| 186 | + delta_v4 = np.zeros((50)) |
| 187 | + delta_v0 = 0 |
| 188 | + delta_w1 = np.zeros((50)) |
| 189 | + delta_w2 = np.zeros((50)) |
| 190 | + delta_w3 = np.zeros((50)) |
| 191 | + delta_w4 = np.zeros((50)) |
| 192 | + delta_w5 = np.zeros((50)) |
| 193 | + delta_w0 = 0 |
| 194 | + for h in range(15): |
| 195 | + j = s_index[h] |
| 196 | + sM = sM_keeper[j] |
| 197 | + if h<5: |
| 198 | + cA = cA3_keeper[j] |
| 199 | + i = documentIndexVector3[j] |
| 200 | + delta_u1 = delta_u1 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i] |
| 201 | + delta_u2 = delta_u2 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+1] |
| 202 | + delta_u3 = delta_u3 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+2] |
| 203 | + delta_u0 = delta_u0 + nu2*(r-y)*p[h]*cA*(1-cA) |
| 204 | + elif h<10: |
| 205 | + cA = cA4_keeper[j] |
| 206 | + i = documentIndexVector4[j] |
| 207 | + delta_v1 = delta_v1 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i] |
| 208 | + delta_v2 = delta_v2 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+1] |
| 209 | + delta_v3 = delta_v3 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+2] |
| 210 | + delta_v4 = delta_v4 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+3] |
| 211 | + delta_v0 = delta_v0 + nu2*(r-y)*p[h]*cA*(1-cA) |
| 212 | + else: |
| 213 | + cA = cA5_keeper[j] |
| 214 | + i = documentIndexVector5[j] |
| 215 | + delta_w1 = delta_w1 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i] |
| 216 | + delta_w2 = delta_w2 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+1] |
| 217 | + delta_w3 = delta_w3 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+2] |
| 218 | + delta_w4 = delta_w4 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+3] |
| 219 | + delta_w5 = delta_w5 + nu2*(r-y)*p[h]*cA*(1-cA)*sM[i+4] |
| 220 | + delta_w0 = delta_w0 + nu2*(r-y)*p[h]*cA*(1-cA) |
| 221 | + #updating weights |
| 222 | + u1 = u1 + delta_u1 |
| 223 | + u2 = u2 + delta_u2 |
| 224 | + u3 = u3 + delta_u3 |
| 225 | + u0 = u0 + delta_u0 |
| 226 | + v1 = v1 + delta_v1 |
| 227 | + v2 = v2 + delta_v2 |
| 228 | + v3 = v3 + delta_v3 |
| 229 | + v4 = v4 + delta_v4 |
| 230 | + v0 = v0 + delta_v0 |
| 231 | + w1 = w1 + delta_w1 |
| 232 | + w2 = w2 + delta_w2 |
| 233 | + w3 = w3 + delta_w3 |
| 234 | + w4 = w4 + delta_w4 |
| 235 | + w5 = w5 + delta_w5 |
| 236 | + w0 = w0 + delta_w0 |
| 237 | + p = p + delta_p |
| 238 | + p0 = p0 + delta_p0 |
| 239 | + #store y and r values |
| 240 | + y_store = np.append(y_store,y) |
| 241 | + r_store = np.append(r_store,r) |
| 242 | + count_valid_docs = count_valid_docs + 1 |
| 243 | + print r |
| 244 | + print y |
| 245 | + print doc |
| 246 | + if count_valid_docs==50: |
| 247 | + cross_entropy = -1*np.inner(r_store,np.log(y_store)) #Cross-Entropy |
| 248 | + count_valid_docs = 0 |
| 249 | + y_store = np.array([]) |
| 250 | + r_store = np.array([]) |
| 251 | + print "CROSS-ENTROPY" |
| 252 | + print cross_entropy |
| 253 | + #save the found weights to txt file |
| 254 | + with open(category_to_be_trained+'_weights.txt',"w") as cat_f: |
| 255 | + cat_f.write("\n".join(" ".join(map(str, x)) for x in (u1,u2,u3,u0,v1,v2,v3,v4,v0,w1,w2,w3,w4,w5,w0,p,np.array([p0])))) |
| 256 | + |
| 257 | + |
| 258 | + |
| 259 | +training(inp_train,out_train,1,directory,stop_words) |
0 commit comments