1
+ import pandas as pd
2
+ import numpy as np
3
+ import tensorflow as tf
4
+ import nltk
5
+ from nltk .corpus import stopwords
6
+ from nltk .corpus import wordnet
7
+ from nltk .tokenize import word_tokenize
8
+ from nltk .stem import PorterStemmer , WordNetLemmatizer
9
+ from nltk .wsd import lesk
10
+ import numpy as np
11
+ from scipy .optimize import linear_sum_assignment
12
+ from nltk import pos_tag , ne_chunk
13
+ import nltk .tag .stanford as st
14
+ classifier = '/home/gautam/Desktop/Courses/MTL785/project/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz'
15
+ jar = '/home/gautam/Desktop/Courses/MTL785/project/stanford-ner-2017-06-09/stanford-ner.jar'
16
+ s = st .StanfordNERTagger (classifier ,jar )
17
+ # nltk.download('wordnet')
18
+ stemmer = PorterStemmer ()
19
+ lemmatiser = WordNetLemmatizer ()
20
+ stop_word = set (stopwords .words ('english' ))
21
+
22
+ df = pd .read_csv ('data/train.csv' )
23
+ # print(df.columns.values)
24
+ question2_total = df .iloc [:,4 ].values
25
+ question1_total = df .iloc [:,3 ].values
26
+ # question1_total = ['what is your name']
27
+ # question2_total = ['what should I call you']
28
+ # print(question1_total)
29
+ question1 = word_tokenize (question1_total [0 ])
30
+ question2 = word_tokenize (question2_total [0 ])
31
+ print (question1 )
32
+ print (question2 )
33
+
34
+
35
+ nerq1 = s .tag (question1 )
36
+
37
+ nerq2 = s .tag (question2 )
38
+ print ("##########" )
39
+ for i in nerq1 :
40
+ # print(i[1])
41
+ if (i [1 ]== "LOCATION" ):
42
+ loc1 = i [0 ]
43
+ if (i [1 ]== "NAME" ):
44
+ name1 = i [0 ]
45
+ print (loc1 )
46
+
47
+
48
+ for i in nerq2 :
49
+ # print(i[1])
50
+ if (i [1 ]== "LOCATION" ):
51
+ loc2 = i [0 ]
52
+ if (i [1 ]== "NAME" ):
53
+ name2 = i [0 ]
54
+ # print(nerq2)
55
+ print ("##########" )
56
+
57
+ tagged1 = nltk .pos_tag (question1 )
58
+ tagged2 = nltk .pos_tag (question2 )
59
+ # filtered_q1 = [w for w in tagged1 if not w[0] in stop_word]
60
+ # filtered_q2 = [w for w in tagged2 if not w[0] in stop_word]
61
+ # q1 = [(stemmer.stem(w[0]),w[1]) for w in tagged1]
62
+ # q2 = [(stemmer.stem(w[0]),w[1]) for w in tagged2]
63
+
64
+ # print(tagged1)
65
+ # print(tagged2)
66
+ common_words = [word for word in question1 if word in question2 ]
67
+ # print(common_words)
68
+
69
+ # list = []
70
+ list1 = []
71
+ list2 = []
72
+ for item in tagged1 :
73
+ list1 .append (lesk (tagged1 ,item [0 ]))
74
+ # print(item[0])
75
+ # print(item[1])
76
+ print (list1 )
77
+ for item in tagged2 :
78
+ list2 .append (lesk (tagged2 ,item [0 ]))
79
+ # print(item[0])
80
+ # print(item[1])
81
+ print (list2 )
82
+
83
+ R = np .zeros ((len (list1 ),len (list2 )))
84
+ for i in range (len (list1 )):
85
+ for j in range (len (list2 )):
86
+ if list1 [i ]and list2 [j ]:
87
+ R [i ][j ] = list1 [i ].wup_similarity (list2 [j ])
88
+ print (R )
89
+ # row_ind, col_ind = linear_sum_assignment(R)
90
+ # print(row_ind)
91
+ # print(col_ind)
92
+ # for word1 in question1:
93
+ # for word2 in question2:
94
+ # wordFromList1 = wordnet.synsets(word1)
95
+ # wordFromList2 = wordnet.synsets(word2)
96
+ # # print(wordFromList1)
97
+ # # print(wordFromList2)
98
+ # if wordFromList1 and wordFromList2: #Thanks to @alexis' note
99
+ # s = wordFromList1[0].wup_similarity(wordFromList2[0])
100
+ # # print()
101
+ # if (s>0.8)
102
+ # print(s)
103
+ # print(word1)
104
+ # print(word2)
105
+ # list.append(s)
106
+ # break
107
+ # break
108
+ # print(list)
109
+ # print(max(list))
110
+ # print(lesk())
0 commit comments