forked from FerreroJeremy/ln2sql
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathStopwordFilter.py
executable file
·37 lines (28 loc) · 1002 Bytes
/
StopwordFilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# -*- coding: utf-8 -*
import sys
import unicodedata
reload(sys)
sys.setdefaultencoding("utf-8")
class StopwordFilter:
def __init__(self):
self.list = []
def add_stopword(self, word):
self.list.append(word)
def get_stopword_list(self):
return self.list
def filter(self, sentence):
tmp_sentence = []
for word in sentence:
word = self.remove_accents(word).lower()
if word not in self.list:
tmp_sentence.append(word)
return tmp_sentence
def remove_accents(self, string):
nkfd_form = unicodedata.normalize('NFKD', unicode(string))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def load(self, lang):
with open('./stopwords/' + lang + '.txt') as f:
lines = f.read().split('\n')
for word in lines:
stopword = self.remove_accents(word).lower()
self.list.append(stopword)