-
Notifications
You must be signed in to change notification settings - Fork 0
/
vocab.py
113 lines (97 loc) · 3.53 KB
/
vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python
# _*_coding:utf-8 _*_
# @Time :2022/7/22 22:42
# @Author :Abner Wong
# @Software: PyCharm
import re
from config import Config
class Vocab:
UNK_TAG = "<UNK>" # 表示未知字符
PAD_TAG = "<PAD>" # 填充符
PAD = 0
UNK = 1
def __init__(self):
self.dict = { # 保存词语和对应的数字
self.UNK_TAG: self.UNK,
self.PAD_TAG: self.PAD
}
self.count = {} # 统计词频的
def fit(self, sentence):
"""
接受句子,统计词频
:param sentence:[str,str,str]
:return:None
"""
for word in sentence:
self.count[word] = self.count.get(word, 0) + 1 # 所有的句子fit之后,self.count就有了所有词语的词频
def build_vocab(self, min_count=1, max_count=None, max_features=None):
"""
根据条件构造 词典
:param min_count:最小词频
:param max_count: 最大词频
:param max_features: 最大词语数
:return:
"""
if min_count is not None:
self.count = {word: count for word, count in self.count.items() if count >= min_count}
if max_count is not None:
self.count = {word: count for word, count in self.count.items() if count <= max_count}
if max_features is not None:
# [(k,v),(k,v)....] --->{k:v,k:v}
self.count = dict(sorted(self.count.items(), lambda x: x[-1], reverse=True)[:max_features])
for word in self.count:
self.dict[word] = len(self.dict) # 每次word对应一个数字
# 把dict进行翻转
self.inverse_dict = dict(zip(self.dict.values(), self.dict.keys()))
def tokenize(self, sentence):
"""
进行文本分词
:param sentence: str
:return: [str,str,str]
"""
fileters = ['!', '"', '#', '&', '\(', '\)', '\*', '\+', '\:', ';', '<', '=', '>',
'\[', '\\\\', '\]', '^', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”',
'“', '//', '/', '\.']
# sentence = sentence.lower() # 把大写转化为小写
sentence = re.sub("|".join(fileters), " ", sentence.replace('{', ' '))
result = [i for i in sentence.lower().split(" ") if len(i) > 0]
return result
def transform(self, sentence, max_len=None):
"""
把句子转化为数字序列
:param sentence:[str,str,str]
:return: [int,int,int]
"""
if len(sentence) > max_len:
sentence = sentence[:max_len]
else:
sentence = sentence + [self.PAD_TAG] * (max_len - len(sentence)) # 填充PAD
return [self.dict.get(i, 1) for i in sentence]
def inverse_transform(self, incides):
"""
把数字序列转化为字符
:param incides: [int,int,int]
:return: [str,str,str]
"""
return [self.inverse_dict.get(i, "<UNK>") for i in incides]
def __len__(self):
return len(self.dict)
if __name__ == '__main__':
import pandas as pd
import joblib
from tqdm import tqdm
config = Config()
data_path = "/mnt/abner/project/prompt/tmp/sms_data/"
vb = Vocab()
df = pd.read_csv(data_path+"sms_train_data.csv",)
for body in tqdm(set(df.body)):
try:
sent = vb.tokenize(body)
vb.fit(sent)
except Exception as e:
print(e)
print(body)
break
vb.build_vocab()
print(len(vb.dict))
joblib.dump(vb.dict, config.vocab_path)