forked from JerryLead/CorgiPile-PyTorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess.py
55 lines (39 loc) · 1.92 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from datasets import run_doc_prepro, run_sent_prepro
from utils import parse_opt
if __name__ == '__main__':
# data_name = 'ag_news'
data_name = 'yelp_review_full'
#model_name = 'textcnn'
model_name = 'han'
config = parse_opt(data_name, model_name)
if config.model_name in ['han']:
run_doc_prepro(
csv_folder = config.dataset_path,
output_folder = config.output_path,
word_limit = config.word_limit,
sentence_limit = config.sentence_limit,
min_word_count = config.min_word_count
)
else:
run_sent_prepro(
csv_folder = config.dataset_path,
output_folder = config.output_path,
word_limit = config.word_limit,
min_word_count = config.min_word_count
)
'''
[lijie@db4ai-1 nlp_dl_bench]$ python3 preprocess.py
Training data: reading and preprocessing...
100%|████████████████████████████████████████████████████████████████████| 120000/120000 [01:11<00:00, 1680.54it/s]
Training data: discarding words with counts less than 5, the size of the vocabulary is 26933.
Training data: word map saved to /ssddisk/data/text_data/outputs/ag_news/docs.
Training data: encoding and padding...
Training data: saving...
Training data: encoded, padded data saved to /ssddisk/data/text_data/outputs/ag_news/docs.
Test data: reading and preprocessing...
100%|████████████████████████████████████████████████████████████████████████| 7600/7600 [00:04<00:00, 1754.40it/s]
Test data: encoding and padding...
Test data: saving...
Test data: encoded, padded data saved to /ssddisk/data/text_data/outputs/ag_news/docs.
All done!
'''