-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathdata_helper.py
96 lines (78 loc) · 2.84 KB
/
data_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
@file : data_helper.py
@author : xiaolu
@email : [email protected]
@time : 2022-01-07
"""
import torch
from torch.utils.data import Dataset
def load_data(path):
sentence, label = [], []
with open(path, 'r', encoding='utf8') as f:
lines = f.readlines()
for line in lines:
line = line.strip().split('\t')
try:
sentence.extend([line[0], line[1]])
lab = int(line[2])
label.extend([lab, lab])
except:
continue
return sentence, label
def load_test_data(path):
sent1, sent2, label = [], [], []
with open(path, 'r', encoding='utf8') as f:
lines = f.readlines()
for line in lines:
line = line.strip().split('\t')
sent1.append(line[0])
sent2.append(line[1])
label.append(int(line[2]))
return sent1, sent2, label
class CustomDataset(Dataset):
def __init__(self, sentence, label, tokenizer):
self.sentence = sentence
self.label = label
self.tokenizer = tokenizer
def __len__(self):
return len(self.sentence)
def __getitem__(self, index):
inputs = self.tokenizer.encode_plus(
text=self.sentence[index],
text_pair=None,
add_special_tokens=True,
return_token_type_ids=True
)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'token_type_ids': token_type_ids,
'label': self.label[index]
}
def pad_to_maxlen(input_ids, max_len, pad_value=0):
if len(input_ids) >= max_len:
input_ids = input_ids[:max_len]
else:
input_ids = input_ids + [pad_value] * (max_len - len(input_ids))
return input_ids
def collate_fn(batch):
# 按batch进行padding获取当前batch中最大长度
max_len = max([len(d['input_ids']) for d in batch])
if max_len > 512:
max_len = 512
# 定一个全局的max_len
# max_len = 128
input_ids, attention_mask, token_type_ids, labels = [], [], [], []
for item in batch:
input_ids.append(pad_to_maxlen(item['input_ids'], max_len=max_len))
attention_mask.append(pad_to_maxlen(item['attention_mask'], max_len=max_len))
token_type_ids.append(pad_to_maxlen(item['token_type_ids'], max_len=max_len))
labels.append(item['label'])
all_input_ids = torch.tensor(input_ids, dtype=torch.long)
all_input_mask = torch.tensor(attention_mask, dtype=torch.long)
all_segment_ids = torch.tensor(token_type_ids, dtype=torch.long)
all_label_ids = torch.tensor(labels, dtype=torch.float)
return all_input_ids, all_input_mask, all_segment_ids, all_label_ids