15
15
from datasets .conala .evaluator import ConalaEvaluator
16
16
from datasets .conala .util import *
17
17
18
+ assert astor .__version__ == '0.7.1'
18
19
19
20
def preprocess_conala_dataset (train_file , test_file , grammar_file , src_freq = 3 , code_freq = 3 ,
20
21
mined_data_file = None , vocab_size = 20000 , num_mined = 0 , out_dir = 'data/conala' ):
@@ -33,11 +34,23 @@ def preprocess_conala_dataset(train_file, test_file, grammar_file, src_freq=3, c
33
34
dev_examples = train_examples [:200 ]
34
35
train_examples = train_examples [200 :]
35
36
37
+ mined_examples = None
36
38
if mined_data_file and num_mined > 0 :
37
39
print ("use mined data: " , num_mined )
38
40
print ("from file: " , mined_data_file )
39
41
mined_examples = preprocess_dataset (mined_data_file , name = 'mined' , transition_system = transition_system ,
40
42
firstk = num_mined )
43
+ # mined_src_vocab = VocabEntry.from_corpus([e.src_sent for e in train_examples], size=vocab_size,
44
+ # freq_cutoff=src_freq)
45
+ # mined_primitive_tokens = [map(lambda a: a.action.token,
46
+ # filter(lambda a: isinstance(a.action, GenTokenAction), e.tgt_actions))
47
+ # for e in train_examples]
48
+ # mined_primitive_vocab = VocabEntry.from_corpus(mined_primitive_tokens, size=vocab_size, freq_cutoff=code_freq)
49
+ #
50
+ # # generate vocabulary for the code tokens!
51
+ # mined_code_tokens = [transition_system.tokenize_code(e.tgt_code, mode='decoder') for e in train_examples]
52
+ # mined_code_vocab = VocabEntry.from_corpus(mined_code_tokens, size=vocab_size, freq_cutoff=code_freq)
53
+
41
54
pickle .dump (mined_examples , open (os .path .join (out_dir , 'pre_{}.bin' .format (num_mined )), 'wb' ))
42
55
train_examples += mined_examples
43
56
@@ -57,6 +70,7 @@ def preprocess_conala_dataset(train_file, test_file, grammar_file, src_freq=3, c
57
70
58
71
# generate vocabulary for the code tokens!
59
72
code_tokens = [transition_system .tokenize_code (e .tgt_code , mode = 'decoder' ) for e in train_examples ]
73
+
60
74
code_vocab = VocabEntry .from_corpus (code_tokens , size = vocab_size , freq_cutoff = code_freq )
61
75
62
76
vocab = Vocab (source = src_vocab , primitive = primitive_vocab , code = code_vocab )
0 commit comments