forked from ThiagoCF05/webnlg
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
37 lines (29 loc) · 955 Bytes
/
preprocess.py
File metadata and controls
37 lines (29 loc) · 955 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
__author__='thiagocastroferreira'
"""
Author: Thiago Castro Ferreira
Date: 15/07/2018
Description:
Preprocessing script: for train and dev sets, parse files, order english part and preprocess to be translated it
PYTHON VERSION: 2.7
"""
import order
import parser
from nmt import NMT
import cPickle as p
def run(set_path, entry_path, _set):
entryset = parser.run_parser(set_path) # parse
entryset = order.run(entryset, 'en') # order english part
nmt = NMT(entryset, _set) # translate to german
nmt.preprocess()
p.dump(entryset, open(entry_path, 'w'))
if __name__ == '__main__':
print 'Preparing devset...'
DEV_PATH = 'corpus/delexicalized/dev'
ENTRY_PATH = 'corpus/dev.cPickle'
_set = 'dev'
run(DEV_PATH, ENTRY_PATH, _set)
print 'Preparing trainset....'
TRAIN_PATH = 'corpus/delexicalized/train'
ENTRY_PATH = 'corpus/train.cPickle'
_set = 'train'
run(TRAIN_PATH, ENTRY_PATH, _set)