-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain_tokenization.py
More file actions
58 lines (52 loc) · 2.8 KB
/
main_tokenization.py
File metadata and controls
58 lines (52 loc) · 2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import json
from stages.tokenization import annotate_data, build_matrizes, build_csv_matrix
def read_google_data(data_source):
data_string = open(data_source, 'rb').read().decode('utf-8')
sentences = data_string.split('\n\n')
print('Number of sentences:', len(sentences))
json_objects = []
for chunk in sentences:
try:
obj = json.loads(chunk)
current = {'sentence': obj['graph']['sentence'], 'compression': obj['compression']['text']}
json_objects.append(current)
except json.decoder.JSONDecodeError:
print('Found decode error:', chunk)
continue
return json_objects
def read_process_description_data(data_source):
data_string = open(data_source, 'rb').read().decode('utf-8')
json_objects = json.loads(data_string)
print('Number of sentences:', len(json_objects))
return json_objects
if __name__ == '__main__':
try:
# Google corpus
GOOGLE_DATA_SOURCES = ['sent-comp.train01.json', 'sent-comp.train02.json',
'sent-comp.train03.json', 'sent-comp.train04.json',
'sent-comp.train05.json', 'sent-comp.train06.json',
'sent-comp.train07.json', 'sent-comp.train08.json',
'sent-comp.train09.json', 'sent-comp.train10.json',
'comp-data.eval.json']
for data_source in GOOGLE_DATA_SOURCES:
print('Reading ', data_source)
data = read_google_data('ressources/google/' + data_source)
sentences, compressions = annotate_data(data)
X = build_matrizes(sentences, compressions)
header_row = ['word', 'pos', 'dependency_label', 'id', 'parent', 'EOS', 'kept']
data = build_csv_matrix(header_row, X)
data.to_csv('csv/google/' + data_source + '.csv', sep=',', encoding='utf-8', na_rep='N/A')
print('Totally saved:', len(X))
# Process description corpus
PD_DATA_SOURCES = ['process1.json', 'process2.json', 'process3.json']
for data_source in PD_DATA_SOURCES:
print('Reading ', data_source)
data = read_process_description_data('ressources/process_descriptions/' + data_source)
sentences, compressions = annotate_data(data)
X = build_matrizes(sentences, compressions)
header_row = ['word', 'pos', 'dependency_label', 'id', 'parent', 'EOS', 'kept']
data = build_csv_matrix(header_row, X)
data.to_csv('csv/process_descriptions/' + data_source + '.csv', sep=',', encoding='utf-8', na_rep='N/A')
print('Totally saved:', len(X))
except FileNotFoundError:
print('Download full corpus first and add it to ressources/process_descriptions and ressources/google first')