-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprocess-text-with-udpipe.py
More file actions
133 lines (110 loc) · 5.16 KB
/
process-text-with-udpipe.py
File metadata and controls
133 lines (110 loc) · 5.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
This script provides an example of how to use the UDPipe toolkit for sentence breaking,
tokenisation, (morphological) tagging and (syntactic) parsing.
"""
import sys
import os
import argparse
import datetime
import ufal.udpipe
import cPickle
script_path=os.path.dirname(os.path.realpath(__file__))
truecaser_script_dir = os.path.join(script_path,"dependencies","truecaser")
sys.path.insert(1,truecaser_script_dir)
from Truecaser import *
#Model implementation from https://github.com/ufal/udpipe/tree/master/bindings/python/examples
class Model:
def __init__(self, path):
"""Load given model."""
self.model = ufal.udpipe.Model.load(path)
if not self.model:
raise Exception("Cannot load UDPipe model from file '%s'" % path)
def tokenize(self, text):
"""Tokenize the text and return list of ufal.udpipe.Sentence-s."""
tokenizer = self.model.newTokenizer(self.model.DEFAULT)
if not tokenizer:
raise Exception("The model does not have a tokenizer")
return self._read(text, tokenizer)
def read(self, text, format):
"""Load text in the given format (conllu|horizontal|vertical) and return list of ufal.udpipe.Sentence-s."""
input_format = ufal.udpipe.InputFormat.newInputFormat(format)
if not input_format:
raise Exception("Cannot create input format '%s'" % format)
return self._read(text, input_format)
def _read(self, text, input_format):
input_format.setText(text)
error = ufal.udpipe.ProcessingError()
sentences = []
sentence = ufal.udpipe.Sentence()
while input_format.nextSentence(sentence, error):
sentences.append(sentence)
sentence = ufal.udpipe.Sentence()
if error.occurred():
raise Exception(error.message)
return sentences
def tag(self, sentence):
"""Tag the given ufal.udpipe.Sentence (inplace)."""
self.model.tag(sentence, self.model.DEFAULT)
def parse(self, sentence):
"""Parse the given ufal.udpipe.Sentence (inplace)."""
self.model.parse(sentence, self.model.DEFAULT)
def write(self, sentences, format):
"""Write given ufal.udpipe.Sentence-s in the required format (conllu|horizontal|vertical)."""
output_format = ufal.udpipe.OutputFormat.newOutputFormat(format)
output = ''
for sentence in sentences:
output += output_format.writeSentence(sentence)
output += output_format.finishDocument()
return output
"""Main method - performs processing of text from a file (handle) and writes the result in a conll format in a file (handle)"""
def main(input_file, output_file, language="en",verbose=True):
sys.stderr.write("Starting to process text in file {0} at {1}\n".format(input_file.name,str(datetime.datetime.now())))
text=input_file.read().decode('utf-8')
if verbose:
print "============= Raw text: ============="
print text
script_path=os.path.dirname(os.path.realpath(__file__)) #needed to figure out the path of dependencies when executing from different directories
model_file = os.path.join(script_path,"models",language+".model.output")
#Read the truecasing model (this is the slowest part in this example and could be removed.
truecasing_model = os.path.join(script_path,"models","truecasing-model."+language)
f = open(truecasing_model, 'rb')
uniDist = cPickle.load(f)
backwardBiDist = cPickle.load(f)
forwardBiDist = cPickle.load(f)
trigramDist = cPickle.load(f)
wordCasingLookup = cPickle.load(f)
f.close()
#Read the UDPipe model
model = Model(model_file)
#First, we tokenize the text and split it into sentences.
sentences = model.tokenize(text)
#Then, we truecase the text (this is a bit ad-hoc).
for s in sentences:
tokens = [w.form.lower() for w in s.words ]
truecased_tokens = getTrueCase(tokens, 'title', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
#We truecase only the first token in a sentence!
s.words[1].form = truecased_tokens[1]
#Then, we perform tagging and parsing for each sentence
for s in sentences:
model.tag(s) #inplace tagging
model.parse(s) #inplace parsing
conllu = model.write(sentences, "conllu")
#Save the result in the output file
if verbose:
print "========= Processed sentences: ========="
print (conllu)
sys.stderr.write("... processing completed at {0}\n".format(str(datetime.datetime.now())))
output_file.write(conllu.encode('utf-8'))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--input', '-i',
type=argparse.FileType('r'),
default=sys.stdin, metavar='PATH',
help="Input file (default: standard input)")
parser.add_argument('--output', '-o', type=argparse.FileType('w'),
default=sys.stdout, metavar='PATH',
help="Output file (default: standard output)")
parser.add_argument('--language', '-l', type=str, default='en',
help="Language (default: %(default)s))")
args = parser.parse_args()
main(args.input, args.output, args.language)