-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbaseline.py
More file actions
82 lines (72 loc) · 2.06 KB
/
baseline.py
File metadata and controls
82 lines (72 loc) · 2.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import operator
import os
import os.path
import nltk
from nltk.corpus import stopwords
#Output file name
kaggleTest = "kaggleTest.csv"
def training(fname):
with open(fname) as f:
content = f.readlines()
lexicon = {}
for l in range(0,len(content),3):
words = content[l].split()
pos = content[l+1].split()
tags = content[l+2].split()
for w in range(0,len(words)):
if words[w] not in lexicon:
lexicon[words[w]] = {}
lexicon[words[w]][tags[w]]= 1
else:
if tags[w] in lexicon[words[w]]:
lexicon[words[w]][tags[w]] += 1
else:
lexicon[words[w]][tags[w]] = 1
return lexicon
def test(fname,lexicon):
predictions = {
'PER':[],
'LOC':[],
'ORG':[],
'MISC':[]
}
with open(fname) as f:
content = f.readlines()
for l in range(0,len(content),3):
words = content[l].split()
pos = content[l+1].split()
number = content[l+2].split()
lasttag = 0
for w in range(0,len(words)):
word = words[w]
if word in lexicon:
tag = lexicon[word]
#select the tag that the word was most used as in the training corpus
maxtag = max(tag.iteritems(), key=operator.itemgetter(1))[0]
if maxtag != 'O':
shorten = maxtag[2:]
predictions[shorten].append(number[w])
print_to_file(predictions)
return predictions
def print_to_file(predictions):
if os.path.exists(kaggleTest):
mode = 'a'
else:
mode = 'a'
with open(kaggleTest, mode) as f:
f.write("Type,Prediction\n")
with open(kaggleTest, mode) as f:
for k in predictions:
f.write(k + ", ")
start = predictions[k][0]
f.write(str(start) + "-")
for w in range(1,len(predictions[k])):
if not (int(predictions[k][w]) - int(start) == 1):
f.write(str(start) + " ")
start = predictions[k][w]
f.write(str(start) + "-")
else:
start = predictions[k][w]
f.write(str(start)+"\n")
lexicon = training('train.txt')
test('test.txt', lexicon)