-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhw2.py
More file actions
113 lines (95 loc) · 2.97 KB
/
hw2.py
File metadata and controls
113 lines (95 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import math
import sys
import bigrams
import dicts
#accepts a unigram, bigram, and two chars/words
def addOneSmoothing(u, b, w1, w2):
V = len(b.keys()) #vocab size
bigram_count = b[w1][w2]
unigram_count = u[w1]
return (bigram_count + 1.) / (unigram_count + V)
ENGLISH_FILENAME = sys.argv[1]
FRENCH_FILENAME = sys.argv[2]
ITALIAN_FILENAME = sys.argv[3]
TEST_FILENAME = sys.argv[4]
OPTION = sys.argv[5]
if OPTION == "-l":
########## QUESTION 1 ##########
######## letter bigrams ########
letter_out = open('output_letter.txt', 'w')
english_bigrams = bigrams.file2bigrams_letter(ENGLISH_FILENAME)
french_bigrams = bigrams.file2bigrams_letter(FRENCH_FILENAME)
italian_bigrams = bigrams.file2bigrams_letter(ITALIAN_FILENAME)
english_unigrams = bigrams.file2unigrams_letter(ENGLISH_FILENAME)
french_unigrams = bigrams.file2unigrams_letter(FRENCH_FILENAME)
italian_unigrams = bigrams.file2unigrams_letter(ITALIAN_FILENAME)
for i, line in enumerate(open(TEST_FILENAME).read().split('\n')):
if line == '':
continue
chars = list(line)
max_prob = float("-Inf")
max_language = ""
lang = ""
unigram = {}
bigram = {}
for j in range(0,3):
prob = 0.0
if j == 0:
lang = "English"
unigram = english_unigrams
bigram = english_bigrams
elif j == 1:
lang = "French"
unigram = french_unigrams
bigram = french_bigrams
else:
lang = "Italian"
unigram = italian_unigrams
bigram = italian_bigrams
for (w1, w2) in zip([None] + chars, chars + [None]):
if w1 != None and w2 != None:
prob += math.log(addOneSmoothing(unigram, bigram, w1, w2))
if prob > max_prob:
max_prob = prob
max_language = lang
letter_out.write(str(i + 1) + " " + max_language + "\n")
else:
########## QUESTION 2 ##########
######## word bigrams ########
word_out = open('output_word.txt', 'w')
english_bigrams = bigrams.file2bigrams_word(ENGLISH_FILENAME)
french_bigrams = bigrams.file2bigrams_word(FRENCH_FILENAME)
italian_bigrams = bigrams.file2bigrams_word(ITALIAN_FILENAME)
english_unigrams = bigrams.file2unigrams_word(ENGLISH_FILENAME)
french_unigrams = bigrams.file2unigrams_word(FRENCH_FILENAME)
italian_unigrams = bigrams.file2unigrams_word(ITALIAN_FILENAME)
for i, line in enumerate(open(TEST_FILENAME).read().split('\n')):
if line == '':
continue
words = line.split()
max_prob = float("-Inf")
max_language = ""
lang = ""
unigram = {}
bigram = {}
for j in range(0,3):
prob = 0.0
if j == 0:
lang = "English"
unigram = english_unigrams
bigram = english_bigrams
elif j == 1:
lang = "French"
unigram = french_unigrams
bigram = french_bigrams
else:
lang = "Italian"
unigram = italian_unigrams
bigram = italian_bigrams
for (w1, w2) in zip([None] + words, words + [None]):
if w1 != None and w2 != None:
prob += math.log(addOneSmoothing(unigram, bigram, w1, w2))
if prob > max_prob:
max_prob = prob
max_language = lang
word_out.write(str(i + 1) + " " + max_language + "\n")