python-project/lab1.py at master · UNA8211/python-project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python
import argparse
import re

# Replace the string value of the following variable with your names.
ME = 'Ryan Brand';
COLLABORATORS = ["nobody"]

def process_file(infile):
    titles = set()

    # Loop through each line of the file
    for line in infile:

        # This prints each line. You will not want to remove this line.
        #print(line.rstrip())

 	# Add your code here to clean the input

        title = re.search( r'<SEP>(.*)<SEP>(.*)<SEP>(.*)', line, re.M|re.I)
        #print(title.group(3))
        title = title.group(3)

        title = re.sub( r'\[.*\]', '', title)
        title = re.sub( r'\(.*\)', '', title)

        title = re.sub( r'[!&\.\?;]', '', title)
        # adds the clean line to the titles
        # you may want to keep this line
        lower_case_clean_title = title.lower()
        print(lower_case_clean_title)
        titles.add(lower_case_clean_title)

    # loop over the cleaned titles and compute the bigram counts
    bigram_count = 'SOME DATA STRUCTURE'
    for title in titles:
        pass


    # using bigram_count, find most common word following 'word'
    def most_common_word(word):
        return random.choice(choices)

    # return most common word
    return most_common_word


# DON'T WORRY ABOUT CODE BELOW HERE, IT JUST MAKES YOUR LIVE EASIER
def get_file_name():
    parser = argparse.ArgumentParser()
    parser.add_argument('file_name')
    return parser.parse_args().file_name


def main():
    print('CSCI 305 Lab 1 submitted by %s' % ME)
    print('  with help from %s\n\n' % ', '.join(COLLABORATORS))
    file_name = get_file_name()
    with open(file_name, 'r') as infile:
        process_file(infile)


if __name__ == '__main__':
    main()