From 7b4017be7cdb80cb6aea020f43abb9d765093c0f Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Thu, 28 May 2015 13:24:36 -0400 Subject: [PATCH 01/10] added word_frequency.py --- word_frequency.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 word_frequency.py diff --git a/word_frequency.py b/word_frequency.py new file mode 100644 index 0000000..b199df5 --- /dev/null +++ b/word_frequency.py @@ -0,0 +1 @@ +import re From b6bfd1d953e8cb71462c14cf8a5c1c5bd6277ef1 Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Thu, 28 May 2015 13:32:14 -0400 Subject: [PATCH 02/10] Added basic program flow in comments --- word_frequency.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/word_frequency.py b/word_frequency.py index b199df5..8ba857b 100644 --- a/word_frequency.py +++ b/word_frequency.py @@ -1 +1,7 @@ import re + + +# Read file. +# Create Dictionary. +# Keep top 20 words from dictionary. +# Print out the top 20 words. From 6c8f5b15e33238fa6372600d87e924756e577bfc Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Thu, 28 May 2015 13:59:21 -0400 Subject: [PATCH 03/10] Added a function to reformat the input from a file --- word_frequency.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/word_frequency.py b/word_frequency.py index 8ba857b..b1016b1 100644 --- a/word_frequency.py +++ b/word_frequency.py @@ -1,7 +1,22 @@ import re +# Functions + + +# Takes a list of strings, joins them, and removes any punctuation. +def reformat(text): + file_text = ' '.join(file_text) + # Remove whitespace. + file_text = re.sub(r'[^A-Za-z1-9 ]', "", file_text) + file_text = re.sub(r'[ *]', " ", file_text) + + # Read file. +with open('sample.txt') as f: + file_text = f.readlines() + +file_text = reformat(file_text) # Create Dictionary. # Keep top 20 words from dictionary. # Print out the top 20 words. From a02d84f1ab6649eaaa80c53e371cc52becf694d8 Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Thu, 28 May 2015 14:18:09 -0400 Subject: [PATCH 04/10] added word_frequency function and modified reformat to take a string --- word_frequency.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/word_frequency.py b/word_frequency.py index b1016b1..05d7ff6 100644 --- a/word_frequency.py +++ b/word_frequency.py @@ -1,22 +1,34 @@ -import re +from collections import defaultdict +from re import sub # Functions -# Takes a list of strings, joins them, and removes any punctuation. +# Takes a string, keeps characters and numbers, and lowercases them. def reformat(text): - file_text = ' '.join(file_text) - # Remove whitespace. - file_text = re.sub(r'[^A-Za-z1-9 ]', "", file_text) - file_text = re.sub(r'[ *]', " ", file_text) + text = re.sub(r'[\n]', " ", text) + text = re.sub(r'[^A-Za-z1-9 ]', "", text) + text = re.sub(r'[ *]', " ", text) + text.lower() + return text + + +# Takes a string and returns a defaultdict(int) histogram of the words +def word_frequency(file_text): + file_text = reformat(file_text) + word_list = file_text.split() + histogram = defaultdict() + for word in word_list: + histogram[word] += 1 + return histogram # Read file. with open('sample.txt') as f: - file_text = f.readlines() + file_text = f.read() + -file_text = reformat(file_text) # Create Dictionary. # Keep top 20 words from dictionary. # Print out the top 20 words. From 7452bbcba41d17be27ecf7029e7d7c9bb601cf91 Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Thu, 28 May 2015 14:48:49 -0400 Subject: [PATCH 05/10] fixed issue with imports, and modified word_frequency to return a dict --- word_frequency.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/word_frequency.py b/word_frequency.py index 05d7ff6..23345b7 100644 --- a/word_frequency.py +++ b/word_frequency.py @@ -1,10 +1,9 @@ +import re from collections import defaultdict -from re import sub # Functions - # Takes a string, keeps characters and numbers, and lowercases them. def reformat(text): text = re.sub(r'[\n]', " ", text) @@ -18,16 +17,18 @@ def reformat(text): def word_frequency(file_text): file_text = reformat(file_text) word_list = file_text.split() - histogram = defaultdict() + histogram = defaultdict(int) for word in word_list: histogram[word] += 1 - return histogram + return dict(histogram) # Read file. with open('sample.txt') as f: file_text = f.read() +histogram_dict = word_frequency(file_text) + # Create Dictionary. # Keep top 20 words from dictionary. From 5442c5b2a3d3ef918dfdae0bd7257e54d910a12e Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Thu, 28 May 2015 15:34:38 -0400 Subject: [PATCH 06/10] added print_word function --- word_frequency.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/word_frequency.py b/word_frequency.py index 23345b7..1da50a8 100644 --- a/word_frequency.py +++ b/word_frequency.py @@ -9,11 +9,11 @@ def reformat(text): text = re.sub(r'[\n]', " ", text) text = re.sub(r'[^A-Za-z1-9 ]', "", text) text = re.sub(r'[ *]', " ", text) - text.lower() + text = text.lower() return text -# Takes a string and returns a defaultdict(int) histogram of the words +# Takes a string and returns a dict histogram of the words. def word_frequency(file_text): file_text = reformat(file_text) word_list = file_text.split() @@ -23,13 +23,29 @@ def word_frequency(file_text): return dict(histogram) +# Takes a dictionary and returns a list of tuples of the top number of words. +def top_words(word_dictionary, number): + word_list = [] + for key, value in word_dictionary.items(): + word_list.append((key, value)) + word_list.sort(key=lambda tuple_: -tuple_[1]) + return word_list[:number] + + +# Takes a list of tuples and prints them. +def print_results(word_list): + for word, count in word_list: + print("{} {}".format(word, count)) + # Read file. with open('sample.txt') as f: file_text = f.read() +# Create dictionary. histogram_dict = word_frequency(file_text) - -# Create Dictionary. # Keep top 20 words from dictionary. +frequent_words = top_words(histogram_dict, 20) + # Print out the top 20 words. +print_results(frequent_words) From e47174e0f1139d265593d40739bd7707614ec143 Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Thu, 28 May 2015 16:49:34 -0400 Subject: [PATCH 07/10] added ignored_words.txt now input file must be received from command line change print results to print a scaled histogram --- ignored_words.txt | 8 ++++++++ word_frequency.py | 22 +++++++++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) create mode 100644 ignored_words.txt diff --git a/ignored_words.txt b/ignored_words.txt new file mode 100644 index 0000000..e6474df --- /dev/null +++ b/ignored_words.txt @@ -0,0 +1,8 @@ +a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be, +because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every, +for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is, +it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor, +not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should, +since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too, +twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with, +would,yet,you,your diff --git a/word_frequency.py b/word_frequency.py index 1da50a8..d59ac69 100644 --- a/word_frequency.py +++ b/word_frequency.py @@ -1,13 +1,14 @@ import re +import sys from collections import defaultdict # Functions # Takes a string, keeps characters and numbers, and lowercases them. -def reformat(text): +def reformat(text, regex=r'[^A-Za-z1-9 ]'): text = re.sub(r'[\n]', " ", text) - text = re.sub(r'[^A-Za-z1-9 ]', "", text) + text = re.sub(regex, "", text) text = re.sub(r'[ *]', " ", text) text = text.lower() return text @@ -18,8 +19,13 @@ def word_frequency(file_text): file_text = reformat(file_text) word_list = file_text.split() histogram = defaultdict(int) + with open("ignored_words.txt") as f: + ignored_words = f.read() + ignored_words = reformat(ignored_words, r'[^A-Za-z1-9,]') + ignored_list = ignored_words.split(",") for word in word_list: - histogram[word] += 1 + if word not in ignored_list: + histogram[word] += 1 return dict(histogram) @@ -34,11 +40,17 @@ def top_words(word_dictionary, number): # Takes a list of tuples and prints them. def print_results(word_list): + max_word_length = 0 + for word, x in word_list: + max_word_length = max([max_word_length, len(word)]) + scale = word_list[0][1]/50 for word, count in word_list: - print("{} {}".format(word, count)) + count_text = "#"*int(count//scale) + word_text = word + " "*(max_word_length - len(word)) + print("{} {}".format(word_text, count_text)) # Read file. -with open('sample.txt') as f: +with open(sys.argv[1]) as f: file_text = f.read() # Create dictionary. From efd15d37c1d1529f7280fe6f1ffebb805d6f4f47 Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Fri, 29 May 2015 08:01:29 -0400 Subject: [PATCH 08/10] refactored top_words and print_results --- word_frequency.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/word_frequency.py b/word_frequency.py index d59ac69..630aef3 100644 --- a/word_frequency.py +++ b/word_frequency.py @@ -31,21 +31,17 @@ def word_frequency(file_text): # Takes a dictionary and returns a list of tuples of the top number of words. def top_words(word_dictionary, number): - word_list = [] - for key, value in word_dictionary.items(): - word_list.append((key, value)) + word_list = [x for x in word_dictionary.items()] word_list.sort(key=lambda tuple_: -tuple_[1]) return word_list[:number] # Takes a list of tuples and prints them. def print_results(word_list): - max_word_length = 0 - for word, x in word_list: - max_word_length = max([max_word_length, len(word)]) + max_word_length = max(len(pair[0]) for pair in word_list) scale = word_list[0][1]/50 for word, count in word_list: - count_text = "#"*int(count//scale) + count_text = "#"*int(round(count/scale)) word_text = word + " "*(max_word_length - len(word)) print("{} {}".format(word_text, count_text)) From 6f926bdf9bdfc9865ee1b3935c0f9dc70677d761 Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Fri, 29 May 2015 09:10:32 -0400 Subject: [PATCH 09/10] made some variable names more descriptive --- word_frequency.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/word_frequency.py b/word_frequency.py index 630aef3..4d674e8 100644 --- a/word_frequency.py +++ b/word_frequency.py @@ -31,8 +31,8 @@ def word_frequency(file_text): # Takes a dictionary and returns a list of tuples of the top number of words. def top_words(word_dictionary, number): - word_list = [x for x in word_dictionary.items()] - word_list.sort(key=lambda tuple_: -tuple_[1]) + word_list = [pair for pair in word_dictionary.items()] + word_list.sort(key=lambda pair: -pair[1]) return word_list[:number] From 0b4e82a9bbcbc81ee0ca37845d37370a36f3f264 Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Fri, 29 May 2015 09:23:14 -0400 Subject: [PATCH 10/10] updated .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 9b42106..08bb6b1 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ .direnv/ +__pycache__