From 7b4017be7cdb80cb6aea020f43abb9d765093c0f Mon Sep 17 00:00:00 2001
From: PJ Passalacqua <pjpassalacqua@Daniels-MacBook-Pro.local>
Date: Thu, 28 May 2015 13:24:36 -0400
Subject: [PATCH 01/10] added word_frequency.py

---
 word_frequency.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 word_frequency.py

diff --git a/word_frequency.py b/word_frequency.py
new file mode 100644
index 0000000..b199df5
--- /dev/null
+++ b/word_frequency.py
@@ -0,0 +1 @@
+import re

From b6bfd1d953e8cb71462c14cf8a5c1c5bd6277ef1 Mon Sep 17 00:00:00 2001
From: PJ Passalacqua <pjpassalacqua@Daniels-MacBook-Pro.local>
Date: Thu, 28 May 2015 13:32:14 -0400
Subject: [PATCH 02/10] Added basic program flow in comments

---
 word_frequency.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/word_frequency.py b/word_frequency.py
index b199df5..8ba857b 100644
--- a/word_frequency.py
+++ b/word_frequency.py
@@ -1 +1,7 @@
 import re
+
+
+# Read file.
+# Create Dictionary.
+# Keep top 20 words from dictionary.
+# Print out the top 20 words.

From 6c8f5b15e33238fa6372600d87e924756e577bfc Mon Sep 17 00:00:00 2001
From: PJ Passalacqua <pjpassalacqua@Daniels-MacBook-Pro.local>
Date: Thu, 28 May 2015 13:59:21 -0400
Subject: [PATCH 03/10] Added a function to reformat the input from a file

---
 word_frequency.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/word_frequency.py b/word_frequency.py
index 8ba857b..b1016b1 100644
--- a/word_frequency.py
+++ b/word_frequency.py
@@ -1,7 +1,22 @@
 import re
 
 
+# Functions
+
+
+# Takes a list of strings, joins them, and removes any punctuation.
+def reformat(text):
+    file_text = ' '.join(file_text)
+    # Remove whitespace.
+    file_text = re.sub(r'[^A-Za-z1-9 ]', "", file_text)
+    file_text = re.sub(r'[ *]', " ", file_text)
+
+
 # Read file.
+with open('sample.txt') as f:
+    file_text = f.readlines()
+
+file_text = reformat(file_text)
 # Create Dictionary.
 # Keep top 20 words from dictionary.
 # Print out the top 20 words.

From a02d84f1ab6649eaaa80c53e371cc52becf694d8 Mon Sep 17 00:00:00 2001
From: PJ Passalacqua <pjpassa@gmail.com>
Date: Thu, 28 May 2015 14:18:09 -0400
Subject: [PATCH 04/10] added word_frequency function and modified reformat to
 take a string

---
 word_frequency.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/word_frequency.py b/word_frequency.py
index b1016b1..05d7ff6 100644
--- a/word_frequency.py
+++ b/word_frequency.py
@@ -1,22 +1,34 @@
-import re
+from collections import defaultdict
+from re import sub
 
 
 # Functions
 
 
-# Takes a list of strings, joins them, and removes any punctuation.
+# Takes a string, keeps characters and numbers, and lowercases them.
 def reformat(text):
-    file_text = ' '.join(file_text)
-    # Remove whitespace.
-    file_text = re.sub(r'[^A-Za-z1-9 ]', "", file_text)
-    file_text = re.sub(r'[ *]', " ", file_text)
+    text = re.sub(r'[\n]', " ", text)
+    text = re.sub(r'[^A-Za-z1-9 ]', "", text)
+    text = re.sub(r'[ *]', " ", text)
+    text.lower()
+    return text
+
+
+# Takes a string and returns a defaultdict(int) histogram of the words
+def word_frequency(file_text):
+    file_text = reformat(file_text)
+    word_list = file_text.split()
+    histogram = defaultdict()
+    for word in word_list:
+        histogram[word] += 1
+    return histogram
 
 
 # Read file.
 with open('sample.txt') as f:
-    file_text = f.readlines()
+    file_text = f.read()
+
 
-file_text = reformat(file_text)
 # Create Dictionary.
 # Keep top 20 words from dictionary.
 # Print out the top 20 words.

From 7452bbcba41d17be27ecf7029e7d7c9bb601cf91 Mon Sep 17 00:00:00 2001
From: PJ Passalacqua <pjpassa@gmail.com>
Date: Thu, 28 May 2015 14:48:49 -0400
Subject: [PATCH 05/10] fixed issue with imports, and modified word_frequency
 to return a dict

---
 word_frequency.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/word_frequency.py b/word_frequency.py
index 05d7ff6..23345b7 100644
--- a/word_frequency.py
+++ b/word_frequency.py
@@ -1,10 +1,9 @@
+import re
 from collections import defaultdict
-from re import sub
 
 
 # Functions
 
-
 # Takes a string, keeps characters and numbers, and lowercases them.
 def reformat(text):
     text = re.sub(r'[\n]', " ", text)
@@ -18,16 +17,18 @@ def reformat(text):
 def word_frequency(file_text):
     file_text = reformat(file_text)
     word_list = file_text.split()
-    histogram = defaultdict()
+    histogram = defaultdict(int)
     for word in word_list:
         histogram[word] += 1
-    return histogram
+    return dict(histogram)
 
 
 # Read file.
 with open('sample.txt') as f:
     file_text = f.read()
 
+histogram_dict = word_frequency(file_text)
+
 
 # Create Dictionary.
 # Keep top 20 words from dictionary.

From 5442c5b2a3d3ef918dfdae0bd7257e54d910a12e Mon Sep 17 00:00:00 2001
From: PJ Passalacqua <pjpassa@gmail.com>
Date: Thu, 28 May 2015 15:34:38 -0400
Subject: [PATCH 06/10] added print_word function

---
 word_frequency.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/word_frequency.py b/word_frequency.py
index 23345b7..1da50a8 100644
--- a/word_frequency.py
+++ b/word_frequency.py
@@ -9,11 +9,11 @@ def reformat(text):
     text = re.sub(r'[\n]', " ", text)
     text = re.sub(r'[^A-Za-z1-9 ]', "", text)
     text = re.sub(r'[ *]', " ", text)
-    text.lower()
+    text = text.lower()
     return text
 
 
-# Takes a string and returns a defaultdict(int) histogram of the words
+# Takes a string and returns a dict histogram of the words.
 def word_frequency(file_text):
     file_text = reformat(file_text)
     word_list = file_text.split()
@@ -23,13 +23,29 @@ def word_frequency(file_text):
     return dict(histogram)
 
 
+# Takes a dictionary and returns a list of tuples of the top number of words.
+def top_words(word_dictionary, number):
+    word_list = []
+    for key, value in word_dictionary.items():
+        word_list.append((key, value))
+    word_list.sort(key=lambda tuple_: -tuple_[1])
+    return word_list[:number]
+
+
+# Takes a list of tuples and prints them.
+def print_results(word_list):
+    for word, count in word_list:
+        print("{} {}".format(word, count))
+
 # Read file.
 with open('sample.txt') as f:
     file_text = f.read()
 
+# Create dictionary.
 histogram_dict = word_frequency(file_text)
 
-
-# Create Dictionary.
 # Keep top 20 words from dictionary.
+frequent_words = top_words(histogram_dict, 20)
+
 # Print out the top 20 words.
+print_results(frequent_words)

From e47174e0f1139d265593d40739bd7707614ec143 Mon Sep 17 00:00:00 2001
From: PJ Passalacqua <pjpassa@gmail.com>
Date: Thu, 28 May 2015 16:49:34 -0400
Subject: [PATCH 07/10] added ignored_words.txt now input file must be received
 from command line change print results to print a scaled histogram

---
 ignored_words.txt |  8 ++++++++
 word_frequency.py | 22 +++++++++++++++++-----
 2 files changed, 25 insertions(+), 5 deletions(-)
 create mode 100644 ignored_words.txt

diff --git a/ignored_words.txt b/ignored_words.txt
new file mode 100644
index 0000000..e6474df
--- /dev/null
+++ b/ignored_words.txt
@@ -0,0 +1,8 @@
+a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,
+because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,
+for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,
+it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,
+not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,
+since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,
+twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,
+would,yet,you,your
diff --git a/word_frequency.py b/word_frequency.py
index 1da50a8..d59ac69 100644
--- a/word_frequency.py
+++ b/word_frequency.py
@@ -1,13 +1,14 @@
 import re
+import sys
 from collections import defaultdict
 
 
 # Functions
 
 # Takes a string, keeps characters and numbers, and lowercases them.
-def reformat(text):
+def reformat(text, regex=r'[^A-Za-z1-9 ]'):
     text = re.sub(r'[\n]', " ", text)
-    text = re.sub(r'[^A-Za-z1-9 ]', "", text)
+    text = re.sub(regex, "", text)
     text = re.sub(r'[ *]', " ", text)
     text = text.lower()
     return text
@@ -18,8 +19,13 @@ def word_frequency(file_text):
     file_text = reformat(file_text)
     word_list = file_text.split()
     histogram = defaultdict(int)
+    with open("ignored_words.txt") as f:
+        ignored_words = f.read()
+    ignored_words = reformat(ignored_words, r'[^A-Za-z1-9,]')
+    ignored_list = ignored_words.split(",")
     for word in word_list:
-        histogram[word] += 1
+        if word not in ignored_list:
+            histogram[word] += 1
     return dict(histogram)
 
 
@@ -34,11 +40,17 @@ def top_words(word_dictionary, number):
 
 # Takes a list of tuples and prints them.
 def print_results(word_list):
+    max_word_length = 0
+    for word, x in word_list:
+        max_word_length = max([max_word_length, len(word)])
+    scale = word_list[0][1]/50
     for word, count in word_list:
-        print("{} {}".format(word, count))
+        count_text = "#"*int(count//scale)
+        word_text = word + " "*(max_word_length - len(word))
+        print("{} {}".format(word_text, count_text))
 
 # Read file.
-with open('sample.txt') as f:
+with open(sys.argv[1]) as f:
     file_text = f.read()
 
 # Create dictionary.

From efd15d37c1d1529f7280fe6f1ffebb805d6f4f47 Mon Sep 17 00:00:00 2001
From: PJ Passalacqua <pjpassa@gmail.com>
Date: Fri, 29 May 2015 08:01:29 -0400
Subject: [PATCH 08/10] refactored top_words and print_results

---
 word_frequency.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/word_frequency.py b/word_frequency.py
index d59ac69..630aef3 100644
--- a/word_frequency.py
+++ b/word_frequency.py
@@ -31,21 +31,17 @@ def word_frequency(file_text):
 
 # Takes a dictionary and returns a list of tuples of the top number of words.
 def top_words(word_dictionary, number):
-    word_list = []
-    for key, value in word_dictionary.items():
-        word_list.append((key, value))
+    word_list = [x for x in word_dictionary.items()]
     word_list.sort(key=lambda tuple_: -tuple_[1])
     return word_list[:number]
 
 
 # Takes a list of tuples and prints them.
 def print_results(word_list):
-    max_word_length = 0
-    for word, x in word_list:
-        max_word_length = max([max_word_length, len(word)])
+    max_word_length = max(len(pair[0]) for pair in word_list)
     scale = word_list[0][1]/50
     for word, count in word_list:
-        count_text = "#"*int(count//scale)
+        count_text = "#"*int(round(count/scale))
         word_text = word + " "*(max_word_length - len(word))
         print("{} {}".format(word_text, count_text))
 

From 6f926bdf9bdfc9865ee1b3935c0f9dc70677d761 Mon Sep 17 00:00:00 2001
From: PJ Passalacqua <pjpassa@gmail.com>
Date: Fri, 29 May 2015 09:10:32 -0400
Subject: [PATCH 09/10] made some variable names more descriptive

---
 word_frequency.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/word_frequency.py b/word_frequency.py
index 630aef3..4d674e8 100644
--- a/word_frequency.py
+++ b/word_frequency.py
@@ -31,8 +31,8 @@ def word_frequency(file_text):
 
 # Takes a dictionary and returns a list of tuples of the top number of words.
 def top_words(word_dictionary, number):
-    word_list = [x for x in word_dictionary.items()]
-    word_list.sort(key=lambda tuple_: -tuple_[1])
+    word_list = [pair for pair in word_dictionary.items()]
+    word_list.sort(key=lambda pair: -pair[1])
     return word_list[:number]
 
 

From 0b4e82a9bbcbc81ee0ca37845d37370a36f3f264 Mon Sep 17 00:00:00 2001
From: PJ Passalacqua <pjpassa@gmail.com>
Date: Fri, 29 May 2015 09:23:14 -0400
Subject: [PATCH 10/10] updated .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 9b42106..08bb6b1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 .direnv/
+__pycache__