From 0a6cd9bde017bdc69f6c41fecb8774f98eb64635 Mon Sep 17 00:00:00 2001
From: Zachary <hello@zach.ie>
Date: Mon, 23 Jul 2018 20:45:35 +0100
Subject: [PATCH 1/2] Add lemmatization to get_raw_words util function

---
 .circleci/config.yml | 2 +-
 README.md            | 2 +-
 utils.py             | 9 +++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index f7a43e5..0340861 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -19,7 +19,7 @@ jobs:
           command: |
             sudo pip install pipenv --upgrade
             pipenv install --dev
-            pipenv run python -m nltk.downloader stopwords
+            pipenv run python -m nltk.downloader stopwords wordnet
 
       - save_cache: # save dependency cache
           key: deps-{{ .Branch }}-{{ checksum "Pipfile.lock" }}
diff --git a/README.md b/README.md
index 28826f6..5cf75bb 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ Welcome to HQ Trivia Bot! Thanks for contributing. Here are the steps to get sta
  * Ensure you have Python 3 installed on your system.
  * Install Pipenv `sudo pip install pipenv`
  * Create Pipenv virtual environment `pipenv --three install --dev`
- * Install NLTK corpora `pipenv run python3 -m nltk.downloader stopwords`
+ * Install NLTK corpora `pipenv run python3 -m nltk.downloader stopwords wordnet`
 
 
 ### Run HQ Trivia Bot
diff --git a/utils.py b/utils.py
index 073eb87..8529a3c 100644
--- a/utils.py
+++ b/utils.py
@@ -2,7 +2,9 @@
 import re
 from enum import Enum
 from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
 
+Lemmatizer = WordNetLemmatizer()
 
 class Colours(Enum):
     """ console colours """
@@ -32,8 +34,11 @@ def get_significant_words(question_words):
     return list(filter(lambda word: word not in our_stopwords, question_words.split(' ')))
 
 
+
+
 def get_raw_words(data):
     """ Extract raw words from data """
-    data = re.sub(r'[^\w ]', '', data).replace(' and ', ' ').strip()
-    words = data.replace('  ', ' ').lower()
+    data = re.sub(r'[^\w ]', '', data).lower().replace(' and ', ' ')
+    words_list = data.replace('  ', ' ').strip().split(' ')
+    words = ' '.join([Lemmatizer.lemmatize(word) for word in words_list])
     return words

From 9a062ffffbbb8bd095c7f184790fe9dc65d7bc16 Mon Sep 17 00:00:00 2001
From: Zachary <hello@zach.ie>
Date: Mon, 23 Jul 2018 21:30:31 +0100
Subject: [PATCH 2/2] Rename lemmatizer as constant

---
 utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils.py b/utils.py
index 8529a3c..47885a1 100644
--- a/utils.py
+++ b/utils.py
@@ -4,7 +4,7 @@
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 
-Lemmatizer = WordNetLemmatizer()
+WORDNET = WordNetLemmatizer()
 
 class Colours(Enum):
     """ console colours """
@@ -40,5 +40,5 @@ def get_raw_words(data):
     """ Extract raw words from data """
     data = re.sub(r'[^\w ]', '', data).lower().replace(' and ', ' ')
     words_list = data.replace('  ', ' ').strip().split(' ')
-    words = ' '.join([Lemmatizer.lemmatize(word) for word in words_list])
+    words = ' '.join([WORDNET.lemmatize(word) for word in words_list])
     return words