HQDerek · zachd · Jul 23, 2018 · Jul 23, 2018
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -19,7 +19,7 @@ jobs:
           command: |
             sudo pip install pipenv --upgrade
             pipenv install --dev
-            pipenv run python -m nltk.downloader stopwords
+            pipenv run python -m nltk.downloader stopwords wordnet
 
       - save_cache: # save dependency cache
           key: deps-{{ .Branch }}-{{ checksum "Pipfile.lock" }}

diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ Welcome to HQ Trivia Bot! Thanks for contributing. Here are the steps to get sta
  * Ensure you have Python 3 installed on your system.
  * Install Pipenv `sudo pip install pipenv`
  * Create Pipenv virtual environment `pipenv --three install --dev`
- * Install NLTK corpora `pipenv run python3 -m nltk.downloader stopwords`
+ * Install NLTK corpora `pipenv run python3 -m nltk.downloader stopwords wordnet`
 
 
 ### Run HQ Trivia Bot

diff --git a/utils.py b/utils.py
@@ -2,7 +2,9 @@
 import re
 from enum import Enum
 from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
 
+WORDNET = WordNetLemmatizer()
 
 class Colours(Enum):
     """ console colours """
@@ -32,8 +34,11 @@ def get_significant_words(question_words):
     return list(filter(lambda word: word not in our_stopwords, question_words.split(' ')))
 
 
+
+
 def get_raw_words(data):
     """ Extract raw words from data """
-    data = re.sub(r'[^\w ]', '', data).replace(' and ', ' ').strip()
-    words = data.replace('  ', ' ').lower()
+    data = re.sub(r'[^\w ]', '', data).lower().replace(' and ', ' ')
+    words_list = data.replace('  ', ' ').strip().split(' ')
+    words = ' '.join([WORDNET.lemmatize(word) for word in words_list])
     return words