diff --git a/.circleci/config.yml b/.circleci/config.yml index f7a43e5..0340861 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -19,7 +19,7 @@ jobs: command: | sudo pip install pipenv --upgrade pipenv install --dev - pipenv run python -m nltk.downloader stopwords + pipenv run python -m nltk.downloader stopwords wordnet - save_cache: # save dependency cache key: deps-{{ .Branch }}-{{ checksum "Pipfile.lock" }} diff --git a/README.md b/README.md index 28826f6..5cf75bb 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Welcome to HQ Trivia Bot! Thanks for contributing. Here are the steps to get sta * Ensure you have Python 3 installed on your system. * Install Pipenv `sudo pip install pipenv` * Create Pipenv virtual environment `pipenv --three install --dev` - * Install NLTK corpora `pipenv run python3 -m nltk.downloader stopwords` + * Install NLTK corpora `pipenv run python3 -m nltk.downloader stopwords wordnet` ### Run HQ Trivia Bot diff --git a/utils.py b/utils.py index 073eb87..47885a1 100644 --- a/utils.py +++ b/utils.py @@ -2,7 +2,9 @@ import re from enum import Enum from nltk.corpus import stopwords +from nltk.stem import WordNetLemmatizer +WORDNET = WordNetLemmatizer() class Colours(Enum): """ console colours """ @@ -32,8 +34,11 @@ def get_significant_words(question_words): return list(filter(lambda word: word not in our_stopwords, question_words.split(' '))) + + def get_raw_words(data): """ Extract raw words from data """ - data = re.sub(r'[^\w ]', '', data).replace(' and ', ' ').strip() - words = data.replace(' ', ' ').lower() + data = re.sub(r'[^\w ]', '', data).lower().replace(' and ', ' ') + words_list = data.replace(' ', ' ').strip().split(' ') + words = ' '.join([WORDNET.lemmatize(word) for word in words_list]) return words