From 0a6cd9bde017bdc69f6c41fecb8774f98eb64635 Mon Sep 17 00:00:00 2001 From: Zachary Date: Mon, 23 Jul 2018 20:45:35 +0100 Subject: [PATCH 1/2] Add lemmatization to get_raw_words util function --- .circleci/config.yml | 2 +- README.md | 2 +- utils.py | 9 +++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index f7a43e5..0340861 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -19,7 +19,7 @@ jobs: command: | sudo pip install pipenv --upgrade pipenv install --dev - pipenv run python -m nltk.downloader stopwords + pipenv run python -m nltk.downloader stopwords wordnet - save_cache: # save dependency cache key: deps-{{ .Branch }}-{{ checksum "Pipfile.lock" }} diff --git a/README.md b/README.md index 28826f6..5cf75bb 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Welcome to HQ Trivia Bot! Thanks for contributing. Here are the steps to get sta * Ensure you have Python 3 installed on your system. * Install Pipenv `sudo pip install pipenv` * Create Pipenv virtual environment `pipenv --three install --dev` - * Install NLTK corpora `pipenv run python3 -m nltk.downloader stopwords` + * Install NLTK corpora `pipenv run python3 -m nltk.downloader stopwords wordnet` ### Run HQ Trivia Bot diff --git a/utils.py b/utils.py index 073eb87..8529a3c 100644 --- a/utils.py +++ b/utils.py @@ -2,7 +2,9 @@ import re from enum import Enum from nltk.corpus import stopwords +from nltk.stem import WordNetLemmatizer +Lemmatizer = WordNetLemmatizer() class Colours(Enum): """ console colours """ @@ -32,8 +34,11 @@ def get_significant_words(question_words): return list(filter(lambda word: word not in our_stopwords, question_words.split(' '))) + + def get_raw_words(data): """ Extract raw words from data """ - data = re.sub(r'[^\w ]', '', data).replace(' and ', ' ').strip() - words = data.replace(' ', ' ').lower() + data = re.sub(r'[^\w ]', '', data).lower().replace(' and ', ' ') + words_list = data.replace(' ', ' ').strip().split(' ') + words = ' '.join([Lemmatizer.lemmatize(word) for word in words_list]) return words From 9a062ffffbbb8bd095c7f184790fe9dc65d7bc16 Mon Sep 17 00:00:00 2001 From: Zachary Date: Mon, 23 Jul 2018 21:30:31 +0100 Subject: [PATCH 2/2] Rename lemmatizer as constant --- utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils.py b/utils.py index 8529a3c..47885a1 100644 --- a/utils.py +++ b/utils.py @@ -4,7 +4,7 @@ from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer -Lemmatizer = WordNetLemmatizer() +WORDNET = WordNetLemmatizer() class Colours(Enum): """ console colours """ @@ -40,5 +40,5 @@ def get_raw_words(data): """ Extract raw words from data """ data = re.sub(r'[^\w ]', '', data).lower().replace(' and ', ' ') words_list = data.replace(' ', ' ').strip().split(' ') - words = ' '.join([Lemmatizer.lemmatize(word) for word in words_list]) + words = ' '.join([WORDNET.lemmatize(word) for word in words_list]) return words