diff --git a/pinecone_text/sparse/bm25_tokenizer.py b/pinecone_text/sparse/bm25_tokenizer.py index 9513176..298deff 100644 --- a/pinecone_text/sparse/bm25_tokenizer.py +++ b/pinecone_text/sparse/bm25_tokenizer.py @@ -34,9 +34,9 @@ def __init__( @staticmethod def nltk_setup() -> None: try: - nltk.data.find("tokenizers/punkt") + nltk.data.find("tokenizers/punkt_tab") except LookupError: - nltk.download("punkt") + nltk.download("punkt_tab") try: nltk.data.find("corpora/stopwords") diff --git a/tests/unit/test_bm25_tokenizer.py b/tests/unit/test_bm25_tokenizer.py index 5ed1115..95984d6 100644 --- a/tests/unit/test_bm25_tokenizer.py +++ b/tests/unit/test_bm25_tokenizer.py @@ -152,7 +152,7 @@ def test_nltk_download(self): language="english", ) - nltk.find("tokenizers/punkt") + nltk.find("tokenizers/punkt_tab") nltk.find("corpora/stopwords") assert tokenizer("The quick brown fox jumps over the lazy dog") == [