From 2e2470656a303136cddfbf93a71c0712724ef7e7 Mon Sep 17 00:00:00 2001 From: Emiel Steerneman Date: Mon, 26 Aug 2024 14:05:55 +0200 Subject: [PATCH] Replace deprecated 'punkt' with 'punkt_tab' --- pinecone_text/sparse/bm25_tokenizer.py | 4 ++-- tests/unit/test_bm25_tokenizer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pinecone_text/sparse/bm25_tokenizer.py b/pinecone_text/sparse/bm25_tokenizer.py index 9513176..298deff 100644 --- a/pinecone_text/sparse/bm25_tokenizer.py +++ b/pinecone_text/sparse/bm25_tokenizer.py @@ -34,9 +34,9 @@ def __init__( @staticmethod def nltk_setup() -> None: try: - nltk.data.find("tokenizers/punkt") + nltk.data.find("tokenizers/punkt_tab") except LookupError: - nltk.download("punkt") + nltk.download("punkt_tab") try: nltk.data.find("corpora/stopwords") diff --git a/tests/unit/test_bm25_tokenizer.py b/tests/unit/test_bm25_tokenizer.py index 5ed1115..95984d6 100644 --- a/tests/unit/test_bm25_tokenizer.py +++ b/tests/unit/test_bm25_tokenizer.py @@ -152,7 +152,7 @@ def test_nltk_download(self): language="english", ) - nltk.find("tokenizers/punkt") + nltk.find("tokenizers/punkt_tab") nltk.find("corpora/stopwords") assert tokenizer("The quick brown fox jumps over the lazy dog") == [