diff --git a/helper-scripts/first-run.py b/helper-scripts/first-run.py index dea0fd487..a565a591e 100644 --- a/helper-scripts/first-run.py +++ b/helper-scripts/first-run.py @@ -40,7 +40,7 @@ # Now check for presence of required NLTK packages import nltk -nltk_downloads = ("wordnet", "punkt", "omw-1.4") +nltk_downloads = ("wordnet", "punkt_tab", "omw-1.4") for package in nltk_downloads: # if it already exists, .download() will just NOP try: diff --git a/helper-scripts/migrate.py b/helper-scripts/migrate.py index 25071afe4..55c26c044 100644 --- a/helper-scripts/migrate.py +++ b/helper-scripts/migrate.py @@ -69,9 +69,9 @@ def check_for_nltk(): # NLTK import nltk try: - nltk.data.find('tokenizers/punkt') + nltk.data.find('tokenizers/punkt_tab') except LookupError: - nltk.download('punkt', quiet=True) + nltk.download('punkt_tab', quiet=True) try: nltk.data.find('corpora/wordnet') except LookupError: diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py index a104306f1..17c350c86 100644 --- a/processors/text-analysis/tokenise.py +++ b/processors/text-analysis/tokenise.py @@ -357,7 +357,7 @@ def dummy_function(x, *args, **kwargs): # for russian we use a special purpose splitter with better # performance sentence_method = razdel.sentenize - elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt')) if + elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab')) if 'pickle' in lang]: self.dataset.update_status( f"Language {language} not available for sentence tokenizer; grouping by item/post instead.")