From a269f96ed0cf296400fc1d5b4252d0a6765dda52 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 8 Oct 2024 12:31:22 +0200 Subject: [PATCH] use punkt_tab instead of punkt due to pickle issue: https://github.com/nltk/nltk/issues/3293 --- helper-scripts/first-run.py | 2 +- helper-scripts/migrate.py | 4 ++-- processors/text-analysis/tokenise.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/helper-scripts/first-run.py b/helper-scripts/first-run.py index dea0fd487..a565a591e 100644 --- a/helper-scripts/first-run.py +++ b/helper-scripts/first-run.py @@ -40,7 +40,7 @@ # Now check for presence of required NLTK packages import nltk -nltk_downloads = ("wordnet", "punkt", "omw-1.4") +nltk_downloads = ("wordnet", "punkt_tab", "omw-1.4") for package in nltk_downloads: # if it already exists, .download() will just NOP try: diff --git a/helper-scripts/migrate.py b/helper-scripts/migrate.py index 25071afe4..55c26c044 100644 --- a/helper-scripts/migrate.py +++ b/helper-scripts/migrate.py @@ -69,9 +69,9 @@ def check_for_nltk(): # NLTK import nltk try: - nltk.data.find('tokenizers/punkt') + nltk.data.find('tokenizers/punkt_tab') except LookupError: - nltk.download('punkt', quiet=True) + nltk.download('punkt_tab', quiet=True) try: nltk.data.find('corpora/wordnet') except LookupError: diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py index a104306f1..17c350c86 100644 --- a/processors/text-analysis/tokenise.py +++ b/processors/text-analysis/tokenise.py @@ -357,7 +357,7 @@ def dummy_function(x, *args, **kwargs): # for russian we use a special purpose splitter with better # performance sentence_method = razdel.sentenize - elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt')) if + elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab')) if 'pickle' in lang]: self.dataset.update_status( f"Language {language} not available for sentence tokenizer; grouping by item/post instead.")