From a269f96ed0cf296400fc1d5b4252d0a6765dda52 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 8 Oct 2024 12:31:22 +0200
Subject: [PATCH] use punkt_tab instead of punkt

due to pickle issue: https://github.com/nltk/nltk/issues/3293
---
 helper-scripts/first-run.py          | 2 +-
 helper-scripts/migrate.py            | 4 ++--
 processors/text-analysis/tokenise.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/helper-scripts/first-run.py b/helper-scripts/first-run.py
index dea0fd487..a565a591e 100644
--- a/helper-scripts/first-run.py
+++ b/helper-scripts/first-run.py
@@ -40,7 +40,7 @@
 
 # Now check for presence of required NLTK packages
 import nltk
-nltk_downloads = ("wordnet", "punkt", "omw-1.4")
+nltk_downloads = ("wordnet", "punkt_tab", "omw-1.4")
 for package in nltk_downloads:
     # if it already exists, .download() will just NOP
     try:
diff --git a/helper-scripts/migrate.py b/helper-scripts/migrate.py
index 25071afe4..55c26c044 100644
--- a/helper-scripts/migrate.py
+++ b/helper-scripts/migrate.py
@@ -69,9 +69,9 @@ def check_for_nltk():
 	# NLTK
 	import nltk
 	try:
-		nltk.data.find('tokenizers/punkt')
+		nltk.data.find('tokenizers/punkt_tab')
 	except LookupError:
-		nltk.download('punkt', quiet=True)
+		nltk.download('punkt_tab', quiet=True)
 	try:
 		nltk.data.find('corpora/wordnet')
 	except LookupError:
diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py
index a104306f1..17c350c86 100644
--- a/processors/text-analysis/tokenise.py
+++ b/processors/text-analysis/tokenise.py
@@ -357,7 +357,7 @@ def dummy_function(x, *args, **kwargs):
 				# for russian we use a special purpose splitter with better
 				# performance
 				sentence_method = razdel.sentenize
-			elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt')) if
+			elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab')) if
 								'pickle' in lang]:
 				self.dataset.update_status(
 					f"Language {language} not available for sentence tokenizer; grouping by item/post instead.")