use punkt_tab instead of punkt

due to pickle issue: nltk/nltk#3293
digitalmethodsinitiative · Oct 8, 2024 · a269f96 · a269f96
1 parent e4c0099
commit a269f96
Show file tree

Hide file tree

Showing 3 changed files with 4 additions and 4 deletions.
diff --git a/helper-scripts/first-run.py b/helper-scripts/first-run.py
@@ -40,7 +40,7 @@
 
 # Now check for presence of required NLTK packages
 import nltk
-nltk_downloads = ("wordnet", "punkt", "omw-1.4")
+nltk_downloads = ("wordnet", "punkt_tab", "omw-1.4")
 for package in nltk_downloads:
     # if it already exists, .download() will just NOP
     try:

diff --git a/helper-scripts/migrate.py b/helper-scripts/migrate.py
@@ -69,9 +69,9 @@ def check_for_nltk():
 	# NLTK
 	import nltk
 	try:
-		nltk.data.find('tokenizers/punkt')
+		nltk.data.find('tokenizers/punkt_tab')
 	except LookupError:
-		nltk.download('punkt', quiet=True)
+		nltk.download('punkt_tab', quiet=True)
 	try:
 		nltk.data.find('corpora/wordnet')
 	except LookupError:

diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py
@@ -357,7 +357,7 @@ def dummy_function(x, *args, **kwargs):
 				# for russian we use a special purpose splitter with better
 				# performance
 				sentence_method = razdel.sentenize
-			elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt')) if
+			elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab')) if
 								'pickle' in lang]:
 				self.dataset.update_status(
 					f"Language {language} not available for sentence tokenizer; grouping by item/post instead.")