Skip to content

Commit

Permalink
use punkt_tab instead of punkt
Browse files Browse the repository at this point in the history
due to pickle issue: nltk/nltk#3293
  • Loading branch information
dale-wahl committed Oct 8, 2024
1 parent e4c0099 commit a269f96
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 4 deletions.
2 changes: 1 addition & 1 deletion helper-scripts/first-run.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

# Now check for presence of required NLTK packages
import nltk
nltk_downloads = ("wordnet", "punkt", "omw-1.4")
nltk_downloads = ("wordnet", "punkt_tab", "omw-1.4")
for package in nltk_downloads:
# if it already exists, .download() will just NOP
try:
Expand Down
4 changes: 2 additions & 2 deletions helper-scripts/migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ def check_for_nltk():
# NLTK
import nltk
try:
nltk.data.find('tokenizers/punkt')
nltk.data.find('tokenizers/punkt_tab')
except LookupError:
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
try:
nltk.data.find('corpora/wordnet')
except LookupError:
Expand Down
2 changes: 1 addition & 1 deletion processors/text-analysis/tokenise.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def dummy_function(x, *args, **kwargs):
# for russian we use a special purpose splitter with better
# performance
sentence_method = razdel.sentenize
elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt')) if
elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab')) if
'pickle' in lang]:
self.dataset.update_status(
f"Language {language} not available for sentence tokenizer; grouping by item/post instead.")
Expand Down

0 comments on commit a269f96

Please sign in to comment.