From 1d749c3cf83b130ba70bdb09174f382d6711a14b Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 21 Aug 2024 12:52:54 +0200 Subject: [PATCH] Set UTF-8 encoding when opening stop words (fixes Windows bug) --- processors/text-analysis/tokenise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py index ad9197033..fb1b89cbd 100644 --- a/processors/text-analysis/tokenise.py +++ b/processors/text-analysis/tokenise.py @@ -239,7 +239,7 @@ def process(self): numbers = re.compile(r"\b[0-9]+\b") # load general stopwords dictionary - with config.get("PATH_ROOT").joinpath("common/assets/stopwords-iso.json").open() as infile: + with open(config.get("PATH_ROOT").joinpath("common/assets/stopwords-iso.json"), encoding="utf-8") as infile: stopwords_iso = json.load(infile) # Twitter tokenizer if indicated