From 1d749c3cf83b130ba70bdb09174f382d6711a14b Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 21 Aug 2024 12:52:54 +0200
Subject: [PATCH] Set UTF-8 encoding when opening stop words (fixes Windows
 bug)

---
 processors/text-analysis/tokenise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py
index ad9197033..fb1b89cbd 100644
--- a/processors/text-analysis/tokenise.py
+++ b/processors/text-analysis/tokenise.py
@@ -239,7 +239,7 @@ def process(self):
 		numbers = re.compile(r"\b[0-9]+\b")
 
 		# load general stopwords dictionary
-		with config.get("PATH_ROOT").joinpath("common/assets/stopwords-iso.json").open() as infile:
+		with open(config.get("PATH_ROOT").joinpath("common/assets/stopwords-iso.json"), encoding="utf-8") as infile:
 			stopwords_iso = json.load(infile)
 
 		# Twitter tokenizer if indicated