From 48c20c2ebac382bd41b92da4481ff7d832dc1538 Mon Sep 17 00:00:00 2001
From: Desktop Sal <info@salhagen.nl>
Date: Wed, 11 Sep 2024 13:54:23 +0200
Subject: [PATCH] Remove spacy processors (linguistic extractor, get nouns, get
 entities) and remove dependencies

---
 processors/text-analysis/get_entities.py      | 172 ---------------
 processors/text-analysis/get_nouns.py         | 196 ------------------
 .../text-analysis/linguistic_extractor.py     | 168 ---------------
 setup.py                                      |   4 +-
 4 files changed, 1 insertion(+), 539 deletions(-)
 delete mode 100644 processors/text-analysis/get_entities.py
 delete mode 100644 processors/text-analysis/get_nouns.py
 delete mode 100644 processors/text-analysis/linguistic_extractor.py

diff --git a/processors/text-analysis/get_entities.py b/processors/text-analysis/get_entities.py
deleted file mode 100644
index e639c7672..000000000
--- a/processors/text-analysis/get_entities.py
+++ /dev/null
@@ -1,172 +0,0 @@
-"""
-Extract nouns from SpaCy NLP docs.
-
-"""
-import pickle
-import spacy
-
-from collections import Counter
-from spacy.tokens import DocBin
-from common.lib.helpers import UserInput
-from backend.lib.processor import BasicProcessor
-from common.lib.exceptions import ProcessorInterruptedException
-
-__author__ = "Sal Hagen"
-__credits__ = ["Sal Hagen"]
-__maintainer__ = "Sal Hagen"
-__email__ = "4cat@oilab.eu"
-
-
-class ExtractNouns(BasicProcessor):  # TEMPORARILY DISABLED
-    """
-    Rank vectors over time
-    """
-    type = "get-entities"  # job type ID
-    category = "Text analysis"  # category
-    title = "Extract named entities"  # title displayed in UI
-    description = "Retrieve named entities detected by SpaCy, ranked on frequency. Be sure to have selected " \
-                  "\"Named Entity Recognition\" in the previous module." # description displayed in UI
-    extension = "csv"  # extension of result file, used internally and in UI
-
-    followups = ["wordcloud"]
-
-    options = {
-        "entities": {
-            "type": UserInput.OPTION_MULTI,
-            "default": [],
-            "options": {
-                "PERSON": "PERSON: People, including fictional.",
-                "NORP": "NORP: Nationalities or religious or political groups.",
-                "FAC": "FAC: Buildings, airports, highways, bridges, etc.",
-                "ORG": "ORG: Companies, agencies, institutions, etc.",
-                "GPE": "GPE: Countries, cities, states.",
-                "LOC": "LOC: Non-GPE locations, mountain ranges, bodies of water.",
-                "PRODUCT": "PRODUCT: Objects, vehicles, foods, etc. (Not services.)",
-                "EVENT": "EVENT: Named hurricanes, battles, wars, sports events, etc.",
-                "WORK_OF_ART": "WORK_OF_ART: Titles of books, songs, etc.",
-                "LAW": "LAW: Named documents made into laws.",
-                "LANGUAGE": "LANGUAGE: Any named language.",
-                "DATE": "DATE: Absolute or relative dates or periods.",
-                "TIME": "TIME: Times smaller than a day.",
-                "PERCENT": "PERCENT: Percentage, including ”%“.",
-                "MONEY": "MONEY: Monetary values, including unit.",
-                "QUANTITY": "QUANTITY: Measurements, as of weight or distance.",
-                "ORDINAL": "ORDINAL: “first”, “second”, etc.",
-                "CARDINAL": "CARDINAL: Numerals that do not fall under another type."
-            },
-            "help": "What types of entities to extract (select at least one)",
-            "tooltip": "The above list is derived from the SpaCy documentation (see references)."
-        }
-    }
-
-    references = [
-        "[SpaCy named entities list](https://spacy.io/api/annotation#named-entities)"
-    ]
-
-    @classmethod
-    def is_compatible_with(cls, module=None, user=None):
-        """
-        Allow processor on linguistic feature data
-
-        :param module: Module to determine compatibility with
-        """
-
-        return module.type == "linguistic-features"
-
-    def process(self):
-        """
-        Opens the SpaCy output and gets ze entities.
-
-        """
-
-        # Validate whether the user enabled the right parameters.
-        if "ner" not in self.source_dataset.parameters["enable"]:
-            self.dataset.update_status("Enable \"Named entity recognition\" in previous module")
-            self.dataset.finish(0)
-            return
-
-        else:
-            # Extract the SpaCy docs first
-            self.dataset.update_status("Unzipping SpaCy docs")
-
-            # Store all the entities in this list
-            li_entities = []
-            nlp = spacy.load("en_core_web_sm")  # Load model
-
-            for doc_file in self.iterate_archive_contents(self.source_file):
-                with doc_file.open("rb") as pickle_file:
-                    # Load DocBin
-                    file = pickle.load(pickle_file)
-                    doc_bin = DocBin().from_bytes(file)
-                    docs = list(doc_bin.get_docs(nlp.vocab))
-
-                for doc in docs:
-                    post_entities = []
-
-                    # stop processing if worker has been asked to stop
-                    if self.interrupted:
-                        raise ProcessorInterruptedException("Interrupted while processing documents")
-
-                    for ent in doc.ents:
-                        if ent.label_ in self.parameters["entities"]:
-                            post_entities.append((ent.text, ent.label_))  # Add a tuple
-
-                    li_entities.append(post_entities)
-
-            results = []
-
-            if li_entities:
-
-                # Also add the data to the original file, if indicated.
-                if self.parameters.get("overwrite"):
-                    self.add_field_to_parent(field_name='named_entities',
-                                             # Format like "Apple:ORG, Gates:PERSON, ..." and add to the row
-                                             new_data=[", ".join([":".join(post_entities) for post_entities in entity]) for entity in li_entities],
-                                             which_parent=self.dataset.top_parent(),
-                                             update_existing=True)
-
-                all_entities = []
-                # Convert to lower and filter out one-letter words. Join the words with the entities so we can group easily.
-                for post_ents in li_entities:
-                    for pair in post_ents:
-                        if pair and len(pair[0]) > 1:
-                            pair = pair[0].lower() + " |#| " + pair[1]
-                            all_entities.append(pair)
-
-                # Group and rank
-                count_nouns = Counter(all_entities).most_common()
-                # Unsplit and list the count.
-                results = [{"word": tpl[0].split(" |#| ")[0], "entity": tpl[0].split(" |#| ")[1], "count": tpl[1]} for
-                           tpl in count_nouns]
-
-            # done!
-            if results:
-                self.dataset.update_status("Finished")
-                self.write_csv_items_and_finish(results)
-            else:
-                self.dataset.update_status("Finished, but no entities were extracted.")
-                self.dataset.finish(0)
-
-    @classmethod
-    def get_options(cls, parent_dataset=None, user=None):
-        """
-        Get processor options
-
-        The feature of this processor that overwrites the parent dataset can
-        only work properly on csv datasets so check the extension before
-        showing it.
-
-        :param user:
-        :param parent_dataset:  Dataset to get options for
-        :return dict:
-        """
-        options = cls.options
-        if parent_dataset and parent_dataset.top_parent().get_results_path().suffix in [".csv", ".ndjson"]:
-            options["overwrite"] = {
-                "type": UserInput.OPTION_TOGGLE,
-                "default": False,
-                "help": "Add extracted nouns to source csv",
-                "tooltip": "Will add a column (\"nouns\", \"nouns_and_compounds\", or \"noun_chunks\"), and the found nouns in the post row."
-            }
-
-        return options
diff --git a/processors/text-analysis/get_nouns.py b/processors/text-analysis/get_nouns.py
deleted file mode 100644
index cad8653eb..000000000
--- a/processors/text-analysis/get_nouns.py
+++ /dev/null
@@ -1,196 +0,0 @@
-"""
-Extract nouns from SpaCy NLP docs.
-
-"""
-import pickle
-import spacy
-
-from collections import Counter
-from spacy.tokens import DocBin
-from common.lib.helpers import UserInput
-from backend.lib.processor import BasicProcessor
-
-__author__ = "Sal Hagen"
-__credits__ = ["Sal Hagen"]
-__maintainer__ = "Sal Hagen"
-__email__ = "4cat@oilab.eu"
-
-
-class ExtractNouns(BasicProcessor):
-    """
-    Rank vectors over time
-    """
-    type = "extract-nouns"  # job type ID
-    category = "Text analysis"  # category
-    title = "Extract nouns"  # title displayed in UI
-    description = "Retrieve nouns detected by SpaCy's part-of-speech tagging, and rank by frequency. " \
-                  "Make sure to have selected \"Part of Speech\" in the previous " \
-                  "module, as well as \"Dependency parsing\" if you want to extract compound nouns or noun chunks." # description displayed in UI
-    extension = "csv"  # extension of result file, used internally and in UI
-
-    references = ["[Information on noun chunks](https://spacy.io/usage/linguistic-features#noun-chunks)"]
-
-    followups = ["wordcloud"]
-
-    options = {
-        "type": {
-            "type": UserInput.OPTION_CHOICE,
-            "default": ["nouns"],
-            "options": {
-                "nouns": "Single-word nouns",
-                "nouns_and_compounds": "Nouns and compound nouns",
-                "noun_chunks": "Noun chunks"
-            },
-            "help": "Whether to only get 1) separate words indicated as nouns, 2) nouns and compound nouns " \
-                    "(nouns with multiple words, e.g.\"United States\") using a custom parser, or 3) noun chunks: " \
-                    "nouns plus the words describing them, e.g. \"the old grandpa\". See the references for more info."
-        }
-    }
-
-    @classmethod
-    def is_compatible_with(cls, module=None, user=None):
-        """
-        Allow processor on linguistic feature data
-
-        :param module: Module to determine compatibility with
-        """
-        return module.type == "linguistic-features"
-
-    def process(self):
-        """
-        Opens the SpaCy output and gets ze nouns.
-
-        """
-        noun_type = self.parameters["type"]
-
-        # Validate whether the user enabled the right parameters.
-        # Check part of speech tagging
-        if "tagger" not in self.source_dataset.parameters["enable"]:
-            self.dataset.update_status("Enable \"Part-of-speech tagging\" in previous module")
-            self.dataset.finish(0)
-
-        # Check dependency parsing if nouns and compouns nouns is selected
-        elif (noun_type == "nouns_and_compounds" or noun_type == "noun_chunks") and "parser" not in \
-                self.source_dataset.parameters["enable"]:
-            self.dataset.update_status(
-                "Enable \"Part-of-speech tagging\" and \"Dependency parsing\" for compound nouns/noun chunks in previous module")
-            self.dataset.finish(0)
-
-        # Valid parameters
-        else:
-
-            # Extract the SpaCy docs first
-            self.dataset.update_status("Unzipping SpaCy docs")
-            self.dataset.update_status("Extracting nouns")
-
-            # Store all the nouns in this list
-            li_nouns = []
-            nlp = spacy.load("en_core_web_sm")  # Load model
-            spacy.load("en_core_web_sm")
-
-            for doc_file in self.iterate_archive_contents(self.source_file):
-                with doc_file.open("rb") as pickle_file:
-                    # Load DocBin
-                    file = pickle.load(pickle_file)
-                    doc_bin = DocBin().from_bytes(file)
-                    docs = list(doc_bin.get_docs(nlp.vocab))
-
-            # Simply add each word if its POS is "NOUN"
-            if noun_type == "nouns":
-                for doc in docs:
-                    post_nouns = []
-                    post_nouns += [token.text for token in doc if token.pos_ == "NOUN"]
-                    li_nouns.append(post_nouns)
-
-            # Use SpaCy's noun chunk detection
-            elif noun_type == "noun_chunks":
-
-                for doc in docs:
-
-                    # Note: this is a workaround for now.
-                    # Serialization of the SpaCy docs does not
-                    # work well with dependency parsing after
-                    # loading. Quick fix: parse again.
-
-                    new_doc = nlp(doc.text)
-                    post_nouns = []
-                    for chunk in new_doc.noun_chunks:
-                        post_nouns.append(chunk.text)
-
-                    li_nouns.append(post_nouns)
-
-            # Use a custom script to get single nouns and compound nouns
-            elif noun_type == "nouns_and_compounds":
-                for doc in docs:
-                    post_nouns = []
-                    noun = ""
-
-                    for i, token in enumerate(doc):
-
-                        # Check for common nouns (general, e.g. "people")
-                        # and proper nouns (specific, e.g. "London")
-                        if token.pos_ == "NOUN" or token.pos_ == "PROPN":
-                            # Check if the token is part of a noun chunk
-                            if token.dep_ == "compound":  # Check for a compound relation
-                                noun = token.text
-                            else:
-                                if noun:
-                                    noun += " " + token.text
-                                    post_nouns.append(noun)
-                                    noun = ""
-                                else:
-                                    post_nouns.append(token.text)
-                    li_nouns.append(post_nouns)
-
-            results = []
-
-            if li_nouns:
-
-                # Also add the data to the original file, if indicated.
-                if self.parameters.get("overwrite"):
-                    self.add_field_to_parent(field_name=noun_type,
-                                             # Format like "apple, gates, ..." and add to the row
-                                             new_data=[", ".join([post_noun.lower() for post_noun in li_noun if len(post_noun) > 1]) for li_noun in li_nouns],
-                                             which_parent=self.dataset.top_parent())
-
-                # convert to lower and filter out one-letter words
-                all_nouns = []
-                for post_n in li_nouns:
-                    all_nouns += [str(cap_noun).lower() for cap_noun in post_n if len(cap_noun) > 1]
-
-                # Group and rank
-                count_nouns = Counter(all_nouns).most_common()
-                results = [{"word": tpl[0], "count": tpl[1]} for tpl in count_nouns]
-
-            # done!
-            if results:
-                self.dataset.update_status("Finished")
-                self.write_csv_items_and_finish(results)
-            else:
-                self.dataset.update_status("Finished, but no nouns were extracted.")
-                self.dataset.finish(0)
-
-    @classmethod
-    def get_options(cls, parent_dataset=None, user=None):
-        """
-        Get processor options
-
-        The feature of this processor that overwrites the parent dataset can
-        only work properly on csv datasets so check the extension before
-        showing it.
-
-        :param user:
-        :param parent_dataset:  Dataset to get options for
-        :return dict:
-        """
-        options = cls.options
-        if parent_dataset and parent_dataset.top_parent().get_results_path().suffix in [".csv", ".ndjson"]:
-            options["overwrite"] = {
-                "type": UserInput.OPTION_TOGGLE,
-                "default": False,
-                "help": "Add extracted nouns to source csv",
-                "tooltip": "Will add a column (\"nouns\", \"nouns_and_compounds\", or \"noun_chunks\"), and the found "
-                           "nouns in the post row."
-            }
-
-        return options
diff --git a/processors/text-analysis/linguistic_extractor.py b/processors/text-analysis/linguistic_extractor.py
deleted file mode 100644
index 92357853a..000000000
--- a/processors/text-analysis/linguistic_extractor.py
+++ /dev/null
@@ -1,168 +0,0 @@
-"""
-Extract linguistic features from text using SpaCy.
-
-"""
-import zipfile
-import pickle
-import re
-
-import spacy
-from spacy.tokens import DocBin
-from spacy.tokenizer import Tokenizer
-from spacy.util import compile_prefix_regex, compile_suffix_regex
-
-from common.lib.helpers import UserInput
-from common.lib.exceptions import ProcessorInterruptedException
-from backend.lib.processor import BasicProcessor
-
-__author__ = "Sal Hagen"
-__credits__ = ["Sal Hagen", "Stijn Peeters"]
-__maintainer__ = "Sal Hagen"
-__email__ = "4cat@oilab.eu"
-
-
-class LinguisticFeatures(BasicProcessor):
-	"""
-	Rank vectors over time
-	"""
-	type = "linguistic-features"  # job type ID
-	category = "Text analysis"  # category
-	title = "Annotate text features with SpaCy"  # title displayed in UI
-	description = "Annotate your text with a variety of linguistic features using the SpaCy library, " \
-				  "including part-of-speech tagging, depencency parsing, and named entity recognition. " \
-				  "Subsequent processors can extract the words labelled by SpaCy (e.g. as a noun or name). " \
-				  "Produces a Doc file using the en_core_web_sm model. Currently only available for datasets " \
-				  "with less than 100,000 items. " # description displayed in UI
-	extension = "zip"  # extension of result file, used internally and in UI
-
-	followups = ["get-entities", "extract-nouns"]
-
-	references = [
-		"[SpaCy Linguistic Features - Documentation](https://spacy.io/usage/linguistic-features/)"
-	]
-
-	options = {
-		"enable": {
-			"type": UserInput.OPTION_MULTI,
-			"default": [],
-			"options": {
-				"tagger": "Part-of-speech tagging: Tag the grammatical function of words, like nouns and verbs",
-				"parser": "Dependency parsing: Extract how words in a sentence relate to each other",
-				"ner": "Named entity recognition: Annotate what kind of objects appear in a sentence (e.g. Apple -> Organisation)"
-			},
-			"help": "What linguistic features to extract. Without any of these selected, it simply saves the SpaCy docs (tokenised sentences) as a serialized file. See references for more information."
-		}
-	}
-
-	@classmethod
-	def is_compatible_with(cls, module=None, user=None):
-		"""
-        Allow CSV and NDJSON datasets
-        """
-		return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson")
-
-	def process(self):
-		"""
-		Reads text and outputs entities per text body.
-		"""
-
-		# prepare staging area
-		staging_area = self.dataset.get_staging_area()
-
-		self.dataset.update_status("Preparing data")
-
-		# go through all archived token sets and vectorise them
-		results = []
-
-		# Load the spacy goods
-		nlp = spacy.load("en_core_web_sm")
-		nlp.tokenizer = self.custom_tokenizer(nlp)  # Keep words with a dash in between
-
-		# Disable what has _not_ been selected
-		options = ["parser", "tagger", "ner"]
-		enable = self.parameters.get("enable", False)
-
-		if not enable:
-			self.dataset.update_status("Select at least one of the options.")
-			self.dataset.finish(0)
-			return
-
-		disable = [option for option in options if option not in enable]
-
-		# Get all ze text first so we can process it in batches
-		posts = []
-		for post in self.source_dataset.iterate_items(self):
-			if post.get("body", ""):
-				if len(post["body"]) > 1000000:
-					body = post["body"][:1000000]
-				else:
-					body = post["body"]
-				posts.append(body)
-			else:
-				self.dataset.log('Warning: Post %s has no body from which to extract entities' % post.get('id'))
-				posts.append("")
-
-		# Process the text in batches
-		if len(posts) < 100000:
-			self.dataset.update_status("Extracting linguistic features")
-		else:
-			self.dataset.update_status(
-				"Extracting linguistic features is currently only available for datasets with less than 100,000 items.")
-			self.dataset.finish(0)
-			return
-
-		# Make sure only the needed information is extracted.
-		attrs = []
-		if "tagger" not in disable:
-			attrs.append("POS")
-		if "parser" not in disable:
-			attrs.append("DEP")
-		if "ner":
-			attrs.append("ENT_IOB")
-			attrs.append("ENT_TYPE")
-			attrs.append("ENT_ID")
-			attrs.append("ENT_KB_ID")
-
-		# DocBin for quick saving
-		doc_bin = DocBin(attrs=attrs)
-
-		# Start the processing!
-		try:
-			for i, doc in enumerate(nlp.pipe(posts, disable=disable)):
-				doc_bin.add(doc)
-
-				# It's quite a heavy process, so make sure it can be interrupted
-				if self.interrupted:
-					raise ProcessorInterruptedException("Processor interrupted while iterating through CSV file")
-
-				if i % 1000 == 0:
-					self.dataset.update_status("Done with post %s out of %s" % (i, len(posts)))
-		except MemoryError:
-			self.dataset.update_status("Out of memory. The dataset may be too large to process. Try again with a smaller dataset.", is_final=True)
-			return
-
-		self.dataset.update_status("Serializing results - this will take a while")
-
-		# Then serialize the NLP docs and the vocab
-		doc_bytes = doc_bin.to_bytes()
-
-		# Dump ze data in a temporary folder
-		with staging_area.joinpath("spacy_docs.pb").open("wb") as outputfile:
-			pickle.dump(doc_bytes, outputfile)
-
-		# create zip of archive and delete temporary files and folder
-		self.write_archive_and_finish(staging_area, compression=zipfile.ZIP_LZMA)
-
-	def custom_tokenizer(self, nlp):
-		"""
-		Custom tokeniser that does not split on dashes.
-		Useful for names (e.g. Hennis-Plasschaert).
-		"""
-		infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
-		prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
-		suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
-
-		return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
-						 suffix_search=suffix_re.search,
-						 infix_finditer=infix_re.finditer,
-						 token_match=None)
diff --git a/setup.py b/setup.py
index e62f292ba..17079a887 100644
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,6 @@
 	"scikit-learn",
 	"scipy==1.10.1",
 	"shapely",
-	"spacy==3.7.2",
 	"svgwrite~=1.4.0",
 	"tailer",
 	"Telethon~=1.36.0",
@@ -64,8 +63,7 @@
 	"imagedominantcolor @ git+https://github.com/dale-wahl/imagedominantcolor.git@pillow10",
 	"videohash @ git+https://github.com/dale-wahl/videohash@main",
 	"vk_api",
-	"yt-dlp",
-	"en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz#egg=en_core_web_sm"
+	"yt-dlp"
 ]
 
 # Some packages don't run on Windows