From 48c20c2ebac382bd41b92da4481ff7d832dc1538 Mon Sep 17 00:00:00 2001 From: Desktop Sal Date: Wed, 11 Sep 2024 13:54:23 +0200 Subject: [PATCH] Remove spacy processors (linguistic extractor, get nouns, get entities) and remove dependencies --- processors/text-analysis/get_entities.py | 172 --------------- processors/text-analysis/get_nouns.py | 196 ------------------ .../text-analysis/linguistic_extractor.py | 168 --------------- setup.py | 4 +- 4 files changed, 1 insertion(+), 539 deletions(-) delete mode 100644 processors/text-analysis/get_entities.py delete mode 100644 processors/text-analysis/get_nouns.py delete mode 100644 processors/text-analysis/linguistic_extractor.py diff --git a/processors/text-analysis/get_entities.py b/processors/text-analysis/get_entities.py deleted file mode 100644 index e639c7672..000000000 --- a/processors/text-analysis/get_entities.py +++ /dev/null @@ -1,172 +0,0 @@ -""" -Extract nouns from SpaCy NLP docs. - -""" -import pickle -import spacy - -from collections import Counter -from spacy.tokens import DocBin -from common.lib.helpers import UserInput -from backend.lib.processor import BasicProcessor -from common.lib.exceptions import ProcessorInterruptedException - -__author__ = "Sal Hagen" -__credits__ = ["Sal Hagen"] -__maintainer__ = "Sal Hagen" -__email__ = "4cat@oilab.eu" - - -class ExtractNouns(BasicProcessor): # TEMPORARILY DISABLED - """ - Rank vectors over time - """ - type = "get-entities" # job type ID - category = "Text analysis" # category - title = "Extract named entities" # title displayed in UI - description = "Retrieve named entities detected by SpaCy, ranked on frequency. Be sure to have selected " \ - "\"Named Entity Recognition\" in the previous module." # description displayed in UI - extension = "csv" # extension of result file, used internally and in UI - - followups = ["wordcloud"] - - options = { - "entities": { - "type": UserInput.OPTION_MULTI, - "default": [], - "options": { - "PERSON": "PERSON: People, including fictional.", - "NORP": "NORP: Nationalities or religious or political groups.", - "FAC": "FAC: Buildings, airports, highways, bridges, etc.", - "ORG": "ORG: Companies, agencies, institutions, etc.", - "GPE": "GPE: Countries, cities, states.", - "LOC": "LOC: Non-GPE locations, mountain ranges, bodies of water.", - "PRODUCT": "PRODUCT: Objects, vehicles, foods, etc. (Not services.)", - "EVENT": "EVENT: Named hurricanes, battles, wars, sports events, etc.", - "WORK_OF_ART": "WORK_OF_ART: Titles of books, songs, etc.", - "LAW": "LAW: Named documents made into laws.", - "LANGUAGE": "LANGUAGE: Any named language.", - "DATE": "DATE: Absolute or relative dates or periods.", - "TIME": "TIME: Times smaller than a day.", - "PERCENT": "PERCENT: Percentage, including ”%“.", - "MONEY": "MONEY: Monetary values, including unit.", - "QUANTITY": "QUANTITY: Measurements, as of weight or distance.", - "ORDINAL": "ORDINAL: “first”, “second”, etc.", - "CARDINAL": "CARDINAL: Numerals that do not fall under another type." - }, - "help": "What types of entities to extract (select at least one)", - "tooltip": "The above list is derived from the SpaCy documentation (see references)." - } - } - - references = [ - "[SpaCy named entities list](https://spacy.io/api/annotation#named-entities)" - ] - - @classmethod - def is_compatible_with(cls, module=None, user=None): - """ - Allow processor on linguistic feature data - - :param module: Module to determine compatibility with - """ - - return module.type == "linguistic-features" - - def process(self): - """ - Opens the SpaCy output and gets ze entities. - - """ - - # Validate whether the user enabled the right parameters. - if "ner" not in self.source_dataset.parameters["enable"]: - self.dataset.update_status("Enable \"Named entity recognition\" in previous module") - self.dataset.finish(0) - return - - else: - # Extract the SpaCy docs first - self.dataset.update_status("Unzipping SpaCy docs") - - # Store all the entities in this list - li_entities = [] - nlp = spacy.load("en_core_web_sm") # Load model - - for doc_file in self.iterate_archive_contents(self.source_file): - with doc_file.open("rb") as pickle_file: - # Load DocBin - file = pickle.load(pickle_file) - doc_bin = DocBin().from_bytes(file) - docs = list(doc_bin.get_docs(nlp.vocab)) - - for doc in docs: - post_entities = [] - - # stop processing if worker has been asked to stop - if self.interrupted: - raise ProcessorInterruptedException("Interrupted while processing documents") - - for ent in doc.ents: - if ent.label_ in self.parameters["entities"]: - post_entities.append((ent.text, ent.label_)) # Add a tuple - - li_entities.append(post_entities) - - results = [] - - if li_entities: - - # Also add the data to the original file, if indicated. - if self.parameters.get("overwrite"): - self.add_field_to_parent(field_name='named_entities', - # Format like "Apple:ORG, Gates:PERSON, ..." and add to the row - new_data=[", ".join([":".join(post_entities) for post_entities in entity]) for entity in li_entities], - which_parent=self.dataset.top_parent(), - update_existing=True) - - all_entities = [] - # Convert to lower and filter out one-letter words. Join the words with the entities so we can group easily. - for post_ents in li_entities: - for pair in post_ents: - if pair and len(pair[0]) > 1: - pair = pair[0].lower() + " |#| " + pair[1] - all_entities.append(pair) - - # Group and rank - count_nouns = Counter(all_entities).most_common() - # Unsplit and list the count. - results = [{"word": tpl[0].split(" |#| ")[0], "entity": tpl[0].split(" |#| ")[1], "count": tpl[1]} for - tpl in count_nouns] - - # done! - if results: - self.dataset.update_status("Finished") - self.write_csv_items_and_finish(results) - else: - self.dataset.update_status("Finished, but no entities were extracted.") - self.dataset.finish(0) - - @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Get processor options - - The feature of this processor that overwrites the parent dataset can - only work properly on csv datasets so check the extension before - showing it. - - :param user: - :param parent_dataset: Dataset to get options for - :return dict: - """ - options = cls.options - if parent_dataset and parent_dataset.top_parent().get_results_path().suffix in [".csv", ".ndjson"]: - options["overwrite"] = { - "type": UserInput.OPTION_TOGGLE, - "default": False, - "help": "Add extracted nouns to source csv", - "tooltip": "Will add a column (\"nouns\", \"nouns_and_compounds\", or \"noun_chunks\"), and the found nouns in the post row." - } - - return options diff --git a/processors/text-analysis/get_nouns.py b/processors/text-analysis/get_nouns.py deleted file mode 100644 index cad8653eb..000000000 --- a/processors/text-analysis/get_nouns.py +++ /dev/null @@ -1,196 +0,0 @@ -""" -Extract nouns from SpaCy NLP docs. - -""" -import pickle -import spacy - -from collections import Counter -from spacy.tokens import DocBin -from common.lib.helpers import UserInput -from backend.lib.processor import BasicProcessor - -__author__ = "Sal Hagen" -__credits__ = ["Sal Hagen"] -__maintainer__ = "Sal Hagen" -__email__ = "4cat@oilab.eu" - - -class ExtractNouns(BasicProcessor): - """ - Rank vectors over time - """ - type = "extract-nouns" # job type ID - category = "Text analysis" # category - title = "Extract nouns" # title displayed in UI - description = "Retrieve nouns detected by SpaCy's part-of-speech tagging, and rank by frequency. " \ - "Make sure to have selected \"Part of Speech\" in the previous " \ - "module, as well as \"Dependency parsing\" if you want to extract compound nouns or noun chunks." # description displayed in UI - extension = "csv" # extension of result file, used internally and in UI - - references = ["[Information on noun chunks](https://spacy.io/usage/linguistic-features#noun-chunks)"] - - followups = ["wordcloud"] - - options = { - "type": { - "type": UserInput.OPTION_CHOICE, - "default": ["nouns"], - "options": { - "nouns": "Single-word nouns", - "nouns_and_compounds": "Nouns and compound nouns", - "noun_chunks": "Noun chunks" - }, - "help": "Whether to only get 1) separate words indicated as nouns, 2) nouns and compound nouns " \ - "(nouns with multiple words, e.g.\"United States\") using a custom parser, or 3) noun chunks: " \ - "nouns plus the words describing them, e.g. \"the old grandpa\". See the references for more info." - } - } - - @classmethod - def is_compatible_with(cls, module=None, user=None): - """ - Allow processor on linguistic feature data - - :param module: Module to determine compatibility with - """ - return module.type == "linguistic-features" - - def process(self): - """ - Opens the SpaCy output and gets ze nouns. - - """ - noun_type = self.parameters["type"] - - # Validate whether the user enabled the right parameters. - # Check part of speech tagging - if "tagger" not in self.source_dataset.parameters["enable"]: - self.dataset.update_status("Enable \"Part-of-speech tagging\" in previous module") - self.dataset.finish(0) - - # Check dependency parsing if nouns and compouns nouns is selected - elif (noun_type == "nouns_and_compounds" or noun_type == "noun_chunks") and "parser" not in \ - self.source_dataset.parameters["enable"]: - self.dataset.update_status( - "Enable \"Part-of-speech tagging\" and \"Dependency parsing\" for compound nouns/noun chunks in previous module") - self.dataset.finish(0) - - # Valid parameters - else: - - # Extract the SpaCy docs first - self.dataset.update_status("Unzipping SpaCy docs") - self.dataset.update_status("Extracting nouns") - - # Store all the nouns in this list - li_nouns = [] - nlp = spacy.load("en_core_web_sm") # Load model - spacy.load("en_core_web_sm") - - for doc_file in self.iterate_archive_contents(self.source_file): - with doc_file.open("rb") as pickle_file: - # Load DocBin - file = pickle.load(pickle_file) - doc_bin = DocBin().from_bytes(file) - docs = list(doc_bin.get_docs(nlp.vocab)) - - # Simply add each word if its POS is "NOUN" - if noun_type == "nouns": - for doc in docs: - post_nouns = [] - post_nouns += [token.text for token in doc if token.pos_ == "NOUN"] - li_nouns.append(post_nouns) - - # Use SpaCy's noun chunk detection - elif noun_type == "noun_chunks": - - for doc in docs: - - # Note: this is a workaround for now. - # Serialization of the SpaCy docs does not - # work well with dependency parsing after - # loading. Quick fix: parse again. - - new_doc = nlp(doc.text) - post_nouns = [] - for chunk in new_doc.noun_chunks: - post_nouns.append(chunk.text) - - li_nouns.append(post_nouns) - - # Use a custom script to get single nouns and compound nouns - elif noun_type == "nouns_and_compounds": - for doc in docs: - post_nouns = [] - noun = "" - - for i, token in enumerate(doc): - - # Check for common nouns (general, e.g. "people") - # and proper nouns (specific, e.g. "London") - if token.pos_ == "NOUN" or token.pos_ == "PROPN": - # Check if the token is part of a noun chunk - if token.dep_ == "compound": # Check for a compound relation - noun = token.text - else: - if noun: - noun += " " + token.text - post_nouns.append(noun) - noun = "" - else: - post_nouns.append(token.text) - li_nouns.append(post_nouns) - - results = [] - - if li_nouns: - - # Also add the data to the original file, if indicated. - if self.parameters.get("overwrite"): - self.add_field_to_parent(field_name=noun_type, - # Format like "apple, gates, ..." and add to the row - new_data=[", ".join([post_noun.lower() for post_noun in li_noun if len(post_noun) > 1]) for li_noun in li_nouns], - which_parent=self.dataset.top_parent()) - - # convert to lower and filter out one-letter words - all_nouns = [] - for post_n in li_nouns: - all_nouns += [str(cap_noun).lower() for cap_noun in post_n if len(cap_noun) > 1] - - # Group and rank - count_nouns = Counter(all_nouns).most_common() - results = [{"word": tpl[0], "count": tpl[1]} for tpl in count_nouns] - - # done! - if results: - self.dataset.update_status("Finished") - self.write_csv_items_and_finish(results) - else: - self.dataset.update_status("Finished, but no nouns were extracted.") - self.dataset.finish(0) - - @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Get processor options - - The feature of this processor that overwrites the parent dataset can - only work properly on csv datasets so check the extension before - showing it. - - :param user: - :param parent_dataset: Dataset to get options for - :return dict: - """ - options = cls.options - if parent_dataset and parent_dataset.top_parent().get_results_path().suffix in [".csv", ".ndjson"]: - options["overwrite"] = { - "type": UserInput.OPTION_TOGGLE, - "default": False, - "help": "Add extracted nouns to source csv", - "tooltip": "Will add a column (\"nouns\", \"nouns_and_compounds\", or \"noun_chunks\"), and the found " - "nouns in the post row." - } - - return options diff --git a/processors/text-analysis/linguistic_extractor.py b/processors/text-analysis/linguistic_extractor.py deleted file mode 100644 index 92357853a..000000000 --- a/processors/text-analysis/linguistic_extractor.py +++ /dev/null @@ -1,168 +0,0 @@ -""" -Extract linguistic features from text using SpaCy. - -""" -import zipfile -import pickle -import re - -import spacy -from spacy.tokens import DocBin -from spacy.tokenizer import Tokenizer -from spacy.util import compile_prefix_regex, compile_suffix_regex - -from common.lib.helpers import UserInput -from common.lib.exceptions import ProcessorInterruptedException -from backend.lib.processor import BasicProcessor - -__author__ = "Sal Hagen" -__credits__ = ["Sal Hagen", "Stijn Peeters"] -__maintainer__ = "Sal Hagen" -__email__ = "4cat@oilab.eu" - - -class LinguisticFeatures(BasicProcessor): - """ - Rank vectors over time - """ - type = "linguistic-features" # job type ID - category = "Text analysis" # category - title = "Annotate text features with SpaCy" # title displayed in UI - description = "Annotate your text with a variety of linguistic features using the SpaCy library, " \ - "including part-of-speech tagging, depencency parsing, and named entity recognition. " \ - "Subsequent processors can extract the words labelled by SpaCy (e.g. as a noun or name). " \ - "Produces a Doc file using the en_core_web_sm model. Currently only available for datasets " \ - "with less than 100,000 items. " # description displayed in UI - extension = "zip" # extension of result file, used internally and in UI - - followups = ["get-entities", "extract-nouns"] - - references = [ - "[SpaCy Linguistic Features - Documentation](https://spacy.io/usage/linguistic-features/)" - ] - - options = { - "enable": { - "type": UserInput.OPTION_MULTI, - "default": [], - "options": { - "tagger": "Part-of-speech tagging: Tag the grammatical function of words, like nouns and verbs", - "parser": "Dependency parsing: Extract how words in a sentence relate to each other", - "ner": "Named entity recognition: Annotate what kind of objects appear in a sentence (e.g. Apple -> Organisation)" - }, - "help": "What linguistic features to extract. Without any of these selected, it simply saves the SpaCy docs (tokenised sentences) as a serialized file. See references for more information." - } - } - - @classmethod - def is_compatible_with(cls, module=None, user=None): - """ - Allow CSV and NDJSON datasets - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - - def process(self): - """ - Reads text and outputs entities per text body. - """ - - # prepare staging area - staging_area = self.dataset.get_staging_area() - - self.dataset.update_status("Preparing data") - - # go through all archived token sets and vectorise them - results = [] - - # Load the spacy goods - nlp = spacy.load("en_core_web_sm") - nlp.tokenizer = self.custom_tokenizer(nlp) # Keep words with a dash in between - - # Disable what has _not_ been selected - options = ["parser", "tagger", "ner"] - enable = self.parameters.get("enable", False) - - if not enable: - self.dataset.update_status("Select at least one of the options.") - self.dataset.finish(0) - return - - disable = [option for option in options if option not in enable] - - # Get all ze text first so we can process it in batches - posts = [] - for post in self.source_dataset.iterate_items(self): - if post.get("body", ""): - if len(post["body"]) > 1000000: - body = post["body"][:1000000] - else: - body = post["body"] - posts.append(body) - else: - self.dataset.log('Warning: Post %s has no body from which to extract entities' % post.get('id')) - posts.append("") - - # Process the text in batches - if len(posts) < 100000: - self.dataset.update_status("Extracting linguistic features") - else: - self.dataset.update_status( - "Extracting linguistic features is currently only available for datasets with less than 100,000 items.") - self.dataset.finish(0) - return - - # Make sure only the needed information is extracted. - attrs = [] - if "tagger" not in disable: - attrs.append("POS") - if "parser" not in disable: - attrs.append("DEP") - if "ner": - attrs.append("ENT_IOB") - attrs.append("ENT_TYPE") - attrs.append("ENT_ID") - attrs.append("ENT_KB_ID") - - # DocBin for quick saving - doc_bin = DocBin(attrs=attrs) - - # Start the processing! - try: - for i, doc in enumerate(nlp.pipe(posts, disable=disable)): - doc_bin.add(doc) - - # It's quite a heavy process, so make sure it can be interrupted - if self.interrupted: - raise ProcessorInterruptedException("Processor interrupted while iterating through CSV file") - - if i % 1000 == 0: - self.dataset.update_status("Done with post %s out of %s" % (i, len(posts))) - except MemoryError: - self.dataset.update_status("Out of memory. The dataset may be too large to process. Try again with a smaller dataset.", is_final=True) - return - - self.dataset.update_status("Serializing results - this will take a while") - - # Then serialize the NLP docs and the vocab - doc_bytes = doc_bin.to_bytes() - - # Dump ze data in a temporary folder - with staging_area.joinpath("spacy_docs.pb").open("wb") as outputfile: - pickle.dump(doc_bytes, outputfile) - - # create zip of archive and delete temporary files and folder - self.write_archive_and_finish(staging_area, compression=zipfile.ZIP_LZMA) - - def custom_tokenizer(self, nlp): - """ - Custom tokeniser that does not split on dashes. - Useful for names (e.g. Hennis-Plasschaert). - """ - infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''') - prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) - suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) - - return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, - suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer, - token_match=None) diff --git a/setup.py b/setup.py index e62f292ba..17079a887 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,6 @@ "scikit-learn", "scipy==1.10.1", "shapely", - "spacy==3.7.2", "svgwrite~=1.4.0", "tailer", "Telethon~=1.36.0", @@ -64,8 +63,7 @@ "imagedominantcolor @ git+https://github.com/dale-wahl/imagedominantcolor.git@pillow10", "videohash @ git+https://github.com/dale-wahl/videohash@main", "vk_api", - "yt-dlp", - "en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz#egg=en_core_web_sm" + "yt-dlp" ] # Some packages don't run on Windows