Skip to content

Commit

Permalink
New 'Bipartite image-item network' processor
Browse files Browse the repository at this point in the history
  • Loading branch information
stijn-uva committed Oct 23, 2024
1 parent 53821d1 commit f98addc
Showing 1 changed file with 146 additions and 0 deletions.
146 changes: 146 additions & 0 deletions processors/networks/image-network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""
Make a bipartite Image-Item network
"""
import json

from backend.lib.processor import BasicProcessor

import networkx as nx

__author__ = "Stijn Peeters"
__credits__ = ["Stijn Peeters"]
__maintainer__ = "Stijn Peeters"
__email__ = "[email protected]"

from common.lib.exceptions import ProcessorInterruptedException
from common.lib.user_input import UserInput


class ImageGrapher(BasicProcessor):
"""
Image network
Creates a bipartite network of images and some attribute of the dataset the
images were sourced from
"""
type = "image-bipartite-network" # job type ID
category = "Networks"
title = "Bipartite image-item network" # title displayed in UI
description = ("Create a GEXF network file with a bipartite network of "
"images and some data field (e.g. author) of the dataset "
"the images were sourced from. Suitable for use with Gephi's "
"'Image Preview' plugin.")
extension = "gexf" # extension of result file, used internally and in UI

options = {}

@classmethod
def get_options(cls, parent_dataset=None, user=None):
root_dataset = None
columns = None
if parent_dataset:
for parent in reversed(parent_dataset.get_genealogy()):
if parent.get_columns():
root_dataset = parent
break
columns = root_dataset.get_columns()

return {
"column": {
"help": "Dataset field",
"type": UserInput.OPTION_TEXT,
"default": "id"
},
"image-value": {
"help": "Image node label",
"type": UserInput.OPTION_CHOICE,
"options": {
"filename": "Image file name",
"url": "Image URL"
},
"tooltip": "The image node label will have this value. Depending on the network visualisation software "
"you use, one or the other is required to display the images as nodes."
},
**({
"column": {
"help": "Dataset field",
"type": UserInput.OPTION_CHOICE,
"options": {
column: column
for column in columns}
}
} if columns else {})
}

@classmethod
def is_compatible_with(cls, module=None, user=None):
"""
Allow processor to run on images downloaded from a dataset
:param module: Module to determine compatibility with
"""
return module.type.startswith("image-downloader")

def process(self):
column = self.parameters.get("column")
metadata = None
for file in self.iterate_archive_contents(self.source_file, filename_filter=[".metadata.json"]):
with file.open() as infile:
try:
metadata = json.load(infile)
except json.JSONDecodeError:
pass

if not metadata:
return self.dataset.finish_with_error("No valid metadata found in image archive - this processor can only "
"be run on sets of images sourced from another 4CAT dataset.")

id_file_map = {}
for url, details in metadata.items():
for item_id in details.get("post_ids", []):
id_file_map[item_id] = url if self.parameters.get("image-value") == "url" else details["filename"]

root_dataset = None
for parent in reversed(self.dataset.get_genealogy()):
if parent.get_columns():
root_dataset = parent
break

if not root_dataset:
return self.dataset.finish_with_error("No suitable parent dataset found - this processor can only "
"be run on sets of images sourced from another 4CAT dataset.")

network = nx.DiGraph()
processed = 0
for item in root_dataset.iterate_items():
self.dataset.update_progress(processed / root_dataset.num_rows)
processed += 1
if processed % 100 == 0:
self.dataset.update_status(f"Processed {processed:,} of {root_dataset.num_rows:,} item(s)")

if self.interrupted:
raise ProcessorInterruptedException()

if item.get("id") not in id_file_map:
continue

from_node_label = item.get(column)
from_node = f"{column}-{from_node_label}"
to_node_label = id_file_map[item.get("id")]
to_node = f"image-{to_node_label}"

if from_node not in network.nodes:
network.add_node(from_node, label=from_node_label, category=column)

if to_node not in network.nodes:
network.add_node(to_node, label=to_node_label, category="image", image=to_node_label)

edge = (from_node, to_node)
if edge not in network.edges():
network.add_edge(*edge, frequency=0)

network.edges[edge]["frequency"] += 1

self.dataset.update_status("Writing network file")
nx.write_gexf(network, self.dataset.get_results_path())
self.dataset.finish(len(network.nodes))

0 comments on commit f98addc

Please sign in to comment.