From 4205610ffcfb80c3c7e0f35150f5ec1f784a1af4 Mon Sep 17 00:00:00 2001 From: Stijn Peeters <42036349+stijn-uva@users.noreply.github.com> Date: Wed, 5 Feb 2025 13:22:45 +0100 Subject: [PATCH] Pinterest data source (#478) * Pinterest data source * Upd8s * Squashed commit of the following: commit 513a58986cc518bb42c59eb4fb947e2f83eb3c43 Author: Dale Wahl Date: Mon Jan 27 17:01:52 2025 +0100 bsky: ensure interrupt commit 0badee73d919aece92134fc942540c83294861f0 Author: Dale Wahl Date: Mon Jan 27 15:19:42 2025 +0100 bsky: no progress bar if no max_posts commit 115a3c155f1c5edbce3d6c62c1fbbf66fa6a51ba Author: Dale Wahl Date: Mon Jan 27 14:18:12 2025 +0100 bsky datasource commit 836a23531e3cbbbc5cd5d8e2a195b83bba0211ae Author: Dale Wahl Date: Thu Jan 23 11:47:51 2025 +0100 post_topic_matrix: rename column when tokenizer created multiple documents per post commit 977d887ccf99d38c4971876755538d54af92e577 Author: Dale Wahl Date: Thu Jan 23 11:37:52 2025 +0100 rank_attribute: convert to str to lower() commit a1cdd4ce49c70f38e99ff71b747c55e3819ad56b Author: Dale Wahl Date: Wed Jan 22 09:40:29 2025 +0100 fix: allow None for columns.default; remove debug log statement fix occasional error that appears particularly on new processors with no expected default, i.e.: TypeError: argument of type 'NoneType' is not iterable * Handle both JSON and HTML-sourced Pinterest data * Squashed commit of the following: commit dd2ab725901e4f967d512d896e7c4e18de6905e8 Author: Stijn Peeters Date: Tue Feb 4 18:16:59 2025 +0100 Highlight missing fields in CSV preview commit 204ab8a157ae7cbd306430ecc9b8b91f0484dcfe Author: Stijn Peeters Date: Tue Feb 4 18:16:47 2025 +0100 Add a 'missing fields' key to mapped dataset items commit 11457e0dd19426117d082c8f576afd5a2542410e Author: Stijn Peeters Date: Tue Feb 4 18:16:26 2025 +0100 Add a 'missing_fields' column to mapped objects commit a3e4f77e0f420e4335def18482e3c82606cc1047 Author: Stijn Peeters Date: Tue Feb 4 18:15:06 2025 +0100 Prevent tooltips from falling (partially) outside the viewport commit 16be136c8c394570990f592860b42f263a058907 Author: Dale Wahl Date: Tue Feb 4 17:18:27 2025 +0100 docker build action possible fix commit b3303398cc2ab4e911870ff6bc8a5c741739b879 Author: Stijn Peeters Date: Tue Feb 4 16:48:38 2025 +0100 Parse Markdown in dataset status commit 79cb297d53519106203a5583bdac37c14ec6114f Author: Stijn Peeters Date: Tue Feb 4 15:43:27 2025 +0100 Indicate whether like amount is hidden for Instagram posts commit 3c62f37f139df2e397de411421c39e6ac944c695 Author: Stijn Peeters Date: Tue Feb 4 15:43:00 2025 +0100 Do not consider missing geotags in Instagram posts 'missing' fields commit 6d3f9d4a0e252b139c9c3285a0fe785a8d36a923 Author: Dale Wahl Date: Tue Feb 4 13:45:47 2025 +0100 consolidate_urls: better logging/status, better url split commit a84c63b2d593164cb2d2902a78fa398dcb0e82c4 Author: Dale Wahl Date: Tue Feb 4 13:45:05 2025 +0100 revert 0983a3671399b23db67ccb1cc9dedf6a5364c469 commit bf7fe147cb1ea4b5b5326119eefa773ad073f502 Author: Dale Wahl Date: Tue Feb 4 12:36:26 2025 +0100 consolidate_urls: hide unused settings based on requirements commit ccaf114346f7837f8acff6bf43a984f6b4b3da77 Author: Dale Wahl Date: Tue Feb 4 12:27:57 2025 +0100 consolidate_urls: validate URL before parsing commit 638413a02ae3406b89e6b45a0b3e8ff8fb14d6fd Author: Dale Wahl Date: Tue Feb 4 11:08:17 2025 +0100 check results exist then delete; error message include dataset key when unable to delete sometimes log files are left behind because FileNotFoundError was raised on the results_path commit 0983a3671399b23db67ccb1cc9dedf6a5364c469 Author: Dale Wahl Date: Tue Feb 4 10:28:51 2025 +0100 possibly address github action build fail issue commit 855d34e9f69a802fe5833ef977ed8f9938ebddd0 Author: Stijn Peeters Date: Mon Feb 3 18:44:14 2025 +0100 Fix Gephi Lite link commit 4e5752de5e5d4ad99983631c254c868a745f274f Author: Stijn Peeters Date: Mon Feb 3 17:57:16 2025 +0100 Nicer numbers in network processor statuses commit 1e0a24cf5938bfcf4bf6621a58144fb4c23ca9ac Author: Stijn Peeters Date: Mon Feb 3 17:43:20 2025 +0100 Never assume fields are non-null in Telegram data... commit 66d60e918e050bb85086408ea3f03863cdc3a186 Author: Stijn Peeters Date: Mon Feb 3 17:40:09 2025 +0100 Fix forward username mapping in some cases for Telegram commit 8034d1c12f06ddc8b5f098b72a37f74e7bef1c7c Merge: 59a15465 9bccdf14 Author: Stijn Peeters Date: Mon Feb 3 12:05:48 2025 +0100 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit 59a15465a7d31a051a4918ac703674e14ada1b0c Author: Stijn Peeters Date: Mon Feb 3 12:05:44 2025 +0100 Add progress indicators to 'Count values' and 'Thread metadata' processors commit 9bccdf144b8e07b92f6de29d9ac08aeacef3f089 Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com> Date: Fri Jan 31 13:05:29 2025 +0100 Update docker_latest.yml 6.13.0? commit e826283b0d3717ddaaca2209ba8c184fd3dd0251 Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com> Date: Fri Jan 31 13:01:18 2025 +0100 Same but different commit 54d10cb5aa283e91b2a76b96cd1fbb0d0a8341b5 Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com> Date: Fri Jan 31 12:59:41 2025 +0100 Update GitHub action to use latest docker commit 2600e55f4bb3297814bfb842ddeb0f8b2366a437 Author: Dale Wahl Date: Fri Jan 31 12:34:59 2025 +0100 python 3.11 for Docker Have been using this all winter and have had no issues. Enjoying the better error messages too. commit 513a58986cc518bb42c59eb4fb947e2f83eb3c43 Author: Dale Wahl Date: Mon Jan 27 17:01:52 2025 +0100 bsky: ensure interrupt commit 0badee73d919aece92134fc942540c83294861f0 Author: Dale Wahl Date: Mon Jan 27 15:19:42 2025 +0100 bsky: no progress bar if no max_posts commit 115a3c155f1c5edbce3d6c62c1fbbf66fa6a51ba Author: Dale Wahl Date: Mon Jan 27 14:18:12 2025 +0100 bsky datasource commit 836a23531e3cbbbc5cd5d8e2a195b83bba0211ae Author: Dale Wahl Date: Thu Jan 23 11:47:51 2025 +0100 post_topic_matrix: rename column when tokenizer created multiple documents per post commit 977d887ccf99d38c4971876755538d54af92e577 Author: Dale Wahl Date: Thu Jan 23 11:37:52 2025 +0100 rank_attribute: convert to str to lower() commit a1cdd4ce49c70f38e99ff71b747c55e3819ad56b Author: Dale Wahl Date: Wed Jan 22 09:40:29 2025 +0100 fix: allow None for columns.default; remove debug log statement fix occasional error that appears particularly on new processors with no expected default, i.e.: TypeError: argument of type 'NoneType' is not iterable --- datasources/pinterest/DESCRIPTION.md | 13 ++ datasources/pinterest/__init__.py | 12 ++ datasources/pinterest/search_pinterest.py | 142 ++++++++++++++++++++++ webtool/lib/template_filters.py | 19 +++ 4 files changed, 186 insertions(+) create mode 100644 datasources/pinterest/DESCRIPTION.md create mode 100644 datasources/pinterest/__init__.py create mode 100644 datasources/pinterest/search_pinterest.py diff --git a/datasources/pinterest/DESCRIPTION.md b/datasources/pinterest/DESCRIPTION.md new file mode 100644 index 000000000..3feddfaca --- /dev/null +++ b/datasources/pinterest/DESCRIPTION.md @@ -0,0 +1,13 @@ +The Pinterest data source can be used to manipulate data collected from [Pinterest](https://pinterest.com/) with +[Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer). Data is collected with the browser extension; 4CAT cannot collect data on its own. After collecting +data with Zeeschuimer it can be uploaded to 4CAT for further processing and analysis. See the Zeeschuimer documentation +for more information on how to collect data with it. + +Data is collected as it is formatted internally by Pinterest's website. Posts are stored as (large) JSON objects; it +will usually be easier to make sense of the data by downloading it as a CSV file from 4CAT instead. The JSON structure +is relatively straightforward and contains some data not included in the CSV exports. + +## Missing data + +Pinterest does not always include all metadata in its JSON objects; on some pages, the time the post was made is missing +from a post, for example. 4CAT will warn you about this when importing data. \ No newline at end of file diff --git a/datasources/pinterest/__init__.py b/datasources/pinterest/__init__.py new file mode 100644 index 000000000..a79da189b --- /dev/null +++ b/datasources/pinterest/__init__.py @@ -0,0 +1,12 @@ +""" +Initialize Pinterest data source +""" + +# An init_datasource function is expected to be available to initialize this +# data source. A default function that does this is available from the +# backend helpers library. +from common.lib.helpers import init_datasource + +# Internal identifier for this data source +DATASOURCE = "pinterest" +NAME = "Pinterest" \ No newline at end of file diff --git a/datasources/pinterest/search_pinterest.py b/datasources/pinterest/search_pinterest.py new file mode 100644 index 000000000..cd1b5e836 --- /dev/null +++ b/datasources/pinterest/search_pinterest.py @@ -0,0 +1,142 @@ +""" +Import scraped Pinterest data + +It's prohibitively difficult to scrape data from Pinterest within 4CAT itself due +to its aggressive rate limiting. Instead, import data collected elsewhere. +""" +from datetime import datetime + +from backend.lib.search import Search +from common.lib.item_mapping import MappedItem, MissingMappedField + + +class SearchPinterest(Search): + """ + Import scraped Pinterest data + """ + type = "pinterest-search" # job ID + category = "Search" # category + title = "Import scraped Pinterest data" # title displayed in UI + description = "Import Pinterest data collected with an external tool such as Zeeschuimer." # description displayed in UI + extension = "ndjson" # extension of result file, used internally and in UI + is_from_zeeschuimer = True + + # not available as a processor for existing datasets + accepts = [None] + references = [ + "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", + "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" + ] + + def get_items(self, query): + """ + Run custom search + + Not available for Pinterest + """ + raise NotImplementedError("Pinterest datasets can only be created by importing data from elsewhere") + + @staticmethod + def map_item(post): + """ + Map Pinterest object to 4CAT item + + Depending on whether the object was captured from JSON or HTML, treat it + differently. A lot of data is missing from HTML objects. + + :param post: + :return: + """ + if post.get("_zs-origin") == "html": + return SearchPinterest.map_item_from_html(post) + else: + return SearchPinterest.map_item_from_json(post) + + @staticmethod + def map_item_from_json(post): + """ + Map Pinterest object to 4CAT item + + Pretty simple, except posts sometimes don't have timestamps :| but at + least these objects are more complete than the HTML data usually + + :param dict post: Pinterest object + :return MappedItem: Mapped item + """ + try: + # there are often no timestamps :'( + timestamp = datetime.strptime(post.get("created_at", post.get("createdAt")), "%a, %d %b %Y %H:%M:%S %z") + unix_timestamp = int(timestamp.timestamp()) + str_timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S") + except (ValueError, TypeError): + unix_timestamp = str_timestamp = MissingMappedField("") + + post_id = post.get("entityId", post["id"]) + + if "imageSpec_orig" in post: + image_url = post["imageSpec_orig"]["url"] + else: + image_url = post["images"]["orig"]["url"] + + return MappedItem({ + "id": post_id, + "author": post["pinner"]["username"], + "author_fullname": post["pinner"].get("fullName", post["pinner"].get("full_name", "")), + "author_original": post["nativeCreator"]["username"] if post.get("nativeCreator") else post["pinner"]["username"], + "body": post["description"].strip(), + "subject": post["title"].strip(), + "ai_description": post.get("auto_alt_text", ""), + "pinner_original": post["originPinner"]["fullName"] if post.get("originPinner") else "", + "pinner_via": post["viaPinner"]["fullName"] if post.get("viaPinner") else "", + "board": post["board"]["name"], + "board_pins": post["board"].get("pinCount", post["board"].get("pin_count")), + "board_url": f"https://www.pinterest.com{post['board']['url']}", + "timestamp": str_timestamp, + "idea_tags": ",".join(post["pinJoin"]["visualAnnotation"]) if post.get("pinJoin") else "", + "url": f"https://www.pinterest.com/pin/{post_id}", + # these are not always available (shame) + # "is_repin": "yes" if post["isRepin"] else "no", + # "is_unsafe": "yes" if post["isUnsafe"] else "no", + # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"], + "is_video": "yes" if post.get("isVideo", post.get("videos")) else "no", + "image_url": image_url, + "dominant_colour": post.get("dominantColor", post.get("dominant_color")), + "unix_timestamp": unix_timestamp + }) + + @staticmethod + def map_item_from_html(post): + """ + Map Pinterest object to 4CAT item + + These are from the HTML and have even less data than JSON objects... + but enough to be useful in some cases. + + :param dict post: Pinterest object + :return MappedItem: Mapped item + """ + return MappedItem({ + "id": int(post["id"]), + "author": MissingMappedField(""), + "author_fullname": MissingMappedField(""), + "author_original": MissingMappedField(""), + "body": post["body"].strip(), + "subject": post["title"].strip(), + "ai_description": MissingMappedField(""), + "pinner_original": MissingMappedField(""), + "pinner_via": MissingMappedField(""), + "board": MissingMappedField(""), + "board_pins": MissingMappedField(""), + "board_url": MissingMappedField(""), + "timestamp": MissingMappedField(""), # there are no timestamps :( + "idea_tags": ",".join(post["tags"]), + "url": f"https://www.pinterest.com/pin/{post['id']}", + # these are not always available (shame) + # "is_repin": "yes" if post["isRepin"] else "no", + # "is_unsafe": "yes" if post["isUnsafe"] else "no", + # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"], + "is_video": MissingMappedField(""), + "image_url": post["image"], + "dominant_colour": MissingMappedField(""), + "unix_timestamp": MissingMappedField("") + }) diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index 2daffd001..a27cad485 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -121,6 +121,25 @@ def _jinja2_filter_httpquery(data): except TypeError: return "" +@app.template_filter("add_colour") +def _jinja2_add_colours(data): + """ + Add colour preview to hexadecimal colour values. + + Cute little preview for #FF0099-like strings. Used (at time of writing) for + Pinterest data, which has a "dominant colour" field. + + Only works on strings that are *just* the value, to avoid messing up HTML + etc + + :param str data: String + :return str: HTML + """ + if type(data) is not str or not re.match(r"#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})\b", data): + return data + + return f' {data}' + @app.template_filter("add_ahref") def _jinja2_filter_add_ahref(content, ellipsiate=0): """