From 34b8409abbc4055416bf57a6a29b50d66d92fab4 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 17 Feb 2025 19:02:17 +0100 Subject: [PATCH] Xiaohongshu/RedNote data source --- datasources/xiaohongshu/DESCRIPTION.md | 14 ++ datasources/xiaohongshu/__init__.py | 12 ++ datasources/xiaohongshu/search_rednote.py | 160 ++++++++++++++++++++++ 3 files changed, 186 insertions(+) create mode 100644 datasources/xiaohongshu/DESCRIPTION.md create mode 100644 datasources/xiaohongshu/__init__.py create mode 100644 datasources/xiaohongshu/search_rednote.py diff --git a/datasources/xiaohongshu/DESCRIPTION.md b/datasources/xiaohongshu/DESCRIPTION.md new file mode 100644 index 00000000..b525fa8e --- /dev/null +++ b/datasources/xiaohongshu/DESCRIPTION.md @@ -0,0 +1,14 @@ +The RedNote data source can be used to manipulate data collected from [RedNote](https://www.xiaohongshu.com/) - also +known as Xiaohongshu or Little Red Book - with [Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer). Data is collected with the browser extension; 4CAT +cannot collect data on its own. After collecting data with Zeeschuimer it can be uploaded to 4CAT for further processing +and analysis. See the Zeeschuimer documentation for more information on how to collect data with it. + +Data is collected as it is formatted internally by RedNote' website. Posts are stored as (large) JSON objects; it +will usually be easier to make sense of the data by downloading it as a CSV file from 4CAT instead. The JSON structure +is relatively straightforward and contains some data not included in the CSV exports. + +Note that depending on the page data is captured from, some metadata may not be available. For example, when capturing +data from the 'Explore' page, the description and time of posting of a post are not available. These are however +available when capturing from a post's page. After importing a dataset to 4CAT, the dataset status will summarise what +information is and is not available. A `missing_fields` column additionally contains the names of columns with missing +data for each imported item. \ No newline at end of file diff --git a/datasources/xiaohongshu/__init__.py b/datasources/xiaohongshu/__init__.py new file mode 100644 index 00000000..074468d4 --- /dev/null +++ b/datasources/xiaohongshu/__init__.py @@ -0,0 +1,12 @@ +""" +Initialize RedNote data source +""" + +# An init_datasource function is expected to be available to initialize this +# data source. A default function that does this is available from the +# backend helpers library. +from common.lib.helpers import init_datasource + +# Internal identifier for this data source +DATASOURCE = "xiaohongshu" +NAME = "Xiaohongshu/RedNote" \ No newline at end of file diff --git a/datasources/xiaohongshu/search_rednote.py b/datasources/xiaohongshu/search_rednote.py new file mode 100644 index 00000000..b1e0969d --- /dev/null +++ b/datasources/xiaohongshu/search_rednote.py @@ -0,0 +1,160 @@ +""" +Import scraped RedNote data + +It's prohibitively difficult to scrape data from RedNote within 4CAT itself due +to its aggressive rate limiting. Instead, import data collected elsewhere. +""" +from datetime import datetime + +from backend.lib.search import Search +from common.lib.item_mapping import MappedItem, MissingMappedField + + +class SearchRedNote(Search): + """ + Import scraped RedNote/Xiaohongshu/XSH data + """ + type = "xiaohongshu-search" # job ID + category = "Search" # category + title = "Import scraped RedNote data" # title displayed in UI + description = "Import RedNote data collected with an external tool such as Zeeschuimer." # description displayed in UI + extension = "ndjson" # extension of result file, used internally and in UI + is_from_zeeschuimer = True + + # not available as a processor for existing datasets + accepts = [None] + references = [ + "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", + "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" + ] + + def get_items(self, query): + """ + Run custom search + + Not available for RedNote + """ + raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere") + + + @staticmethod + def map_item(post): + """ + Map XSH object to 4CAT item + + Depending on whether the object was captured from JSON or HTML, treat it + differently. A lot of data is missing from HTML objects. + + :param post: + :return: + """ + if post.get("_zs-origin") == "html": + return SearchRedNote.map_item_from_html(post) + else: + if "note" in post: + return SearchRedNote.map_item_from_json_embedded(post) + else: + if post.get("type") == "video": + post["note_card"] = post.copy() + return SearchRedNote.map_item_from_json_api_explore(post) + + @staticmethod + def map_item_from_json_api_explore(post): + """ + Map API-sourced XSH object to 4CAT item + + Most straightforward - JSON objects from the XSH web API, which do + however not always contain the same fields. + + :param dict post: + :return MappedItem: + """ + item = post["note_card"] + item_id = post.get("id", post.get("note_id")) + + image = item["image_list"].pop(0)["url_default"] if item.get("image_list") else item["cover"]["url_default"] + + # permalinks need this token to work, else you get a 404 not found + xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else "" + + timestamp = item.get("time", None) + return MappedItem({ + "id": item_id, + "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}", + "title": item.get("display_title", ""), + "body": item.get("desc", "") if "desc" in item else MissingMappedField(""), + "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), + "author": item["user"]["nickname"], + "author_avatar_url": item["user"]["avatar"], + "image_url": image, + # only available when loading an individual post page, so skip + # "tags": ",".join(t["name"] for t in item["tag_list"]), + "likes": item["interact_info"]["liked_count"], + # "collects": item["interact_info"]["collected_count"], + # "comments": item["interact_info"]["comment_count"], + # "shares": item["interact_info"]["share_count"], + "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), + }) + + @staticmethod + def map_item_from_json_embedded(item): + """ + Map JSON object from an XHS HTML page + + JSON objects from the HTML are formatted slightly differently, mostly + in that they use camelCase instead of underscores, but we can also + make a few more assumptions about the data + + :param dict item: + :return MappedItem: + """ + note = item["note"] + image = note["imageList"].pop(0)["urlDefault"] + # permalinks need this token to work, else you get a 404 not found + xsec_bit = f"?xsec_token={note['xsecToken']}" + timestamp = item.get("time", None) + + return MappedItem({ + "id": item["id"], + "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}", + "title": note.get("title", ""), + "body": note.get("desc", "") if "desc" in note else MissingMappedField(""), + "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), + "author": note["user"]["nickname"], + "author_avatar_url": note["user"]["avatar"], + "image_url": image, + # only available when loading an individual post page, so skip + # "tags": ",".join(t["name"] for t in item["tag_list"]), + "likes": item["interactInfo"]["likedCount"], + # "collects": item["interact_info"]["collected_count"], + # "comments": item["interact_info"]["comment_count"], + # "shares": item["interact_info"]["share_count"], + "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), + }) + + def map_item_from_html(item): + """ + Map pre-mapped item + + These have been mapped by Zeeschuimer from the page HTML and contain + less data than JSON objects (but enough to be useful in some cases). + + :param dict item: + :return MappedItem: + """ + return MappedItem({ + "id": item["id"], + "url": f"https://www.xiaohongshu.com{item['url']}", + "title": item["title"], + "body": MissingMappedField(""), + "timestamp": MissingMappedField(""), + "author": item["author_name"], + "author_avatar_url": item["author_avatar_url"], + "image_url": item["thumbnail_url"], + # "tags": MissingMappedField(""), + "likes": item["likes"], + # "collects": MissingMappedField(""), + # "comments": MissingMappedField(""), + # "shares": MissingMappedField(""), + "unix_timestamp": MissingMappedField(""), + })