From 34b8409abbc4055416bf57a6a29b50d66d92fab4 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 17 Feb 2025 19:02:17 +0100
Subject: [PATCH] Xiaohongshu/RedNote data source

---
 datasources/xiaohongshu/DESCRIPTION.md    |  14 ++
 datasources/xiaohongshu/__init__.py       |  12 ++
 datasources/xiaohongshu/search_rednote.py | 160 ++++++++++++++++++++++
 3 files changed, 186 insertions(+)
 create mode 100644 datasources/xiaohongshu/DESCRIPTION.md
 create mode 100644 datasources/xiaohongshu/__init__.py
 create mode 100644 datasources/xiaohongshu/search_rednote.py

diff --git a/datasources/xiaohongshu/DESCRIPTION.md b/datasources/xiaohongshu/DESCRIPTION.md
new file mode 100644
index 00000000..b525fa8e
--- /dev/null
+++ b/datasources/xiaohongshu/DESCRIPTION.md
@@ -0,0 +1,14 @@
+The RedNote data source can be used to manipulate data collected from [RedNote](https://www.xiaohongshu.com/) - also 
+known as Xiaohongshu or Little Red Book - with  [Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer). Data is collected with the browser extension; 4CAT 
+cannot collect data on its own. After collecting data with Zeeschuimer it can be uploaded to 4CAT for further processing 
+and analysis. See the Zeeschuimer documentation for more information on how to collect data with it.
+
+Data is collected as it is formatted internally by RedNote' website. Posts are stored as (large) JSON objects; it 
+will usually be easier to make sense of the data by downloading it as a CSV file from 4CAT instead. The JSON structure
+is relatively straightforward and contains some data not included in the CSV exports.
+
+Note that depending on the page data is captured from, some metadata may not be available. For example, when capturing 
+data from the 'Explore' page, the description and time of posting of a post are not available. These are however 
+available when capturing from a post's page. After importing a dataset to 4CAT, the dataset status will summarise what
+information is and is not available. A `missing_fields` column additionally contains the names of columns with missing 
+data for each imported item.
\ No newline at end of file
diff --git a/datasources/xiaohongshu/__init__.py b/datasources/xiaohongshu/__init__.py
new file mode 100644
index 00000000..074468d4
--- /dev/null
+++ b/datasources/xiaohongshu/__init__.py
@@ -0,0 +1,12 @@
+"""
+Initialize RedNote data source
+"""
+
+# An init_datasource function is expected to be available to initialize this
+# data source. A default function that does this is available from the
+# backend helpers library.
+from common.lib.helpers import init_datasource
+
+# Internal identifier for this data source
+DATASOURCE = "xiaohongshu"
+NAME = "Xiaohongshu/RedNote"
\ No newline at end of file
diff --git a/datasources/xiaohongshu/search_rednote.py b/datasources/xiaohongshu/search_rednote.py
new file mode 100644
index 00000000..b1e0969d
--- /dev/null
+++ b/datasources/xiaohongshu/search_rednote.py
@@ -0,0 +1,160 @@
+"""
+Import scraped RedNote data
+
+It's prohibitively difficult to scrape data from RedNote within 4CAT itself due
+to its aggressive rate limiting. Instead, import data collected elsewhere.
+"""
+from datetime import datetime
+
+from backend.lib.search import Search
+from common.lib.item_mapping import MappedItem, MissingMappedField
+
+
+class SearchRedNote(Search):
+    """
+    Import scraped RedNote/Xiaohongshu/XSH data
+    """
+    type = "xiaohongshu-search"  # job ID
+    category = "Search"  # category
+    title = "Import scraped RedNote data"  # title displayed in UI
+    description = "Import RedNote data collected with an external tool such as Zeeschuimer."  # description displayed in UI
+    extension = "ndjson"  # extension of result file, used internally and in UI
+    is_from_zeeschuimer = True
+
+    # not available as a processor for existing datasets
+    accepts = [None]
+    references = [
+        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
+        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
+    ]
+
+    def get_items(self, query):
+        """
+        Run custom search
+
+        Not available for RedNote
+        """
+        raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere")
+
+
+    @staticmethod
+    def map_item(post):
+        """
+        Map XSH object to 4CAT item
+
+        Depending on whether the object was captured from JSON or HTML, treat it
+        differently. A lot of data is missing from HTML objects.
+
+        :param post:
+        :return:
+        """
+        if post.get("_zs-origin") == "html":
+            return SearchRedNote.map_item_from_html(post)
+        else:
+            if "note" in post:
+                return SearchRedNote.map_item_from_json_embedded(post)
+            else:
+                if post.get("type") == "video":
+                    post["note_card"] = post.copy()
+                return SearchRedNote.map_item_from_json_api_explore(post)
+
+    @staticmethod
+    def map_item_from_json_api_explore(post):
+        """
+        Map API-sourced XSH object to 4CAT item
+
+        Most straightforward - JSON objects from the XSH web API, which do
+        however not always contain the same fields.
+
+        :param dict post:
+        :return MappedItem:
+        """
+        item = post["note_card"]
+        item_id = post.get("id", post.get("note_id"))
+
+        image = item["image_list"].pop(0)["url_default"] if item.get("image_list") else item["cover"]["url_default"]
+
+        # permalinks need this token to work, else you get a 404 not found
+        xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else ""
+
+        timestamp = item.get("time", None)
+        return MappedItem({
+            "id": item_id,
+            "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}",
+            "title": item.get("display_title", ""),
+            "body": item.get("desc", "") if "desc" in item else MissingMappedField(""),
+            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
+            "author": item["user"]["nickname"],
+            "author_avatar_url": item["user"]["avatar"],
+            "image_url": image,
+            # only available when loading an individual post page, so skip
+            # "tags": ",".join(t["name"] for t in item["tag_list"]),
+            "likes": item["interact_info"]["liked_count"],
+            # "collects": item["interact_info"]["collected_count"],
+            # "comments": item["interact_info"]["comment_count"],
+            # "shares": item["interact_info"]["share_count"],
+            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
+        })
+
+    @staticmethod
+    def map_item_from_json_embedded(item):
+        """
+        Map JSON object from an XHS HTML page
+
+        JSON objects from the HTML are formatted slightly differently, mostly
+        in that they use camelCase instead of underscores, but we can also
+        make a few more assumptions about the data
+
+        :param dict item:
+        :return MappedItem:
+        """
+        note = item["note"]
+        image = note["imageList"].pop(0)["urlDefault"]
+        # permalinks need this token to work, else you get a 404 not found
+        xsec_bit = f"?xsec_token={note['xsecToken']}"
+        timestamp = item.get("time", None)
+
+        return MappedItem({
+            "id": item["id"],
+            "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}",
+            "title": note.get("title", ""),
+            "body": note.get("desc", "") if "desc" in note else MissingMappedField(""),
+            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
+            "author": note["user"]["nickname"],
+            "author_avatar_url": note["user"]["avatar"],
+            "image_url": image,
+            # only available when loading an individual post page, so skip
+            # "tags": ",".join(t["name"] for t in item["tag_list"]),
+            "likes": item["interactInfo"]["likedCount"],
+            # "collects": item["interact_info"]["collected_count"],
+            # "comments": item["interact_info"]["comment_count"],
+            # "shares": item["interact_info"]["share_count"],
+            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
+        })
+
+    def map_item_from_html(item):
+        """
+        Map pre-mapped item
+
+        These have been mapped by Zeeschuimer from the page HTML and contain
+        less data than JSON objects (but enough to be useful in some cases).
+
+        :param dict item:
+        :return MappedItem:
+        """
+        return MappedItem({
+            "id": item["id"],
+            "url": f"https://www.xiaohongshu.com{item['url']}",
+            "title": item["title"],
+            "body": MissingMappedField(""),
+            "timestamp": MissingMappedField(""),
+            "author": item["author_name"],
+            "author_avatar_url": item["author_avatar_url"],
+            "image_url": item["thumbnail_url"],
+            # "tags": MissingMappedField(""),
+            "likes": item["likes"],
+            # "collects": MissingMappedField(""),
+            # "comments": MissingMappedField(""),
+            # "shares": MissingMappedField(""),
+            "unix_timestamp": MissingMappedField(""),
+        })