From 5d5a0e30bb111a4096c22dc7929e79ca1a9d1f9c Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 23 Oct 2024 19:04:34 +0200
Subject: [PATCH] Catch rate limits in Telegram media downloads

---
 .../visualisation/download-telegram-images.py | 35 ++++++++++++++-----
 .../visualisation/download-telegram-videos.py | 12 ++++++-
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py
index 99ff5199b..9f0d38eec 100644
--- a/processors/visualisation/download-telegram-images.py
+++ b/processors/visualisation/download-telegram-images.py
@@ -7,13 +7,14 @@
 
 from pathlib import Path
 
+import telethon.errors
 from telethon import TelegramClient
 from telethon.errors import TimedOutError
 
 from common.config_manager import config
 from backend.lib.processor import BasicProcessor
 from common.lib.exceptions import ProcessorInterruptedException
-from common.lib.helpers import UserInput
+from common.lib.helpers import UserInput, timify_long
 from common.lib.dataset import DataSet
 from processors.visualisation.download_images import ImageDownloader
 
@@ -194,6 +195,13 @@ async def get_images(self):
                     if self.interrupted:
                         raise ProcessorInterruptedException("Interrupted while downloading images")
 
+                    if not message:
+                        # message no longer exists
+                        self.dataset.log(f"Could not download image for message {msg_id} - message is unavailable (it "
+                                         f"may have been deleted)")
+                        self.flawless = False
+                        continue
+
                     success = False
                     try:
                         # it's actually unclear if images are always jpegs, but this
@@ -215,14 +223,23 @@ async def get_images(self):
                         msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}"
                         self.dataset.log(f"Could not download image for message {msg_id} ({e})")
                         self.flawless = False
-
-                    media_done += 1
-                    self.metadata[filename] = {
-                        "filename": filename,
-                        "success": success,
-                        "from_dataset": self.source_dataset.key,
-                        "post_ids": [msg_id]
-                    }
+                    finally:
+                        media_done += 1
+                        self.metadata[filename] = {
+                            "filename": filename,
+                            "success": success,
+                            "from_dataset": self.source_dataset.key,
+                            "post_ids": [msg_id]
+                        }
+
+            except telethon.errors.FloodError as e:
+                later = "later"
+                if hasattr(e, "seconds"):
+                    later = f"in {timify_long(e.seconds)}"
+                self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); "
+                                           f"halting download process. Try again {later}.", is_final=True)
+                self.flawless = False
+                break
                     
             except ValueError as e:
                 self.dataset.log(f"Couldn't retrieve images for {entity}, it probably does not exist anymore ({e})")
diff --git a/processors/visualisation/download-telegram-videos.py b/processors/visualisation/download-telegram-videos.py
index ef6d44231..b441ff9d4 100644
--- a/processors/visualisation/download-telegram-videos.py
+++ b/processors/visualisation/download-telegram-videos.py
@@ -8,12 +8,13 @@
 from pathlib import Path
 
 from telethon import TelegramClient
+from telethon.errors import FloodError
 
 from common.config_manager import config
 from backend.lib.processor import BasicProcessor
 from common.lib.exceptions import ProcessorInterruptedException
 from processors.visualisation.download_videos import VideoDownloaderPlus
-from common.lib.helpers import UserInput
+from common.lib.helpers import UserInput, timify_long
 from common.lib.dataset import DataSet
 
 __author__ = "Stijn Peeters"
@@ -210,6 +211,15 @@ async def get_videos(self):
                         "from_dataset": self.source_dataset.key,
                         "post_ids": [msg_id]
                     }
+
+            except FloodError as e:
+                later = "later"
+                if hasattr(e, "seconds"):
+                    later = f"in {timify_long(e.seconds)}"
+                self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); "
+                                           f"halting download process. Try again {later}.", is_final=True)
+                self.flawless = False
+                break
                     
             except ValueError as e:
                 self.dataset.log(f"Couldn't retrieve video for {entity}, it probably does not exist anymore ({e})")