From 5d5a0e30bb111a4096c22dc7929e79ca1a9d1f9c Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Wed, 23 Oct 2024 19:04:34 +0200 Subject: [PATCH] Catch rate limits in Telegram media downloads --- .../visualisation/download-telegram-images.py | 35 ++++++++++++++----- .../visualisation/download-telegram-videos.py | 12 ++++++- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py index 99ff5199b..9f0d38eec 100644 --- a/processors/visualisation/download-telegram-images.py +++ b/processors/visualisation/download-telegram-images.py @@ -7,13 +7,14 @@ from pathlib import Path +import telethon.errors from telethon import TelegramClient from telethon.errors import TimedOutError from common.config_manager import config from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException -from common.lib.helpers import UserInput +from common.lib.helpers import UserInput, timify_long from common.lib.dataset import DataSet from processors.visualisation.download_images import ImageDownloader @@ -194,6 +195,13 @@ async def get_images(self): if self.interrupted: raise ProcessorInterruptedException("Interrupted while downloading images") + if not message: + # message no longer exists + self.dataset.log(f"Could not download image for message {msg_id} - message is unavailable (it " + f"may have been deleted)") + self.flawless = False + continue + success = False try: # it's actually unclear if images are always jpegs, but this @@ -215,14 +223,23 @@ async def get_images(self): msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}" self.dataset.log(f"Could not download image for message {msg_id} ({e})") self.flawless = False - - media_done += 1 - self.metadata[filename] = { - "filename": filename, - "success": success, - "from_dataset": self.source_dataset.key, - "post_ids": [msg_id] - } + finally: + media_done += 1 + self.metadata[filename] = { + "filename": filename, + "success": success, + "from_dataset": self.source_dataset.key, + "post_ids": [msg_id] + } + + except telethon.errors.FloodError as e: + later = "later" + if hasattr(e, "seconds"): + later = f"in {timify_long(e.seconds)}" + self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); " + f"halting download process. Try again {later}.", is_final=True) + self.flawless = False + break except ValueError as e: self.dataset.log(f"Couldn't retrieve images for {entity}, it probably does not exist anymore ({e})") diff --git a/processors/visualisation/download-telegram-videos.py b/processors/visualisation/download-telegram-videos.py index ef6d44231..b441ff9d4 100644 --- a/processors/visualisation/download-telegram-videos.py +++ b/processors/visualisation/download-telegram-videos.py @@ -8,12 +8,13 @@ from pathlib import Path from telethon import TelegramClient +from telethon.errors import FloodError from common.config_manager import config from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException from processors.visualisation.download_videos import VideoDownloaderPlus -from common.lib.helpers import UserInput +from common.lib.helpers import UserInput, timify_long from common.lib.dataset import DataSet __author__ = "Stijn Peeters" @@ -210,6 +211,15 @@ async def get_videos(self): "from_dataset": self.source_dataset.key, "post_ids": [msg_id] } + + except FloodError as e: + later = "later" + if hasattr(e, "seconds"): + later = f"in {timify_long(e.seconds)}" + self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); " + f"halting download process. Try again {later}.", is_final=True) + self.flawless = False + break except ValueError as e: self.dataset.log(f"Couldn't retrieve video for {entity}, it probably does not exist anymore ({e})")