vid download improvements/fixes: stop on any 4 consecutive errors; er…

…ror message for filesize too large, fix counting yt_dlp downloads, wait time on additional error types
digitalmethodsinitiative · Dec 19, 2024 · d1d9347 · d1d9347
1 parent d0be5c6
commit d1d9347
Showing 1 changed file with 35 additions and 40 deletions.
diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py
@@ -75,7 +75,6 @@ class VideoDownloaderPlus(BasicProcessor):
         "columns": {
             "type": UserInput.OPTION_TEXT,
             "help": "Column to get video links from",
-            "default": "video_url",
             "inline": True,
             "tooltip": "If the column contains a single URL, use that URL; else, try to find image URLs in the "
                        "column's content"
@@ -145,7 +144,7 @@ class VideoDownloaderPlus(BasicProcessor):
     def __init__(self, logger, job, queue=None, manager=None, modules=None):
         super().__init__(logger, job, queue, manager, modules)
         self.max_videos_per_url = 1
-        self.videos_downloaded_from_url = 0
+        self.videos_downloaded_from_url = None
         self.downloaded_videos = 0
         self.total_possible_videos = 5
         self.url_files = None
@@ -167,7 +166,7 @@ def get_options(cls, parent_dataset=None, user=None):
             options["amount"]["tooltip"] = "Use 0 to download all videos"
         else:
             options['amount']['max'] = max_number_videos
-            options['amount']['help'] = f"No. of videos (max {max_number_videos:,}"
+            options['amount']['help'] = f"No. of videos (max {max_number_videos:,})"
 
         # And update the max size and help from config
         max_video_size = int(config.get('video-downloader.max-size', 100, user=user))
@@ -190,19 +189,9 @@ def get_options(cls, parent_dataset=None, user=None):
             options["columns"]["options"] = {v: v for v in columns}
 
             # Figure out default column
-            if "video_url" in columns:
-                options["columns"]["default"] = "video_url"
-            elif "media_urls" in columns:
-                options["columns"]["default"] = "media_urls"
-            elif "video" in "".join(columns):
-                # Grab first video column
-                options["columns"]["default"] = sorted(columns, key=lambda k: "video" in k).pop()
-            elif "url" in columns or "urls" in columns:
-                # Grab first url column
-                options["columns"]["default"] = sorted(columns, key=lambda k: "url" in k).pop()
-            else:
-                # Give up
-                options["columns"]["default"] = "body"
+            priority = ["video_url", "video_link", "media_url", "media_link", "final_url", "url", "link", "video", "body"]
+            columns.sort(key=lambda col: next((i for i, p in enumerate(priority) if p in col.lower()), len(priority)))
+            options["columns"]["default"] = [columns.pop(0)]
 
         # these two options are likely to be unwanted on instances with many
         # users, so they are behind an admin config options
@@ -325,7 +314,7 @@ def dmi_match_filter(vid_info, *, incomplete):
         self.downloaded_videos = 0
         failed_downloads = 0
         copied_videos = 0
-        consecutive_timeouts = 0
+        consecutive_errors = 0
         self.total_possible_videos = min(len(urls), amount) if amount != 0 else len(urls)
         yt_dlp_archive_map = {}
         for url in urls:
@@ -342,7 +331,7 @@ def dmi_match_filter(vid_info, *, incomplete):
                         continue
                     except FailedToCopy as e:
                         # Unable to copy
-                        self.dataset.log(str(e)+"; attempting to download again")
+                        self.dataset.log(f"{str(e)}; attempting to download again")
                 elif previous_vid_metadata.get("retry", True) is False:
                     urls[url] = previous_vid_metadata
                     self.dataset.log(f"Skipping; previously identified url as not a video: {url}")
@@ -359,13 +348,13 @@ def dmi_match_filter(vid_info, *, incomplete):
                 raise ProcessorInterruptedException("Interrupted while downloading videos.")
 
             # Check for repeated timeouts
-            if consecutive_timeouts > 5:
+            if consecutive_errors >= 5:
                 if use_yt_dlp:
-                    message = "Downloaded %i videos. Timed out %i consecutive times; try " \
-                              "deselecting the non-direct videos setting" % (self.downloaded_videos, consecutive_timeouts)
+                    message = "Downloaded %i videos. Errors %i consecutive times; try " \
+                              "deselecting the non-direct videos setting" % (self.downloaded_videos, consecutive_errors)
                 else:
-                    message = "Downloaded %i videos. Timed out %i consecutive times; check logs to ensure " \
-                              "video URLs are working links and you are not being blocked." % (self.downloaded_videos, consecutive_timeouts)
+                    message = "Downloaded %i videos. Errors %i consecutive times; check logs to ensure " \
+                              "video URLs are working links and you are not being blocked." % (self.downloaded_videos, consecutive_errors)
                 self.dataset.update_status(message, is_final=True)
                 if self.downloaded_videos == 0:
                     self.dataset.finish(0)
@@ -382,7 +371,7 @@ def dmi_match_filter(vid_info, *, incomplete):
                 self.dataset.log(message)
                 continue
 
-            self.videos_downloaded_from_url = 0
+            self.videos_downloaded_from_url = set()
             # First we'll try to see if we can directly download the URL
             try:
                 filename = self.download_video_with_requests(url, results_path, max_video_size)
@@ -393,16 +382,16 @@ def dmi_match_filter(vid_info, *, incomplete):
                     "success": True
                 }]
                 success = True
-                self.videos_downloaded_from_url = 1
+                self.videos_downloaded_from_url.add(filename)
             except (requests.exceptions.Timeout, requests.exceptions.SSLError, requests.exceptions.ConnectionError, requests.exceptions.TooManyRedirects, FilesizeException, FailedDownload, NotAVideo) as e:
                 # FilesizeException raised when file size is too large or unknown filesize (and that is disabled in 4CAT settings)
                 # FailedDownload raised when response other than 200 received
                 # NotAVideo raised due to specific Content-Type known to not be a video (and not a webpage/html that could lead to a video via YT-DLP)
-                self.dataset.log(str(e))
+                self.dataset.log(f"Request Error: {str(e)}")
                 urls[url]["error"] = str(e)
                 if type(e) is requests.exceptions.Timeout:
                     # TODO: retry timeouts?
-                    consecutive_timeouts += 1
+                    consecutive_errors += 1
                 if type(e) in [requests.exceptions.Timeout, requests.exceptions.SSLError, requests.exceptions.ConnectionError, FilesizeException, FailedDownload]:
                     failed_downloads += 1
                 if type(e) in [NotAVideo]:
@@ -420,14 +409,15 @@ def dmi_match_filter(vid_info, *, incomplete):
                         self.url_files = []
                         self.last_dl_status = {}
                         self.last_post_process_status = {}
-                        self.dataset.update_status("Downloading %i/%i via yt-dlp: %s" % (
-                        self.downloaded_videos + 1, self.total_possible_videos, url))
+                        self.dataset.update_status("Downloading %i/%i via yt-dlp: %s" % (self.downloaded_videos + 1, self.total_possible_videos, url))
                         try:
                             info = ydl.extract_info(url)
                         except MaxVideosDownloaded:
+                            self.dataset.log("Max videos for URL reached.")
                             # Raised when already downloaded max number of videos per URL as defined in self.max_videos_per_url
                             pass
                         except ExistingVideoReached:
+                            self.dataset.log("Already downloaded video associated with: %s" % url)
                             # Video already downloaded; grab from archive
                             # TODO: with multiple videos per URL, this may not capture the desired video and would instead repeat the first video; Need more feedback from yt-dlp!
                             with yt_dlp.YoutubeDL({"socket_timeout": 30}) as ydl2:
@@ -439,33 +429,36 @@ def dmi_match_filter(vid_info, *, incomplete):
                                     message = f"Video identified, but unable to identify which video from {url}"
                                     self.dataset.log(message)
                                     self.log.warning(message)
-                                    if self.videos_downloaded_from_url == 0:
+                                    if len(self.videos_downloaded_from_url) == 0:
                                         # No videos downloaded for this URL
                                         urls[url]['error'] = message
                                         continue
                         except (DownloadError, LiveVideoException) as e:
                             # LiveVideoException raised when a video is known to be live
                             if "Requested format is not available" in str(e):
-                                message = f"No format available for video (filesize less than {max_size}" + " and unknown sizes not allowed)" if not allow_unknown_sizes else ")"
+                                self.dataset.log(f"Format Error: {str(e)}")
+                                message = f"No format available for video (filesize less than {max_size}" + (" and unknown sizes not allowed)" if not allow_unknown_sizes else ")")
                             elif "Unable to download webpage: The read operation timed out" in str(e):
                                 # Certain sites fail repeatedly (22-12-8 TikTok has this issue)
-                                consecutive_timeouts += 1
                                 message = 'DownloadError: %s' % str(e)
+                            elif "Sign in to confirm you’re not a bot." in str(e):
+                                # Youtube blocks YT-DLP; need to sign in
+                                message = 'Sign in required: %s' % str(e)
                             elif "HTTP Error 429: Too Many Requests" in str(e):
                                 # Oh no, https://github.com/yt-dlp/yt-dlp/wiki/FAQ#http-error-429-too-many-requests-or-402-payment-required
-                                consecutive_timeouts += 1
                                 message = 'Too Many Requests: %s' % str(e)
-                                # We can try waiting...
                                 # TODO: Should we check to see if we are hitting the same domain repeatedly?
-                                time.sleep(10 * consecutive_timeouts)
                                 # TODO: Add url back to end?
                             else:
                                 message = 'DownloadError: %s' % str(e)
+                            time.sleep(10 * consecutive_errors)
+                            consecutive_errors += 1
                             urls[url]['error'] = message
                             failed_downloads += 1
                             self.dataset.log(message)
                             continue
                         except Exception as e:
+                            self.dataset.log(f"YT-DLP raised unexpected error: {str(e)}")
                             # Catch all other issues w/ yt-dlp
                             message = "YT-DLP raised unexpected error: %s" % str(e)
                             urls[url]['error'] = message
@@ -486,16 +479,16 @@ def dmi_match_filter(vid_info, *, incomplete):
 
                 else:
                     # No YT-DLP; move on
-                    self.dataset.log(str(e))
+                    self.dataset.log(f"Unknown Request Error: {str(e)}")
                     urls[url]["error"] = str(e)
                     continue
 
             urls[url]["success"] = success
             if success:
-                consecutive_timeouts = 0
+                consecutive_errors = 0
 
             # Update status
-            self.downloaded_videos += self.videos_downloaded_from_url
+            self.downloaded_videos += len(self.videos_downloaded_from_url)
             self.dataset.update_status(f"Downloaded {self.downloaded_videos}/{self.total_possible_videos} videos" +
                                        (f"; videos copied from {copied_videos} previous downloads" if copied_videos > 0 else "") +
                                        (f"; {failed_downloads} URLs failed." if failed_downloads > 0 else ""))
@@ -525,7 +518,7 @@ def yt_dlp_monitor(self, d):
         self.last_dl_status = d
 
         # Check if Max Video Downloads already reached
-        if self.videos_downloaded_from_url != 0 and self.videos_downloaded_from_url >= self.max_videos_per_url:
+        if len(self.videos_downloaded_from_url) != 0 and len(self.videos_downloaded_from_url) >= self.max_videos_per_url:
             # DO NOT RAISE ON 0! (22-12-8 max_videos_per_url should no longer ever be 0)
             raise MaxVideosDownloaded('Max videos for URL reached.')
 
@@ -539,7 +532,7 @@ def yt_dlp_post_monitor(self, d):
         """
         self.last_post_process_status = d
         if d['status'] == 'finished':  # "downloading", "error", or "finished"
-            self.videos_downloaded_from_url += 1
+            self.videos_downloaded_from_url.add(d.get('info_dict',{}).get('_filename', {}))
             self.url_files.append({
                 "filename": Path(d.get('info_dict').get('_filename')).name,
                 "metadata": d.get('info_dict'),
@@ -702,6 +695,8 @@ def copy_previous_video(self, previous_vid_metadata, staging_area, previous_down
             files = previous_vid_metadata.get('files')
         elif "filename" in previous_vid_metadata:
             files = [{"filename": previous_vid_metadata.get("filename"), "success": True}]
+        else:
+            raise FailedToCopy(f"Unable to read video metadata")
 
         if not files:
             raise FailedToCopy(f"No file found in metadata")