diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py index d1d7bd67..e74df85e 100644 --- a/processors/visualisation/download_videos.py +++ b/processors/visualisation/download_videos.py @@ -75,7 +75,6 @@ class VideoDownloaderPlus(BasicProcessor): "columns": { "type": UserInput.OPTION_TEXT, "help": "Column to get video links from", - "default": "video_url", "inline": True, "tooltip": "If the column contains a single URL, use that URL; else, try to find image URLs in the " "column's content" @@ -145,7 +144,7 @@ class VideoDownloaderPlus(BasicProcessor): def __init__(self, logger, job, queue=None, manager=None, modules=None): super().__init__(logger, job, queue, manager, modules) self.max_videos_per_url = 1 - self.videos_downloaded_from_url = 0 + self.videos_downloaded_from_url = None self.downloaded_videos = 0 self.total_possible_videos = 5 self.url_files = None @@ -167,7 +166,7 @@ def get_options(cls, parent_dataset=None, user=None): options["amount"]["tooltip"] = "Use 0 to download all videos" else: options['amount']['max'] = max_number_videos - options['amount']['help'] = f"No. of videos (max {max_number_videos:,}" + options['amount']['help'] = f"No. of videos (max {max_number_videos:,})" # And update the max size and help from config max_video_size = int(config.get('video-downloader.max-size', 100, user=user)) @@ -190,19 +189,9 @@ def get_options(cls, parent_dataset=None, user=None): options["columns"]["options"] = {v: v for v in columns} # Figure out default column - if "video_url" in columns: - options["columns"]["default"] = "video_url" - elif "media_urls" in columns: - options["columns"]["default"] = "media_urls" - elif "video" in "".join(columns): - # Grab first video column - options["columns"]["default"] = sorted(columns, key=lambda k: "video" in k).pop() - elif "url" in columns or "urls" in columns: - # Grab first url column - options["columns"]["default"] = sorted(columns, key=lambda k: "url" in k).pop() - else: - # Give up - options["columns"]["default"] = "body" + priority = ["video_url", "video_link", "media_url", "media_link", "final_url", "url", "link", "video", "body"] + columns.sort(key=lambda col: next((i for i, p in enumerate(priority) if p in col.lower()), len(priority))) + options["columns"]["default"] = [columns.pop(0)] # these two options are likely to be unwanted on instances with many # users, so they are behind an admin config options @@ -325,7 +314,7 @@ def dmi_match_filter(vid_info, *, incomplete): self.downloaded_videos = 0 failed_downloads = 0 copied_videos = 0 - consecutive_timeouts = 0 + consecutive_errors = 0 self.total_possible_videos = min(len(urls), amount) if amount != 0 else len(urls) yt_dlp_archive_map = {} for url in urls: @@ -342,7 +331,7 @@ def dmi_match_filter(vid_info, *, incomplete): continue except FailedToCopy as e: # Unable to copy - self.dataset.log(str(e)+"; attempting to download again") + self.dataset.log(f"{str(e)}; attempting to download again") elif previous_vid_metadata.get("retry", True) is False: urls[url] = previous_vid_metadata self.dataset.log(f"Skipping; previously identified url as not a video: {url}") @@ -359,13 +348,13 @@ def dmi_match_filter(vid_info, *, incomplete): raise ProcessorInterruptedException("Interrupted while downloading videos.") # Check for repeated timeouts - if consecutive_timeouts > 5: + if consecutive_errors >= 5: if use_yt_dlp: - message = "Downloaded %i videos. Timed out %i consecutive times; try " \ - "deselecting the non-direct videos setting" % (self.downloaded_videos, consecutive_timeouts) + message = "Downloaded %i videos. Errors %i consecutive times; try " \ + "deselecting the non-direct videos setting" % (self.downloaded_videos, consecutive_errors) else: - message = "Downloaded %i videos. Timed out %i consecutive times; check logs to ensure " \ - "video URLs are working links and you are not being blocked." % (self.downloaded_videos, consecutive_timeouts) + message = "Downloaded %i videos. Errors %i consecutive times; check logs to ensure " \ + "video URLs are working links and you are not being blocked." % (self.downloaded_videos, consecutive_errors) self.dataset.update_status(message, is_final=True) if self.downloaded_videos == 0: self.dataset.finish(0) @@ -382,7 +371,7 @@ def dmi_match_filter(vid_info, *, incomplete): self.dataset.log(message) continue - self.videos_downloaded_from_url = 0 + self.videos_downloaded_from_url = set() # First we'll try to see if we can directly download the URL try: filename = self.download_video_with_requests(url, results_path, max_video_size) @@ -393,16 +382,16 @@ def dmi_match_filter(vid_info, *, incomplete): "success": True }] success = True - self.videos_downloaded_from_url = 1 + self.videos_downloaded_from_url.add(filename) except (requests.exceptions.Timeout, requests.exceptions.SSLError, requests.exceptions.ConnectionError, requests.exceptions.TooManyRedirects, FilesizeException, FailedDownload, NotAVideo) as e: # FilesizeException raised when file size is too large or unknown filesize (and that is disabled in 4CAT settings) # FailedDownload raised when response other than 200 received # NotAVideo raised due to specific Content-Type known to not be a video (and not a webpage/html that could lead to a video via YT-DLP) - self.dataset.log(str(e)) + self.dataset.log(f"Request Error: {str(e)}") urls[url]["error"] = str(e) if type(e) is requests.exceptions.Timeout: # TODO: retry timeouts? - consecutive_timeouts += 1 + consecutive_errors += 1 if type(e) in [requests.exceptions.Timeout, requests.exceptions.SSLError, requests.exceptions.ConnectionError, FilesizeException, FailedDownload]: failed_downloads += 1 if type(e) in [NotAVideo]: @@ -420,14 +409,15 @@ def dmi_match_filter(vid_info, *, incomplete): self.url_files = [] self.last_dl_status = {} self.last_post_process_status = {} - self.dataset.update_status("Downloading %i/%i via yt-dlp: %s" % ( - self.downloaded_videos + 1, self.total_possible_videos, url)) + self.dataset.update_status("Downloading %i/%i via yt-dlp: %s" % (self.downloaded_videos + 1, self.total_possible_videos, url)) try: info = ydl.extract_info(url) except MaxVideosDownloaded: + self.dataset.log("Max videos for URL reached.") # Raised when already downloaded max number of videos per URL as defined in self.max_videos_per_url pass except ExistingVideoReached: + self.dataset.log("Already downloaded video associated with: %s" % url) # Video already downloaded; grab from archive # TODO: with multiple videos per URL, this may not capture the desired video and would instead repeat the first video; Need more feedback from yt-dlp! with yt_dlp.YoutubeDL({"socket_timeout": 30}) as ydl2: @@ -439,33 +429,36 @@ def dmi_match_filter(vid_info, *, incomplete): message = f"Video identified, but unable to identify which video from {url}" self.dataset.log(message) self.log.warning(message) - if self.videos_downloaded_from_url == 0: + if len(self.videos_downloaded_from_url) == 0: # No videos downloaded for this URL urls[url]['error'] = message continue except (DownloadError, LiveVideoException) as e: # LiveVideoException raised when a video is known to be live if "Requested format is not available" in str(e): - message = f"No format available for video (filesize less than {max_size}" + " and unknown sizes not allowed)" if not allow_unknown_sizes else ")" + self.dataset.log(f"Format Error: {str(e)}") + message = f"No format available for video (filesize less than {max_size}" + (" and unknown sizes not allowed)" if not allow_unknown_sizes else ")") elif "Unable to download webpage: The read operation timed out" in str(e): # Certain sites fail repeatedly (22-12-8 TikTok has this issue) - consecutive_timeouts += 1 message = 'DownloadError: %s' % str(e) + elif "Sign in to confirm you’re not a bot." in str(e): + # Youtube blocks YT-DLP; need to sign in + message = 'Sign in required: %s' % str(e) elif "HTTP Error 429: Too Many Requests" in str(e): # Oh no, https://github.com/yt-dlp/yt-dlp/wiki/FAQ#http-error-429-too-many-requests-or-402-payment-required - consecutive_timeouts += 1 message = 'Too Many Requests: %s' % str(e) - # We can try waiting... # TODO: Should we check to see if we are hitting the same domain repeatedly? - time.sleep(10 * consecutive_timeouts) # TODO: Add url back to end? else: message = 'DownloadError: %s' % str(e) + time.sleep(10 * consecutive_errors) + consecutive_errors += 1 urls[url]['error'] = message failed_downloads += 1 self.dataset.log(message) continue except Exception as e: + self.dataset.log(f"YT-DLP raised unexpected error: {str(e)}") # Catch all other issues w/ yt-dlp message = "YT-DLP raised unexpected error: %s" % str(e) urls[url]['error'] = message @@ -486,16 +479,16 @@ def dmi_match_filter(vid_info, *, incomplete): else: # No YT-DLP; move on - self.dataset.log(str(e)) + self.dataset.log(f"Unknown Request Error: {str(e)}") urls[url]["error"] = str(e) continue urls[url]["success"] = success if success: - consecutive_timeouts = 0 + consecutive_errors = 0 # Update status - self.downloaded_videos += self.videos_downloaded_from_url + self.downloaded_videos += len(self.videos_downloaded_from_url) self.dataset.update_status(f"Downloaded {self.downloaded_videos}/{self.total_possible_videos} videos" + (f"; videos copied from {copied_videos} previous downloads" if copied_videos > 0 else "") + (f"; {failed_downloads} URLs failed." if failed_downloads > 0 else "")) @@ -525,7 +518,7 @@ def yt_dlp_monitor(self, d): self.last_dl_status = d # Check if Max Video Downloads already reached - if self.videos_downloaded_from_url != 0 and self.videos_downloaded_from_url >= self.max_videos_per_url: + if len(self.videos_downloaded_from_url) != 0 and len(self.videos_downloaded_from_url) >= self.max_videos_per_url: # DO NOT RAISE ON 0! (22-12-8 max_videos_per_url should no longer ever be 0) raise MaxVideosDownloaded('Max videos for URL reached.') @@ -539,7 +532,7 @@ def yt_dlp_post_monitor(self, d): """ self.last_post_process_status = d if d['status'] == 'finished': # "downloading", "error", or "finished" - self.videos_downloaded_from_url += 1 + self.videos_downloaded_from_url.add(d.get('info_dict',{}).get('_filename', {})) self.url_files.append({ "filename": Path(d.get('info_dict').get('_filename')).name, "metadata": d.get('info_dict'), @@ -702,6 +695,8 @@ def copy_previous_video(self, previous_vid_metadata, staging_area, previous_down files = previous_vid_metadata.get('files') elif "filename" in previous_vid_metadata: files = [{"filename": previous_vid_metadata.get("filename"), "success": True}] + else: + raise FailedToCopy(f"Unable to read video metadata") if not files: raise FailedToCopy(f"No file found in metadata")