Skip to content

Commit

Permalink
vid download improvements/fixes: stop on any 4 consecutive errors; er…
Browse files Browse the repository at this point in the history
…ror message for filesize too large, fix counting yt_dlp downloads, wait time on additional error types
  • Loading branch information
dale-wahl committed Dec 19, 2024
1 parent d0be5c6 commit d1d9347
Showing 1 changed file with 35 additions and 40 deletions.
75 changes: 35 additions & 40 deletions processors/visualisation/download_videos.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ class VideoDownloaderPlus(BasicProcessor):
"columns": {
"type": UserInput.OPTION_TEXT,
"help": "Column to get video links from",
"default": "video_url",
"inline": True,
"tooltip": "If the column contains a single URL, use that URL; else, try to find image URLs in the "
"column's content"
Expand Down Expand Up @@ -145,7 +144,7 @@ class VideoDownloaderPlus(BasicProcessor):
def __init__(self, logger, job, queue=None, manager=None, modules=None):
super().__init__(logger, job, queue, manager, modules)
self.max_videos_per_url = 1
self.videos_downloaded_from_url = 0
self.videos_downloaded_from_url = None
self.downloaded_videos = 0
self.total_possible_videos = 5
self.url_files = None
Expand All @@ -167,7 +166,7 @@ def get_options(cls, parent_dataset=None, user=None):
options["amount"]["tooltip"] = "Use 0 to download all videos"
else:
options['amount']['max'] = max_number_videos
options['amount']['help'] = f"No. of videos (max {max_number_videos:,}"
options['amount']['help'] = f"No. of videos (max {max_number_videos:,})"

# And update the max size and help from config
max_video_size = int(config.get('video-downloader.max-size', 100, user=user))
Expand All @@ -190,19 +189,9 @@ def get_options(cls, parent_dataset=None, user=None):
options["columns"]["options"] = {v: v for v in columns}

# Figure out default column
if "video_url" in columns:
options["columns"]["default"] = "video_url"
elif "media_urls" in columns:
options["columns"]["default"] = "media_urls"
elif "video" in "".join(columns):
# Grab first video column
options["columns"]["default"] = sorted(columns, key=lambda k: "video" in k).pop()
elif "url" in columns or "urls" in columns:
# Grab first url column
options["columns"]["default"] = sorted(columns, key=lambda k: "url" in k).pop()
else:
# Give up
options["columns"]["default"] = "body"
priority = ["video_url", "video_link", "media_url", "media_link", "final_url", "url", "link", "video", "body"]
columns.sort(key=lambda col: next((i for i, p in enumerate(priority) if p in col.lower()), len(priority)))
options["columns"]["default"] = [columns.pop(0)]

# these two options are likely to be unwanted on instances with many
# users, so they are behind an admin config options
Expand Down Expand Up @@ -325,7 +314,7 @@ def dmi_match_filter(vid_info, *, incomplete):
self.downloaded_videos = 0
failed_downloads = 0
copied_videos = 0
consecutive_timeouts = 0
consecutive_errors = 0
self.total_possible_videos = min(len(urls), amount) if amount != 0 else len(urls)
yt_dlp_archive_map = {}
for url in urls:
Expand All @@ -342,7 +331,7 @@ def dmi_match_filter(vid_info, *, incomplete):
continue
except FailedToCopy as e:
# Unable to copy
self.dataset.log(str(e)+"; attempting to download again")
self.dataset.log(f"{str(e)}; attempting to download again")
elif previous_vid_metadata.get("retry", True) is False:
urls[url] = previous_vid_metadata
self.dataset.log(f"Skipping; previously identified url as not a video: {url}")
Expand All @@ -359,13 +348,13 @@ def dmi_match_filter(vid_info, *, incomplete):
raise ProcessorInterruptedException("Interrupted while downloading videos.")

# Check for repeated timeouts
if consecutive_timeouts > 5:
if consecutive_errors >= 5:
if use_yt_dlp:
message = "Downloaded %i videos. Timed out %i consecutive times; try " \
"deselecting the non-direct videos setting" % (self.downloaded_videos, consecutive_timeouts)
message = "Downloaded %i videos. Errors %i consecutive times; try " \
"deselecting the non-direct videos setting" % (self.downloaded_videos, consecutive_errors)
else:
message = "Downloaded %i videos. Timed out %i consecutive times; check logs to ensure " \
"video URLs are working links and you are not being blocked." % (self.downloaded_videos, consecutive_timeouts)
message = "Downloaded %i videos. Errors %i consecutive times; check logs to ensure " \
"video URLs are working links and you are not being blocked." % (self.downloaded_videos, consecutive_errors)
self.dataset.update_status(message, is_final=True)
if self.downloaded_videos == 0:
self.dataset.finish(0)
Expand All @@ -382,7 +371,7 @@ def dmi_match_filter(vid_info, *, incomplete):
self.dataset.log(message)
continue

self.videos_downloaded_from_url = 0
self.videos_downloaded_from_url = set()
# First we'll try to see if we can directly download the URL
try:
filename = self.download_video_with_requests(url, results_path, max_video_size)
Expand All @@ -393,16 +382,16 @@ def dmi_match_filter(vid_info, *, incomplete):
"success": True
}]
success = True
self.videos_downloaded_from_url = 1
self.videos_downloaded_from_url.add(filename)
except (requests.exceptions.Timeout, requests.exceptions.SSLError, requests.exceptions.ConnectionError, requests.exceptions.TooManyRedirects, FilesizeException, FailedDownload, NotAVideo) as e:
# FilesizeException raised when file size is too large or unknown filesize (and that is disabled in 4CAT settings)
# FailedDownload raised when response other than 200 received
# NotAVideo raised due to specific Content-Type known to not be a video (and not a webpage/html that could lead to a video via YT-DLP)
self.dataset.log(str(e))
self.dataset.log(f"Request Error: {str(e)}")
urls[url]["error"] = str(e)
if type(e) is requests.exceptions.Timeout:
# TODO: retry timeouts?
consecutive_timeouts += 1
consecutive_errors += 1
if type(e) in [requests.exceptions.Timeout, requests.exceptions.SSLError, requests.exceptions.ConnectionError, FilesizeException, FailedDownload]:
failed_downloads += 1
if type(e) in [NotAVideo]:
Expand All @@ -420,14 +409,15 @@ def dmi_match_filter(vid_info, *, incomplete):
self.url_files = []
self.last_dl_status = {}
self.last_post_process_status = {}
self.dataset.update_status("Downloading %i/%i via yt-dlp: %s" % (
self.downloaded_videos + 1, self.total_possible_videos, url))
self.dataset.update_status("Downloading %i/%i via yt-dlp: %s" % (self.downloaded_videos + 1, self.total_possible_videos, url))
try:
info = ydl.extract_info(url)
except MaxVideosDownloaded:
self.dataset.log("Max videos for URL reached.")
# Raised when already downloaded max number of videos per URL as defined in self.max_videos_per_url
pass
except ExistingVideoReached:
self.dataset.log("Already downloaded video associated with: %s" % url)
# Video already downloaded; grab from archive
# TODO: with multiple videos per URL, this may not capture the desired video and would instead repeat the first video; Need more feedback from yt-dlp!
with yt_dlp.YoutubeDL({"socket_timeout": 30}) as ydl2:
Expand All @@ -439,33 +429,36 @@ def dmi_match_filter(vid_info, *, incomplete):
message = f"Video identified, but unable to identify which video from {url}"
self.dataset.log(message)
self.log.warning(message)
if self.videos_downloaded_from_url == 0:
if len(self.videos_downloaded_from_url) == 0:
# No videos downloaded for this URL
urls[url]['error'] = message
continue
except (DownloadError, LiveVideoException) as e:
# LiveVideoException raised when a video is known to be live
if "Requested format is not available" in str(e):
message = f"No format available for video (filesize less than {max_size}" + " and unknown sizes not allowed)" if not allow_unknown_sizes else ")"
self.dataset.log(f"Format Error: {str(e)}")
message = f"No format available for video (filesize less than {max_size}" + (" and unknown sizes not allowed)" if not allow_unknown_sizes else ")")
elif "Unable to download webpage: The read operation timed out" in str(e):
# Certain sites fail repeatedly (22-12-8 TikTok has this issue)
consecutive_timeouts += 1
message = 'DownloadError: %s' % str(e)
elif "Sign in to confirm you’re not a bot." in str(e):
# Youtube blocks YT-DLP; need to sign in
message = 'Sign in required: %s' % str(e)
elif "HTTP Error 429: Too Many Requests" in str(e):
# Oh no, https://github.com/yt-dlp/yt-dlp/wiki/FAQ#http-error-429-too-many-requests-or-402-payment-required
consecutive_timeouts += 1
message = 'Too Many Requests: %s' % str(e)
# We can try waiting...
# TODO: Should we check to see if we are hitting the same domain repeatedly?
time.sleep(10 * consecutive_timeouts)
# TODO: Add url back to end?
else:
message = 'DownloadError: %s' % str(e)
time.sleep(10 * consecutive_errors)
consecutive_errors += 1
urls[url]['error'] = message
failed_downloads += 1
self.dataset.log(message)
continue
except Exception as e:
self.dataset.log(f"YT-DLP raised unexpected error: {str(e)}")
# Catch all other issues w/ yt-dlp
message = "YT-DLP raised unexpected error: %s" % str(e)
urls[url]['error'] = message
Expand All @@ -486,16 +479,16 @@ def dmi_match_filter(vid_info, *, incomplete):

else:
# No YT-DLP; move on
self.dataset.log(str(e))
self.dataset.log(f"Unknown Request Error: {str(e)}")
urls[url]["error"] = str(e)
continue

urls[url]["success"] = success
if success:
consecutive_timeouts = 0
consecutive_errors = 0

# Update status
self.downloaded_videos += self.videos_downloaded_from_url
self.downloaded_videos += len(self.videos_downloaded_from_url)
self.dataset.update_status(f"Downloaded {self.downloaded_videos}/{self.total_possible_videos} videos" +
(f"; videos copied from {copied_videos} previous downloads" if copied_videos > 0 else "") +
(f"; {failed_downloads} URLs failed." if failed_downloads > 0 else ""))
Expand Down Expand Up @@ -525,7 +518,7 @@ def yt_dlp_monitor(self, d):
self.last_dl_status = d

# Check if Max Video Downloads already reached
if self.videos_downloaded_from_url != 0 and self.videos_downloaded_from_url >= self.max_videos_per_url:
if len(self.videos_downloaded_from_url) != 0 and len(self.videos_downloaded_from_url) >= self.max_videos_per_url:
# DO NOT RAISE ON 0! (22-12-8 max_videos_per_url should no longer ever be 0)
raise MaxVideosDownloaded('Max videos for URL reached.')

Expand All @@ -539,7 +532,7 @@ def yt_dlp_post_monitor(self, d):
"""
self.last_post_process_status = d
if d['status'] == 'finished': # "downloading", "error", or "finished"
self.videos_downloaded_from_url += 1
self.videos_downloaded_from_url.add(d.get('info_dict',{}).get('_filename', {}))
self.url_files.append({
"filename": Path(d.get('info_dict').get('_filename')).name,
"metadata": d.get('info_dict'),
Expand Down Expand Up @@ -702,6 +695,8 @@ def copy_previous_video(self, previous_vid_metadata, staging_area, previous_down
files = previous_vid_metadata.get('files')
elif "filename" in previous_vid_metadata:
files = [{"filename": previous_vid_metadata.get("filename"), "success": True}]
else:
raise FailedToCopy(f"Unable to read video metadata")

if not files:
raise FailedToCopy(f"No file found in metadata")
Expand Down

0 comments on commit d1d9347

Please sign in to comment.