Skip to content

Commit 63df0ce

Browse files
committed
Add --skip-assets-on flag to skip release asset downloads (#135)
Allow users to skip downloading release assets for specific repositories while still backing up release metadata. Useful for starred repos with large assets (e.g. syncthing with 27GB+). Usage: --skip-assets-on repo1 repo2 owner/repo3 Features: - Space-separated repos (consistent with --exclude) - Case-insensitive matching - Supports both repo name and owner/repo format
1 parent eb5779a commit 63df0ce

File tree

3 files changed

+403
-34
lines changed

3 files changed

+403
-34
lines changed

README.rst

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ CLI Help output::
5050
[--keychain-name OSX_KEYCHAIN_ITEM_NAME]
5151
[--keychain-account OSX_KEYCHAIN_ITEM_ACCOUNT]
5252
[--releases] [--latest-releases NUMBER_OF_LATEST_RELEASES]
53-
[--skip-prerelease] [--assets] [--attachments]
54-
[--exclude [REPOSITORY [REPOSITORY ...]]
53+
[--skip-prerelease] [--assets] [--skip-assets-on [REPO ...]]
54+
[--attachments] [--exclude [REPOSITORY [REPOSITORY ...]]
5555
[--throttle-limit THROTTLE_LIMIT] [--throttle-pause THROTTLE_PAUSE]
5656
USER
5757

@@ -133,6 +133,9 @@ CLI Help output::
133133
--skip-prerelease skip prerelease and draft versions; only applies if including releases
134134
--assets include assets alongside release information; only
135135
applies if including releases
136+
--skip-assets-on [REPO ...]
137+
skip asset downloads for these repositories (e.g.
138+
--skip-assets-on repo1 owner/repo2)
136139
--attachments download user-attachments from issues and pull requests
137140
to issues/attachments/{issue_number}/ and
138141
pulls/attachments/{pull_number}/ directories

github_backup/github_backup.py

Lines changed: 78 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,12 @@ def parse_args(args=None):
440440
dest="include_assets",
441441
help="include assets alongside release information; only applies if including releases",
442442
)
443+
parser.add_argument(
444+
"--skip-assets-on",
445+
dest="skip_assets_on",
446+
nargs="*",
447+
help="skip asset downloads for these repositories",
448+
)
443449
parser.add_argument(
444450
"--attachments",
445451
action="store_true",
@@ -561,7 +567,7 @@ def get_github_host(args):
561567

562568

563569
def read_file_contents(file_uri):
564-
return open(file_uri[len(FILE_URI_PREFIX):], "rt").readline().strip()
570+
return open(file_uri[len(FILE_URI_PREFIX) :], "rt").readline().strip()
565571

566572

567573
def get_github_repo_url(args, repository):
@@ -631,7 +637,7 @@ def retrieve_data_gen(args, template, query_args=None, single_request=False):
631637
pass
632638
raise RepositoryUnavailableError(
633639
"Repository unavailable due to legal reasons (HTTP 451)",
634-
dmca_url=dmca_url
640+
dmca_url=dmca_url,
635641
)
636642

637643
# Check if we got correct data
@@ -709,7 +715,7 @@ def retrieve_data_gen(args, template, query_args=None, single_request=False):
709715
# Parse Link header: <https://api.github.com/...?per_page=100&after=cursor>; rel="next"
710716
for link in link_header.split(","):
711717
if 'rel="next"' in link:
712-
next_url = link[link.find("<") + 1:link.find(">")]
718+
next_url = link[link.find("<") + 1 : link.find(">")]
713719
break
714720
if not next_url:
715721
break
@@ -763,9 +769,7 @@ def _get_response(request, auth, template):
763769
return r, errors
764770

765771

766-
def _construct_request(
767-
per_page, query_args, template, auth, as_app=None, fine=False
768-
):
772+
def _construct_request(per_page, query_args, template, auth, as_app=None, fine=False):
769773
# If template is already a full URL with query params (from Link header), use it directly
770774
if "?" in template and template.startswith("http"):
771775
request_url = template
@@ -1480,9 +1484,11 @@ def download_attachments(
14801484
manifest = {
14811485
"issue_number": number,
14821486
"issue_type": item_type,
1483-
"repository": f"{args.user}/{args.repository}"
1484-
if hasattr(args, "repository") and args.repository
1485-
else args.user,
1487+
"repository": (
1488+
f"{args.user}/{args.repository}"
1489+
if hasattr(args, "repository") and args.repository
1490+
else args.user
1491+
),
14861492
"manifest_updated_at": datetime.now(timezone.utc).isoformat(),
14871493
"attachments": attachment_metadata_list,
14881494
}
@@ -1538,9 +1544,7 @@ def retrieve_repositories(args, authenticated_user):
15381544
else:
15391545
repo_path = "{0}/{1}".format(args.user, args.repository)
15401546
single_request = True
1541-
template = "https://{0}/repos/{1}".format(
1542-
get_github_api_host(args), repo_path
1543-
)
1547+
template = "https://{0}/repos/{1}".format(get_github_api_host(args), repo_path)
15441548

15451549
repos = retrieve_data(args, template, single_request=single_request)
15461550

@@ -1565,7 +1569,10 @@ def retrieve_repositories(args, authenticated_user):
15651569
repos.extend(gists)
15661570

15671571
if args.include_starred_gists:
1568-
if not authenticated_user.get("login") or args.user.lower() != authenticated_user["login"].lower():
1572+
if (
1573+
not authenticated_user.get("login")
1574+
or args.user.lower() != authenticated_user["login"].lower()
1575+
):
15691576
logger.warning(
15701577
"Cannot retrieve starred gists for '%s'. GitHub only allows access to the authenticated user's starred gists.",
15711578
args.user,
@@ -1673,9 +1680,11 @@ def backup_repositories(args, output_directory, repositories):
16731680

16741681
include_gists = args.include_gists or args.include_starred_gists
16751682
include_starred = args.all_starred and repository.get("is_starred")
1676-
if (args.include_repository or args.include_everything) or (
1677-
include_gists and repository.get("is_gist")
1678-
) or include_starred:
1683+
if (
1684+
(args.include_repository or args.include_everything)
1685+
or (include_gists and repository.get("is_gist"))
1686+
or include_starred
1687+
):
16791688
repo_name = (
16801689
repository.get("name")
16811690
if not repository.get("is_gist")
@@ -1735,7 +1744,9 @@ def backup_repositories(args, output_directory, repositories):
17351744
include_assets=args.include_assets or args.include_everything,
17361745
)
17371746
except RepositoryUnavailableError as e:
1738-
logger.warning(f"Repository {repository['full_name']} is unavailable (HTTP 451)")
1747+
logger.warning(
1748+
f"Repository {repository['full_name']} is unavailable (HTTP 451)"
1749+
)
17391750
if e.dmca_url:
17401751
logger.warning(f"DMCA notice: {e.dmca_url}")
17411752
logger.info(f"Skipping remaining resources for {repository['full_name']}")
@@ -1795,7 +1806,11 @@ def backup_issues(args, repo_cwd, repository, repos_template):
17951806
modified = os.path.getmtime(issue_file)
17961807
modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ")
17971808
if modified > issue["updated_at"]:
1798-
logger.info("Skipping issue {0} because it wasn't modified since last backup".format(number))
1809+
logger.info(
1810+
"Skipping issue {0} because it wasn't modified since last backup".format(
1811+
number
1812+
)
1813+
)
17991814
continue
18001815

18011816
if args.include_issue_comments or args.include_everything:
@@ -1811,7 +1826,9 @@ def backup_issues(args, repo_cwd, repository, repos_template):
18111826

18121827
with codecs.open(issue_file + ".temp", "w", encoding="utf-8") as f:
18131828
json_dump(issue, f)
1814-
os.rename(issue_file + ".temp", issue_file) # Unlike json_dump, this is atomic
1829+
os.rename(
1830+
issue_file + ".temp", issue_file
1831+
) # Unlike json_dump, this is atomic
18151832

18161833

18171834
def backup_pulls(args, repo_cwd, repository, repos_template):
@@ -1869,7 +1886,11 @@ def backup_pulls(args, repo_cwd, repository, repos_template):
18691886
modified = os.path.getmtime(pull_file)
18701887
modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ")
18711888
if modified > pull["updated_at"]:
1872-
logger.info("Skipping pull request {0} because it wasn't modified since last backup".format(number))
1889+
logger.info(
1890+
"Skipping pull request {0} because it wasn't modified since last backup".format(
1891+
number
1892+
)
1893+
)
18731894
continue
18741895
if args.include_pull_comments or args.include_everything:
18751896
template = comments_regular_template.format(number)
@@ -1886,7 +1907,9 @@ def backup_pulls(args, repo_cwd, repository, repos_template):
18861907

18871908
with codecs.open(pull_file + ".temp", "w", encoding="utf-8") as f:
18881909
json_dump(pull, f)
1889-
os.rename(pull_file + ".temp", pull_file) # Unlike json_dump, this is atomic
1910+
os.rename(
1911+
pull_file + ".temp", pull_file
1912+
) # Unlike json_dump, this is atomic
18901913

18911914

18921915
def backup_milestones(args, repo_cwd, repository, repos_template):
@@ -1919,9 +1942,11 @@ def backup_milestones(args, repo_cwd, repository, repos_template):
19191942
elif written_count == 0:
19201943
logger.info("{0} milestones unchanged, skipped write".format(total))
19211944
else:
1922-
logger.info("Saved {0} of {1} milestones to disk ({2} unchanged)".format(
1923-
written_count, total, total - written_count
1924-
))
1945+
logger.info(
1946+
"Saved {0} of {1} milestones to disk ({2} unchanged)".format(
1947+
written_count, total, total - written_count
1948+
)
1949+
)
19251950

19261951

19271952
def backup_labels(args, repo_cwd, repository, repos_template):
@@ -1975,6 +2000,20 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
19752000
)
19762001
releases = releases[: args.number_of_latest_releases]
19772002

2003+
# Check if this repo should skip asset downloads (case-insensitive)
2004+
skip_assets = False
2005+
if include_assets:
2006+
repo_name = repository.get("name", "").lower()
2007+
repo_full_name = repository.get("full_name", "").lower()
2008+
skip_repos = [r.lower() for r in (args.skip_assets_on or [])]
2009+
skip_assets = repo_name in skip_repos or repo_full_name in skip_repos
2010+
if skip_assets:
2011+
logger.info(
2012+
"Skipping assets for {0} ({1} releases) due to --skip-assets-on".format(
2013+
repository.get("name"), len(releases)
2014+
)
2015+
)
2016+
19782017
# for each release, store it
19792018
written_count = 0
19802019
for release in releases:
@@ -1986,7 +2025,7 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
19862025
if json_dump_if_changed(release, output_filepath):
19872026
written_count += 1
19882027

1989-
if include_assets:
2028+
if include_assets and not skip_assets:
19902029
assets = retrieve_data(args, release["assets_url"])
19912030
if len(assets) > 0:
19922031
# give release asset files somewhere to live & download them (not including source archives)
@@ -2008,9 +2047,11 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
20082047
elif written_count == 0:
20092048
logger.info("{0} releases unchanged, skipped write".format(total))
20102049
else:
2011-
logger.info("Saved {0} of {1} releases to disk ({2} unchanged)".format(
2012-
written_count, total, total - written_count
2013-
))
2050+
logger.info(
2051+
"Saved {0} of {1} releases to disk ({2} unchanged)".format(
2052+
written_count, total, total - written_count
2053+
)
2054+
)
20142055

20152056

20162057
def fetch_repository(
@@ -2024,9 +2065,12 @@ def fetch_repository(
20242065
):
20252066
if bare_clone:
20262067
if os.path.exists(local_dir):
2027-
clone_exists = subprocess.check_output(
2028-
["git", "rev-parse", "--is-bare-repository"], cwd=local_dir
2029-
) == b"true\n"
2068+
clone_exists = (
2069+
subprocess.check_output(
2070+
["git", "rev-parse", "--is-bare-repository"], cwd=local_dir
2071+
)
2072+
== b"true\n"
2073+
)
20302074
else:
20312075
clone_exists = False
20322076
else:
@@ -2047,7 +2091,9 @@ def fetch_repository(
20472091
)
20482092
else:
20492093
logger.info(
2050-
"Skipping {0} (repository not accessible - may be empty, private, or credentials invalid)".format(name)
2094+
"Skipping {0} (repository not accessible - may be empty, private, or credentials invalid)".format(
2095+
name
2096+
)
20512097
)
20522098
return
20532099

0 commit comments

Comments
 (0)