Skip to content

Commit ebc16e1

Browse files
committed
Refactor: improved broken link checker (better parsing, whitelist, safe exit)
1 parent cc78e2f commit ebc16e1

File tree

1 file changed

+81
-46
lines changed

1 file changed

+81
-46
lines changed

scripts/check-broken-links-md.py

Lines changed: 81 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,100 @@
1-
"""Check if ther eis any broken links."""
2-
3-
from __future__ import annotations
4-
5-
import os
61
import subprocess
2+
import sys
3+
import re
4+
import json
5+
from pathlib import Path
6+
from typing import List, Tuple
77

8-
# List of exception URLs
9-
exception_urls = [
10-
"https://www.linkedin.com/",
11-
]
8+
# ==== BASIC SETTINGS ====
9+
IGNORE_URLS = ["https://www.linkedin.com/"] # URLs to skip
10+
RETRY_STATUS = 429 # Too many requests
11+
SCAN_FOLDER = "pages" # default folder to check
12+
HTTP_MODE = "get" # request type
1213

1314

14-
def process_log() -> None:
15-
"""Run the command and capture the output."""
16-
log_err = ""
17-
exitcode = 0
15+
# ==== HELPERS ====
16+
def load_whitelist(path: str) -> List[str]:
17+
"""Load extra ignored links from a JSON file (if available)."""
18+
try:
19+
with open(path, "r", encoding="utf-8") as f:
20+
return json.load(f)
21+
except (FileNotFoundError, json.JSONDecodeError):
22+
print("[ii] Using default ignore list.")
23+
return IGNORE_URLS
1824

25+
26+
def run_checker(folder: str, method: str) -> Tuple[int, str]:
27+
"""Run linkcheckmd and capture what it says."""
1928
try:
20-
subprocess.run(
21-
["python", "-m", "linkcheckmd", "-r", "-v", "-m", "get", "pages"],
22-
stdout=subprocess.PIPE,
23-
stderr=subprocess.PIPE,
29+
task = subprocess.run(
30+
["python", "-m", "linkcheckmd", "-r", "-v", "-m", method, folder],
31+
capture_output=True,
2432
text=True,
2533
check=True,
2634
)
35+
return task.returncode, task.stdout
2736
except subprocess.CalledProcessError as e:
28-
exitcode = e.returncode
29-
# for some reason they were swapped
30-
log_err = e.stdout
37+
return e.returncode, e.stdout + e.stderr
3138

32-
if exitcode == 0:
33-
print("[II] All links are ok.")
34-
return
3539

36-
flagged_errors = []
37-
for line in log_err.splitlines():
38-
line = line.strip() # noqa: PLW2901
39-
# Check if the line starts with '('
40-
if not line.startswith("("):
41-
continue
40+
def extract_bad_links(log: str) -> List[Tuple[str, str, int]]:
41+
"""
42+
Pull out (file, url, status) from linkcheckmd logs.
43+
"""
44+
regex = re.compile(r"\(([^)]+)\)\s+(https?://[^\s]+)\s+\[status:(\d+)\]")
45+
found = regex.findall(log)
46+
return [(f, u, int(s)) for f, u, s in found]
47+
4248

43-
if line.endswith("429)"):
44-
# Too Many Requests http error
49+
def skip_allowed(links: List[Tuple[str, str, int]], ignore_list: List[str]):
50+
"""Remove whitelisted or rate-limited links."""
51+
final = []
52+
for file, url, code in links:
53+
if any(skip in url for skip in ignore_list):
4554
continue
46-
# Extract the URL using regex
47-
for exception_url in exception_urls:
48-
if exception_url not in line:
49-
flagged_errors.append(line)
50-
51-
# Print flagged errors
52-
if not flagged_errors:
53-
print("[II] All links are ok.")
54-
print("No errors flagged. All URLs are in the exception list.")
55+
if code == RETRY_STATUS:
56+
continue
57+
final.append((file, url, code))
58+
return final
59+
60+
61+
def print_report(bad_links: List[Tuple[str, str, int]]):
62+
"""Display summary in a friendlier way."""
63+
if not bad_links:
64+
print(" Everything looks good! No broken links 🎉")
65+
return
66+
67+
print(f"[!!] Found {len(bad_links)} broken link(s):\n")
68+
for file, url, code in bad_links:
69+
print(f"- In: {file}\n{url}\n (Status: {code})\n")
70+
sys.exit(1)
71+
72+
73+
# MAIN WORK
74+
def main():
75+
import argparse
76+
77+
parser = argparse.ArgumentParser(description="Simple Markdown link checker.")
78+
parser.add_argument("-d", "--dir", default=SCAN_FOLDER, help="Folder to check.")
79+
parser.add_argument(
80+
"-e", "--exceptions", default="", help="Path to JSON file of links to skip."
81+
)
82+
parser.add_argument(
83+
"-m", "--method", default=HTTP_MODE, help="HTTP method (default: get)."
84+
)
85+
args = parser.parse_args()
86+
87+
whitelist = load_whitelist(args.exceptions)
88+
code, output = run_checker(args.dir, args.method)
89+
90+
if code == 0:
91+
print(" All clear! No broken links reported.")
5592
return
5693

57-
print("Errors flagged for the following URLs:")
58-
for line in flagged_errors:
59-
print(line)
60-
os._exit(1)
94+
bad_links = extract_bad_links(output)
95+
final_list = skip_allowed(bad_links, whitelist)
96+
print_report(final_list)
6197

6298

63-
# Run the script
6499
if __name__ == "__main__":
65-
process_log()
100+
main()

0 commit comments

Comments
 (0)