-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheck_pages.py
More file actions
109 lines (91 loc) · 3.72 KB
/
check_pages.py
File metadata and controls
109 lines (91 loc) · 3.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python3
"""
Check downloaded page images for obvious failures (loading spinners, blank/black).
Flags pages that are unusually small, very dark, or very low variance (spinner/blank).
Suggest re-downloading any suspect pages with the browser script.
Usage:
python check_pages.py --pages-dir love-and-limerence-pages
python check_pages.py --pages-dir love-and-limerence-pages --expected 346 --id lovelimerence00tenn
"""
import argparse
import re
import sys
from pathlib import Path
from typing import Tuple
try:
from PIL import Image
HAS_PIL = True
except ImportError:
HAS_PIL = False
# Heuristics: flag if below these
MIN_FILE_SIZE = 20_000 # bytes (loading/blank often < 20KB)
MIN_MEAN_BRIGHTNESS = 30 # 0-255; black screen is ~0
MIN_VARIANCE = 100 # pixel variance; spinner/blank is low
def natural_sort_key(p: Path) -> Tuple[int, ...]:
m = re.search(r"(\d+)\s*$", p.stem)
return (int(m.group(1)),) if m else (0,)
def check_image(path: Path) -> Tuple[bool, str]:
"""Returns (ok, reason)."""
if not path.exists():
return False, "missing"
size = path.stat().st_size
if size < MIN_FILE_SIZE:
return False, f"tiny ({size} bytes)"
if not HAS_PIL:
return True, "ok (Pillow not installed, only size checked)"
try:
img = Image.open(path)
img = img.convert("L") # grayscale
pixels = list(img.getdata())
n = len(pixels)
if n == 0:
return False, "empty image"
mean = sum(pixels) / n
variance = sum((x - mean) ** 2 for x in pixels) / n
if mean < MIN_MEAN_BRIGHTNESS:
return False, f"too dark (mean={mean:.0f})"
if variance < MIN_VARIANCE:
return False, f"too flat/spinner? (variance={variance:.0f})"
return True, "ok"
except Exception as e:
return False, str(e)
def main():
ap = argparse.ArgumentParser(description="Check page images for spinners/blank.")
ap.add_argument("--pages-dir", type=Path, default=Path("love-and-limerence-pages"), help="Directory with page_*.png")
ap.add_argument("--expected", type=int, default=None, help="Expected number of pages (report if count differs)")
ap.add_argument("--id", default="lovelimerence00tenn", help="Book ID (for suggested re-download command)")
args = ap.parse_args()
pages_dir = args.pages_dir.resolve()
if not pages_dir.is_dir():
ap.error(f"Directory not found: {pages_dir}")
images = sorted(pages_dir.glob("page_*.png"), key=natural_sort_key)
if not images:
images = sorted(pages_dir.glob("page_*.jpg"), key=natural_sort_key)
if not images:
print("No page_*.png or page_*.jpg found.", file=sys.stderr)
sys.exit(1)
if args.expected and len(images) != args.expected:
print(f"Count: {len(images)} (expected {args.expected})", file=sys.stderr)
suspect = []
for p in images:
ok, reason = check_image(p)
if not ok:
m = re.search(r"(\d+)\s*$", p.stem)
num = int(m.group(1)) if m else 0
suspect.append((num, p.name, reason))
print(f"SUSPECT {p.name}: {reason}")
else:
print(f" ok {p.name}")
if suspect:
nums = sorted(n for n, _, _ in suspect)
print("\n--- Re-download suspect pages only (good pages are skipped) ---")
print(
f"python3.10 extract_archive_pages_browser.py --id {args.id} "
f"--start {min(nums)} --end {max(nums)} -o {pages_dir} --cookies cookies.txt --headless --validate-existing"
)
print("(Uses --validate-existing so only missing or bad pages are downloaded.)")
sys.exit(1)
print("\nAll pages look OK.")
sys.exit(0)
if __name__ == "__main__":
main()