-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_archive_pages_browser.py
More file actions
537 lines (490 loc) · 21 KB
/
extract_archive_pages_browser.py
File metadata and controls
537 lines (490 loc) · 21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
#!/usr/bin/env python3
"""
Download page images using a real browser (Playwright) and your cookies.
Use this when the requests-based script gets 403 (Archive.org often blocks scripted downloads).
Supports parallel downloads (multiple browser pages at once) for speed.
Requires: pip install playwright && playwright install chromium
Usage:
python extract_archive_pages_browser.py --cookies cookies.txt -o stand-on-zanzibar-pages
python extract_archive_pages_browser.py --cookies cookies.txt --end 674 -o out --workers 6
python extract_archive_pages_browser.py ... --force # re-download even if page file exists
python extract_archive_pages_browser.py ... --validate-existing # re-download only missing or bad pages (skip good)
Total pages are auto-detected from the book when --end is omitted. Use --id to set the Archive.org identifier.
By default, pages that already exist on disk are skipped. Use --force to re-download all. Use --validate-existing to only re-download pages that are missing or fail validation (spinner/blank).
The browser will open; ensure the book is borrowed in that session (or cookies are fresh).
"""
import argparse
import asyncio
import re
import sys
from pathlib import Path
from typing import List, Optional, Tuple
try:
from playwright.async_api import async_playwright
except ImportError:
print("Install Playwright: pip install playwright && playwright install chromium", file=sys.stderr)
sys.exit(1)
DEFAULT_IDENTIFIER = "standonzanzibar0000unse"
DEFAULT_PAGE_COUNT = 674
# Embed URL with hash: BookReader reads #page/n{index}/mode/1up on load (index 0-based)
BASE_READER_URL = "https://archive.org/embed/{identifier}#page/n{index}/mode/1up"
# Details page (used as fallback for page-count detection if embed doesn't show indicator)
DETAILS_READER_URL = "https://archive.org/details/{identifier}/page/n1/mode/1up"
# Regex to extract total page count from reader UI: "(1/346)" or "1 of 346"
PAGE_INDICATOR_RE = re.compile(r"\(\d+/(\d+)\)")
PAGE_INDICATOR_ALT_RE = re.compile(r"(\d+)\s+of\s+(\d+)")
# Validation: same heuristics as check_pages.py (reject spinners/blank)
MIN_CAPTURE_SIZE = 20_000 # bytes
MIN_MEAN_BRIGHTNESS = 30
MIN_VARIANCE = 100
try:
from PIL import Image
_HAS_PIL = True
except ImportError:
_HAS_PIL = False
def validate_capture(path: Path) -> Tuple[bool, str]:
"""Return (True, '') if image looks valid; (False, reason) if spinner/blank."""
if not path.exists():
return False, "missing"
size = path.stat().st_size
if size < MIN_CAPTURE_SIZE:
return False, f"tiny ({size} bytes)"
if not _HAS_PIL:
return True, ""
try:
img = Image.open(path)
img = img.convert("L")
pixels = list(img.getdata())
n = len(pixels)
if n == 0:
return False, "empty"
mean = sum(pixels) / n
variance = sum((x - mean) ** 2 for x in pixels) / n
if mean < MIN_MEAN_BRIGHTNESS:
return False, f"too dark ({mean:.0f})"
if variance < MIN_VARIANCE:
return False, f"too flat ({variance:.0f})"
return True, ""
except Exception as e:
return False, str(e)
def parse_netscape_cookies(path: Path) -> List[dict]:
"""Convert Netscape cookies.txt to Playwright cookie list."""
# Playwright wants: name, value, domain, path, expires (unix), httpOnly?, secure?, sameSite?
out = []
with open(path) as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
parts = line.split("\t")
if len(parts) < 7:
continue
domain, _include_sub, path_str, secure, expires_str, name, value = parts[:7]
if "archive.org" not in domain:
continue
try:
expires = int(expires_str) if expires_str != "0" else -1
except ValueError:
expires = -1
out.append({
"name": name,
"value": value,
"domain": domain if domain.startswith(".") else "." + domain.lstrip("."),
"path": path_str or "/",
"expires": expires,
"secure": secure.upper() == "TRUE",
"httpOnly": False,
"sameSite": "Lax",
})
return out
async def get_page_count_from_reader(
context,
identifier: str,
timeout_ms: int = 30000,
content_wait_ms: int = 15000,
) -> Optional[int]:
"""Open reader at page 1, wait for content and for page indicator to appear, parse total.
Returns total page count or None if detection fails.
"""
detected: Optional[int] = None
page = await context.new_page()
try:
url = BASE_READER_URL.format(identifier=identifier, index=0) # page 1 = index 0
await page.goto(url, wait_until="load", timeout=timeout_ms)
await page.wait_for_timeout(2000) # let embed apply hash and render
await page.wait_for_selector(
"#BookReader canvas, #BookReader img[src]",
state="visible",
timeout=content_wait_ms,
)
# Try BookReader API first if available (embed exposes BR)
try:
total = await page.evaluate(
"""() => {
const br = window.BR || window.BookReader;
if (br && typeof br.getNumPages === 'function') return br.getNumPages();
if (br && br.options && typeof br.options.getNumPages === 'function') return br.options.getNumPages();
return null;
}"""
)
if total is not None and total >= 1:
detected = int(total)
except Exception:
pass
if detected is None:
# Wait for reader to show page indicator (1/N) or "of N" — can appear after content
for _ in range(20):
await page.wait_for_timeout(1000)
text = await page.evaluate("() => document.body.innerText")
m = PAGE_INDICATOR_RE.search(text)
if m:
detected = int(m.group(1))
else:
m = PAGE_INDICATOR_ALT_RE.search(text)
if m:
detected = int(m.group(2)) # "1 of 344" -> 344
if detected is not None and detected >= 1:
break
except Exception:
pass
finally:
await page.close()
if detected is not None:
return detected
# Fallback: try details page (indicator often visible there)
page2 = await context.new_page()
try:
url = DETAILS_READER_URL.format(identifier=identifier)
await page2.goto(url, wait_until="load", timeout=timeout_ms)
await page2.wait_for_timeout(3000)
for _ in range(15):
text = await page2.evaluate("() => document.body.innerText")
m = PAGE_INDICATOR_RE.search(text)
if m:
detected = int(m.group(1))
else:
m = PAGE_INDICATOR_ALT_RE.search(text)
if m:
detected = int(m.group(2))
if detected is not None and detected >= 1:
return detected
await page2.wait_for_timeout(1000)
except Exception:
pass
finally:
await page2.close()
return None
def capture_page(
page,
identifier: str,
page_num: int,
out_path: Path,
timeout_ms: int,
slow_ms: int,
retries: int,
) -> bool:
"""Sync version (unused when running async; kept for compatibility)."""
url = BASE_READER_URL.format(identifier=identifier, index=page_num - 1)
for attempt in range(retries + 1):
try:
page.goto(url, wait_until="load", timeout=timeout_ms)
page.wait_for_timeout(slow_ms)
selector = "#BookReader"
if page.locator(selector).count() > 0:
page.locator(selector).first.screenshot(path=str(out_path))
else:
page.screenshot(path=str(out_path))
return True
except Exception:
if attempt < retries:
page.wait_for_timeout(2000)
return False
async def capture_page_async(
page,
identifier: str,
page_num: int,
out_path: Path,
timeout_ms: int,
slow_ms: int,
retries: int,
content_wait_ms: int = 15000,
validate: bool = True,
) -> bool:
"""Async: load one reader page, wait for content (no spinner), screenshot, save. Returns True on success."""
# index is 0-based for embed hash (page 1 -> n0)
url = BASE_READER_URL.format(identifier=identifier, index=page_num - 1)
last_error = None
for attempt in range(retries + 1):
try:
await page.goto(url, wait_until="load", timeout=timeout_ms)
await page.wait_for_timeout(slow_ms)
# Give the reader time to apply the hash and render the correct page (embed reads #page/nX on load)
await page.wait_for_timeout(2000)
# Wait for actual book content (canvas or img) so we don't capture loading spinners
try:
await page.wait_for_selector(
"#BookReader canvas, #BookReader img[src]",
state="visible",
timeout=content_wait_ms,
)
except Exception as e:
raise RuntimeError(f"Book content did not load (spinner or blank): {e}") from e
await page.wait_for_timeout(500) # brief settle
# IA BookReader often ignores URL and stays on page 1; call its API to jump to the requested page
try:
jumped = await page.evaluate(
"""(pageIndex) => {
const br = window.BR || window.BookReader;
if (!br) return false;
if (typeof br.jumpToPage === 'function') { br.jumpToPage(pageIndex); return true; }
if (typeof br.goToPage === 'function') { br.goToPage(pageIndex); return true; }
return false;
}""",
page_num - 1, # 0-based index
)
if not jumped:
# Reader may be in embed iframe on details page
embed_frame = page.frame(url=re.compile(r"/embed/"))
if embed_frame:
try:
jumped = await embed_frame.evaluate(
"""(pageIndex) => {
const br = window.BR || window.BookReader;
if (br && typeof br.jumpToPage === 'function') {
br.jumpToPage(pageIndex);
return true;
}
return false;
}""",
page_num - 1,
)
except Exception:
pass
if jumped:
await page.wait_for_timeout(1500) # let the reader flip the page
except Exception:
pass
# Wait for reader to show the requested page (it updates (current/total) after load)
page_indicator_ok = False
for _ in range(25):
try:
on_right_page = await page.evaluate(
"""(expected) => document.body.innerText.includes('(' + expected + '/')""",
page_num,
)
if on_right_page:
page_indicator_ok = True
break
except Exception:
pass
await page.wait_for_timeout(500)
if not page_indicator_ok:
# Don't block: capture anyway; validation will retry if image is wrong/blank
print(f"Note: page {page_num} indicator not seen, capturing anyway", file=sys.stderr)
selector = "#BookReader"
if await page.locator(selector).count() > 0:
await page.locator(selector).first.screenshot(path=str(out_path))
else:
await page.screenshot(path=str(out_path))
# Validate: reject spinners/blank; retry if bad
if validate:
ok_validate, reason = validate_capture(out_path)
if not ok_validate:
out_path.unlink(missing_ok=True)
raise RuntimeError(f"Capture failed validation: {reason}")
return True
except Exception as e:
last_error = e
if attempt < retries:
await page.wait_for_timeout(2000)
if last_error:
print(f"Error page {page_num}: {last_error}", file=sys.stderr)
return False
async def worker(
page_pool: asyncio.Queue,
work_queue: asyncio.Queue,
identifier: str,
timeout_ms: int,
slow_ms: int,
retries: int,
content_wait_ms: int,
validate: bool = True,
) -> None:
"""Pull (page_num, out_path) from work_queue; use a page from page_pool; capture; put page back."""
while True:
try:
item = work_queue.get_nowait()
except asyncio.QueueEmpty:
break
page_num, out_path = item
if page_num is None:
work_queue.task_done()
break
page = await page_pool.get()
try:
ok = await capture_page_async(
page, identifier, page_num, out_path, timeout_ms, slow_ms, retries, content_wait_ms, validate
)
if ok:
print(f"Saved: {out_path.name}")
finally:
await page_pool.put(page)
work_queue.task_done()
def main():
p = argparse.ArgumentParser(description="Download book pages via browser (Playwright).")
p.add_argument("--id", default=DEFAULT_IDENTIFIER, help="Archive.org item identifier")
p.add_argument("--start", type=int, default=1, help="First page (1-based)")
p.add_argument("--end", type=int, default=None, help="Last page (default: auto-detect from book)")
p.add_argument("--pages", type=int, default=DEFAULT_PAGE_COUNT, help="Fallback total pages if auto-detect fails (used when --end omitted)")
p.add_argument("--output", "-o", type=Path, default=Path("stand-on-zanzibar-pages"), help="Output directory")
p.add_argument("--cookies", type=Path, default=None, help="Netscape cookies.txt (required for borrowed books)")
p.add_argument("--no-headless", action="store_true", help="Show browser window (default is headless)")
p.add_argument("--slow", type=float, default=0.5, help="Seconds to wait after each page load (default 0.5)")
p.add_argument("--timeout", type=int, default=45000, help="Page load timeout in ms (default 45000)")
p.add_argument("--retries", type=int, default=2, help="Retries per page on failure (default 2 = 3 attempts total)")
p.add_argument("--workers", type=int, default=4, help="Number of concurrent browser pages (default 4)")
p.add_argument("--force", action="store_true", help="Re-download pages even if file already exists")
p.add_argument("--content-wait", type=int, default=15000, help="Ms to wait for book content to load before screenshot (default 15000)")
p.add_argument("--no-validate", action="store_true", help="Skip post-capture validation (don't retry on spinner/blank)")
p.add_argument("--validate-existing", action="store_true", help="Re-download only pages that are missing or fail validation (good pages are skipped)")
args = p.parse_args()
if not args.cookies or not args.cookies.exists():
p.error("--cookies FILE is required and must exist (export from browser while book is borrowed).")
# end: explicit --end, or None to auto-detect in run_async
end = args.end
if end is not None and (args.start < 1 or end < args.start):
p.error("Require 1 <= --start <= --end")
out_dir = args.output.resolve()
out_dir.mkdir(parents=True, exist_ok=True)
cookies = parse_netscape_cookies(args.cookies)
if not cookies:
print("No archive.org cookies found in file.", file=sys.stderr)
sys.exit(1)
print(f"Loaded {len(cookies)} cookies. Opening browser...")
timeout_ms = args.timeout
slow_ms = int(args.slow * 1000)
retries = max(0, args.retries)
workers = max(1, min(args.workers, 16)) # clamp 1–16
asyncio.run(run_async(
cookies=cookies,
identifier=args.id,
start=args.start,
end=end,
pages_fallback=args.pages,
out_dir=out_dir,
headless=not args.no_headless,
timeout_ms=timeout_ms,
slow_ms=slow_ms,
retries=retries,
workers=workers,
force=args.force,
content_wait_ms=args.content_wait,
validate=not args.no_validate,
validate_existing=args.validate_existing,
))
print("Done.")
async def run_async(
cookies: List[dict],
identifier: str,
start: int,
end: Optional[int],
pages_fallback: int,
out_dir: Path,
headless: bool,
timeout_ms: int,
slow_ms: int,
retries: int,
workers: int,
force: bool = False,
content_wait_ms: int = 15000,
validate: bool = True,
validate_existing: bool = False,
) -> None:
async with async_playwright() as pw:
browser = await pw.chromium.launch(headless=headless)
context = await browser.new_context(
viewport={"width": 1200, "height": 900},
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
)
await context.add_cookies(cookies)
# Auto-detect total pages when --end was not provided
if end is None:
print("Auto-detecting total page count...")
detected = await get_page_count_from_reader(
context, identifier, timeout_ms=timeout_ms, content_wait_ms=content_wait_ms
)
if detected is not None:
end = detected
print(f"Auto-detected total pages: {end}")
else:
end = pages_fallback
print(f"Could not auto-detect; using --pages {end}", file=sys.stderr)
if start > end:
print(f"Error: --start {start} > end {end}", file=sys.stderr)
await browser.close()
return
# Pool of browser pages for concurrent work
page_pool = asyncio.Queue()
for _ in range(workers):
page = await context.new_page()
await page_pool.put(page)
def build_work() -> List[Tuple[int, Path]]:
work = []
for n in range(start, end + 1):
out_path = out_dir / f"page_{n:04d}.png"
if force:
work.append((n, out_path))
elif validate_existing:
if not out_path.exists():
work.append((n, out_path))
else:
ok, _ = validate_capture(out_path)
if not ok:
work.append((n, out_path))
else:
if not out_path.exists():
work.append((n, out_path))
return work
work_list = build_work()
if not work_list:
print("All pages already exist.")
await browser.close()
return
print(f"Downloading {len(work_list)} pages with {workers} workers...")
work_queue = asyncio.Queue()
for item in work_list:
await work_queue.put(item)
worker_tasks = [
asyncio.create_task(
worker(page_pool, work_queue, identifier, timeout_ms, slow_ms, retries, content_wait_ms, validate)
)
for _ in range(workers)
]
await work_queue.join()
await asyncio.gather(*worker_tasks)
# Missing-pages pass
expected = set(range(start, end + 1))
found = set()
for p in out_dir.glob("page_*.png"):
m = re.match(r"page_(\d+)\.png", p.name)
if m:
found.add(int(m.group(1)))
missing = sorted(expected - found)
if missing:
print(f"\nRetrying {len(missing)} missing pages: {missing[:10]}{'...' if len(missing) > 10 else ''}")
for n in missing:
await work_queue.put((n, out_dir / f"page_{n:04d}.png"))
worker_tasks = [
asyncio.create_task(
worker(page_pool, work_queue, identifier, timeout_ms, slow_ms, retries, content_wait_ms, validate)
)
for _ in range(workers)
]
await work_queue.join()
await asyncio.gather(*worker_tasks)
still_missing = [n for n in missing if not (out_dir / f"page_{n:04d}.png").exists()]
if still_missing:
print(f"\nStill missing after retries ({len(still_missing)}): {still_missing}", file=sys.stderr)
await browser.close()
if __name__ == "__main__":
main()