flaneur/tools/capture_page.py

106 lines
3.3 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import os
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Literal
from playwright.sync_api import Error as PlaywrightError
from playwright.sync_api import sync_playwright
Status = Literal["ok", "error"]
@dataclass(frozen=True)
class CaptureResult:
url: str
final_url: str | None
title: str | None
captured_at: str
status: Status
error: str | None
def iso_now() -> str:
return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
def ensure_parent(path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Capture a web page screenshot (and optional HTML).")
parser.add_argument("--url", required=True)
parser.add_argument("--screenshot", required=True, help="Output PNG path")
parser.add_argument("--html", default=None, help="Optional output HTML path")
parser.add_argument("--full-page", action="store_true", help="Capture full-page screenshot")
parser.add_argument("--timeout-ms", type=int, default=45_000)
parser.add_argument("--wait-ms", type=int, default=1_000, help="Extra wait after DOMContentLoaded")
parser.add_argument("--user-agent", default=None)
parser.add_argument("--locale", default="fr-FR")
parser.add_argument("--timezone", default="Europe/Paris")
parser.add_argument("--referer", default=None)
return parser.parse_args()
def main() -> int:
args = parse_args()
screenshot_path = Path(args.screenshot)
html_path = Path(args.html) if args.html else None
ensure_parent(screenshot_path)
if html_path:
ensure_parent(html_path)
captured_at = iso_now()
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
locale=args.locale,
timezone_id=args.timezone,
user_agent=args.user_agent,
extra_http_headers={k: v for k, v in {"Referer": args.referer}.items() if v},
)
page = context.new_page()
page.goto(args.url, wait_until="domcontentloaded", timeout=args.timeout_ms)
if args.wait_ms:
page.wait_for_timeout(args.wait_ms)
page.screenshot(path=str(screenshot_path), full_page=bool(args.full_page))
if html_path:
html_path.write_text(page.content(), encoding="utf-8")
result = CaptureResult(
url=args.url,
final_url=page.url,
title=page.title(),
captured_at=captured_at,
status="ok",
error=None,
)
browser.close()
except (PlaywrightError, OSError, ValueError) as exc:
result = CaptureResult(
url=args.url,
final_url=None,
title=None,
captured_at=captured_at,
status="error",
error=str(exc),
)
os.write(1, (json.dumps(asdict(result), ensure_ascii=False) + "\n").encode("utf-8"))
return 0 if result.status == "ok" else 2
if __name__ == "__main__":
raise SystemExit(main())