From 1e9d55ada8b2a053305b85fa34c55e56f61b9f45 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 2 Jan 2026 18:01:53 +0000 Subject: [PATCH] Add helper scripts for capture and CSV export --- tools/capture_page.py | 106 ++++++++++++++++++++++++++++++++++++++++++ tools/json_to_csv.py | 51 ++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 tools/capture_page.py create mode 100644 tools/json_to_csv.py diff --git a/tools/capture_page.py b/tools/capture_page.py new file mode 100644 index 0000000..69834f3 --- /dev/null +++ b/tools/capture_page.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +from dataclasses import asdict, dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Literal + +from playwright.sync_api import Error as PlaywrightError +from playwright.sync_api import sync_playwright + + +Status = Literal["ok", "error"] + + +@dataclass(frozen=True) +class CaptureResult: + url: str + final_url: str | None + title: str | None + captured_at: str + status: Status + error: str | None + + +def iso_now() -> str: + return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds") + + +def ensure_parent(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Capture a web page screenshot (and optional HTML).") + parser.add_argument("--url", required=True) + parser.add_argument("--screenshot", required=True, help="Output PNG path") + parser.add_argument("--html", default=None, help="Optional output HTML path") + parser.add_argument("--full-page", action="store_true", help="Capture full-page screenshot") + parser.add_argument("--timeout-ms", type=int, default=45_000) + parser.add_argument("--wait-ms", type=int, default=1_000, help="Extra wait after DOMContentLoaded") + parser.add_argument("--user-agent", default=None) + parser.add_argument("--locale", default="fr-FR") + parser.add_argument("--timezone", default="Europe/Paris") + parser.add_argument("--referer", default=None) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + + screenshot_path = Path(args.screenshot) + html_path = Path(args.html) if args.html else None + + ensure_parent(screenshot_path) + if html_path: + ensure_parent(html_path) + + captured_at = iso_now() + try: + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + context = browser.new_context( + locale=args.locale, + timezone_id=args.timezone, + user_agent=args.user_agent, + extra_http_headers={k: v for k, v in {"Referer": args.referer}.items() if v}, + ) + page = context.new_page() + page.goto(args.url, wait_until="domcontentloaded", timeout=args.timeout_ms) + if args.wait_ms: + page.wait_for_timeout(args.wait_ms) + + page.screenshot(path=str(screenshot_path), full_page=bool(args.full_page)) + if html_path: + html_path.write_text(page.content(), encoding="utf-8") + + result = CaptureResult( + url=args.url, + final_url=page.url, + title=page.title(), + captured_at=captured_at, + status="ok", + error=None, + ) + browser.close() + except (PlaywrightError, OSError, ValueError) as exc: + result = CaptureResult( + url=args.url, + final_url=None, + title=None, + captured_at=captured_at, + status="error", + error=str(exc), + ) + + os.write(1, (json.dumps(asdict(result), ensure_ascii=False) + "\n").encode("utf-8")) + return 0 if result.status == "ok" else 2 + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/tools/json_to_csv.py b/tools/json_to_csv.py new file mode 100644 index 0000000..3a641ce --- /dev/null +++ b/tools/json_to_csv.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +from pathlib import Path + + +FIELDS = [ + "target", + "source", + "metric_name", + "metric_value", + "url", + "captured_at", + "status", + "confidence", + "notes", + "screenshot_path", +] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert evidence.json to evidence.csv.") + parser.add_argument("--json", required=True, help="Path to evidence.json") + parser.add_argument("--csv", required=True, help="Path to output evidence.csv") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + json_path = Path(args.json) + csv_path = Path(args.csv) + csv_path.parent.mkdir(parents=True, exist_ok=True) + + payload = json.loads(json_path.read_text(encoding="utf-8")) + rows = payload.get("evidence", []) + + with csv_path.open("w", encoding="utf-8", newline="") as f: + writer = csv.DictWriter(f, fieldnames=FIELDS, extrasaction="ignore") + writer.writeheader() + for row in rows: + writer.writerow({k: row.get(k) for k in FIELDS}) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) +