Flaneur: canonical URLs

Document shared evidence schema
Add helper scripts for capture and CSV export
2026-01-02 18:03:05 +00:00 · 2026-01-02 18:03:05 +00:00 · 2026-01-02 18:03:05 +00:00
4 changed files with 321 additions and 0 deletions
--- a/SCHEMA.md
+++ b/SCHEMA.md
@ -0,0 +1,40 @@
+# Evidence schema (v1)
+
+This repo is coordinated across parallel sessions. Each session writes its own evidence under:
+
+- `data/ho36/`
+- `data/flaneur/`
+
+## `evidence.json`
+
+Top-level:
+
+```json
+{
+  "hostel_name": "string",
+  "collected_at": "ISO datetime",
+  "collector_session": "A|B|C",
+  "evidence": [],
+  "profile": {}
+}
+```
+
+Each evidence item (one “fact”) is a row/object with:
+
+- `target`: `"ho36"` | `"flaneur"`
+- `source`: `"official_site"` | `"google_maps"` | `"booking"` | `"hostelworld"` | `"tripadvisor"` | `"instagram"` | `"facebook"` | `"tiktok"` | `"press"` | `"other"`
+- `metric_name`: string
+- `metric_value`: string | number | null
+- `url`: string
+- `captured_at`: ISO datetime
+- `status`: `"ok"` | `"blocked"` | `"unknown"` | `"error"`
+- `confidence`: `"high"` | `"med"` | `"low"`
+- `notes`: string
+- `screenshot_path`: string | null
+
+## `evidence.csv`
+
+One row per evidence item, with the same fields:
+
+`target,source,metric_name,metric_value,url,captured_at,status,confidence,notes,screenshot_path`
+
--- a/data/flaneur/evidence.json
+++ b/data/flaneur/evidence.json
@ -0,0 +1,124 @@
+{
+  "hostel_name": "Flâneur Hostel Lyon",
+  "collected_at": "2026-01-02T19:01:21+01:00",
+  "collector_session": "B",
+  "evidence": [
+    {
+      "target": "flaneur",
+      "source": "other",
+      "metric_name": "canonical_url_official_site",
+      "metric_value": "https://leflaneur-guesthouse.com/",
+      "url": "https://leflaneur-guesthouse.com/",
+      "captured_at": "2026-01-02T19:01:21+01:00",
+      "status": "ok",
+      "confidence": "high",
+      "notes": "Discovered via Google Maps place page (website field).",
+      "screenshot_path": null
+    },
+    {
+      "target": "flaneur",
+      "source": "other",
+      "metric_name": "canonical_url_google_maps",
+      "metric_value": "https://www.google.com/maps/place/Le+Fl%C3%A2neur+Guesthouse/@45.7512135,4.8428045,17z/data=!3m1!5s0x47f4ea4464dcb499:0x7fbb59cd88d1026a!4m9!3m8!1s0x47f4ea446430af35:0xe27846417ed8f4f!5m2!4m1!1i2!8m2!3d45.7512135!4d4.8428045!16s%2Fg%2F11ckqn6t7v",
+      "url": "https://www.google.com/maps/search/?api=1&query=Fl%C3%A2neur%20Hostel%20Lyon",
+      "captured_at": "2026-01-02T19:01:21+01:00",
+      "status": "ok",
+      "confidence": "med",
+      "notes": "Maps search landed on place page titled “Le Flâneur Guesthouse”.",
+      "screenshot_path": null
+    },
+    {
+      "target": "flaneur",
+      "source": "other",
+      "metric_name": "canonical_url_booking_engine",
+      "metric_value": "https://booking.roomraccoon.fr/le-fl-neur-guesthouse-8346/fr/",
+      "url": "https://leflaneur-guesthouse.com/",
+      "captured_at": "2026-01-02T19:01:21+01:00",
+      "status": "ok",
+      "confidence": "high",
+      "notes": "Official site “RESERVE TA NUIT ! BOOK NOW !” CTA points to RoomRaccoon booking engine.",
+      "screenshot_path": null
+    },
+    {
+      "target": "flaneur",
+      "source": "other",
+      "metric_name": "canonical_url_instagram",
+      "metric_value": "https://www.instagram.com/leflaneur_gh/",
+      "url": "https://leflaneur-guesthouse.com/",
+      "captured_at": "2026-01-02T19:01:21+01:00",
+      "status": "ok",
+      "confidence": "high",
+      "notes": "Linked from official site footer/social icons.",
+      "screenshot_path": null
+    },
+    {
+      "target": "flaneur",
+      "source": "other",
+      "metric_name": "canonical_url_facebook",
+      "metric_value": "https://www.facebook.com/leflaneurlyon",
+      "url": "https://leflaneur-guesthouse.com/",
+      "captured_at": "2026-01-02T19:01:21+01:00",
+      "status": "ok",
+      "confidence": "high",
+      "notes": "Linked from official site footer/social icons.",
+      "screenshot_path": null
+    },
+    {
+      "target": "flaneur",
+      "source": "other",
+      "metric_name": "canonical_url_tiktok",
+      "metric_value": null,
+      "url": "https://leflaneur-guesthouse.com/",
+      "captured_at": "2026-01-02T19:01:21+01:00",
+      "status": "unknown",
+      "confidence": "low",
+      "notes": "No TikTok link found on official site homepage/footer; will attempt direct check later.",
+      "screenshot_path": null
+    },
+    {
+      "target": "flaneur",
+      "source": "other",
+      "metric_name": "canonical_url_booking_listing",
+      "metric_value": null,
+      "url": "https://www.booking.com/searchresults.html?ss=Le%20Fl%C3%A2neur%20Guesthouse%20Lyon",
+      "captured_at": "2026-01-02T19:01:21+01:00",
+      "status": "unknown",
+      "confidence": "low",
+      "notes": "Attempted Booking.com search; listing URL not yet identified (timeboxed).",
+      "screenshot_path": null
+    },
+    {
+      "target": "flaneur",
+      "source": "other",
+      "metric_name": "canonical_url_hostelworld_listing",
+      "metric_value": null,
+      "url": "https://www.hostelworld.com/search/?searchTerm=Le%20Fl%C3%A2neur%20Guesthouse%20Lyon",
+      "captured_at": "2026-01-02T19:01:21+01:00",
+      "status": "unknown",
+      "confidence": "low",
+      "notes": "Attempted Hostelworld search URL returned “404 - Page not found”; listing URL not yet identified.",
+      "screenshot_path": null
+    },
+    {
+      "target": "flaneur",
+      "source": "other",
+      "metric_name": "canonical_url_tripadvisor_listing",
+      "metric_value": "https://www.tripadvisor.com/Hotel_Review-g187265-d8778985-Reviews-Le_Flaneur_Guesthouse-Lyon_Rhone_Auvergne_Rhone_Alpes.html",
+      "url": "https://www.tripadvisor.com/Hotel_Review-g187265-d8778985-Reviews-Le_Flaneur_Guesthouse-Lyon_Rhone_Auvergne_Rhone_Alpes.html",
+      "captured_at": "2026-01-02T19:01:21+01:00",
+      "status": "blocked",
+      "confidence": "med",
+      "notes": "TripAdvisor page did not render content in automated capture; screenshot to be stored during TripAdvisor section.",
+      "screenshot_path": null
+    }
+  ],
+  "profile": {
+    "positioning": "",
+    "target_audience": [],
+    "key_amenities": [],
+    "vibe_tags": [],
+    "booking_engine": "",
+    "languages": []
+  }
+}
+
--- a/tools/capture_page.py
+++ b/tools/capture_page.py
@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from dataclasses import asdict, dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Literal
+
+from playwright.sync_api import Error as PlaywrightError
+from playwright.sync_api import sync_playwright
+
+
+Status = Literal["ok", "error"]
+
+
+@dataclass(frozen=True)
+class CaptureResult:
+    url: str
+    final_url: str | None
+    title: str | None
+    captured_at: str
+    status: Status
+    error: str | None
+
+
+def iso_now() -> str:
+    return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
+
+
+def ensure_parent(path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Capture a web page screenshot (and optional HTML).")
+    parser.add_argument("--url", required=True)
+    parser.add_argument("--screenshot", required=True, help="Output PNG path")
+    parser.add_argument("--html", default=None, help="Optional output HTML path")
+    parser.add_argument("--full-page", action="store_true", help="Capture full-page screenshot")
+    parser.add_argument("--timeout-ms", type=int, default=45_000)
+    parser.add_argument("--wait-ms", type=int, default=1_000, help="Extra wait after DOMContentLoaded")
+    parser.add_argument("--user-agent", default=None)
+    parser.add_argument("--locale", default="fr-FR")
+    parser.add_argument("--timezone", default="Europe/Paris")
+    parser.add_argument("--referer", default=None)
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+
+    screenshot_path = Path(args.screenshot)
+    html_path = Path(args.html) if args.html else None
+
+    ensure_parent(screenshot_path)
+    if html_path:
+        ensure_parent(html_path)
+
+    captured_at = iso_now()
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            context = browser.new_context(
+                locale=args.locale,
+                timezone_id=args.timezone,
+                user_agent=args.user_agent,
+                extra_http_headers={k: v for k, v in {"Referer": args.referer}.items() if v},
+            )
+            page = context.new_page()
+            page.goto(args.url, wait_until="domcontentloaded", timeout=args.timeout_ms)
+            if args.wait_ms:
+                page.wait_for_timeout(args.wait_ms)
+
+            page.screenshot(path=str(screenshot_path), full_page=bool(args.full_page))
+            if html_path:
+                html_path.write_text(page.content(), encoding="utf-8")
+
+            result = CaptureResult(
+                url=args.url,
+                final_url=page.url,
+                title=page.title(),
+                captured_at=captured_at,
+                status="ok",
+                error=None,
+            )
+            browser.close()
+    except (PlaywrightError, OSError, ValueError) as exc:
+        result = CaptureResult(
+            url=args.url,
+            final_url=None,
+            title=None,
+            captured_at=captured_at,
+            status="error",
+            error=str(exc),
+        )
+
+    os.write(1, (json.dumps(asdict(result), ensure_ascii=False) + "\n").encode("utf-8"))
+    return 0 if result.status == "ok" else 2
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
+
--- a/tools/json_to_csv.py
+++ b/tools/json_to_csv.py
@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+from pathlib import Path
+
+
+FIELDS = [
+    "target",
+    "source",
+    "metric_name",
+    "metric_value",
+    "url",
+    "captured_at",
+    "status",
+    "confidence",
+    "notes",
+    "screenshot_path",
+]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert evidence.json to evidence.csv.")
+    parser.add_argument("--json", required=True, help="Path to evidence.json")
+    parser.add_argument("--csv", required=True, help="Path to output evidence.csv")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    json_path = Path(args.json)
+    csv_path = Path(args.csv)
+    csv_path.parent.mkdir(parents=True, exist_ok=True)
+
+    payload = json.loads(json_path.read_text(encoding="utf-8"))
+    rows = payload.get("evidence", [])
+
+    with csv_path.open("w", encoding="utf-8", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=FIELDS, extrasaction="ignore")
+        writer.writeheader()
+        for row in rows:
+            writer.writerow({k: row.get(k) for k in FIELDS})
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
+
Author	SHA1	Message	Date
root	fd0672ac7b	Flaneur: canonical URLs	2026-01-02 18:03:05 +00:00
root	fc45c5a458	Document shared evidence schema	2026-01-02 18:03:05 +00:00
root	1e9d55ada8	Add helper scripts for capture and CSV export	2026-01-02 18:03:05 +00:00