Compare commits
3 commits
69e0fe6576
...
fd0672ac7b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fd0672ac7b | ||
|
|
fc45c5a458 | ||
|
|
1e9d55ada8 |
4 changed files with 321 additions and 0 deletions
40
SCHEMA.md
Normal file
40
SCHEMA.md
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
# Evidence schema (v1)
|
||||
|
||||
This repo is coordinated across parallel sessions. Each session writes its own evidence under:
|
||||
|
||||
- `data/ho36/`
|
||||
- `data/flaneur/`
|
||||
|
||||
## `evidence.json`
|
||||
|
||||
Top-level:
|
||||
|
||||
```json
|
||||
{
|
||||
"hostel_name": "string",
|
||||
"collected_at": "ISO datetime",
|
||||
"collector_session": "A|B|C",
|
||||
"evidence": [],
|
||||
"profile": {}
|
||||
}
|
||||
```
|
||||
|
||||
Each evidence item (one “fact”) is a row/object with:
|
||||
|
||||
- `target`: `"ho36"` | `"flaneur"`
|
||||
- `source`: `"official_site"` | `"google_maps"` | `"booking"` | `"hostelworld"` | `"tripadvisor"` | `"instagram"` | `"facebook"` | `"tiktok"` | `"press"` | `"other"`
|
||||
- `metric_name`: string
|
||||
- `metric_value`: string | number | null
|
||||
- `url`: string
|
||||
- `captured_at`: ISO datetime
|
||||
- `status`: `"ok"` | `"blocked"` | `"unknown"` | `"error"`
|
||||
- `confidence`: `"high"` | `"med"` | `"low"`
|
||||
- `notes`: string
|
||||
- `screenshot_path`: string | null
|
||||
|
||||
## `evidence.csv`
|
||||
|
||||
One row per evidence item, with the same fields:
|
||||
|
||||
`target,source,metric_name,metric_value,url,captured_at,status,confidence,notes,screenshot_path`
|
||||
|
||||
124
data/flaneur/evidence.json
Normal file
124
data/flaneur/evidence.json
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
{
|
||||
"hostel_name": "Flâneur Hostel Lyon",
|
||||
"collected_at": "2026-01-02T19:01:21+01:00",
|
||||
"collector_session": "B",
|
||||
"evidence": [
|
||||
{
|
||||
"target": "flaneur",
|
||||
"source": "other",
|
||||
"metric_name": "canonical_url_official_site",
|
||||
"metric_value": "https://leflaneur-guesthouse.com/",
|
||||
"url": "https://leflaneur-guesthouse.com/",
|
||||
"captured_at": "2026-01-02T19:01:21+01:00",
|
||||
"status": "ok",
|
||||
"confidence": "high",
|
||||
"notes": "Discovered via Google Maps place page (website field).",
|
||||
"screenshot_path": null
|
||||
},
|
||||
{
|
||||
"target": "flaneur",
|
||||
"source": "other",
|
||||
"metric_name": "canonical_url_google_maps",
|
||||
"metric_value": "https://www.google.com/maps/place/Le+Fl%C3%A2neur+Guesthouse/@45.7512135,4.8428045,17z/data=!3m1!5s0x47f4ea4464dcb499:0x7fbb59cd88d1026a!4m9!3m8!1s0x47f4ea446430af35:0xe27846417ed8f4f!5m2!4m1!1i2!8m2!3d45.7512135!4d4.8428045!16s%2Fg%2F11ckqn6t7v",
|
||||
"url": "https://www.google.com/maps/search/?api=1&query=Fl%C3%A2neur%20Hostel%20Lyon",
|
||||
"captured_at": "2026-01-02T19:01:21+01:00",
|
||||
"status": "ok",
|
||||
"confidence": "med",
|
||||
"notes": "Maps search landed on place page titled “Le Flâneur Guesthouse”.",
|
||||
"screenshot_path": null
|
||||
},
|
||||
{
|
||||
"target": "flaneur",
|
||||
"source": "other",
|
||||
"metric_name": "canonical_url_booking_engine",
|
||||
"metric_value": "https://booking.roomraccoon.fr/le-fl-neur-guesthouse-8346/fr/",
|
||||
"url": "https://leflaneur-guesthouse.com/",
|
||||
"captured_at": "2026-01-02T19:01:21+01:00",
|
||||
"status": "ok",
|
||||
"confidence": "high",
|
||||
"notes": "Official site “RESERVE TA NUIT ! BOOK NOW !” CTA points to RoomRaccoon booking engine.",
|
||||
"screenshot_path": null
|
||||
},
|
||||
{
|
||||
"target": "flaneur",
|
||||
"source": "other",
|
||||
"metric_name": "canonical_url_instagram",
|
||||
"metric_value": "https://www.instagram.com/leflaneur_gh/",
|
||||
"url": "https://leflaneur-guesthouse.com/",
|
||||
"captured_at": "2026-01-02T19:01:21+01:00",
|
||||
"status": "ok",
|
||||
"confidence": "high",
|
||||
"notes": "Linked from official site footer/social icons.",
|
||||
"screenshot_path": null
|
||||
},
|
||||
{
|
||||
"target": "flaneur",
|
||||
"source": "other",
|
||||
"metric_name": "canonical_url_facebook",
|
||||
"metric_value": "https://www.facebook.com/leflaneurlyon",
|
||||
"url": "https://leflaneur-guesthouse.com/",
|
||||
"captured_at": "2026-01-02T19:01:21+01:00",
|
||||
"status": "ok",
|
||||
"confidence": "high",
|
||||
"notes": "Linked from official site footer/social icons.",
|
||||
"screenshot_path": null
|
||||
},
|
||||
{
|
||||
"target": "flaneur",
|
||||
"source": "other",
|
||||
"metric_name": "canonical_url_tiktok",
|
||||
"metric_value": null,
|
||||
"url": "https://leflaneur-guesthouse.com/",
|
||||
"captured_at": "2026-01-02T19:01:21+01:00",
|
||||
"status": "unknown",
|
||||
"confidence": "low",
|
||||
"notes": "No TikTok link found on official site homepage/footer; will attempt direct check later.",
|
||||
"screenshot_path": null
|
||||
},
|
||||
{
|
||||
"target": "flaneur",
|
||||
"source": "other",
|
||||
"metric_name": "canonical_url_booking_listing",
|
||||
"metric_value": null,
|
||||
"url": "https://www.booking.com/searchresults.html?ss=Le%20Fl%C3%A2neur%20Guesthouse%20Lyon",
|
||||
"captured_at": "2026-01-02T19:01:21+01:00",
|
||||
"status": "unknown",
|
||||
"confidence": "low",
|
||||
"notes": "Attempted Booking.com search; listing URL not yet identified (timeboxed).",
|
||||
"screenshot_path": null
|
||||
},
|
||||
{
|
||||
"target": "flaneur",
|
||||
"source": "other",
|
||||
"metric_name": "canonical_url_hostelworld_listing",
|
||||
"metric_value": null,
|
||||
"url": "https://www.hostelworld.com/search/?searchTerm=Le%20Fl%C3%A2neur%20Guesthouse%20Lyon",
|
||||
"captured_at": "2026-01-02T19:01:21+01:00",
|
||||
"status": "unknown",
|
||||
"confidence": "low",
|
||||
"notes": "Attempted Hostelworld search URL returned “404 - Page not found”; listing URL not yet identified.",
|
||||
"screenshot_path": null
|
||||
},
|
||||
{
|
||||
"target": "flaneur",
|
||||
"source": "other",
|
||||
"metric_name": "canonical_url_tripadvisor_listing",
|
||||
"metric_value": "https://www.tripadvisor.com/Hotel_Review-g187265-d8778985-Reviews-Le_Flaneur_Guesthouse-Lyon_Rhone_Auvergne_Rhone_Alpes.html",
|
||||
"url": "https://www.tripadvisor.com/Hotel_Review-g187265-d8778985-Reviews-Le_Flaneur_Guesthouse-Lyon_Rhone_Auvergne_Rhone_Alpes.html",
|
||||
"captured_at": "2026-01-02T19:01:21+01:00",
|
||||
"status": "blocked",
|
||||
"confidence": "med",
|
||||
"notes": "TripAdvisor page did not render content in automated capture; screenshot to be stored during TripAdvisor section.",
|
||||
"screenshot_path": null
|
||||
}
|
||||
],
|
||||
"profile": {
|
||||
"positioning": "",
|
||||
"target_audience": [],
|
||||
"key_amenities": [],
|
||||
"vibe_tags": [],
|
||||
"booking_engine": "",
|
||||
"languages": []
|
||||
}
|
||||
}
|
||||
|
||||
106
tools/capture_page.py
Normal file
106
tools/capture_page.py
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from dataclasses import asdict, dataclass
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
|
||||
from playwright.sync_api import Error as PlaywrightError
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
|
||||
Status = Literal["ok", "error"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CaptureResult:
|
||||
url: str
|
||||
final_url: str | None
|
||||
title: str | None
|
||||
captured_at: str
|
||||
status: Status
|
||||
error: str | None
|
||||
|
||||
|
||||
def iso_now() -> str:
|
||||
return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
|
||||
|
||||
|
||||
def ensure_parent(path: Path) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Capture a web page screenshot (and optional HTML).")
|
||||
parser.add_argument("--url", required=True)
|
||||
parser.add_argument("--screenshot", required=True, help="Output PNG path")
|
||||
parser.add_argument("--html", default=None, help="Optional output HTML path")
|
||||
parser.add_argument("--full-page", action="store_true", help="Capture full-page screenshot")
|
||||
parser.add_argument("--timeout-ms", type=int, default=45_000)
|
||||
parser.add_argument("--wait-ms", type=int, default=1_000, help="Extra wait after DOMContentLoaded")
|
||||
parser.add_argument("--user-agent", default=None)
|
||||
parser.add_argument("--locale", default="fr-FR")
|
||||
parser.add_argument("--timezone", default="Europe/Paris")
|
||||
parser.add_argument("--referer", default=None)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
screenshot_path = Path(args.screenshot)
|
||||
html_path = Path(args.html) if args.html else None
|
||||
|
||||
ensure_parent(screenshot_path)
|
||||
if html_path:
|
||||
ensure_parent(html_path)
|
||||
|
||||
captured_at = iso_now()
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
context = browser.new_context(
|
||||
locale=args.locale,
|
||||
timezone_id=args.timezone,
|
||||
user_agent=args.user_agent,
|
||||
extra_http_headers={k: v for k, v in {"Referer": args.referer}.items() if v},
|
||||
)
|
||||
page = context.new_page()
|
||||
page.goto(args.url, wait_until="domcontentloaded", timeout=args.timeout_ms)
|
||||
if args.wait_ms:
|
||||
page.wait_for_timeout(args.wait_ms)
|
||||
|
||||
page.screenshot(path=str(screenshot_path), full_page=bool(args.full_page))
|
||||
if html_path:
|
||||
html_path.write_text(page.content(), encoding="utf-8")
|
||||
|
||||
result = CaptureResult(
|
||||
url=args.url,
|
||||
final_url=page.url,
|
||||
title=page.title(),
|
||||
captured_at=captured_at,
|
||||
status="ok",
|
||||
error=None,
|
||||
)
|
||||
browser.close()
|
||||
except (PlaywrightError, OSError, ValueError) as exc:
|
||||
result = CaptureResult(
|
||||
url=args.url,
|
||||
final_url=None,
|
||||
title=None,
|
||||
captured_at=captured_at,
|
||||
status="error",
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
os.write(1, (json.dumps(asdict(result), ensure_ascii=False) + "\n").encode("utf-8"))
|
||||
return 0 if result.status == "ok" else 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
||||
51
tools/json_to_csv.py
Normal file
51
tools/json_to_csv.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
FIELDS = [
|
||||
"target",
|
||||
"source",
|
||||
"metric_name",
|
||||
"metric_value",
|
||||
"url",
|
||||
"captured_at",
|
||||
"status",
|
||||
"confidence",
|
||||
"notes",
|
||||
"screenshot_path",
|
||||
]
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert evidence.json to evidence.csv.")
|
||||
parser.add_argument("--json", required=True, help="Path to evidence.json")
|
||||
parser.add_argument("--csv", required=True, help="Path to output evidence.csv")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
json_path = Path(args.json)
|
||||
csv_path = Path(args.csv)
|
||||
csv_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
payload = json.loads(json_path.read_text(encoding="utf-8"))
|
||||
rows = payload.get("evidence", [])
|
||||
|
||||
with csv_path.open("w", encoding="utf-8", newline="") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=FIELDS, extrasaction="ignore")
|
||||
writer.writeheader()
|
||||
for row in rows:
|
||||
writer.writerow({k: row.get(k) for k in FIELDS})
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
||||
Loading…
Add table
Reference in a new issue