flaneur/verify/tools/fetch_hostelworld_reviews.py

109 lines
3.9 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import random
import time
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
API_BASE = "https://prod.apigee.hostelworld.com/legacy-hwapi-service/2.2"
def iso_now() -> str:
return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
def fetch_json(url: str, *, timeout_s: float = 30.0, user_agent: str | None = None) -> dict[str, Any]:
req = Request(url)
req.add_header("Accept", "application/json")
req.add_header("User-Agent", user_agent or "Mozilla/5.0")
with urlopen(req, timeout=timeout_s) as resp:
return json.load(resp)
def jitter_sleep(base_s: float, jitter_s: float) -> None:
time.sleep(max(0.0, base_s + random.uniform(-jitter_s, jitter_s)))
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Fetch Hostelworld property reviews (no login).")
parser.add_argument("--property-id", type=int, required=True)
parser.add_argument(
"--month-count",
type=int,
default=12,
help="Legacy API uses a fixed tail window; set high and filter client-side by date",
)
parser.add_argument("--currency", default="EUR")
parser.add_argument("--out", required=True, help="Output JSON path")
parser.add_argument("--sleep-s", type=float, default=1.0, help="Delay between page requests")
parser.add_argument("--jitter-s", type=float, default=0.25, help="Random +/- jitter applied to --sleep-s")
parser.add_argument("--timeout-s", type=float, default=30.0)
parser.add_argument("--user-agent", default=None)
return parser.parse_args()
def main() -> int:
args = parse_args()
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
base_url = (
f"{API_BASE}/properties/{args.property_id}/reviews/"
f"?sort=-date&allLanguages=false&page=1&monthCount={args.month_count}&currency={args.currency}"
)
fetched_at = iso_now()
try:
page1 = fetch_json(base_url, timeout_s=args.timeout_s, user_agent=args.user_agent)
except (HTTPError, URLError, TimeoutError, ValueError) as exc:
raise SystemExit(f"failed to fetch page 1: {exc}") from exc
pagination = page1.get("pagination") or {}
number_of_pages = int(pagination.get("numberOfPages") or 1)
total_items = int(pagination.get("totalNumberOfItems") or 0)
pages: list[dict[str, Any]] = [page1]
for page_num in range(2, number_of_pages + 1):
jitter_sleep(args.sleep_s, args.jitter_s)
url = (
f"{API_BASE}/properties/{args.property_id}/reviews/"
f"?sort=-date&allLanguages=false&page={page_num}&monthCount={args.month_count}&currency={args.currency}"
)
pages.append(fetch_json(url, timeout_s=args.timeout_s, user_agent=args.user_agent))
reviews: list[dict[str, Any]] = []
review_statistics: dict[str, Any] | None = None
for page in pages:
if review_statistics is None:
review_statistics = page.get("reviewStatistics") or {}
reviews.extend(page.get("reviews") or [])
payload: dict[str, Any] = {
"fetched_at": fetched_at,
"api_base": API_BASE,
"property_id": args.property_id,
"month_count": args.month_count,
"currency": args.currency,
"pagination": {
"number_of_pages": number_of_pages,
"total_number_of_items": total_items,
},
"review_statistics": review_statistics or {},
"reviews": reviews,
}
out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
print(json.dumps(payload, ensure_ascii=False, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())