#!/usr/bin/env python3 from __future__ import annotations import argparse import json import random import time from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path from typing import Any from urllib.error import HTTPError, URLError from urllib.request import Request, urlopen API_BASE = "https://prod.apigee.hostelworld.com/legacy-hwapi-service/2.2" def iso_now() -> str: return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds") def fetch_json(url: str, *, timeout_s: float = 30.0, user_agent: str | None = None) -> dict[str, Any]: req = Request(url) req.add_header("Accept", "application/json") req.add_header("User-Agent", user_agent or "Mozilla/5.0") with urlopen(req, timeout=timeout_s) as resp: return json.load(resp) def jitter_sleep(base_s: float, jitter_s: float) -> None: time.sleep(max(0.0, base_s + random.uniform(-jitter_s, jitter_s))) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Fetch Hostelworld property reviews (no login).") parser.add_argument("--property-id", type=int, required=True) parser.add_argument( "--month-count", type=int, default=12, help="Legacy API uses a fixed tail window; set high and filter client-side by date", ) parser.add_argument("--currency", default="EUR") parser.add_argument("--out", required=True, help="Output JSON path") parser.add_argument("--sleep-s", type=float, default=1.0, help="Delay between page requests") parser.add_argument("--jitter-s", type=float, default=0.25, help="Random +/- jitter applied to --sleep-s") parser.add_argument("--timeout-s", type=float, default=30.0) parser.add_argument("--user-agent", default=None) return parser.parse_args() def main() -> int: args = parse_args() out_path = Path(args.out) out_path.parent.mkdir(parents=True, exist_ok=True) base_url = ( f"{API_BASE}/properties/{args.property_id}/reviews/" f"?sort=-date&allLanguages=false&page=1&monthCount={args.month_count}¤cy={args.currency}" ) fetched_at = iso_now() try: page1 = fetch_json(base_url, timeout_s=args.timeout_s, user_agent=args.user_agent) except (HTTPError, URLError, TimeoutError, ValueError) as exc: raise SystemExit(f"failed to fetch page 1: {exc}") from exc pagination = page1.get("pagination") or {} number_of_pages = int(pagination.get("numberOfPages") or 1) total_items = int(pagination.get("totalNumberOfItems") or 0) pages: list[dict[str, Any]] = [page1] for page_num in range(2, number_of_pages + 1): jitter_sleep(args.sleep_s, args.jitter_s) url = ( f"{API_BASE}/properties/{args.property_id}/reviews/" f"?sort=-date&allLanguages=false&page={page_num}&monthCount={args.month_count}¤cy={args.currency}" ) pages.append(fetch_json(url, timeout_s=args.timeout_s, user_agent=args.user_agent)) reviews: list[dict[str, Any]] = [] review_statistics: dict[str, Any] | None = None for page in pages: if review_statistics is None: review_statistics = page.get("reviewStatistics") or {} reviews.extend(page.get("reviews") or []) payload: dict[str, Any] = { "fetched_at": fetched_at, "api_base": API_BASE, "property_id": args.property_id, "month_count": args.month_count, "currency": args.currency, "pagination": { "number_of_pages": number_of_pages, "total_number_of_items": total_items, }, "review_statistics": review_statistics or {}, "reviews": reviews, } out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") print(json.dumps(payload, ensure_ascii=False, indent=2)) return 0 if __name__ == "__main__": raise SystemExit(main())