109 lines
3.9 KiB
Python
109 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import random
|
|
import time
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from urllib.error import HTTPError, URLError
|
|
from urllib.request import Request, urlopen
|
|
|
|
|
|
API_BASE = "https://prod.apigee.hostelworld.com/legacy-hwapi-service/2.2"
|
|
|
|
|
|
def iso_now() -> str:
|
|
return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
|
|
|
|
|
|
def fetch_json(url: str, *, timeout_s: float = 30.0, user_agent: str | None = None) -> dict[str, Any]:
|
|
req = Request(url)
|
|
req.add_header("Accept", "application/json")
|
|
req.add_header("User-Agent", user_agent or "Mozilla/5.0")
|
|
with urlopen(req, timeout=timeout_s) as resp:
|
|
return json.load(resp)
|
|
|
|
|
|
def jitter_sleep(base_s: float, jitter_s: float) -> None:
|
|
time.sleep(max(0.0, base_s + random.uniform(-jitter_s, jitter_s)))
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Fetch Hostelworld property reviews (no login).")
|
|
parser.add_argument("--property-id", type=int, required=True)
|
|
parser.add_argument(
|
|
"--month-count",
|
|
type=int,
|
|
default=12,
|
|
help="Legacy API uses a fixed tail window; set high and filter client-side by date",
|
|
)
|
|
parser.add_argument("--currency", default="EUR")
|
|
parser.add_argument("--out", required=True, help="Output JSON path")
|
|
parser.add_argument("--sleep-s", type=float, default=1.0, help="Delay between page requests")
|
|
parser.add_argument("--jitter-s", type=float, default=0.25, help="Random +/- jitter applied to --sleep-s")
|
|
parser.add_argument("--timeout-s", type=float, default=30.0)
|
|
parser.add_argument("--user-agent", default=None)
|
|
return parser.parse_args()
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
out_path = Path(args.out)
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
base_url = (
|
|
f"{API_BASE}/properties/{args.property_id}/reviews/"
|
|
f"?sort=-date&allLanguages=false&page=1&monthCount={args.month_count}¤cy={args.currency}"
|
|
)
|
|
|
|
fetched_at = iso_now()
|
|
try:
|
|
page1 = fetch_json(base_url, timeout_s=args.timeout_s, user_agent=args.user_agent)
|
|
except (HTTPError, URLError, TimeoutError, ValueError) as exc:
|
|
raise SystemExit(f"failed to fetch page 1: {exc}") from exc
|
|
|
|
pagination = page1.get("pagination") or {}
|
|
number_of_pages = int(pagination.get("numberOfPages") or 1)
|
|
total_items = int(pagination.get("totalNumberOfItems") or 0)
|
|
|
|
pages: list[dict[str, Any]] = [page1]
|
|
for page_num in range(2, number_of_pages + 1):
|
|
jitter_sleep(args.sleep_s, args.jitter_s)
|
|
url = (
|
|
f"{API_BASE}/properties/{args.property_id}/reviews/"
|
|
f"?sort=-date&allLanguages=false&page={page_num}&monthCount={args.month_count}¤cy={args.currency}"
|
|
)
|
|
pages.append(fetch_json(url, timeout_s=args.timeout_s, user_agent=args.user_agent))
|
|
|
|
reviews: list[dict[str, Any]] = []
|
|
review_statistics: dict[str, Any] | None = None
|
|
for page in pages:
|
|
if review_statistics is None:
|
|
review_statistics = page.get("reviewStatistics") or {}
|
|
reviews.extend(page.get("reviews") or [])
|
|
|
|
payload: dict[str, Any] = {
|
|
"fetched_at": fetched_at,
|
|
"api_base": API_BASE,
|
|
"property_id": args.property_id,
|
|
"month_count": args.month_count,
|
|
"currency": args.currency,
|
|
"pagination": {
|
|
"number_of_pages": number_of_pages,
|
|
"total_number_of_items": total_items,
|
|
},
|
|
"review_statistics": review_statistics or {},
|
|
"reviews": reviews,
|
|
}
|
|
|
|
out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
|
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|