flaneur/verify/tools/analyze_hostelworld_reviews.py

#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
import re
from collections import Counter
from datetime import date, datetime, timedelta
from pathlib import Path
from statistics import mean, median
from typing import Any


STOPWORDS = {
    "the",
    "and",
    "a",
    "an",
    "to",
    "of",
    "in",
    "for",
    "on",
    "at",
    "with",
    "is",
    "it",
    "was",
    "were",
    "are",
    "be",
    "been",
    "i",
    "we",
    "you",
    "they",
    "this",
    "that",
    "as",
    "but",
    "so",
    "if",
    "not",
    "very",
    "really",
    "just",
    "my",
    "our",
    "their",
    "your",
    "me",
    "us",
    "them",
}


THEME_RULES: dict[str, list[re.Pattern[str]]] = {
    "cleanliness": [
        re.compile(r"\bclean\b", re.I),
        re.compile(r"\bdirty\b", re.I),
        re.compile(r"\bsmell\b", re.I),
        re.compile(r"\bstink\b", re.I),
        re.compile(r"\breek\b", re.I),
        re.compile(r"\bmold\b", re.I),
        re.compile(r"\bbath(room|rooms)?\b", re.I),
        re.compile(r"\btoilet(s)?\b", re.I),
        re.compile(r"\bshower(s)?\b", re.I),
    ],
    "staff_reception": [
        re.compile(r"\bstaff\b", re.I),
        re.compile(r"\breception\b", re.I),
        re.compile(r"\bfront\s*desk\b", re.I),
        re.compile(r"\bhelpful\b", re.I),
        re.compile(r"\brude\b", re.I),
        re.compile(r"\bfriendl(y|iness)\b", re.I),
        re.compile(r"\b24h\b", re.I),
        re.compile(r"\b24\s*hour\b", re.I),
    ],
    "safety_neighborhood": [
        re.compile(r"\bsafe\b", re.I),
        re.compile(r"\bunsafe\b", re.I),
        re.compile(r"\bdanger(ous|)\b", re.I),
        re.compile(r"\bdrug(s|)\b", re.I),
        re.compile(r"\bdealer(s)?\b", re.I),
        re.compile(r"\bsketchy\b", re.I),
        re.compile(r"\bafter\s+dark\b", re.I),
        re.compile(r"\bnight\b", re.I),
        re.compile(r"\bhomeless\b", re.I),
    ],
    "sleep_noise": [
        re.compile(r"\bnois(e|y)\b", re.I),
        re.compile(r"\bloud\b", re.I),
        re.compile(r"\bsleep\b", re.I),
        re.compile(r"\bbunk\b", re.I),
        re.compile(r"\brattl(e|ing)\b", re.I),
        re.compile(r"\bcurtain(s)?\b", re.I),
        re.compile(r"\bprivacy\b", re.I),
    ],
    "kitchen_food": [
        re.compile(r"\bkitchen\b", re.I),
        re.compile(r"\bbreakfast\b", re.I),
        re.compile(r"\bfood\b", re.I),
        re.compile(r"\bbar\b", re.I),
        re.compile(r"\bcafe\b", re.I),
        re.compile(r"\bcoffee\b", re.I),
        re.compile(r"\bdrink(s)?\b", re.I),
    ],
    "value_price": [
        re.compile(r"\bvalue\b", re.I),
        re.compile(r"\bprice\b", re.I),
        re.compile(r"\bexpensive\b", re.I),
        re.compile(r"\bcheap\b", re.I),
        re.compile(r"\bworth\b", re.I),
    ],
}


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Summarize Hostelworld reviews JSON into theme counts.")
    parser.add_argument("--in", dest="inputs", action="append", required=True, help="Input JSON file")
    parser.add_argument("--out", required=True, help="Output Markdown path")
    parser.add_argument("--label", action="append", default=[], help="Optional label per --in (same order)")
    parser.add_argument(
        "--days",
        type=int,
        default=365,
        help="Only include reviews within the last N days (based on the review 'date' field)",
    )
    parser.add_argument("--low-threshold", type=float, default=65.0, help="Overall score <= this is 'negative' (0-100)")
    parser.add_argument("--high-threshold", type=float, default=85.0, help="Overall score >= this is 'positive' (0-100)")
    parser.add_argument("--min-theme-count", type=int, default=5, help="Only show themes with at least this many mentions")
    return parser.parse_args()


def get_overall(review: dict[str, Any]) -> float | None:
    rating = review.get("rating") or {}
    overall = rating.get("overall")
    return float(overall) if isinstance(overall, (int, float)) else None


def get_text(review: dict[str, Any]) -> str:
    return str(review.get("notes") or "")


def bucket(overall: float | None, *, low: float, high: float) -> str:
    if overall is None:
        return "unknown"
    if overall <= low:
        return "negative"
    if overall >= high:
        return "positive"
    return "neutral"


def tokenize(text: str) -> list[str]:
    words = re.findall(r"[a-zA-Z]{3,}", text.lower())
    return [w for w in words if w not in STOPWORDS]


def detect_themes(text: str) -> set[str]:
    hits: set[str] = set()
    for theme, patterns in THEME_RULES.items():
        if any(p.search(text) for p in patterns):
            hits.add(theme)
    return hits


def fmt10(score100: float | None) -> str:
    if score100 is None:
        return "n/a"
    return f"{score100 / 10:.1f}"


def summarize(payload: dict[str, Any], *, low: float, high: float) -> dict[str, Any]:
    reviews_all = payload.get("reviews") or []
    fetched_at = payload.get("fetched_at")
    ref_dt = None
    try:
        ref_dt = datetime.fromisoformat(fetched_at) if isinstance(fetched_at, str) else None
    except ValueError:
        ref_dt = None
    if ref_dt is None:
        ref_dt = datetime.now()

    days = int(payload.get("_analysis_days", 365))
    since_date = (ref_dt.date() - timedelta(days=days))

    reviews: list[dict[str, Any]] = []
    for r in reviews_all:
        d = r.get("date")
        if not isinstance(d, str) or len(d) < 10:
            continue
        try:
            rd = date.fromisoformat(d[:10])
        except ValueError:
            continue
        if rd >= since_date:
            reviews.append(r)
    overall_scores = [get_overall(r) for r in reviews]
    overall_clean = [s for s in overall_scores if s is not None]

    bucket_counts = Counter(bucket(get_overall(r), low=low, high=high) for r in reviews)

    sub_keys = ["safety", "location", "staff", "atmosphere", "cleanliness", "facilities", "value"]
    subs: dict[str, list[float]] = {k: [] for k in sub_keys}
    for r in reviews:
        rating = r.get("rating") or {}
        for k in sub_keys:
            v = rating.get(k)
            if isinstance(v, (int, float)):
                subs[k].append(float(v))
    sub_avgs = {k: (mean(v) if v else None) for k, v in subs.items()}

    theme_counts: dict[str, Counter[str]] = {b: Counter() for b in ["positive", "neutral", "negative"]}
    keyword_counts: dict[str, Counter[str]] = {b: Counter() for b in ["positive", "neutral", "negative"]}

    for r in reviews:
        text = get_text(r)
        b = bucket(get_overall(r), low=low, high=high)
        if b not in theme_counts:
            continue
        for t in detect_themes(text):
            theme_counts[b][t] += 1
        keyword_counts[b].update(tokenize(text))

    return {
        "property_id": payload.get("property_id"),
        "month_count": payload.get("month_count"),
        "total_reviews": len(reviews),
        "since_date": since_date.isoformat(),
        "bucket_counts": dict(bucket_counts),
        "overall_mean": mean(overall_clean) if overall_clean else None,
        "overall_median": median(overall_clean) if overall_clean else None,
        "subscore_avgs": sub_avgs,
        "theme_counts": {k: dict(v) for k, v in theme_counts.items()},
        "top_keywords": {k: keyword_counts[k].most_common(25) for k in keyword_counts},
        "review_statistics": payload.get("review_statistics") or {},
    }


def main() -> int:
    args = parse_args()
    labels = list(args.label or [])
    while len(labels) < len(args.inputs):
        labels.append(Path(args.inputs[len(labels)]).stem)

    summaries = []
    for path, label in zip(args.inputs, labels, strict=True):
        payload = json.loads(Path(path).read_text(encoding="utf-8"))
        payload["_analysis_days"] = args.days
        s = summarize(payload, low=args.low_threshold, high=args.high_threshold)
        s["label"] = label
        summaries.append(s)

    out_path = Path(args.out)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    lines: list[str] = []
    lines.append("# Hostelworld review themes (last 12 months)")
    lines.append("")
    lines.append(f"- Generated: {datetime.now().isoformat(timespec='seconds')}")
    lines.append(f"- Window: last {args.days} days")
    lines.append(f"- Negative threshold: <= {args.low_threshold:.0f}/100")
    lines.append(f"- Positive threshold: >= {args.high_threshold:.0f}/100")
    lines.append("")

    for s in summaries:
        lines.append(f"## {s['label']}")
        lines.append("")
        lines.append("| Metric | Value |")
        lines.append("|---|---|")
        lines.append(f"| Window since | {s['since_date']} |")
        lines.append(f"| Reviews (12m) | {s['total_reviews']} |")
        lines.append(f"| Overall mean (/100) | {s['overall_mean']:.1f} |" if s["overall_mean"] is not None else "| Overall mean (/100) | n/a |")
        lines.append(f"| Overall median (/100) | {s['overall_median']:.1f} |" if s["overall_median"] is not None else "| Overall median (/100) | n/a |")
        bc = s["bucket_counts"]
        lines.append(f"| Positive / Neutral / Negative | {bc.get('positive',0)} / {bc.get('neutral',0)} / {bc.get('negative',0)} |")
        lines.append("")

        sub = s["subscore_avgs"]
        lines.append("| Subscore (avg/10) | Score |")
        lines.append("|---|---|")
        for k in ["cleanliness", "facilities", "staff", "atmosphere", "safety", "location", "value"]:
            lines.append(f"| {k} | {fmt10(sub.get(k))} |")
        lines.append("")

        lines.append(f"### Themes (mentions, min {args.min_theme_count})")
        lines.append("")
        for bucket_name in ["negative", "neutral", "positive"]:
            counts = Counter(s["theme_counts"].get(bucket_name, {}))
            counts = Counter({k: v for k, v in counts.items() if v >= args.min_theme_count})
            if not counts:
                continue
            lines.append(f"**{bucket_name}**")
            for theme, cnt in counts.most_common():
                lines.append(f"- {theme}: {cnt}")
            lines.append("")

        lines.append("### Top keywords (sanity check)")
        lines.append("")
        for bucket_name in ["negative", "positive"]:
            kws = s["top_keywords"].get(bucket_name, [])
            if not kws:
                continue
            lines.append(f"**{bucket_name}**")
            lines.append(", ".join([f"{w} ({c})" for w, c in kws[:15]]))
            lines.append("")

    out_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
    print(out_path)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())