#!/usr/bin/env python3 from __future__ import annotations import argparse import json import re from collections import Counter from datetime import date, datetime, timedelta from pathlib import Path from statistics import mean, median from typing import Any STOPWORDS = { "the", "and", "a", "an", "to", "of", "in", "for", "on", "at", "with", "is", "it", "was", "were", "are", "be", "been", "i", "we", "you", "they", "this", "that", "as", "but", "so", "if", "not", "very", "really", "just", "my", "our", "their", "your", "me", "us", "them", } THEME_RULES: dict[str, list[re.Pattern[str]]] = { "cleanliness": [ re.compile(r"\bclean\b", re.I), re.compile(r"\bdirty\b", re.I), re.compile(r"\bsmell\b", re.I), re.compile(r"\bstink\b", re.I), re.compile(r"\breek\b", re.I), re.compile(r"\bmold\b", re.I), re.compile(r"\bbath(room|rooms)?\b", re.I), re.compile(r"\btoilet(s)?\b", re.I), re.compile(r"\bshower(s)?\b", re.I), ], "staff_reception": [ re.compile(r"\bstaff\b", re.I), re.compile(r"\breception\b", re.I), re.compile(r"\bfront\s*desk\b", re.I), re.compile(r"\bhelpful\b", re.I), re.compile(r"\brude\b", re.I), re.compile(r"\bfriendl(y|iness)\b", re.I), re.compile(r"\b24h\b", re.I), re.compile(r"\b24\s*hour\b", re.I), ], "safety_neighborhood": [ re.compile(r"\bsafe\b", re.I), re.compile(r"\bunsafe\b", re.I), re.compile(r"\bdanger(ous|)\b", re.I), re.compile(r"\bdrug(s|)\b", re.I), re.compile(r"\bdealer(s)?\b", re.I), re.compile(r"\bsketchy\b", re.I), re.compile(r"\bafter\s+dark\b", re.I), re.compile(r"\bnight\b", re.I), re.compile(r"\bhomeless\b", re.I), ], "sleep_noise": [ re.compile(r"\bnois(e|y)\b", re.I), re.compile(r"\bloud\b", re.I), re.compile(r"\bsleep\b", re.I), re.compile(r"\bbunk\b", re.I), re.compile(r"\brattl(e|ing)\b", re.I), re.compile(r"\bcurtain(s)?\b", re.I), re.compile(r"\bprivacy\b", re.I), ], "kitchen_food": [ re.compile(r"\bkitchen\b", re.I), re.compile(r"\bbreakfast\b", re.I), re.compile(r"\bfood\b", re.I), re.compile(r"\bbar\b", re.I), re.compile(r"\bcafe\b", re.I), re.compile(r"\bcoffee\b", re.I), re.compile(r"\bdrink(s)?\b", re.I), ], "value_price": [ re.compile(r"\bvalue\b", re.I), re.compile(r"\bprice\b", re.I), re.compile(r"\bexpensive\b", re.I), re.compile(r"\bcheap\b", re.I), re.compile(r"\bworth\b", re.I), ], } def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Summarize Hostelworld reviews JSON into theme counts.") parser.add_argument("--in", dest="inputs", action="append", required=True, help="Input JSON file") parser.add_argument("--out", required=True, help="Output Markdown path") parser.add_argument("--label", action="append", default=[], help="Optional label per --in (same order)") parser.add_argument( "--days", type=int, default=365, help="Only include reviews within the last N days (based on the review 'date' field)", ) parser.add_argument("--low-threshold", type=float, default=65.0, help="Overall score <= this is 'negative' (0-100)") parser.add_argument("--high-threshold", type=float, default=85.0, help="Overall score >= this is 'positive' (0-100)") parser.add_argument("--min-theme-count", type=int, default=5, help="Only show themes with at least this many mentions") return parser.parse_args() def get_overall(review: dict[str, Any]) -> float | None: rating = review.get("rating") or {} overall = rating.get("overall") return float(overall) if isinstance(overall, (int, float)) else None def get_text(review: dict[str, Any]) -> str: return str(review.get("notes") or "") def bucket(overall: float | None, *, low: float, high: float) -> str: if overall is None: return "unknown" if overall <= low: return "negative" if overall >= high: return "positive" return "neutral" def tokenize(text: str) -> list[str]: words = re.findall(r"[a-zA-Z]{3,}", text.lower()) return [w for w in words if w not in STOPWORDS] def detect_themes(text: str) -> set[str]: hits: set[str] = set() for theme, patterns in THEME_RULES.items(): if any(p.search(text) for p in patterns): hits.add(theme) return hits def fmt10(score100: float | None) -> str: if score100 is None: return "n/a" return f"{score100 / 10:.1f}" def summarize(payload: dict[str, Any], *, low: float, high: float) -> dict[str, Any]: reviews_all = payload.get("reviews") or [] fetched_at = payload.get("fetched_at") ref_dt = None try: ref_dt = datetime.fromisoformat(fetched_at) if isinstance(fetched_at, str) else None except ValueError: ref_dt = None if ref_dt is None: ref_dt = datetime.now() days = int(payload.get("_analysis_days", 365)) since_date = (ref_dt.date() - timedelta(days=days)) reviews: list[dict[str, Any]] = [] for r in reviews_all: d = r.get("date") if not isinstance(d, str) or len(d) < 10: continue try: rd = date.fromisoformat(d[:10]) except ValueError: continue if rd >= since_date: reviews.append(r) overall_scores = [get_overall(r) for r in reviews] overall_clean = [s for s in overall_scores if s is not None] bucket_counts = Counter(bucket(get_overall(r), low=low, high=high) for r in reviews) sub_keys = ["safety", "location", "staff", "atmosphere", "cleanliness", "facilities", "value"] subs: dict[str, list[float]] = {k: [] for k in sub_keys} for r in reviews: rating = r.get("rating") or {} for k in sub_keys: v = rating.get(k) if isinstance(v, (int, float)): subs[k].append(float(v)) sub_avgs = {k: (mean(v) if v else None) for k, v in subs.items()} theme_counts: dict[str, Counter[str]] = {b: Counter() for b in ["positive", "neutral", "negative"]} keyword_counts: dict[str, Counter[str]] = {b: Counter() for b in ["positive", "neutral", "negative"]} for r in reviews: text = get_text(r) b = bucket(get_overall(r), low=low, high=high) if b not in theme_counts: continue for t in detect_themes(text): theme_counts[b][t] += 1 keyword_counts[b].update(tokenize(text)) return { "property_id": payload.get("property_id"), "month_count": payload.get("month_count"), "total_reviews": len(reviews), "since_date": since_date.isoformat(), "bucket_counts": dict(bucket_counts), "overall_mean": mean(overall_clean) if overall_clean else None, "overall_median": median(overall_clean) if overall_clean else None, "subscore_avgs": sub_avgs, "theme_counts": {k: dict(v) for k, v in theme_counts.items()}, "top_keywords": {k: keyword_counts[k].most_common(25) for k in keyword_counts}, "review_statistics": payload.get("review_statistics") or {}, } def main() -> int: args = parse_args() labels = list(args.label or []) while len(labels) < len(args.inputs): labels.append(Path(args.inputs[len(labels)]).stem) summaries = [] for path, label in zip(args.inputs, labels, strict=True): payload = json.loads(Path(path).read_text(encoding="utf-8")) payload["_analysis_days"] = args.days s = summarize(payload, low=args.low_threshold, high=args.high_threshold) s["label"] = label summaries.append(s) out_path = Path(args.out) out_path.parent.mkdir(parents=True, exist_ok=True) lines: list[str] = [] lines.append("# Hostelworld review themes (last 12 months)") lines.append("") lines.append(f"- Generated: {datetime.now().isoformat(timespec='seconds')}") lines.append(f"- Window: last {args.days} days") lines.append(f"- Negative threshold: <= {args.low_threshold:.0f}/100") lines.append(f"- Positive threshold: >= {args.high_threshold:.0f}/100") lines.append("") for s in summaries: lines.append(f"## {s['label']}") lines.append("") lines.append("| Metric | Value |") lines.append("|---|---|") lines.append(f"| Window since | {s['since_date']} |") lines.append(f"| Reviews (12m) | {s['total_reviews']} |") lines.append(f"| Overall mean (/100) | {s['overall_mean']:.1f} |" if s["overall_mean"] is not None else "| Overall mean (/100) | n/a |") lines.append(f"| Overall median (/100) | {s['overall_median']:.1f} |" if s["overall_median"] is not None else "| Overall median (/100) | n/a |") bc = s["bucket_counts"] lines.append(f"| Positive / Neutral / Negative | {bc.get('positive',0)} / {bc.get('neutral',0)} / {bc.get('negative',0)} |") lines.append("") sub = s["subscore_avgs"] lines.append("| Subscore (avg/10) | Score |") lines.append("|---|---|") for k in ["cleanliness", "facilities", "staff", "atmosphere", "safety", "location", "value"]: lines.append(f"| {k} | {fmt10(sub.get(k))} |") lines.append("") lines.append(f"### Themes (mentions, min {args.min_theme_count})") lines.append("") for bucket_name in ["negative", "neutral", "positive"]: counts = Counter(s["theme_counts"].get(bucket_name, {})) counts = Counter({k: v for k, v in counts.items() if v >= args.min_theme_count}) if not counts: continue lines.append(f"**{bucket_name}**") for theme, cnt in counts.most_common(): lines.append(f"- {theme}: {cnt}") lines.append("") lines.append("### Top keywords (sanity check)") lines.append("") for bucket_name in ["negative", "positive"]: kws = s["top_keywords"].get(bucket_name, []) if not kws: continue lines.append(f"**{bucket_name}**") lines.append(", ".join([f"{w} ({c})" for w, c in kws[:15]])) lines.append("") out_path.write_text("\n".join(lines) + "\n", encoding="utf-8") print(out_path) return 0 if __name__ == "__main__": raise SystemExit(main())