flaneur/verify/tools/analyze_hostelworld_reviews.py

315 lines
10 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import re
from collections import Counter
from datetime import date, datetime, timedelta
from pathlib import Path
from statistics import mean, median
from typing import Any
STOPWORDS = {
"the",
"and",
"a",
"an",
"to",
"of",
"in",
"for",
"on",
"at",
"with",
"is",
"it",
"was",
"were",
"are",
"be",
"been",
"i",
"we",
"you",
"they",
"this",
"that",
"as",
"but",
"so",
"if",
"not",
"very",
"really",
"just",
"my",
"our",
"their",
"your",
"me",
"us",
"them",
}
THEME_RULES: dict[str, list[re.Pattern[str]]] = {
"cleanliness": [
re.compile(r"\bclean\b", re.I),
re.compile(r"\bdirty\b", re.I),
re.compile(r"\bsmell\b", re.I),
re.compile(r"\bstink\b", re.I),
re.compile(r"\breek\b", re.I),
re.compile(r"\bmold\b", re.I),
re.compile(r"\bbath(room|rooms)?\b", re.I),
re.compile(r"\btoilet(s)?\b", re.I),
re.compile(r"\bshower(s)?\b", re.I),
],
"staff_reception": [
re.compile(r"\bstaff\b", re.I),
re.compile(r"\breception\b", re.I),
re.compile(r"\bfront\s*desk\b", re.I),
re.compile(r"\bhelpful\b", re.I),
re.compile(r"\brude\b", re.I),
re.compile(r"\bfriendl(y|iness)\b", re.I),
re.compile(r"\b24h\b", re.I),
re.compile(r"\b24\s*hour\b", re.I),
],
"safety_neighborhood": [
re.compile(r"\bsafe\b", re.I),
re.compile(r"\bunsafe\b", re.I),
re.compile(r"\bdanger(ous|)\b", re.I),
re.compile(r"\bdrug(s|)\b", re.I),
re.compile(r"\bdealer(s)?\b", re.I),
re.compile(r"\bsketchy\b", re.I),
re.compile(r"\bafter\s+dark\b", re.I),
re.compile(r"\bnight\b", re.I),
re.compile(r"\bhomeless\b", re.I),
],
"sleep_noise": [
re.compile(r"\bnois(e|y)\b", re.I),
re.compile(r"\bloud\b", re.I),
re.compile(r"\bsleep\b", re.I),
re.compile(r"\bbunk\b", re.I),
re.compile(r"\brattl(e|ing)\b", re.I),
re.compile(r"\bcurtain(s)?\b", re.I),
re.compile(r"\bprivacy\b", re.I),
],
"kitchen_food": [
re.compile(r"\bkitchen\b", re.I),
re.compile(r"\bbreakfast\b", re.I),
re.compile(r"\bfood\b", re.I),
re.compile(r"\bbar\b", re.I),
re.compile(r"\bcafe\b", re.I),
re.compile(r"\bcoffee\b", re.I),
re.compile(r"\bdrink(s)?\b", re.I),
],
"value_price": [
re.compile(r"\bvalue\b", re.I),
re.compile(r"\bprice\b", re.I),
re.compile(r"\bexpensive\b", re.I),
re.compile(r"\bcheap\b", re.I),
re.compile(r"\bworth\b", re.I),
],
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Summarize Hostelworld reviews JSON into theme counts.")
parser.add_argument("--in", dest="inputs", action="append", required=True, help="Input JSON file")
parser.add_argument("--out", required=True, help="Output Markdown path")
parser.add_argument("--label", action="append", default=[], help="Optional label per --in (same order)")
parser.add_argument(
"--days",
type=int,
default=365,
help="Only include reviews within the last N days (based on the review 'date' field)",
)
parser.add_argument("--low-threshold", type=float, default=65.0, help="Overall score <= this is 'negative' (0-100)")
parser.add_argument("--high-threshold", type=float, default=85.0, help="Overall score >= this is 'positive' (0-100)")
parser.add_argument("--min-theme-count", type=int, default=5, help="Only show themes with at least this many mentions")
return parser.parse_args()
def get_overall(review: dict[str, Any]) -> float | None:
rating = review.get("rating") or {}
overall = rating.get("overall")
return float(overall) if isinstance(overall, (int, float)) else None
def get_text(review: dict[str, Any]) -> str:
return str(review.get("notes") or "")
def bucket(overall: float | None, *, low: float, high: float) -> str:
if overall is None:
return "unknown"
if overall <= low:
return "negative"
if overall >= high:
return "positive"
return "neutral"
def tokenize(text: str) -> list[str]:
words = re.findall(r"[a-zA-Z]{3,}", text.lower())
return [w for w in words if w not in STOPWORDS]
def detect_themes(text: str) -> set[str]:
hits: set[str] = set()
for theme, patterns in THEME_RULES.items():
if any(p.search(text) for p in patterns):
hits.add(theme)
return hits
def fmt10(score100: float | None) -> str:
if score100 is None:
return "n/a"
return f"{score100 / 10:.1f}"
def summarize(payload: dict[str, Any], *, low: float, high: float) -> dict[str, Any]:
reviews_all = payload.get("reviews") or []
fetched_at = payload.get("fetched_at")
ref_dt = None
try:
ref_dt = datetime.fromisoformat(fetched_at) if isinstance(fetched_at, str) else None
except ValueError:
ref_dt = None
if ref_dt is None:
ref_dt = datetime.now()
days = int(payload.get("_analysis_days", 365))
since_date = (ref_dt.date() - timedelta(days=days))
reviews: list[dict[str, Any]] = []
for r in reviews_all:
d = r.get("date")
if not isinstance(d, str) or len(d) < 10:
continue
try:
rd = date.fromisoformat(d[:10])
except ValueError:
continue
if rd >= since_date:
reviews.append(r)
overall_scores = [get_overall(r) for r in reviews]
overall_clean = [s for s in overall_scores if s is not None]
bucket_counts = Counter(bucket(get_overall(r), low=low, high=high) for r in reviews)
sub_keys = ["safety", "location", "staff", "atmosphere", "cleanliness", "facilities", "value"]
subs: dict[str, list[float]] = {k: [] for k in sub_keys}
for r in reviews:
rating = r.get("rating") or {}
for k in sub_keys:
v = rating.get(k)
if isinstance(v, (int, float)):
subs[k].append(float(v))
sub_avgs = {k: (mean(v) if v else None) for k, v in subs.items()}
theme_counts: dict[str, Counter[str]] = {b: Counter() for b in ["positive", "neutral", "negative"]}
keyword_counts: dict[str, Counter[str]] = {b: Counter() for b in ["positive", "neutral", "negative"]}
for r in reviews:
text = get_text(r)
b = bucket(get_overall(r), low=low, high=high)
if b not in theme_counts:
continue
for t in detect_themes(text):
theme_counts[b][t] += 1
keyword_counts[b].update(tokenize(text))
return {
"property_id": payload.get("property_id"),
"month_count": payload.get("month_count"),
"total_reviews": len(reviews),
"since_date": since_date.isoformat(),
"bucket_counts": dict(bucket_counts),
"overall_mean": mean(overall_clean) if overall_clean else None,
"overall_median": median(overall_clean) if overall_clean else None,
"subscore_avgs": sub_avgs,
"theme_counts": {k: dict(v) for k, v in theme_counts.items()},
"top_keywords": {k: keyword_counts[k].most_common(25) for k in keyword_counts},
"review_statistics": payload.get("review_statistics") or {},
}
def main() -> int:
args = parse_args()
labels = list(args.label or [])
while len(labels) < len(args.inputs):
labels.append(Path(args.inputs[len(labels)]).stem)
summaries = []
for path, label in zip(args.inputs, labels, strict=True):
payload = json.loads(Path(path).read_text(encoding="utf-8"))
payload["_analysis_days"] = args.days
s = summarize(payload, low=args.low_threshold, high=args.high_threshold)
s["label"] = label
summaries.append(s)
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
lines: list[str] = []
lines.append("# Hostelworld review themes (last 12 months)")
lines.append("")
lines.append(f"- Generated: {datetime.now().isoformat(timespec='seconds')}")
lines.append(f"- Window: last {args.days} days")
lines.append(f"- Negative threshold: <= {args.low_threshold:.0f}/100")
lines.append(f"- Positive threshold: >= {args.high_threshold:.0f}/100")
lines.append("")
for s in summaries:
lines.append(f"## {s['label']}")
lines.append("")
lines.append("| Metric | Value |")
lines.append("|---|---|")
lines.append(f"| Window since | {s['since_date']} |")
lines.append(f"| Reviews (12m) | {s['total_reviews']} |")
lines.append(f"| Overall mean (/100) | {s['overall_mean']:.1f} |" if s["overall_mean"] is not None else "| Overall mean (/100) | n/a |")
lines.append(f"| Overall median (/100) | {s['overall_median']:.1f} |" if s["overall_median"] is not None else "| Overall median (/100) | n/a |")
bc = s["bucket_counts"]
lines.append(f"| Positive / Neutral / Negative | {bc.get('positive',0)} / {bc.get('neutral',0)} / {bc.get('negative',0)} |")
lines.append("")
sub = s["subscore_avgs"]
lines.append("| Subscore (avg/10) | Score |")
lines.append("|---|---|")
for k in ["cleanliness", "facilities", "staff", "atmosphere", "safety", "location", "value"]:
lines.append(f"| {k} | {fmt10(sub.get(k))} |")
lines.append("")
lines.append(f"### Themes (mentions, min {args.min_theme_count})")
lines.append("")
for bucket_name in ["negative", "neutral", "positive"]:
counts = Counter(s["theme_counts"].get(bucket_name, {}))
counts = Counter({k: v for k, v in counts.items() if v >= args.min_theme_count})
if not counts:
continue
lines.append(f"**{bucket_name}**")
for theme, cnt in counts.most_common():
lines.append(f"- {theme}: {cnt}")
lines.append("")
lines.append("### Top keywords (sanity check)")
lines.append("")
for bucket_name in ["negative", "positive"]:
kws = s["top_keywords"].get(bucket_name, [])
if not kws:
continue
lines.append(f"**{bucket_name}**")
lines.append(", ".join([f"{w} ({c})" for w, c in kws[:15]]))
lines.append("")
out_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
print(out_path)
return 0
if __name__ == "__main__":
raise SystemExit(main())