315 lines
10 KiB
Python
315 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
from collections import Counter
|
|
from datetime import date, datetime, timedelta
|
|
from pathlib import Path
|
|
from statistics import mean, median
|
|
from typing import Any
|
|
|
|
|
|
STOPWORDS = {
|
|
"the",
|
|
"and",
|
|
"a",
|
|
"an",
|
|
"to",
|
|
"of",
|
|
"in",
|
|
"for",
|
|
"on",
|
|
"at",
|
|
"with",
|
|
"is",
|
|
"it",
|
|
"was",
|
|
"were",
|
|
"are",
|
|
"be",
|
|
"been",
|
|
"i",
|
|
"we",
|
|
"you",
|
|
"they",
|
|
"this",
|
|
"that",
|
|
"as",
|
|
"but",
|
|
"so",
|
|
"if",
|
|
"not",
|
|
"very",
|
|
"really",
|
|
"just",
|
|
"my",
|
|
"our",
|
|
"their",
|
|
"your",
|
|
"me",
|
|
"us",
|
|
"them",
|
|
}
|
|
|
|
|
|
THEME_RULES: dict[str, list[re.Pattern[str]]] = {
|
|
"cleanliness": [
|
|
re.compile(r"\bclean\b", re.I),
|
|
re.compile(r"\bdirty\b", re.I),
|
|
re.compile(r"\bsmell\b", re.I),
|
|
re.compile(r"\bstink\b", re.I),
|
|
re.compile(r"\breek\b", re.I),
|
|
re.compile(r"\bmold\b", re.I),
|
|
re.compile(r"\bbath(room|rooms)?\b", re.I),
|
|
re.compile(r"\btoilet(s)?\b", re.I),
|
|
re.compile(r"\bshower(s)?\b", re.I),
|
|
],
|
|
"staff_reception": [
|
|
re.compile(r"\bstaff\b", re.I),
|
|
re.compile(r"\breception\b", re.I),
|
|
re.compile(r"\bfront\s*desk\b", re.I),
|
|
re.compile(r"\bhelpful\b", re.I),
|
|
re.compile(r"\brude\b", re.I),
|
|
re.compile(r"\bfriendl(y|iness)\b", re.I),
|
|
re.compile(r"\b24h\b", re.I),
|
|
re.compile(r"\b24\s*hour\b", re.I),
|
|
],
|
|
"safety_neighborhood": [
|
|
re.compile(r"\bsafe\b", re.I),
|
|
re.compile(r"\bunsafe\b", re.I),
|
|
re.compile(r"\bdanger(ous|)\b", re.I),
|
|
re.compile(r"\bdrug(s|)\b", re.I),
|
|
re.compile(r"\bdealer(s)?\b", re.I),
|
|
re.compile(r"\bsketchy\b", re.I),
|
|
re.compile(r"\bafter\s+dark\b", re.I),
|
|
re.compile(r"\bnight\b", re.I),
|
|
re.compile(r"\bhomeless\b", re.I),
|
|
],
|
|
"sleep_noise": [
|
|
re.compile(r"\bnois(e|y)\b", re.I),
|
|
re.compile(r"\bloud\b", re.I),
|
|
re.compile(r"\bsleep\b", re.I),
|
|
re.compile(r"\bbunk\b", re.I),
|
|
re.compile(r"\brattl(e|ing)\b", re.I),
|
|
re.compile(r"\bcurtain(s)?\b", re.I),
|
|
re.compile(r"\bprivacy\b", re.I),
|
|
],
|
|
"kitchen_food": [
|
|
re.compile(r"\bkitchen\b", re.I),
|
|
re.compile(r"\bbreakfast\b", re.I),
|
|
re.compile(r"\bfood\b", re.I),
|
|
re.compile(r"\bbar\b", re.I),
|
|
re.compile(r"\bcafe\b", re.I),
|
|
re.compile(r"\bcoffee\b", re.I),
|
|
re.compile(r"\bdrink(s)?\b", re.I),
|
|
],
|
|
"value_price": [
|
|
re.compile(r"\bvalue\b", re.I),
|
|
re.compile(r"\bprice\b", re.I),
|
|
re.compile(r"\bexpensive\b", re.I),
|
|
re.compile(r"\bcheap\b", re.I),
|
|
re.compile(r"\bworth\b", re.I),
|
|
],
|
|
}
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Summarize Hostelworld reviews JSON into theme counts.")
|
|
parser.add_argument("--in", dest="inputs", action="append", required=True, help="Input JSON file")
|
|
parser.add_argument("--out", required=True, help="Output Markdown path")
|
|
parser.add_argument("--label", action="append", default=[], help="Optional label per --in (same order)")
|
|
parser.add_argument(
|
|
"--days",
|
|
type=int,
|
|
default=365,
|
|
help="Only include reviews within the last N days (based on the review 'date' field)",
|
|
)
|
|
parser.add_argument("--low-threshold", type=float, default=65.0, help="Overall score <= this is 'negative' (0-100)")
|
|
parser.add_argument("--high-threshold", type=float, default=85.0, help="Overall score >= this is 'positive' (0-100)")
|
|
parser.add_argument("--min-theme-count", type=int, default=5, help="Only show themes with at least this many mentions")
|
|
return parser.parse_args()
|
|
|
|
|
|
def get_overall(review: dict[str, Any]) -> float | None:
|
|
rating = review.get("rating") or {}
|
|
overall = rating.get("overall")
|
|
return float(overall) if isinstance(overall, (int, float)) else None
|
|
|
|
|
|
def get_text(review: dict[str, Any]) -> str:
|
|
return str(review.get("notes") or "")
|
|
|
|
|
|
def bucket(overall: float | None, *, low: float, high: float) -> str:
|
|
if overall is None:
|
|
return "unknown"
|
|
if overall <= low:
|
|
return "negative"
|
|
if overall >= high:
|
|
return "positive"
|
|
return "neutral"
|
|
|
|
|
|
def tokenize(text: str) -> list[str]:
|
|
words = re.findall(r"[a-zA-Z]{3,}", text.lower())
|
|
return [w for w in words if w not in STOPWORDS]
|
|
|
|
|
|
def detect_themes(text: str) -> set[str]:
|
|
hits: set[str] = set()
|
|
for theme, patterns in THEME_RULES.items():
|
|
if any(p.search(text) for p in patterns):
|
|
hits.add(theme)
|
|
return hits
|
|
|
|
|
|
def fmt10(score100: float | None) -> str:
|
|
if score100 is None:
|
|
return "n/a"
|
|
return f"{score100 / 10:.1f}"
|
|
|
|
|
|
def summarize(payload: dict[str, Any], *, low: float, high: float) -> dict[str, Any]:
|
|
reviews_all = payload.get("reviews") or []
|
|
fetched_at = payload.get("fetched_at")
|
|
ref_dt = None
|
|
try:
|
|
ref_dt = datetime.fromisoformat(fetched_at) if isinstance(fetched_at, str) else None
|
|
except ValueError:
|
|
ref_dt = None
|
|
if ref_dt is None:
|
|
ref_dt = datetime.now()
|
|
|
|
days = int(payload.get("_analysis_days", 365))
|
|
since_date = (ref_dt.date() - timedelta(days=days))
|
|
|
|
reviews: list[dict[str, Any]] = []
|
|
for r in reviews_all:
|
|
d = r.get("date")
|
|
if not isinstance(d, str) or len(d) < 10:
|
|
continue
|
|
try:
|
|
rd = date.fromisoformat(d[:10])
|
|
except ValueError:
|
|
continue
|
|
if rd >= since_date:
|
|
reviews.append(r)
|
|
overall_scores = [get_overall(r) for r in reviews]
|
|
overall_clean = [s for s in overall_scores if s is not None]
|
|
|
|
bucket_counts = Counter(bucket(get_overall(r), low=low, high=high) for r in reviews)
|
|
|
|
sub_keys = ["safety", "location", "staff", "atmosphere", "cleanliness", "facilities", "value"]
|
|
subs: dict[str, list[float]] = {k: [] for k in sub_keys}
|
|
for r in reviews:
|
|
rating = r.get("rating") or {}
|
|
for k in sub_keys:
|
|
v = rating.get(k)
|
|
if isinstance(v, (int, float)):
|
|
subs[k].append(float(v))
|
|
sub_avgs = {k: (mean(v) if v else None) for k, v in subs.items()}
|
|
|
|
theme_counts: dict[str, Counter[str]] = {b: Counter() for b in ["positive", "neutral", "negative"]}
|
|
keyword_counts: dict[str, Counter[str]] = {b: Counter() for b in ["positive", "neutral", "negative"]}
|
|
|
|
for r in reviews:
|
|
text = get_text(r)
|
|
b = bucket(get_overall(r), low=low, high=high)
|
|
if b not in theme_counts:
|
|
continue
|
|
for t in detect_themes(text):
|
|
theme_counts[b][t] += 1
|
|
keyword_counts[b].update(tokenize(text))
|
|
|
|
return {
|
|
"property_id": payload.get("property_id"),
|
|
"month_count": payload.get("month_count"),
|
|
"total_reviews": len(reviews),
|
|
"since_date": since_date.isoformat(),
|
|
"bucket_counts": dict(bucket_counts),
|
|
"overall_mean": mean(overall_clean) if overall_clean else None,
|
|
"overall_median": median(overall_clean) if overall_clean else None,
|
|
"subscore_avgs": sub_avgs,
|
|
"theme_counts": {k: dict(v) for k, v in theme_counts.items()},
|
|
"top_keywords": {k: keyword_counts[k].most_common(25) for k in keyword_counts},
|
|
"review_statistics": payload.get("review_statistics") or {},
|
|
}
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
labels = list(args.label or [])
|
|
while len(labels) < len(args.inputs):
|
|
labels.append(Path(args.inputs[len(labels)]).stem)
|
|
|
|
summaries = []
|
|
for path, label in zip(args.inputs, labels, strict=True):
|
|
payload = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
payload["_analysis_days"] = args.days
|
|
s = summarize(payload, low=args.low_threshold, high=args.high_threshold)
|
|
s["label"] = label
|
|
summaries.append(s)
|
|
|
|
out_path = Path(args.out)
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
lines: list[str] = []
|
|
lines.append("# Hostelworld review themes (last 12 months)")
|
|
lines.append("")
|
|
lines.append(f"- Generated: {datetime.now().isoformat(timespec='seconds')}")
|
|
lines.append(f"- Window: last {args.days} days")
|
|
lines.append(f"- Negative threshold: <= {args.low_threshold:.0f}/100")
|
|
lines.append(f"- Positive threshold: >= {args.high_threshold:.0f}/100")
|
|
lines.append("")
|
|
|
|
for s in summaries:
|
|
lines.append(f"## {s['label']}")
|
|
lines.append("")
|
|
lines.append("| Metric | Value |")
|
|
lines.append("|---|---|")
|
|
lines.append(f"| Window since | {s['since_date']} |")
|
|
lines.append(f"| Reviews (12m) | {s['total_reviews']} |")
|
|
lines.append(f"| Overall mean (/100) | {s['overall_mean']:.1f} |" if s["overall_mean"] is not None else "| Overall mean (/100) | n/a |")
|
|
lines.append(f"| Overall median (/100) | {s['overall_median']:.1f} |" if s["overall_median"] is not None else "| Overall median (/100) | n/a |")
|
|
bc = s["bucket_counts"]
|
|
lines.append(f"| Positive / Neutral / Negative | {bc.get('positive',0)} / {bc.get('neutral',0)} / {bc.get('negative',0)} |")
|
|
lines.append("")
|
|
|
|
sub = s["subscore_avgs"]
|
|
lines.append("| Subscore (avg/10) | Score |")
|
|
lines.append("|---|---|")
|
|
for k in ["cleanliness", "facilities", "staff", "atmosphere", "safety", "location", "value"]:
|
|
lines.append(f"| {k} | {fmt10(sub.get(k))} |")
|
|
lines.append("")
|
|
|
|
lines.append(f"### Themes (mentions, min {args.min_theme_count})")
|
|
lines.append("")
|
|
for bucket_name in ["negative", "neutral", "positive"]:
|
|
counts = Counter(s["theme_counts"].get(bucket_name, {}))
|
|
counts = Counter({k: v for k, v in counts.items() if v >= args.min_theme_count})
|
|
if not counts:
|
|
continue
|
|
lines.append(f"**{bucket_name}**")
|
|
for theme, cnt in counts.most_common():
|
|
lines.append(f"- {theme}: {cnt}")
|
|
lines.append("")
|
|
|
|
lines.append("### Top keywords (sanity check)")
|
|
lines.append("")
|
|
for bucket_name in ["negative", "positive"]:
|
|
kws = s["top_keywords"].get(bucket_name, [])
|
|
if not kws:
|
|
continue
|
|
lines.append(f"**{bucket_name}**")
|
|
lines.append(", ".join([f"{w} ({c})" for w, c in kws[:15]]))
|
|
lines.append("")
|
|
|
|
out_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
print(out_path)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|