diff --git a/verify/.gitignore b/verify/.gitignore new file mode 100644 index 0000000..518e365 --- /dev/null +++ b/verify/.gitignore @@ -0,0 +1,4 @@ +__pycache__/ +*.pyc +.venv/ + diff --git a/verify/README.md b/verify/README.md new file mode 100644 index 0000000..b7e339c --- /dev/null +++ b/verify/README.md @@ -0,0 +1,28 @@ +# Verify (crawler-view audit) + +This folder contains a lightweight SEO audit that fetches external listing/profile URLs using a Googlebot User-Agent, then extracts: + +- Booking.com / TripAdvisor: rating, review count, badges (JSON-LD first) +- Instagram / Facebook / TikTok: follower count + last post date hints (meta tags) +- Policy check: searches page text for "New Year's Eve", "minimum stay", "sold out" + +## Setup + +```bash +python3 -m venv .venv +. .venv/bin/activate +pip install -r verify/requirements.txt +``` + +## Run + +```bash +python verify/tools/audit_listings.py --platform auto +python verify/tools/audit_listings.py --platform auto --include-jsonld +``` + +To save output for review: + +```bash +python verify/tools/audit_listings.py --platform auto > verify/results/audit.jsonl +``` diff --git a/verify/ho36/raw/ho36__booking__ho36_query__20260102.html b/verify/ho36/raw/ho36__booking__ho36_query__20260102.html new file mode 100644 index 0000000..ac473cd --- /dev/null +++ b/verify/ho36/raw/ho36__booking__ho36_query__20260102.html @@ -0,0 +1,225 @@ + + Booking.com | Official site | The best hotels, flights, car rentals & accommodations + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Find your next stay

Search low prices on hotels, homes and much more...

Why Booking.com?

Book now, pay at the property

FREE cancellation on most rooms

300M+ reviews from fellow travellers

Get trusted information from guests like you

2+ million properties worldwide

Hotels, guest houses, apartments, and more…

Trusted customer service you can rely on, 24/7

We're always here to help

Offers

Promotions, deals and special offers for you
Early 2026 Deals

At least 15% off

Save on your next stay with Early 2026 Deals. Book now, stay until 1 April 2026.
A woman sitting on a small sunny balcony, holding a cup and smiling and with her legs pulled up onto the chair, with a table beside her holding a plant and mobile devices.
Late Escape Deals

Go for a good time, not a long time

Squeeze out the last bit of sun +with at least 15% off
A father and child together in a hammock, watching a beautiful sunrise

Travel more, spend less

Sign in, save money

Save 10% or more at participating properties - just look for the blue Genius label

Popular with travellers from Canada

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/verify/ho36/screenshots/ho36__booking__ho36_query__20260102.png b/verify/ho36/screenshots/ho36__booking__ho36_query__20260102.png new file mode 100644 index 0000000..1dcc2ee Binary files /dev/null and b/verify/ho36/screenshots/ho36__booking__ho36_query__20260102.png differ diff --git a/verify/requirements.txt b/verify/requirements.txt new file mode 100644 index 0000000..2c72b29 --- /dev/null +++ b/verify/requirements.txt @@ -0,0 +1,3 @@ +requests>=2.32.0 +beautifulsoup4>=4.12.0 + diff --git a/verify/results/flaneur_googlebot_audit.jsonl b/verify/results/flaneur_googlebot_audit.jsonl new file mode 100644 index 0000000..507f5f4 --- /dev/null +++ b/verify/results/flaneur_googlebot_audit.jsonl @@ -0,0 +1,12 @@ +{"url": "https://booking.roomraccoon.fr/le-fl-neur-guesthouse-8346/fr/", "platform": "policy", "fetched_at": "2026-01-02T20:23:18+00:00", "status_code": 200, "final_url": "https://booking.roomraccoon.fr/le-fl-neur-guesthouse-8346/fr/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://leflaneur-guesthouse.com/", "platform": "policy", "fetched_at": "2026-01-02T20:23:19+00:00", "status_code": 200, "final_url": "https://leflaneur-guesthouse.com/", "ok": true, "error": null, "data": {"content_type": "text/html", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://leflaneur-guesthouse.com/dormir", "platform": "policy", "fetched_at": "2026-01-02T20:23:19+00:00", "status_code": 200, "final_url": "https://leflaneur-guesthouse.com/dormir", "ok": true, "error": null, "data": {"content_type": "text/html", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://www.booking.com/searchresults.html?ss=Le%20Fl%C3%A2neur%20Guesthouse%20Lyon", "platform": "booking", "fetched_at": "2026-01-02T20:23:20+00:00", "status_code": 202, "final_url": "https://www.booking.com/searchresults.html", "ok": true, "error": null, "data": {"content_type": "text/html", "content_length": "3962", "rating": null, "review_count": null, "badges": [], "jsonld_count": 0, "jsonld_parse_errors": []}} +{"url": "https://www.facebook.com/leflaneurlyon", "platform": "facebook", "fetched_at": "2026-01-02T20:23:21+00:00", "status_code": 200, "final_url": "https://m.facebook.com/login/?next=https%3A%2F%2Fm.facebook.com%2Fleflaneurlyon%2F", "ok": true, "error": null, "data": {"content_type": "text/html; charset=utf-8", "content_length": null, "meta_description": "登录 Facebook,与好友、家人和认识的人分享和建立联系。", "follower_count": null, "last_post_date": null, "og_title": "登录 Facebook | Facebook", "og_url": "https://www.facebook.com/"}} +{"url": "https://www.google.com/maps/place/Le+Fl%C3%A2neur+Guesthouse/@45.7512135,4.8428045,17z/data=!3m1!5s0x47f4ea4464dcb499:0x7fbb59cd88d1026a!4m9!3m8!1s0x47f4ea446430af35:0xe27846417ed8f4f!5m2!4m1!1i2!8m2!3d45.7512135!4d4.8428045!16s%2Fg%2F11ckqn6t7v", "platform": "policy", "fetched_at": "2026-01-02T20:23:23+00:00", "status_code": 200, "final_url": "https://www.google.com/maps/place/Le+Fl%C3%A2neur+Guesthouse/@45.7512135,4.8428045,17z/data=!3m1!5s0x47f4ea4464dcb499:0x7fbb59cd88d1026a!4m9!3m8!1s0x47f4ea446430af35:0xe27846417ed8f4f!5m2!4m1!1i2!8m2!3d45.7512135!4d4.8428045!16s%2Fg%2F11ckqn6t7v", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://www.google.com/maps/search/?api=1&query=Fl%C3%A2neur%20Hostel%20Lyon", "platform": "policy", "fetched_at": "2026-01-02T20:23:25+00:00", "status_code": 200, "final_url": "https://www.google.com/maps/search/?api=1&query=Fl%C3%A2neur%20Hostel%20Lyon", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://www.hostelworld.com/hostels/p/100844/le-flaneur-guesthouse/", "platform": "policy", "fetched_at": "2026-01-02T20:23:27+00:00", "status_code": 200, "final_url": "https://www.hostelworld.com/hostels/p/100844/le-flaneur-guesthouse/", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://www.hostelworld.com/st/hostels/lyon/", "platform": "policy", "fetched_at": "2026-01-02T20:23:28+00:00", "status_code": 200, "final_url": "https://www.hostelworld.com/hostels/europe/france/lyon/", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://www.instagram.com/leflaneur_gh/", "platform": "instagram", "fetched_at": "2026-01-02T20:23:30+00:00", "status_code": 200, "final_url": "https://www.instagram.com/leflaneur_gh/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=\"utf-8\"", "content_length": null, "meta_description": "2,296 Followers, 889 Following, 742 Posts - Le Flâneur Guesthouse (@leflaneur_gh) on Instagram: \"🍺☕ Bar & café\n🏠Independent / Alternative / Participative /Ecofriendly / Homely hostel in the heart of Guillotière, Lyon📍\"", "follower_count": {"raw": "2,296", "value": 2296}, "last_post_date": null, "og_title": "Le Flâneur Guesthouse (@leflaneur_gh) • Instagram photos and videos", "og_url": "https://www.instagram.com/leflaneur_gh/"}} +{"url": "https://www.tiktok.com/search?q=le%20flaneur%20guesthouse%20lyon", "platform": "tiktok", "fetched_at": "2026-01-02T20:23:32+00:00", "status_code": 403, "final_url": "https://www.tiktok.com/search?q=le%20flaneur%20guesthouse%20lyon", "ok": true, "error": null, "data": {"content_type": "text/plain; charset=utf-8", "content_length": "9", "meta_description": null, "follower_count": null, "last_post_date": null, "og_title": null, "og_url": null}} +{"url": "https://www.tripadvisor.com/Hotel_Review-g187265-d8778985-Reviews-Le_Flaneur_Guesthouse-Lyon_Rhone_Auvergne_Rhone_Alpes.html", "platform": "tripadvisor", "fetched_at": "2026-01-02T20:23:32+00:00", "status_code": 403, "final_url": "https://www.tripadvisor.com/Hotel_Review-g187265-d8778985-Reviews-Le_Flaneur_Guesthouse-Lyon_Rhone_Auvergne_Rhone_Alpes.html", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": "775", "rating": null, "review_count": null, "badges": [], "jsonld_count": 0, "jsonld_parse_errors": []}} diff --git a/verify/results/ho36_googlebot_audit.jsonl b/verify/results/ho36_googlebot_audit.jsonl new file mode 100644 index 0000000..953242f --- /dev/null +++ b/verify/results/ho36_googlebot_audit.jsonl @@ -0,0 +1,11 @@ +{"url": "https://ho36lyon.com/", "platform": "policy", "fetched_at": "2026-01-02T20:24:02+00:00", "status_code": 200, "final_url": "https://ho36lyon.com/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": "35483", "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://ho36lyon.com/en/", "platform": "policy", "fetched_at": "2026-01-02T20:24:04+00:00", "status_code": 200, "final_url": "https://ho36lyon.com/en/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": "35743", "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://maps.app.goo.gl/vfGnGGQxJBNwvdgX8", "platform": "policy", "fetched_at": "2026-01-02T20:24:05+00:00", "status_code": 200, "final_url": "https://www.google.fr/maps/place/HO36+Hostel+Lyon/@45.7529047,4.8394703,17z/data=!4m9!3m8!1s0x47f4ea44c206c2fd:0xb36a1c20ef67ead4!5m2!4m1!1i2!8m2!3d45.752901!4d4.8420452!16s%2Fg%2F1tnpkbvv?entry=tts&g_ep=EgoyMDI0MTAwMi4xIPu8ASoASAFQAw%3D%3D", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://www.booking.com/hotel/fr/ho36-hostel.html", "platform": "booking", "fetched_at": "2026-01-02T20:24:07+00:00", "status_code": 202, "final_url": "https://www.booking.com/hotel/fr/ho36-hostel.html", "ok": true, "error": null, "data": {"content_type": "text/html", "content_length": "3962", "rating": null, "review_count": null, "badges": [], "jsonld_count": 0, "jsonld_parse_errors": []}} +{"url": "https://www.facebook.com/ho36hotels/", "platform": "facebook", "fetched_at": "2026-01-02T20:24:07+00:00", "status_code": 200, "final_url": "https://m.facebook.com/login/?next=https%3A%2F%2Fm.facebook.com%2Fho36hotels%2F", "ok": true, "error": null, "data": {"content_type": "text/html; charset=utf-8", "content_length": null, "meta_description": "登录 Facebook,与好友、家人和认识的人分享和建立联系。", "follower_count": null, "last_post_date": null, "og_title": "登录 Facebook | Facebook", "og_url": "https://www.facebook.com/"}} +{"url": "https://www.google.com/maps/embed?pb=!1m14!1m8!1m3!1d11135.645489923992!2d4.84204!3d45.7529227!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x0%3A0xb36a1c20ef67ead4!2sho36%20Lyon%20Guilloti%C3%A8re!5e0!3m2!1sfr!2sfr!4v1567089009427!5m2!1sfr!2sfr", "platform": "policy", "fetched_at": "2026-01-02T20:24:08+00:00", "status_code": 200, "final_url": "https://www.google.com/maps/embed?pb=!1m14!1m8!1m3!1d11135.645489923992!2d4.84204!3d45.7529227!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x0%3A0xb36a1c20ef67ead4!2sho36%20Lyon%20Guilloti%C3%A8re!5e0!3m2!1sfr!2sfr!4v1567089009427!5m2!1sfr!2sfr", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://www.google.fr/maps/place/HO36+Hostel+Lyon/@45.7529047,4.8394703,17z/data=!4m9!3m8!1s0x47f4ea44c206c2fd:0xb36a1c20ef67ead4!5m2!4m1!1i2!8m2!3d45.752901!4d4.8420452!16s%2Fg%2F1tnpkbvv?entry=tts", "platform": "policy", "fetched_at": "2026-01-02T20:24:08+00:00", "status_code": 200, "final_url": "https://www.google.fr/maps/place/HO36+Hostel+Lyon/@45.7529047,4.8394703,17z/data=!4m9!3m8!1s0x47f4ea44c206c2fd:0xb36a1c20ef67ead4!5m2!4m1!1i2!8m2!3d45.752901!4d4.8420452!16s%2Fg%2F1tnpkbvv?entry=tts", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://www.hostelworld.com/hostels/europe/france/lyon/", "platform": "policy", "fetched_at": "2026-01-02T20:24:10+00:00", "status_code": 200, "final_url": "https://www.hostelworld.com/hostels/europe/france/lyon/", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://www.hostelworld.com/hostels/p/270217/ho36-hostel/", "platform": "policy", "fetched_at": "2026-01-02T20:24:11+00:00", "status_code": 200, "final_url": "https://www.hostelworld.com/hostels/p/270217/ho36-hostel/", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}} +{"url": "https://www.instagram.com/ho36hotel_lyon/", "platform": "instagram", "fetched_at": "2026-01-02T20:24:12+00:00", "status_code": 200, "final_url": "https://www.instagram.com/ho36hotel_lyon/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=\"utf-8\"", "content_length": null, "meta_description": "3,247 Followers, 615 Following, 108 Posts - ho36 Lyon (@ho36hotel_lyon) on Instagram: \"💙Ho(s)tel\n⚡️Lieu de vie et rencontres entre voyageurs & locaux ☕️\"", "follower_count": {"raw": "3,247", "value": 3247}, "last_post_date": null, "og_title": "ho36 Lyon (@ho36hotel_lyon) • Instagram photos and videos", "og_url": "https://www.instagram.com/ho36hotel_lyon/"}} +{"url": "https://www.tripadvisor.fr/Hotel_Review-g187265-d293643-Reviews-Ho36_Hostel-Lyon_Rhone_Auvergne_Rhone_Alpes.html", "platform": "tripadvisor", "fetched_at": "2026-01-02T20:24:13+00:00", "status_code": 403, "final_url": "https://www.tripadvisor.fr/Hotel_Review-g187265-d293643-Reviews-Ho36_Hostel-Lyon_Rhone_Auvergne_Rhone_Alpes.html", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": "774", "rating": null, "review_count": null, "badges": [], "jsonld_count": 0, "jsonld_parse_errors": []}} diff --git a/verify/tools/audit_listings.py b/verify/tools/audit_listings.py new file mode 100644 index 0000000..ad0344f --- /dev/null +++ b/verify/tools/audit_listings.py @@ -0,0 +1,521 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +import re +from dataclasses import asdict, dataclass +from datetime import datetime, timezone +from typing import Any, Iterable, Literal +from urllib.parse import urlparse + +import requests +try: + from bs4 import BeautifulSoup +except ImportError as exc: # pragma: no cover + raise SystemExit( + "Missing dependency 'beautifulsoup4'. Install with:\n" + " python3 -m pip install beautifulsoup4\n" + "or (recommended) inside a venv:\n" + " python3 -m venv .venv && . .venv/bin/activate && pip install beautifulsoup4\n" + ) from exc + + +Platform = Literal[ + "auto", + "booking", + "tripadvisor", + "instagram", + "facebook", + "tiktok", + "policy", +] + +GOOGLEBOT_UA = ( + "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/41.0.2272.96 Mobile Safari/537.36 " + "(compatible; Googlebot/2.1; +http://www.google.com/bot.html)" +) + + +@dataclass(frozen=True) +class AuditResult: + url: str + platform: str + fetched_at: str + status_code: int | None + final_url: str | None + ok: bool + error: str | None + data: dict[str, Any] + + +def iso_now() -> str: + return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds") + + +def detect_platform(url: str) -> Platform: + host = (urlparse(url).netloc or "").lower() + if "booking.com" in host: + return "booking" + if "tripadvisor." in host or "tripadvisor.com" in host: + return "tripadvisor" + if "instagram.com" in host: + return "instagram" + if "facebook.com" in host or host.endswith("fb.com"): + return "facebook" + if "tiktok.com" in host: + return "tiktok" + return "policy" + + +def normalize_platform(platform: str) -> Platform | None: + key = re.sub(r"[^a-z0-9]+", "", (platform or "").strip().lower()) + if key in ("", "auto", "detect"): + return "auto" + if key in ("booking", "bookingcom"): + return "booking" + if key in ("tripadvisor", "tripadvisorcom", "tripadvisorfr", "tripadvisoruk"): + return "tripadvisor" + if key in ("instagram", "insta", "ig"): + return "instagram" + if key in ("facebook", "fb"): + return "facebook" + if key in ("tiktok", "tik", "tik_tok", "ticktok"): + return "tiktok" + if key in ("policy", "site", "general"): + return "policy" + return None + + +def normalize_whitespace(text: str) -> str: + return re.sub(r"\s+", " ", text).strip() + + +def parse_human_number(raw: str) -> int | None: + s = raw.strip().replace("\u202f", "").replace("\xa0", "").replace(" ", "") + match = re.match(r"^(?P\d+(?:[.,]\d+)?)(?P[KkMmBb])?$", s) + if not match: + return None + + num_part = match.group("num") + suffix = (match.group("suffix") or "").upper() + + if "," in num_part and "." in num_part: + num_part = num_part.replace(",", "") + elif "," in num_part and "." not in num_part: + parts = num_part.split(",") + if len(parts) > 1 and len(parts[-1]) == 3: + num_part = "".join(parts) + else: + num_part = num_part.replace(",", ".") + + try: + value = float(num_part) + except ValueError: + return None + + multiplier = {"": 1, "K": 1_000, "M": 1_000_000, "B": 1_000_000_000}.get(suffix) + if multiplier is None: + return None + return int(round(value * multiplier)) + + +def fetch_url( + session: requests.Session, + url: str, + *, + timeout_s: float, + user_agent: str, +) -> tuple[requests.Response | None, str | None]: + try: + resp = session.get( + url, + headers={ + "User-Agent": user_agent, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + }, + timeout=timeout_s, + allow_redirects=True, + ) + return resp, None + except requests.RequestException as exc: + return None, str(exc) + + +def collect_meta(soup: BeautifulSoup) -> dict[str, list[str]]: + out: dict[str, list[str]] = {} + for tag in soup.find_all("meta"): + key = tag.get("property") or tag.get("name") + if not key: + continue + content = tag.get("content") + if not content: + continue + out.setdefault(key.strip().lower(), []).append(content.strip()) + return out + + +def first_meta(meta: dict[str, list[str]], keys: Iterable[str]) -> str | None: + for key in keys: + values = meta.get(key.lower()) + if values: + return values[0] + return None + + +def clean_jsonld_text(raw: str) -> str: + s = raw.strip() + s = re.sub(r"^\s*\s*$", "", s) + s = re.sub(r"^\s*/\*+\s*\s*\*/\s*$", "", s) + return s.strip() + + +def extract_jsonld_objects(soup: BeautifulSoup) -> tuple[list[Any], list[dict[str, str]]]: + objects: list[Any] = [] + errors: list[dict[str, str]] = [] + for script in soup.find_all("script", attrs={"type": re.compile(r"^application/ld\+json$", re.I)}): + raw = script.string or script.get_text() or "" + raw = clean_jsonld_text(raw) + if not raw: + continue + try: + objects.append(json.loads(raw)) + continue + except json.JSONDecodeError: + pass + + try: + patched = "[" + re.sub(r"}\s*{", "},{", raw) + "]" + objects.append(json.loads(patched)) + except json.JSONDecodeError as exc: + errors.append({"error": str(exc), "snippet": raw[:400]}) + return objects, errors + + +def iter_dicts(obj: Any) -> Iterable[dict[str, Any]]: + if isinstance(obj, dict): + yield obj + graph = obj.get("@graph") + if isinstance(graph, list): + for item in graph: + yield from iter_dicts(item) + for value in obj.values(): + yield from iter_dicts(value) + elif isinstance(obj, list): + for item in obj: + yield from iter_dicts(item) + + +def coerce_float(value: Any) -> float | None: + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + v = value.strip().replace(",", ".") + try: + return float(v) + except ValueError: + return None + return None + + +def coerce_int(value: Any) -> int | None: + if value is None: + return None + if isinstance(value, int): + return value + if isinstance(value, float): + return int(value) + if isinstance(value, str): + v = re.sub(r"[^\d]", "", value) + return int(v) if v else None + return None + + +def pick_best_aggregate_rating(jsonld: list[Any]) -> dict[str, Any] | None: + candidates: list[tuple[int, float, dict[str, Any]]] = [] + for obj in jsonld: + for d in iter_dicts(obj): + agg = d.get("aggregateRating") + if not isinstance(agg, dict): + continue + rating_value = coerce_float(agg.get("ratingValue")) + if rating_value is None: + continue + review_count = coerce_int(agg.get("reviewCount") or agg.get("ratingCount")) + score = (review_count or 0) * 10 + int(round(rating_value * 100)) + candidates.append( + ( + score, + rating_value, + { + "rating_value": rating_value, + "review_count": review_count, + "best_rating": coerce_float(agg.get("bestRating")), + "worst_rating": coerce_float(agg.get("worstRating")), + "source": "jsonld", + "aggregate_rating": agg, + "parent_types": d.get("@type"), + }, + ) + ) + candidates.sort(reverse=True) + return candidates[0][2] if candidates else None + + +def extract_awards(jsonld: list[Any]) -> list[str]: + awards: list[str] = [] + seen: set[str] = set() + for obj in jsonld: + for d in iter_dicts(obj): + raw = d.get("award") or d.get("awards") + if isinstance(raw, str): + items = [raw] + elif isinstance(raw, list): + items = [x for x in raw if isinstance(x, str)] + else: + items = [] + for item in items: + cleaned = normalize_whitespace(item) + if not cleaned or cleaned.lower() in seen: + continue + seen.add(cleaned.lower()) + awards.append(cleaned) + return awards + + +BADGE_KEYWORDS = [ + "travellers' choice", + "travelers' choice", + "traveller review award", + "traveler review award", + "greenleaders", + "green leader", + "travel sustainable", + "preferred partner", + "genius", + "key collection", +] + + +def extract_badges_from_html(soup: BeautifulSoup) -> list[str]: + text = normalize_whitespace(soup.get_text(" ", strip=True)) + lowered = text.lower() + hits: list[str] = [] + for keyword in BADGE_KEYWORDS: + if keyword in lowered: + hits.append(keyword) + return sorted(set(hits)) + + +def extract_followers_from_description(description: str) -> dict[str, Any] | None: + patterns = [ + r"(?P\d[\d.,\s\u202f\xa0]*[KkMmBb]?)\s+followers?\b", + r"(?P\d[\d.,\s\u202f\xa0]*[KkMmBb]?)\s+abonn[eé]s?\b", + r"(?P\d[\d.,\s\u202f\xa0]*[KkMmBb]?)\s+people\s+like\s+this\b", + r"(?P\d[\d.,\s\u202f\xa0]*[KkMmBb]?)\s+likes\b", + ] + lowered = description.lower() + for pattern in patterns: + match = re.search(pattern, lowered, flags=re.IGNORECASE) + if not match: + continue + raw = match.group("count") + return {"raw": raw, "value": parse_human_number(raw)} + return None + + +DATE_META_KEYS = [ + "article:published_time", + "article:modified_time", + "og:published_time", + "og:updated_time", + "og:video:release_date", + "last-modified", +] + + +def parse_first_iso_datetime(values: Iterable[str]) -> str | None: + for value in values: + v = value.strip() + try: + return datetime.fromisoformat(v).isoformat() + except ValueError: + continue + return None + + +def audit_booking_or_tripadvisor( + soup: BeautifulSoup, + *, + include_jsonld: bool, +) -> dict[str, Any]: + jsonld, jsonld_errors = extract_jsonld_objects(soup) + rating = pick_best_aggregate_rating(jsonld) + awards = extract_awards(jsonld) + badges = sorted(set(awards + extract_badges_from_html(soup))) + + data: dict[str, Any] = { + "rating": rating, + "review_count": rating.get("review_count") if rating else None, + "badges": badges, + "jsonld_count": len(jsonld), + "jsonld_parse_errors": jsonld_errors, + } + if include_jsonld: + data["jsonld"] = jsonld + return data + + +def audit_social( + soup: BeautifulSoup, +) -> dict[str, Any]: + meta = collect_meta(soup) + description = first_meta(meta, ["description", "og:description", "twitter:description"]) or "" + followers = extract_followers_from_description(description) if description else None + + raw_date = first_meta(meta, DATE_META_KEYS) + last_post_date = parse_first_iso_datetime([raw_date]) if raw_date else None + + return { + "meta_description": description or None, + "follower_count": followers, + "last_post_date": {"raw": raw_date, "value": last_post_date} if (raw_date or last_post_date) else None, + "og_title": first_meta(meta, ["og:title"]), + "og_url": first_meta(meta, ["og:url"]), + } + + +POLICY_TERMS = [ + "New Year's Eve", + "minimum stay", + "sold out", +] + + +def find_term_snippets(text: str, term: str, *, max_hits: int = 3, context: int = 60) -> list[str]: + pattern = re.escape(term) + if term.lower() == "new year's eve": + pattern = r"new\s+year(?:'|\u2019)?s?\s+eve" + regex = re.compile(pattern, flags=re.IGNORECASE) + + snippets: list[str] = [] + for match in regex.finditer(text): + start = max(0, match.start() - context) + end = min(len(text), match.end() + context) + snippets.append(normalize_whitespace(text[start:end])) + if len(snippets) >= max_hits: + break + return snippets + + +def audit_policy(soup: BeautifulSoup) -> dict[str, Any]: + text = normalize_whitespace(soup.get_text(" ", strip=True)) + checks: list[dict[str, Any]] = [] + for term in POLICY_TERMS: + snippets = find_term_snippets(text, term) + checks.append({"term": term, "found": bool(snippets), "snippets": snippets}) + return {"policy_checks": checks} + + +def audit_listing( + url: str, + platform: str, + *, + timeout_s: float = 25.0, + user_agent: str = GOOGLEBOT_UA, + include_jsonld: bool = False, +) -> AuditResult: + normalized = normalize_platform(platform) + resolved_platform: Platform + if normalized is None or normalized == "auto": + resolved_platform = detect_platform(url) + else: + resolved_platform = normalized + fetched_at = iso_now() + + session = requests.Session() + resp, error = fetch_url(session, url, timeout_s=timeout_s, user_agent=user_agent) + if error or resp is None: + return AuditResult( + url=url, + platform=resolved_platform, + fetched_at=fetched_at, + status_code=None, + final_url=None, + ok=False, + error=error or "unknown error", + data={}, + ) + + content_type = (resp.headers.get("Content-Type") or "").lower() + html = resp.text if "html" in content_type or " argparse.Namespace: + parser = argparse.ArgumentParser( + description="Audit listing pages using a Googlebot user-agent (ratings/reviews/badges, followers, policy text)." + ) + parser.add_argument( + "--platform", + default="auto", + help="Platform hint (auto/booking/tripadvisor/instagram/facebook/tiktok/policy).", + ) + parser.add_argument("--timeout", type=float, default=25.0) + parser.add_argument("--user-agent", default=GOOGLEBOT_UA) + parser.add_argument("--include-jsonld", action="store_true", help="Include parsed JSON-LD blobs in output.") + parser.add_argument("urls", nargs="+", help="One or more URLs to audit") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + exit_code = 0 + for url in args.urls: + result = audit_listing( + url, + args.platform, + timeout_s=args.timeout, + user_agent=args.user_agent, + include_jsonld=bool(args.include_jsonld), + ) + os.write(1, (json.dumps(asdict(result), ensure_ascii=False) + "\n").encode("utf-8")) + if not result.ok: + exit_code = 2 + return exit_code + + +if __name__ == "__main__": + raise SystemExit(main())