Add Googlebot listing audit (verify)

This commit is contained in:
root 2026-01-02 21:56:46 +00:00
parent 8d04a67000
commit aac9108158
8 changed files with 804 additions and 0 deletions

4
verify/.gitignore vendored Normal file
View file

@ -0,0 +1,4 @@
__pycache__/
*.pyc
.venv/

28
verify/README.md Normal file
View file

@ -0,0 +1,28 @@
# Verify (crawler-view audit)
This folder contains a lightweight SEO audit that fetches external listing/profile URLs using a Googlebot User-Agent, then extracts:
- Booking.com / TripAdvisor: rating, review count, badges (JSON-LD first)
- Instagram / Facebook / TikTok: follower count + last post date hints (meta tags)
- Policy check: searches page text for "New Year's Eve", "minimum stay", "sold out"
## Setup
```bash
python3 -m venv .venv
. .venv/bin/activate
pip install -r verify/requirements.txt
```
## Run
```bash
python verify/tools/audit_listings.py --platform auto <url1> <url2>
python verify/tools/audit_listings.py --platform auto --include-jsonld <url>
```
To save output for review:
```bash
python verify/tools/audit_listings.py --platform auto <url1> <url2> > verify/results/audit.jsonl
```

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 93 KiB

3
verify/requirements.txt Normal file
View file

@ -0,0 +1,3 @@
requests>=2.32.0
beautifulsoup4>=4.12.0

View file

@ -0,0 +1,12 @@
{"url": "https://booking.roomraccoon.fr/le-fl-neur-guesthouse-8346/fr/", "platform": "policy", "fetched_at": "2026-01-02T20:23:18+00:00", "status_code": 200, "final_url": "https://booking.roomraccoon.fr/le-fl-neur-guesthouse-8346/fr/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://leflaneur-guesthouse.com/", "platform": "policy", "fetched_at": "2026-01-02T20:23:19+00:00", "status_code": 200, "final_url": "https://leflaneur-guesthouse.com/", "ok": true, "error": null, "data": {"content_type": "text/html", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://leflaneur-guesthouse.com/dormir", "platform": "policy", "fetched_at": "2026-01-02T20:23:19+00:00", "status_code": 200, "final_url": "https://leflaneur-guesthouse.com/dormir", "ok": true, "error": null, "data": {"content_type": "text/html", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://www.booking.com/searchresults.html?ss=Le%20Fl%C3%A2neur%20Guesthouse%20Lyon", "platform": "booking", "fetched_at": "2026-01-02T20:23:20+00:00", "status_code": 202, "final_url": "https://www.booking.com/searchresults.html", "ok": true, "error": null, "data": {"content_type": "text/html", "content_length": "3962", "rating": null, "review_count": null, "badges": [], "jsonld_count": 0, "jsonld_parse_errors": []}}
{"url": "https://www.facebook.com/leflaneurlyon", "platform": "facebook", "fetched_at": "2026-01-02T20:23:21+00:00", "status_code": 200, "final_url": "https://m.facebook.com/login/?next=https%3A%2F%2Fm.facebook.com%2Fleflaneurlyon%2F", "ok": true, "error": null, "data": {"content_type": "text/html; charset=utf-8", "content_length": null, "meta_description": "登录 Facebook与好友、家人和认识的人分享和建立联系。", "follower_count": null, "last_post_date": null, "og_title": "登录 Facebook | Facebook", "og_url": "https://www.facebook.com/"}}
{"url": "https://www.google.com/maps/place/Le+Fl%C3%A2neur+Guesthouse/@45.7512135,4.8428045,17z/data=!3m1!5s0x47f4ea4464dcb499:0x7fbb59cd88d1026a!4m9!3m8!1s0x47f4ea446430af35:0xe27846417ed8f4f!5m2!4m1!1i2!8m2!3d45.7512135!4d4.8428045!16s%2Fg%2F11ckqn6t7v", "platform": "policy", "fetched_at": "2026-01-02T20:23:23+00:00", "status_code": 200, "final_url": "https://www.google.com/maps/place/Le+Fl%C3%A2neur+Guesthouse/@45.7512135,4.8428045,17z/data=!3m1!5s0x47f4ea4464dcb499:0x7fbb59cd88d1026a!4m9!3m8!1s0x47f4ea446430af35:0xe27846417ed8f4f!5m2!4m1!1i2!8m2!3d45.7512135!4d4.8428045!16s%2Fg%2F11ckqn6t7v", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://www.google.com/maps/search/?api=1&query=Fl%C3%A2neur%20Hostel%20Lyon", "platform": "policy", "fetched_at": "2026-01-02T20:23:25+00:00", "status_code": 200, "final_url": "https://www.google.com/maps/search/?api=1&query=Fl%C3%A2neur%20Hostel%20Lyon", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://www.hostelworld.com/hostels/p/100844/le-flaneur-guesthouse/", "platform": "policy", "fetched_at": "2026-01-02T20:23:27+00:00", "status_code": 200, "final_url": "https://www.hostelworld.com/hostels/p/100844/le-flaneur-guesthouse/", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://www.hostelworld.com/st/hostels/lyon/", "platform": "policy", "fetched_at": "2026-01-02T20:23:28+00:00", "status_code": 200, "final_url": "https://www.hostelworld.com/hostels/europe/france/lyon/", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://www.instagram.com/leflaneur_gh/", "platform": "instagram", "fetched_at": "2026-01-02T20:23:30+00:00", "status_code": 200, "final_url": "https://www.instagram.com/leflaneur_gh/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=\"utf-8\"", "content_length": null, "meta_description": "2,296 Followers, 889 Following, 742 Posts - Le Flâneur Guesthouse (@leflaneur_gh) on Instagram: \"🍺☕ Bar & café\n🏠Independent / Alternative / Participative /Ecofriendly / Homely hostel in the heart of Guillotière, Lyon📍\"", "follower_count": {"raw": "2,296", "value": 2296}, "last_post_date": null, "og_title": "Le Flâneur Guesthouse (@leflaneur_gh) • Instagram photos and videos", "og_url": "https://www.instagram.com/leflaneur_gh/"}}
{"url": "https://www.tiktok.com/search?q=le%20flaneur%20guesthouse%20lyon", "platform": "tiktok", "fetched_at": "2026-01-02T20:23:32+00:00", "status_code": 403, "final_url": "https://www.tiktok.com/search?q=le%20flaneur%20guesthouse%20lyon", "ok": true, "error": null, "data": {"content_type": "text/plain; charset=utf-8", "content_length": "9", "meta_description": null, "follower_count": null, "last_post_date": null, "og_title": null, "og_url": null}}
{"url": "https://www.tripadvisor.com/Hotel_Review-g187265-d8778985-Reviews-Le_Flaneur_Guesthouse-Lyon_Rhone_Auvergne_Rhone_Alpes.html", "platform": "tripadvisor", "fetched_at": "2026-01-02T20:23:32+00:00", "status_code": 403, "final_url": "https://www.tripadvisor.com/Hotel_Review-g187265-d8778985-Reviews-Le_Flaneur_Guesthouse-Lyon_Rhone_Auvergne_Rhone_Alpes.html", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": "775", "rating": null, "review_count": null, "badges": [], "jsonld_count": 0, "jsonld_parse_errors": []}}

View file

@ -0,0 +1,11 @@
{"url": "https://ho36lyon.com/", "platform": "policy", "fetched_at": "2026-01-02T20:24:02+00:00", "status_code": 200, "final_url": "https://ho36lyon.com/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": "35483", "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://ho36lyon.com/en/", "platform": "policy", "fetched_at": "2026-01-02T20:24:04+00:00", "status_code": 200, "final_url": "https://ho36lyon.com/en/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": "35743", "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://maps.app.goo.gl/vfGnGGQxJBNwvdgX8", "platform": "policy", "fetched_at": "2026-01-02T20:24:05+00:00", "status_code": 200, "final_url": "https://www.google.fr/maps/place/HO36+Hostel+Lyon/@45.7529047,4.8394703,17z/data=!4m9!3m8!1s0x47f4ea44c206c2fd:0xb36a1c20ef67ead4!5m2!4m1!1i2!8m2!3d45.752901!4d4.8420452!16s%2Fg%2F1tnpkbvv?entry=tts&g_ep=EgoyMDI0MTAwMi4xIPu8ASoASAFQAw%3D%3D", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://www.booking.com/hotel/fr/ho36-hostel.html", "platform": "booking", "fetched_at": "2026-01-02T20:24:07+00:00", "status_code": 202, "final_url": "https://www.booking.com/hotel/fr/ho36-hostel.html", "ok": true, "error": null, "data": {"content_type": "text/html", "content_length": "3962", "rating": null, "review_count": null, "badges": [], "jsonld_count": 0, "jsonld_parse_errors": []}}
{"url": "https://www.facebook.com/ho36hotels/", "platform": "facebook", "fetched_at": "2026-01-02T20:24:07+00:00", "status_code": 200, "final_url": "https://m.facebook.com/login/?next=https%3A%2F%2Fm.facebook.com%2Fho36hotels%2F", "ok": true, "error": null, "data": {"content_type": "text/html; charset=utf-8", "content_length": null, "meta_description": "登录 Facebook与好友、家人和认识的人分享和建立联系。", "follower_count": null, "last_post_date": null, "og_title": "登录 Facebook | Facebook", "og_url": "https://www.facebook.com/"}}
{"url": "https://www.google.com/maps/embed?pb=!1m14!1m8!1m3!1d11135.645489923992!2d4.84204!3d45.7529227!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x0%3A0xb36a1c20ef67ead4!2sho36%20Lyon%20Guilloti%C3%A8re!5e0!3m2!1sfr!2sfr!4v1567089009427!5m2!1sfr!2sfr", "platform": "policy", "fetched_at": "2026-01-02T20:24:08+00:00", "status_code": 200, "final_url": "https://www.google.com/maps/embed?pb=!1m14!1m8!1m3!1d11135.645489923992!2d4.84204!3d45.7529227!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x0%3A0xb36a1c20ef67ead4!2sho36%20Lyon%20Guilloti%C3%A8re!5e0!3m2!1sfr!2sfr!4v1567089009427!5m2!1sfr!2sfr", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://www.google.fr/maps/place/HO36+Hostel+Lyon/@45.7529047,4.8394703,17z/data=!4m9!3m8!1s0x47f4ea44c206c2fd:0xb36a1c20ef67ead4!5m2!4m1!1i2!8m2!3d45.752901!4d4.8420452!16s%2Fg%2F1tnpkbvv?entry=tts", "platform": "policy", "fetched_at": "2026-01-02T20:24:08+00:00", "status_code": 200, "final_url": "https://www.google.fr/maps/place/HO36+Hostel+Lyon/@45.7529047,4.8394703,17z/data=!4m9!3m8!1s0x47f4ea44c206c2fd:0xb36a1c20ef67ead4!5m2!4m1!1i2!8m2!3d45.752901!4d4.8420452!16s%2Fg%2F1tnpkbvv?entry=tts", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://www.hostelworld.com/hostels/europe/france/lyon/", "platform": "policy", "fetched_at": "2026-01-02T20:24:10+00:00", "status_code": 200, "final_url": "https://www.hostelworld.com/hostels/europe/france/lyon/", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://www.hostelworld.com/hostels/p/270217/ho36-hostel/", "platform": "policy", "fetched_at": "2026-01-02T20:24:11+00:00", "status_code": 200, "final_url": "https://www.hostelworld.com/hostels/p/270217/ho36-hostel/", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
{"url": "https://www.instagram.com/ho36hotel_lyon/", "platform": "instagram", "fetched_at": "2026-01-02T20:24:12+00:00", "status_code": 200, "final_url": "https://www.instagram.com/ho36hotel_lyon/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=\"utf-8\"", "content_length": null, "meta_description": "3,247 Followers, 615 Following, 108 Posts - ho36 Lyon (@ho36hotel_lyon) on Instagram: \"💙Ho(s)tel\n⚡Lieu de vie et rencontres entre voyageurs & locaux ☕️\"", "follower_count": {"raw": "3,247", "value": 3247}, "last_post_date": null, "og_title": "ho36 Lyon (@ho36hotel_lyon) • Instagram photos and videos", "og_url": "https://www.instagram.com/ho36hotel_lyon/"}}
{"url": "https://www.tripadvisor.fr/Hotel_Review-g187265-d293643-Reviews-Ho36_Hostel-Lyon_Rhone_Auvergne_Rhone_Alpes.html", "platform": "tripadvisor", "fetched_at": "2026-01-02T20:24:13+00:00", "status_code": 403, "final_url": "https://www.tripadvisor.fr/Hotel_Review-g187265-d293643-Reviews-Ho36_Hostel-Lyon_Rhone_Auvergne_Rhone_Alpes.html", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": "774", "rating": null, "review_count": null, "badges": [], "jsonld_count": 0, "jsonld_parse_errors": []}}

View file

@ -0,0 +1,521 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import os
import re
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from typing import Any, Iterable, Literal
from urllib.parse import urlparse
import requests
try:
from bs4 import BeautifulSoup
except ImportError as exc: # pragma: no cover
raise SystemExit(
"Missing dependency 'beautifulsoup4'. Install with:\n"
" python3 -m pip install beautifulsoup4\n"
"or (recommended) inside a venv:\n"
" python3 -m venv .venv && . .venv/bin/activate && pip install beautifulsoup4\n"
) from exc
Platform = Literal[
"auto",
"booking",
"tripadvisor",
"instagram",
"facebook",
"tiktok",
"policy",
]
GOOGLEBOT_UA = (
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/41.0.2272.96 Mobile Safari/537.36 "
"(compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
)
@dataclass(frozen=True)
class AuditResult:
url: str
platform: str
fetched_at: str
status_code: int | None
final_url: str | None
ok: bool
error: str | None
data: dict[str, Any]
def iso_now() -> str:
return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
def detect_platform(url: str) -> Platform:
host = (urlparse(url).netloc or "").lower()
if "booking.com" in host:
return "booking"
if "tripadvisor." in host or "tripadvisor.com" in host:
return "tripadvisor"
if "instagram.com" in host:
return "instagram"
if "facebook.com" in host or host.endswith("fb.com"):
return "facebook"
if "tiktok.com" in host:
return "tiktok"
return "policy"
def normalize_platform(platform: str) -> Platform | None:
key = re.sub(r"[^a-z0-9]+", "", (platform or "").strip().lower())
if key in ("", "auto", "detect"):
return "auto"
if key in ("booking", "bookingcom"):
return "booking"
if key in ("tripadvisor", "tripadvisorcom", "tripadvisorfr", "tripadvisoruk"):
return "tripadvisor"
if key in ("instagram", "insta", "ig"):
return "instagram"
if key in ("facebook", "fb"):
return "facebook"
if key in ("tiktok", "tik", "tik_tok", "ticktok"):
return "tiktok"
if key in ("policy", "site", "general"):
return "policy"
return None
def normalize_whitespace(text: str) -> str:
return re.sub(r"\s+", " ", text).strip()
def parse_human_number(raw: str) -> int | None:
s = raw.strip().replace("\u202f", "").replace("\xa0", "").replace(" ", "")
match = re.match(r"^(?P<num>\d+(?:[.,]\d+)?)(?P<suffix>[KkMmBb])?$", s)
if not match:
return None
num_part = match.group("num")
suffix = (match.group("suffix") or "").upper()
if "," in num_part and "." in num_part:
num_part = num_part.replace(",", "")
elif "," in num_part and "." not in num_part:
parts = num_part.split(",")
if len(parts) > 1 and len(parts[-1]) == 3:
num_part = "".join(parts)
else:
num_part = num_part.replace(",", ".")
try:
value = float(num_part)
except ValueError:
return None
multiplier = {"": 1, "K": 1_000, "M": 1_000_000, "B": 1_000_000_000}.get(suffix)
if multiplier is None:
return None
return int(round(value * multiplier))
def fetch_url(
session: requests.Session,
url: str,
*,
timeout_s: float,
user_agent: str,
) -> tuple[requests.Response | None, str | None]:
try:
resp = session.get(
url,
headers={
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
timeout=timeout_s,
allow_redirects=True,
)
return resp, None
except requests.RequestException as exc:
return None, str(exc)
def collect_meta(soup: BeautifulSoup) -> dict[str, list[str]]:
out: dict[str, list[str]] = {}
for tag in soup.find_all("meta"):
key = tag.get("property") or tag.get("name")
if not key:
continue
content = tag.get("content")
if not content:
continue
out.setdefault(key.strip().lower(), []).append(content.strip())
return out
def first_meta(meta: dict[str, list[str]], keys: Iterable[str]) -> str | None:
for key in keys:
values = meta.get(key.lower())
if values:
return values[0]
return None
def clean_jsonld_text(raw: str) -> str:
s = raw.strip()
s = re.sub(r"^\s*<!--", "", s)
s = re.sub(r"-->\s*$", "", s)
s = re.sub(r"^\s*/\*+\s*<!\[CDATA\[\s*\*/\s*", "", s)
s = re.sub(r"\s*/\*+\s*\]\]>\s*\*/\s*$", "", s)
return s.strip()
def extract_jsonld_objects(soup: BeautifulSoup) -> tuple[list[Any], list[dict[str, str]]]:
objects: list[Any] = []
errors: list[dict[str, str]] = []
for script in soup.find_all("script", attrs={"type": re.compile(r"^application/ld\+json$", re.I)}):
raw = script.string or script.get_text() or ""
raw = clean_jsonld_text(raw)
if not raw:
continue
try:
objects.append(json.loads(raw))
continue
except json.JSONDecodeError:
pass
try:
patched = "[" + re.sub(r"}\s*{", "},{", raw) + "]"
objects.append(json.loads(patched))
except json.JSONDecodeError as exc:
errors.append({"error": str(exc), "snippet": raw[:400]})
return objects, errors
def iter_dicts(obj: Any) -> Iterable[dict[str, Any]]:
if isinstance(obj, dict):
yield obj
graph = obj.get("@graph")
if isinstance(graph, list):
for item in graph:
yield from iter_dicts(item)
for value in obj.values():
yield from iter_dicts(value)
elif isinstance(obj, list):
for item in obj:
yield from iter_dicts(item)
def coerce_float(value: Any) -> float | None:
if value is None:
return None
if isinstance(value, (int, float)):
return float(value)
if isinstance(value, str):
v = value.strip().replace(",", ".")
try:
return float(v)
except ValueError:
return None
return None
def coerce_int(value: Any) -> int | None:
if value is None:
return None
if isinstance(value, int):
return value
if isinstance(value, float):
return int(value)
if isinstance(value, str):
v = re.sub(r"[^\d]", "", value)
return int(v) if v else None
return None
def pick_best_aggregate_rating(jsonld: list[Any]) -> dict[str, Any] | None:
candidates: list[tuple[int, float, dict[str, Any]]] = []
for obj in jsonld:
for d in iter_dicts(obj):
agg = d.get("aggregateRating")
if not isinstance(agg, dict):
continue
rating_value = coerce_float(agg.get("ratingValue"))
if rating_value is None:
continue
review_count = coerce_int(agg.get("reviewCount") or agg.get("ratingCount"))
score = (review_count or 0) * 10 + int(round(rating_value * 100))
candidates.append(
(
score,
rating_value,
{
"rating_value": rating_value,
"review_count": review_count,
"best_rating": coerce_float(agg.get("bestRating")),
"worst_rating": coerce_float(agg.get("worstRating")),
"source": "jsonld",
"aggregate_rating": agg,
"parent_types": d.get("@type"),
},
)
)
candidates.sort(reverse=True)
return candidates[0][2] if candidates else None
def extract_awards(jsonld: list[Any]) -> list[str]:
awards: list[str] = []
seen: set[str] = set()
for obj in jsonld:
for d in iter_dicts(obj):
raw = d.get("award") or d.get("awards")
if isinstance(raw, str):
items = [raw]
elif isinstance(raw, list):
items = [x for x in raw if isinstance(x, str)]
else:
items = []
for item in items:
cleaned = normalize_whitespace(item)
if not cleaned or cleaned.lower() in seen:
continue
seen.add(cleaned.lower())
awards.append(cleaned)
return awards
BADGE_KEYWORDS = [
"travellers' choice",
"travelers' choice",
"traveller review award",
"traveler review award",
"greenleaders",
"green leader",
"travel sustainable",
"preferred partner",
"genius",
"key collection",
]
def extract_badges_from_html(soup: BeautifulSoup) -> list[str]:
text = normalize_whitespace(soup.get_text(" ", strip=True))
lowered = text.lower()
hits: list[str] = []
for keyword in BADGE_KEYWORDS:
if keyword in lowered:
hits.append(keyword)
return sorted(set(hits))
def extract_followers_from_description(description: str) -> dict[str, Any] | None:
patterns = [
r"(?P<count>\d[\d.,\s\u202f\xa0]*[KkMmBb]?)\s+followers?\b",
r"(?P<count>\d[\d.,\s\u202f\xa0]*[KkMmBb]?)\s+abonn[eé]s?\b",
r"(?P<count>\d[\d.,\s\u202f\xa0]*[KkMmBb]?)\s+people\s+like\s+this\b",
r"(?P<count>\d[\d.,\s\u202f\xa0]*[KkMmBb]?)\s+likes\b",
]
lowered = description.lower()
for pattern in patterns:
match = re.search(pattern, lowered, flags=re.IGNORECASE)
if not match:
continue
raw = match.group("count")
return {"raw": raw, "value": parse_human_number(raw)}
return None
DATE_META_KEYS = [
"article:published_time",
"article:modified_time",
"og:published_time",
"og:updated_time",
"og:video:release_date",
"last-modified",
]
def parse_first_iso_datetime(values: Iterable[str]) -> str | None:
for value in values:
v = value.strip()
try:
return datetime.fromisoformat(v).isoformat()
except ValueError:
continue
return None
def audit_booking_or_tripadvisor(
soup: BeautifulSoup,
*,
include_jsonld: bool,
) -> dict[str, Any]:
jsonld, jsonld_errors = extract_jsonld_objects(soup)
rating = pick_best_aggregate_rating(jsonld)
awards = extract_awards(jsonld)
badges = sorted(set(awards + extract_badges_from_html(soup)))
data: dict[str, Any] = {
"rating": rating,
"review_count": rating.get("review_count") if rating else None,
"badges": badges,
"jsonld_count": len(jsonld),
"jsonld_parse_errors": jsonld_errors,
}
if include_jsonld:
data["jsonld"] = jsonld
return data
def audit_social(
soup: BeautifulSoup,
) -> dict[str, Any]:
meta = collect_meta(soup)
description = first_meta(meta, ["description", "og:description", "twitter:description"]) or ""
followers = extract_followers_from_description(description) if description else None
raw_date = first_meta(meta, DATE_META_KEYS)
last_post_date = parse_first_iso_datetime([raw_date]) if raw_date else None
return {
"meta_description": description or None,
"follower_count": followers,
"last_post_date": {"raw": raw_date, "value": last_post_date} if (raw_date or last_post_date) else None,
"og_title": first_meta(meta, ["og:title"]),
"og_url": first_meta(meta, ["og:url"]),
}
POLICY_TERMS = [
"New Year's Eve",
"minimum stay",
"sold out",
]
def find_term_snippets(text: str, term: str, *, max_hits: int = 3, context: int = 60) -> list[str]:
pattern = re.escape(term)
if term.lower() == "new year's eve":
pattern = r"new\s+year(?:'|\u2019)?s?\s+eve"
regex = re.compile(pattern, flags=re.IGNORECASE)
snippets: list[str] = []
for match in regex.finditer(text):
start = max(0, match.start() - context)
end = min(len(text), match.end() + context)
snippets.append(normalize_whitespace(text[start:end]))
if len(snippets) >= max_hits:
break
return snippets
def audit_policy(soup: BeautifulSoup) -> dict[str, Any]:
text = normalize_whitespace(soup.get_text(" ", strip=True))
checks: list[dict[str, Any]] = []
for term in POLICY_TERMS:
snippets = find_term_snippets(text, term)
checks.append({"term": term, "found": bool(snippets), "snippets": snippets})
return {"policy_checks": checks}
def audit_listing(
url: str,
platform: str,
*,
timeout_s: float = 25.0,
user_agent: str = GOOGLEBOT_UA,
include_jsonld: bool = False,
) -> AuditResult:
normalized = normalize_platform(platform)
resolved_platform: Platform
if normalized is None or normalized == "auto":
resolved_platform = detect_platform(url)
else:
resolved_platform = normalized
fetched_at = iso_now()
session = requests.Session()
resp, error = fetch_url(session, url, timeout_s=timeout_s, user_agent=user_agent)
if error or resp is None:
return AuditResult(
url=url,
platform=resolved_platform,
fetched_at=fetched_at,
status_code=None,
final_url=None,
ok=False,
error=error or "unknown error",
data={},
)
content_type = (resp.headers.get("Content-Type") or "").lower()
html = resp.text if "html" in content_type or "<html" in resp.text[:200].lower() else resp.text
soup = BeautifulSoup(html, "html.parser")
data: dict[str, Any] = {
"content_type": resp.headers.get("Content-Type"),
"content_length": resp.headers.get("Content-Length"),
}
if resolved_platform in ("booking", "tripadvisor"):
data.update(audit_booking_or_tripadvisor(soup, include_jsonld=include_jsonld))
elif resolved_platform in ("instagram", "facebook", "tiktok"):
data.update(audit_social(soup))
elif resolved_platform == "policy":
data.update(audit_policy(soup))
else:
data.update(audit_policy(soup))
return AuditResult(
url=url,
platform=resolved_platform,
fetched_at=fetched_at,
status_code=resp.status_code,
final_url=str(resp.url) if resp.url else None,
ok=True,
error=None,
data=data,
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Audit listing pages using a Googlebot user-agent (ratings/reviews/badges, followers, policy text)."
)
parser.add_argument(
"--platform",
default="auto",
help="Platform hint (auto/booking/tripadvisor/instagram/facebook/tiktok/policy).",
)
parser.add_argument("--timeout", type=float, default=25.0)
parser.add_argument("--user-agent", default=GOOGLEBOT_UA)
parser.add_argument("--include-jsonld", action="store_true", help="Include parsed JSON-LD blobs in output.")
parser.add_argument("urls", nargs="+", help="One or more URLs to audit")
return parser.parse_args()
def main() -> int:
args = parse_args()
exit_code = 0
for url in args.urls:
result = audit_listing(
url,
args.platform,
timeout_s=args.timeout,
user_agent=args.user_agent,
include_jsonld=bool(args.include_jsonld),
)
os.write(1, (json.dumps(asdict(result), ensure_ascii=False) + "\n").encode("utf-8"))
if not result.ok:
exit_code = 2
return exit_code
if __name__ == "__main__":
raise SystemExit(main())