Add Googlebot listing audit (verify)
This commit is contained in:
parent
8d04a67000
commit
aac9108158
8 changed files with 804 additions and 0 deletions
4
verify/.gitignore
vendored
Normal file
4
verify/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
__pycache__/
|
||||
*.pyc
|
||||
.venv/
|
||||
|
||||
28
verify/README.md
Normal file
28
verify/README.md
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
# Verify (crawler-view audit)
|
||||
|
||||
This folder contains a lightweight SEO audit that fetches external listing/profile URLs using a Googlebot User-Agent, then extracts:
|
||||
|
||||
- Booking.com / TripAdvisor: rating, review count, badges (JSON-LD first)
|
||||
- Instagram / Facebook / TikTok: follower count + last post date hints (meta tags)
|
||||
- Policy check: searches page text for "New Year's Eve", "minimum stay", "sold out"
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
python3 -m venv .venv
|
||||
. .venv/bin/activate
|
||||
pip install -r verify/requirements.txt
|
||||
```
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
python verify/tools/audit_listings.py --platform auto <url1> <url2>
|
||||
python verify/tools/audit_listings.py --platform auto --include-jsonld <url>
|
||||
```
|
||||
|
||||
To save output for review:
|
||||
|
||||
```bash
|
||||
python verify/tools/audit_listings.py --platform auto <url1> <url2> > verify/results/audit.jsonl
|
||||
```
|
||||
225
verify/ho36/raw/ho36__booking__ho36_query__20260102.html
Normal file
225
verify/ho36/raw/ho36__booking__ho36_query__20260102.html
Normal file
File diff suppressed because one or more lines are too long
BIN
verify/ho36/screenshots/ho36__booking__ho36_query__20260102.png
Normal file
BIN
verify/ho36/screenshots/ho36__booking__ho36_query__20260102.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 93 KiB |
3
verify/requirements.txt
Normal file
3
verify/requirements.txt
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
requests>=2.32.0
|
||||
beautifulsoup4>=4.12.0
|
||||
|
||||
12
verify/results/flaneur_googlebot_audit.jsonl
Normal file
12
verify/results/flaneur_googlebot_audit.jsonl
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
{"url": "https://booking.roomraccoon.fr/le-fl-neur-guesthouse-8346/fr/", "platform": "policy", "fetched_at": "2026-01-02T20:23:18+00:00", "status_code": 200, "final_url": "https://booking.roomraccoon.fr/le-fl-neur-guesthouse-8346/fr/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://leflaneur-guesthouse.com/", "platform": "policy", "fetched_at": "2026-01-02T20:23:19+00:00", "status_code": 200, "final_url": "https://leflaneur-guesthouse.com/", "ok": true, "error": null, "data": {"content_type": "text/html", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://leflaneur-guesthouse.com/dormir", "platform": "policy", "fetched_at": "2026-01-02T20:23:19+00:00", "status_code": 200, "final_url": "https://leflaneur-guesthouse.com/dormir", "ok": true, "error": null, "data": {"content_type": "text/html", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://www.booking.com/searchresults.html?ss=Le%20Fl%C3%A2neur%20Guesthouse%20Lyon", "platform": "booking", "fetched_at": "2026-01-02T20:23:20+00:00", "status_code": 202, "final_url": "https://www.booking.com/searchresults.html", "ok": true, "error": null, "data": {"content_type": "text/html", "content_length": "3962", "rating": null, "review_count": null, "badges": [], "jsonld_count": 0, "jsonld_parse_errors": []}}
|
||||
{"url": "https://www.facebook.com/leflaneurlyon", "platform": "facebook", "fetched_at": "2026-01-02T20:23:21+00:00", "status_code": 200, "final_url": "https://m.facebook.com/login/?next=https%3A%2F%2Fm.facebook.com%2Fleflaneurlyon%2F", "ok": true, "error": null, "data": {"content_type": "text/html; charset=utf-8", "content_length": null, "meta_description": "登录 Facebook,与好友、家人和认识的人分享和建立联系。", "follower_count": null, "last_post_date": null, "og_title": "登录 Facebook | Facebook", "og_url": "https://www.facebook.com/"}}
|
||||
{"url": "https://www.google.com/maps/place/Le+Fl%C3%A2neur+Guesthouse/@45.7512135,4.8428045,17z/data=!3m1!5s0x47f4ea4464dcb499:0x7fbb59cd88d1026a!4m9!3m8!1s0x47f4ea446430af35:0xe27846417ed8f4f!5m2!4m1!1i2!8m2!3d45.7512135!4d4.8428045!16s%2Fg%2F11ckqn6t7v", "platform": "policy", "fetched_at": "2026-01-02T20:23:23+00:00", "status_code": 200, "final_url": "https://www.google.com/maps/place/Le+Fl%C3%A2neur+Guesthouse/@45.7512135,4.8428045,17z/data=!3m1!5s0x47f4ea4464dcb499:0x7fbb59cd88d1026a!4m9!3m8!1s0x47f4ea446430af35:0xe27846417ed8f4f!5m2!4m1!1i2!8m2!3d45.7512135!4d4.8428045!16s%2Fg%2F11ckqn6t7v", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://www.google.com/maps/search/?api=1&query=Fl%C3%A2neur%20Hostel%20Lyon", "platform": "policy", "fetched_at": "2026-01-02T20:23:25+00:00", "status_code": 200, "final_url": "https://www.google.com/maps/search/?api=1&query=Fl%C3%A2neur%20Hostel%20Lyon", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://www.hostelworld.com/hostels/p/100844/le-flaneur-guesthouse/", "platform": "policy", "fetched_at": "2026-01-02T20:23:27+00:00", "status_code": 200, "final_url": "https://www.hostelworld.com/hostels/p/100844/le-flaneur-guesthouse/", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://www.hostelworld.com/st/hostels/lyon/", "platform": "policy", "fetched_at": "2026-01-02T20:23:28+00:00", "status_code": 200, "final_url": "https://www.hostelworld.com/hostels/europe/france/lyon/", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://www.instagram.com/leflaneur_gh/", "platform": "instagram", "fetched_at": "2026-01-02T20:23:30+00:00", "status_code": 200, "final_url": "https://www.instagram.com/leflaneur_gh/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=\"utf-8\"", "content_length": null, "meta_description": "2,296 Followers, 889 Following, 742 Posts - Le Flâneur Guesthouse (@leflaneur_gh) on Instagram: \"🍺☕ Bar & café\n🏠Independent / Alternative / Participative /Ecofriendly / Homely hostel in the heart of Guillotière, Lyon📍\"", "follower_count": {"raw": "2,296", "value": 2296}, "last_post_date": null, "og_title": "Le Flâneur Guesthouse (@leflaneur_gh) • Instagram photos and videos", "og_url": "https://www.instagram.com/leflaneur_gh/"}}
|
||||
{"url": "https://www.tiktok.com/search?q=le%20flaneur%20guesthouse%20lyon", "platform": "tiktok", "fetched_at": "2026-01-02T20:23:32+00:00", "status_code": 403, "final_url": "https://www.tiktok.com/search?q=le%20flaneur%20guesthouse%20lyon", "ok": true, "error": null, "data": {"content_type": "text/plain; charset=utf-8", "content_length": "9", "meta_description": null, "follower_count": null, "last_post_date": null, "og_title": null, "og_url": null}}
|
||||
{"url": "https://www.tripadvisor.com/Hotel_Review-g187265-d8778985-Reviews-Le_Flaneur_Guesthouse-Lyon_Rhone_Auvergne_Rhone_Alpes.html", "platform": "tripadvisor", "fetched_at": "2026-01-02T20:23:32+00:00", "status_code": 403, "final_url": "https://www.tripadvisor.com/Hotel_Review-g187265-d8778985-Reviews-Le_Flaneur_Guesthouse-Lyon_Rhone_Auvergne_Rhone_Alpes.html", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": "775", "rating": null, "review_count": null, "badges": [], "jsonld_count": 0, "jsonld_parse_errors": []}}
|
||||
11
verify/results/ho36_googlebot_audit.jsonl
Normal file
11
verify/results/ho36_googlebot_audit.jsonl
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
{"url": "https://ho36lyon.com/", "platform": "policy", "fetched_at": "2026-01-02T20:24:02+00:00", "status_code": 200, "final_url": "https://ho36lyon.com/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": "35483", "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://ho36lyon.com/en/", "platform": "policy", "fetched_at": "2026-01-02T20:24:04+00:00", "status_code": 200, "final_url": "https://ho36lyon.com/en/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": "35743", "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://maps.app.goo.gl/vfGnGGQxJBNwvdgX8", "platform": "policy", "fetched_at": "2026-01-02T20:24:05+00:00", "status_code": 200, "final_url": "https://www.google.fr/maps/place/HO36+Hostel+Lyon/@45.7529047,4.8394703,17z/data=!4m9!3m8!1s0x47f4ea44c206c2fd:0xb36a1c20ef67ead4!5m2!4m1!1i2!8m2!3d45.752901!4d4.8420452!16s%2Fg%2F1tnpkbvv?entry=tts&g_ep=EgoyMDI0MTAwMi4xIPu8ASoASAFQAw%3D%3D", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://www.booking.com/hotel/fr/ho36-hostel.html", "platform": "booking", "fetched_at": "2026-01-02T20:24:07+00:00", "status_code": 202, "final_url": "https://www.booking.com/hotel/fr/ho36-hostel.html", "ok": true, "error": null, "data": {"content_type": "text/html", "content_length": "3962", "rating": null, "review_count": null, "badges": [], "jsonld_count": 0, "jsonld_parse_errors": []}}
|
||||
{"url": "https://www.facebook.com/ho36hotels/", "platform": "facebook", "fetched_at": "2026-01-02T20:24:07+00:00", "status_code": 200, "final_url": "https://m.facebook.com/login/?next=https%3A%2F%2Fm.facebook.com%2Fho36hotels%2F", "ok": true, "error": null, "data": {"content_type": "text/html; charset=utf-8", "content_length": null, "meta_description": "登录 Facebook,与好友、家人和认识的人分享和建立联系。", "follower_count": null, "last_post_date": null, "og_title": "登录 Facebook | Facebook", "og_url": "https://www.facebook.com/"}}
|
||||
{"url": "https://www.google.com/maps/embed?pb=!1m14!1m8!1m3!1d11135.645489923992!2d4.84204!3d45.7529227!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x0%3A0xb36a1c20ef67ead4!2sho36%20Lyon%20Guilloti%C3%A8re!5e0!3m2!1sfr!2sfr!4v1567089009427!5m2!1sfr!2sfr", "platform": "policy", "fetched_at": "2026-01-02T20:24:08+00:00", "status_code": 200, "final_url": "https://www.google.com/maps/embed?pb=!1m14!1m8!1m3!1d11135.645489923992!2d4.84204!3d45.7529227!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x0%3A0xb36a1c20ef67ead4!2sho36%20Lyon%20Guilloti%C3%A8re!5e0!3m2!1sfr!2sfr!4v1567089009427!5m2!1sfr!2sfr", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://www.google.fr/maps/place/HO36+Hostel+Lyon/@45.7529047,4.8394703,17z/data=!4m9!3m8!1s0x47f4ea44c206c2fd:0xb36a1c20ef67ead4!5m2!4m1!1i2!8m2!3d45.752901!4d4.8420452!16s%2Fg%2F1tnpkbvv?entry=tts", "platform": "policy", "fetched_at": "2026-01-02T20:24:08+00:00", "status_code": 200, "final_url": "https://www.google.fr/maps/place/HO36+Hostel+Lyon/@45.7529047,4.8394703,17z/data=!4m9!3m8!1s0x47f4ea44c206c2fd:0xb36a1c20ef67ead4!5m2!4m1!1i2!8m2!3d45.752901!4d4.8420452!16s%2Fg%2F1tnpkbvv?entry=tts", "ok": true, "error": null, "data": {"content_type": "text/html; charset=UTF-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://www.hostelworld.com/hostels/europe/france/lyon/", "platform": "policy", "fetched_at": "2026-01-02T20:24:10+00:00", "status_code": 200, "final_url": "https://www.hostelworld.com/hostels/europe/france/lyon/", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://www.hostelworld.com/hostels/p/270217/ho36-hostel/", "platform": "policy", "fetched_at": "2026-01-02T20:24:11+00:00", "status_code": 200, "final_url": "https://www.hostelworld.com/hostels/p/270217/ho36-hostel/", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": null, "policy_checks": [{"term": "New Year's Eve", "found": false, "snippets": []}, {"term": "minimum stay", "found": false, "snippets": []}, {"term": "sold out", "found": false, "snippets": []}]}}
|
||||
{"url": "https://www.instagram.com/ho36hotel_lyon/", "platform": "instagram", "fetched_at": "2026-01-02T20:24:12+00:00", "status_code": 200, "final_url": "https://www.instagram.com/ho36hotel_lyon/", "ok": true, "error": null, "data": {"content_type": "text/html; charset=\"utf-8\"", "content_length": null, "meta_description": "3,247 Followers, 615 Following, 108 Posts - ho36 Lyon (@ho36hotel_lyon) on Instagram: \"💙Ho(s)tel\n⚡️Lieu de vie et rencontres entre voyageurs & locaux ☕️\"", "follower_count": {"raw": "3,247", "value": 3247}, "last_post_date": null, "og_title": "ho36 Lyon (@ho36hotel_lyon) • Instagram photos and videos", "og_url": "https://www.instagram.com/ho36hotel_lyon/"}}
|
||||
{"url": "https://www.tripadvisor.fr/Hotel_Review-g187265-d293643-Reviews-Ho36_Hostel-Lyon_Rhone_Auvergne_Rhone_Alpes.html", "platform": "tripadvisor", "fetched_at": "2026-01-02T20:24:13+00:00", "status_code": 403, "final_url": "https://www.tripadvisor.fr/Hotel_Review-g187265-d293643-Reviews-Ho36_Hostel-Lyon_Rhone_Auvergne_Rhone_Alpes.html", "ok": true, "error": null, "data": {"content_type": "text/html;charset=utf-8", "content_length": "774", "rating": null, "review_count": null, "badges": [], "jsonld_count": 0, "jsonld_parse_errors": []}}
|
||||
521
verify/tools/audit_listings.py
Normal file
521
verify/tools/audit_listings.py
Normal file
|
|
@ -0,0 +1,521 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from dataclasses import asdict, dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Iterable, Literal
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError as exc: # pragma: no cover
|
||||
raise SystemExit(
|
||||
"Missing dependency 'beautifulsoup4'. Install with:\n"
|
||||
" python3 -m pip install beautifulsoup4\n"
|
||||
"or (recommended) inside a venv:\n"
|
||||
" python3 -m venv .venv && . .venv/bin/activate && pip install beautifulsoup4\n"
|
||||
) from exc
|
||||
|
||||
|
||||
Platform = Literal[
|
||||
"auto",
|
||||
"booking",
|
||||
"tripadvisor",
|
||||
"instagram",
|
||||
"facebook",
|
||||
"tiktok",
|
||||
"policy",
|
||||
]
|
||||
|
||||
GOOGLEBOT_UA = (
|
||||
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/41.0.2272.96 Mobile Safari/537.36 "
|
||||
"(compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AuditResult:
|
||||
url: str
|
||||
platform: str
|
||||
fetched_at: str
|
||||
status_code: int | None
|
||||
final_url: str | None
|
||||
ok: bool
|
||||
error: str | None
|
||||
data: dict[str, Any]
|
||||
|
||||
|
||||
def iso_now() -> str:
|
||||
return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
|
||||
|
||||
|
||||
def detect_platform(url: str) -> Platform:
|
||||
host = (urlparse(url).netloc or "").lower()
|
||||
if "booking.com" in host:
|
||||
return "booking"
|
||||
if "tripadvisor." in host or "tripadvisor.com" in host:
|
||||
return "tripadvisor"
|
||||
if "instagram.com" in host:
|
||||
return "instagram"
|
||||
if "facebook.com" in host or host.endswith("fb.com"):
|
||||
return "facebook"
|
||||
if "tiktok.com" in host:
|
||||
return "tiktok"
|
||||
return "policy"
|
||||
|
||||
|
||||
def normalize_platform(platform: str) -> Platform | None:
|
||||
key = re.sub(r"[^a-z0-9]+", "", (platform or "").strip().lower())
|
||||
if key in ("", "auto", "detect"):
|
||||
return "auto"
|
||||
if key in ("booking", "bookingcom"):
|
||||
return "booking"
|
||||
if key in ("tripadvisor", "tripadvisorcom", "tripadvisorfr", "tripadvisoruk"):
|
||||
return "tripadvisor"
|
||||
if key in ("instagram", "insta", "ig"):
|
||||
return "instagram"
|
||||
if key in ("facebook", "fb"):
|
||||
return "facebook"
|
||||
if key in ("tiktok", "tik", "tik_tok", "ticktok"):
|
||||
return "tiktok"
|
||||
if key in ("policy", "site", "general"):
|
||||
return "policy"
|
||||
return None
|
||||
|
||||
|
||||
def normalize_whitespace(text: str) -> str:
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def parse_human_number(raw: str) -> int | None:
|
||||
s = raw.strip().replace("\u202f", "").replace("\xa0", "").replace(" ", "")
|
||||
match = re.match(r"^(?P<num>\d+(?:[.,]\d+)?)(?P<suffix>[KkMmBb])?$", s)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
num_part = match.group("num")
|
||||
suffix = (match.group("suffix") or "").upper()
|
||||
|
||||
if "," in num_part and "." in num_part:
|
||||
num_part = num_part.replace(",", "")
|
||||
elif "," in num_part and "." not in num_part:
|
||||
parts = num_part.split(",")
|
||||
if len(parts) > 1 and len(parts[-1]) == 3:
|
||||
num_part = "".join(parts)
|
||||
else:
|
||||
num_part = num_part.replace(",", ".")
|
||||
|
||||
try:
|
||||
value = float(num_part)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
multiplier = {"": 1, "K": 1_000, "M": 1_000_000, "B": 1_000_000_000}.get(suffix)
|
||||
if multiplier is None:
|
||||
return None
|
||||
return int(round(value * multiplier))
|
||||
|
||||
|
||||
def fetch_url(
|
||||
session: requests.Session,
|
||||
url: str,
|
||||
*,
|
||||
timeout_s: float,
|
||||
user_agent: str,
|
||||
) -> tuple[requests.Response | None, str | None]:
|
||||
try:
|
||||
resp = session.get(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
},
|
||||
timeout=timeout_s,
|
||||
allow_redirects=True,
|
||||
)
|
||||
return resp, None
|
||||
except requests.RequestException as exc:
|
||||
return None, str(exc)
|
||||
|
||||
|
||||
def collect_meta(soup: BeautifulSoup) -> dict[str, list[str]]:
|
||||
out: dict[str, list[str]] = {}
|
||||
for tag in soup.find_all("meta"):
|
||||
key = tag.get("property") or tag.get("name")
|
||||
if not key:
|
||||
continue
|
||||
content = tag.get("content")
|
||||
if not content:
|
||||
continue
|
||||
out.setdefault(key.strip().lower(), []).append(content.strip())
|
||||
return out
|
||||
|
||||
|
||||
def first_meta(meta: dict[str, list[str]], keys: Iterable[str]) -> str | None:
|
||||
for key in keys:
|
||||
values = meta.get(key.lower())
|
||||
if values:
|
||||
return values[0]
|
||||
return None
|
||||
|
||||
|
||||
def clean_jsonld_text(raw: str) -> str:
|
||||
s = raw.strip()
|
||||
s = re.sub(r"^\s*<!--", "", s)
|
||||
s = re.sub(r"-->\s*$", "", s)
|
||||
s = re.sub(r"^\s*/\*+\s*<!\[CDATA\[\s*\*/\s*", "", s)
|
||||
s = re.sub(r"\s*/\*+\s*\]\]>\s*\*/\s*$", "", s)
|
||||
return s.strip()
|
||||
|
||||
|
||||
def extract_jsonld_objects(soup: BeautifulSoup) -> tuple[list[Any], list[dict[str, str]]]:
|
||||
objects: list[Any] = []
|
||||
errors: list[dict[str, str]] = []
|
||||
for script in soup.find_all("script", attrs={"type": re.compile(r"^application/ld\+json$", re.I)}):
|
||||
raw = script.string or script.get_text() or ""
|
||||
raw = clean_jsonld_text(raw)
|
||||
if not raw:
|
||||
continue
|
||||
try:
|
||||
objects.append(json.loads(raw))
|
||||
continue
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
try:
|
||||
patched = "[" + re.sub(r"}\s*{", "},{", raw) + "]"
|
||||
objects.append(json.loads(patched))
|
||||
except json.JSONDecodeError as exc:
|
||||
errors.append({"error": str(exc), "snippet": raw[:400]})
|
||||
return objects, errors
|
||||
|
||||
|
||||
def iter_dicts(obj: Any) -> Iterable[dict[str, Any]]:
|
||||
if isinstance(obj, dict):
|
||||
yield obj
|
||||
graph = obj.get("@graph")
|
||||
if isinstance(graph, list):
|
||||
for item in graph:
|
||||
yield from iter_dicts(item)
|
||||
for value in obj.values():
|
||||
yield from iter_dicts(value)
|
||||
elif isinstance(obj, list):
|
||||
for item in obj:
|
||||
yield from iter_dicts(item)
|
||||
|
||||
|
||||
def coerce_float(value: Any) -> float | None:
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
if isinstance(value, str):
|
||||
v = value.strip().replace(",", ".")
|
||||
try:
|
||||
return float(v)
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def coerce_int(value: Any) -> int | None:
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
if isinstance(value, float):
|
||||
return int(value)
|
||||
if isinstance(value, str):
|
||||
v = re.sub(r"[^\d]", "", value)
|
||||
return int(v) if v else None
|
||||
return None
|
||||
|
||||
|
||||
def pick_best_aggregate_rating(jsonld: list[Any]) -> dict[str, Any] | None:
|
||||
candidates: list[tuple[int, float, dict[str, Any]]] = []
|
||||
for obj in jsonld:
|
||||
for d in iter_dicts(obj):
|
||||
agg = d.get("aggregateRating")
|
||||
if not isinstance(agg, dict):
|
||||
continue
|
||||
rating_value = coerce_float(agg.get("ratingValue"))
|
||||
if rating_value is None:
|
||||
continue
|
||||
review_count = coerce_int(agg.get("reviewCount") or agg.get("ratingCount"))
|
||||
score = (review_count or 0) * 10 + int(round(rating_value * 100))
|
||||
candidates.append(
|
||||
(
|
||||
score,
|
||||
rating_value,
|
||||
{
|
||||
"rating_value": rating_value,
|
||||
"review_count": review_count,
|
||||
"best_rating": coerce_float(agg.get("bestRating")),
|
||||
"worst_rating": coerce_float(agg.get("worstRating")),
|
||||
"source": "jsonld",
|
||||
"aggregate_rating": agg,
|
||||
"parent_types": d.get("@type"),
|
||||
},
|
||||
)
|
||||
)
|
||||
candidates.sort(reverse=True)
|
||||
return candidates[0][2] if candidates else None
|
||||
|
||||
|
||||
def extract_awards(jsonld: list[Any]) -> list[str]:
|
||||
awards: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for obj in jsonld:
|
||||
for d in iter_dicts(obj):
|
||||
raw = d.get("award") or d.get("awards")
|
||||
if isinstance(raw, str):
|
||||
items = [raw]
|
||||
elif isinstance(raw, list):
|
||||
items = [x for x in raw if isinstance(x, str)]
|
||||
else:
|
||||
items = []
|
||||
for item in items:
|
||||
cleaned = normalize_whitespace(item)
|
||||
if not cleaned or cleaned.lower() in seen:
|
||||
continue
|
||||
seen.add(cleaned.lower())
|
||||
awards.append(cleaned)
|
||||
return awards
|
||||
|
||||
|
||||
BADGE_KEYWORDS = [
|
||||
"travellers' choice",
|
||||
"travelers' choice",
|
||||
"traveller review award",
|
||||
"traveler review award",
|
||||
"greenleaders",
|
||||
"green leader",
|
||||
"travel sustainable",
|
||||
"preferred partner",
|
||||
"genius",
|
||||
"key collection",
|
||||
]
|
||||
|
||||
|
||||
def extract_badges_from_html(soup: BeautifulSoup) -> list[str]:
|
||||
text = normalize_whitespace(soup.get_text(" ", strip=True))
|
||||
lowered = text.lower()
|
||||
hits: list[str] = []
|
||||
for keyword in BADGE_KEYWORDS:
|
||||
if keyword in lowered:
|
||||
hits.append(keyword)
|
||||
return sorted(set(hits))
|
||||
|
||||
|
||||
def extract_followers_from_description(description: str) -> dict[str, Any] | None:
|
||||
patterns = [
|
||||
r"(?P<count>\d[\d.,\s\u202f\xa0]*[KkMmBb]?)\s+followers?\b",
|
||||
r"(?P<count>\d[\d.,\s\u202f\xa0]*[KkMmBb]?)\s+abonn[eé]s?\b",
|
||||
r"(?P<count>\d[\d.,\s\u202f\xa0]*[KkMmBb]?)\s+people\s+like\s+this\b",
|
||||
r"(?P<count>\d[\d.,\s\u202f\xa0]*[KkMmBb]?)\s+likes\b",
|
||||
]
|
||||
lowered = description.lower()
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, lowered, flags=re.IGNORECASE)
|
||||
if not match:
|
||||
continue
|
||||
raw = match.group("count")
|
||||
return {"raw": raw, "value": parse_human_number(raw)}
|
||||
return None
|
||||
|
||||
|
||||
DATE_META_KEYS = [
|
||||
"article:published_time",
|
||||
"article:modified_time",
|
||||
"og:published_time",
|
||||
"og:updated_time",
|
||||
"og:video:release_date",
|
||||
"last-modified",
|
||||
]
|
||||
|
||||
|
||||
def parse_first_iso_datetime(values: Iterable[str]) -> str | None:
|
||||
for value in values:
|
||||
v = value.strip()
|
||||
try:
|
||||
return datetime.fromisoformat(v).isoformat()
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def audit_booking_or_tripadvisor(
|
||||
soup: BeautifulSoup,
|
||||
*,
|
||||
include_jsonld: bool,
|
||||
) -> dict[str, Any]:
|
||||
jsonld, jsonld_errors = extract_jsonld_objects(soup)
|
||||
rating = pick_best_aggregate_rating(jsonld)
|
||||
awards = extract_awards(jsonld)
|
||||
badges = sorted(set(awards + extract_badges_from_html(soup)))
|
||||
|
||||
data: dict[str, Any] = {
|
||||
"rating": rating,
|
||||
"review_count": rating.get("review_count") if rating else None,
|
||||
"badges": badges,
|
||||
"jsonld_count": len(jsonld),
|
||||
"jsonld_parse_errors": jsonld_errors,
|
||||
}
|
||||
if include_jsonld:
|
||||
data["jsonld"] = jsonld
|
||||
return data
|
||||
|
||||
|
||||
def audit_social(
|
||||
soup: BeautifulSoup,
|
||||
) -> dict[str, Any]:
|
||||
meta = collect_meta(soup)
|
||||
description = first_meta(meta, ["description", "og:description", "twitter:description"]) or ""
|
||||
followers = extract_followers_from_description(description) if description else None
|
||||
|
||||
raw_date = first_meta(meta, DATE_META_KEYS)
|
||||
last_post_date = parse_first_iso_datetime([raw_date]) if raw_date else None
|
||||
|
||||
return {
|
||||
"meta_description": description or None,
|
||||
"follower_count": followers,
|
||||
"last_post_date": {"raw": raw_date, "value": last_post_date} if (raw_date or last_post_date) else None,
|
||||
"og_title": first_meta(meta, ["og:title"]),
|
||||
"og_url": first_meta(meta, ["og:url"]),
|
||||
}
|
||||
|
||||
|
||||
POLICY_TERMS = [
|
||||
"New Year's Eve",
|
||||
"minimum stay",
|
||||
"sold out",
|
||||
]
|
||||
|
||||
|
||||
def find_term_snippets(text: str, term: str, *, max_hits: int = 3, context: int = 60) -> list[str]:
|
||||
pattern = re.escape(term)
|
||||
if term.lower() == "new year's eve":
|
||||
pattern = r"new\s+year(?:'|\u2019)?s?\s+eve"
|
||||
regex = re.compile(pattern, flags=re.IGNORECASE)
|
||||
|
||||
snippets: list[str] = []
|
||||
for match in regex.finditer(text):
|
||||
start = max(0, match.start() - context)
|
||||
end = min(len(text), match.end() + context)
|
||||
snippets.append(normalize_whitespace(text[start:end]))
|
||||
if len(snippets) >= max_hits:
|
||||
break
|
||||
return snippets
|
||||
|
||||
|
||||
def audit_policy(soup: BeautifulSoup) -> dict[str, Any]:
|
||||
text = normalize_whitespace(soup.get_text(" ", strip=True))
|
||||
checks: list[dict[str, Any]] = []
|
||||
for term in POLICY_TERMS:
|
||||
snippets = find_term_snippets(text, term)
|
||||
checks.append({"term": term, "found": bool(snippets), "snippets": snippets})
|
||||
return {"policy_checks": checks}
|
||||
|
||||
|
||||
def audit_listing(
|
||||
url: str,
|
||||
platform: str,
|
||||
*,
|
||||
timeout_s: float = 25.0,
|
||||
user_agent: str = GOOGLEBOT_UA,
|
||||
include_jsonld: bool = False,
|
||||
) -> AuditResult:
|
||||
normalized = normalize_platform(platform)
|
||||
resolved_platform: Platform
|
||||
if normalized is None or normalized == "auto":
|
||||
resolved_platform = detect_platform(url)
|
||||
else:
|
||||
resolved_platform = normalized
|
||||
fetched_at = iso_now()
|
||||
|
||||
session = requests.Session()
|
||||
resp, error = fetch_url(session, url, timeout_s=timeout_s, user_agent=user_agent)
|
||||
if error or resp is None:
|
||||
return AuditResult(
|
||||
url=url,
|
||||
platform=resolved_platform,
|
||||
fetched_at=fetched_at,
|
||||
status_code=None,
|
||||
final_url=None,
|
||||
ok=False,
|
||||
error=error or "unknown error",
|
||||
data={},
|
||||
)
|
||||
|
||||
content_type = (resp.headers.get("Content-Type") or "").lower()
|
||||
html = resp.text if "html" in content_type or "<html" in resp.text[:200].lower() else resp.text
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
data: dict[str, Any] = {
|
||||
"content_type": resp.headers.get("Content-Type"),
|
||||
"content_length": resp.headers.get("Content-Length"),
|
||||
}
|
||||
|
||||
if resolved_platform in ("booking", "tripadvisor"):
|
||||
data.update(audit_booking_or_tripadvisor(soup, include_jsonld=include_jsonld))
|
||||
elif resolved_platform in ("instagram", "facebook", "tiktok"):
|
||||
data.update(audit_social(soup))
|
||||
elif resolved_platform == "policy":
|
||||
data.update(audit_policy(soup))
|
||||
else:
|
||||
data.update(audit_policy(soup))
|
||||
|
||||
return AuditResult(
|
||||
url=url,
|
||||
platform=resolved_platform,
|
||||
fetched_at=fetched_at,
|
||||
status_code=resp.status_code,
|
||||
final_url=str(resp.url) if resp.url else None,
|
||||
ok=True,
|
||||
error=None,
|
||||
data=data,
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Audit listing pages using a Googlebot user-agent (ratings/reviews/badges, followers, policy text)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--platform",
|
||||
default="auto",
|
||||
help="Platform hint (auto/booking/tripadvisor/instagram/facebook/tiktok/policy).",
|
||||
)
|
||||
parser.add_argument("--timeout", type=float, default=25.0)
|
||||
parser.add_argument("--user-agent", default=GOOGLEBOT_UA)
|
||||
parser.add_argument("--include-jsonld", action="store_true", help="Include parsed JSON-LD blobs in output.")
|
||||
parser.add_argument("urls", nargs="+", help="One or more URLs to audit")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
exit_code = 0
|
||||
for url in args.urls:
|
||||
result = audit_listing(
|
||||
url,
|
||||
args.platform,
|
||||
timeout_s=args.timeout,
|
||||
user_agent=args.user_agent,
|
||||
include_jsonld=bool(args.include_jsonld),
|
||||
)
|
||||
os.write(1, (json.dumps(asdict(result), ensure_ascii=False) + "\n").encode("utf-8"))
|
||||
if not result.ok:
|
||||
exit_code = 2
|
||||
return exit_code
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Loading…
Add table
Reference in a new issue