Relax mirror completeness for short sources

This commit is contained in:
danny 2025-12-27 22:28:07 +00:00
parent 3a11a286d7
commit 1f161ed469

View file

@ -224,6 +224,48 @@ def _looks_like_site_footer(line: str) -> bool:
return False
def _looks_like_navigation_heavy_source(text: str) -> bool:
"""
Heuristic to detect HTMLMD dumps where navigation dominates the payload
(e.g., long menus, repeated link lists, "Skip to content", JS void links).
This is used only for mirror-completeness scoring (not for content edits).
"""
raw_lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
if len(raw_lines) < 40:
return False
linkish = 0
js_void = 0
for ln in raw_lines[:400]:
low = ln.lower()
if "javascript:void" in low:
js_void += 1
if "](" in ln or low.startswith("http://") or low.startswith("https://"):
linkish += 1
link_ratio = linkish / max(1, min(len(raw_lines), 400))
if js_void >= 1 and link_ratio >= 0.25:
return True
if link_ratio >= 0.45:
return True
nav_markers = (
"skip to content",
"skip to main content",
"search search",
"menu",
"column one",
"column two",
"column three",
"all rights reserved",
"cookie",
"privacy",
"terms",
)
marker_hits = sum(1 for ln in raw_lines[:220] if any(m in ln.lower() for m in nav_markers))
return marker_hits >= 6
def _extract_urls(text: str) -> list[str]:
urls: list[str] = []
for match in _URL_RE.finditer(text):
@ -3468,9 +3510,26 @@ def _generate_dave_v1_8_mirror(*, source_text: str, source_path: str, action_pac
# Minimum content contract: mark degraded (and optionally gate-fail) instead of silently shipping emptiness.
non_empty_sections = [s for s in sections[1:] if (s.body or "").strip()]
total_body_chars = sum(len((s.body or "").strip()) for s in non_empty_sections)
mirror_ok = len(non_empty_sections) >= 3 and total_body_chars >= 3000
# Mirror completeness is relative to source size: two-page briefs shouldn't be forced into 3+ sections.
pages = _parse_pages(normalized)
non_empty_pages = sum(1 for _no, p in pages if (p or "").strip())
page_count = max(1, non_empty_pages)
min_sections_required = min(3, max(1, page_count - 1))
min_chars_required = min(3000, max(800, page_count * 800))
frame_blocked = "unauthorized frame window" in normalized.lower()
nav_heavy = _looks_like_navigation_heavy_source(source_text)
mirror_ok = (not frame_blocked) and (not nav_heavy) and (len(non_empty_sections) >= min_sections_required) and (total_body_chars >= min_chars_required)
mirror_status = "OK" if mirror_ok else "DEGRADED"
mirror_reason = "" if mirror_ok else "INSUFFICIENT_MIRROR"
if mirror_ok:
mirror_reason = ""
elif frame_blocked:
mirror_reason = "FRAME_BLOCKED_SOURCE"
elif nav_heavy:
mirror_reason = "NAVIGATION_HEAVY_SOURCE"
else:
mirror_reason = "INSUFFICIENT_MIRROR"
if _truthy_env("REVOICE_QUALITY_GATE") and not mirror_ok:
raise ValueError(f"QUALITY_GATE_FAILED:{mirror_reason}")