diff --git a/src/revoice/generate.py b/src/revoice/generate.py index 37cf25e..b2b0b51 100644 --- a/src/revoice/generate.py +++ b/src/revoice/generate.py @@ -224,6 +224,48 @@ def _looks_like_site_footer(line: str) -> bool: return False +def _looks_like_navigation_heavy_source(text: str) -> bool: + """ + Heuristic to detect HTML→MD dumps where navigation dominates the payload + (e.g., long menus, repeated link lists, "Skip to content", JS void links). + This is used only for mirror-completeness scoring (not for content edits). + """ + raw_lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()] + if len(raw_lines) < 40: + return False + + linkish = 0 + js_void = 0 + for ln in raw_lines[:400]: + low = ln.lower() + if "javascript:void" in low: + js_void += 1 + if "](" in ln or low.startswith("http://") or low.startswith("https://"): + linkish += 1 + + link_ratio = linkish / max(1, min(len(raw_lines), 400)) + if js_void >= 1 and link_ratio >= 0.25: + return True + if link_ratio >= 0.45: + return True + + nav_markers = ( + "skip to content", + "skip to main content", + "search search", + "menu", + "column one", + "column two", + "column three", + "all rights reserved", + "cookie", + "privacy", + "terms", + ) + marker_hits = sum(1 for ln in raw_lines[:220] if any(m in ln.lower() for m in nav_markers)) + return marker_hits >= 6 + + def _extract_urls(text: str) -> list[str]: urls: list[str] = [] for match in _URL_RE.finditer(text): @@ -3468,9 +3510,26 @@ def _generate_dave_v1_8_mirror(*, source_text: str, source_path: str, action_pac # Minimum content contract: mark degraded (and optionally gate-fail) instead of silently shipping emptiness. non_empty_sections = [s for s in sections[1:] if (s.body or "").strip()] total_body_chars = sum(len((s.body or "").strip()) for s in non_empty_sections) - mirror_ok = len(non_empty_sections) >= 3 and total_body_chars >= 3000 + # Mirror completeness is relative to source size: two-page briefs shouldn't be forced into 3+ sections. + pages = _parse_pages(normalized) + non_empty_pages = sum(1 for _no, p in pages if (p or "").strip()) + page_count = max(1, non_empty_pages) + min_sections_required = min(3, max(1, page_count - 1)) + min_chars_required = min(3000, max(800, page_count * 800)) + + frame_blocked = "unauthorized frame window" in normalized.lower() + nav_heavy = _looks_like_navigation_heavy_source(source_text) + + mirror_ok = (not frame_blocked) and (not nav_heavy) and (len(non_empty_sections) >= min_sections_required) and (total_body_chars >= min_chars_required) mirror_status = "OK" if mirror_ok else "DEGRADED" - mirror_reason = "" if mirror_ok else "INSUFFICIENT_MIRROR" + if mirror_ok: + mirror_reason = "" + elif frame_blocked: + mirror_reason = "FRAME_BLOCKED_SOURCE" + elif nav_heavy: + mirror_reason = "NAVIGATION_HEAVY_SOURCE" + else: + mirror_reason = "INSUFFICIENT_MIRROR" if _truthy_env("REVOICE_QUALITY_GATE") and not mirror_ok: raise ValueError(f"QUALITY_GATE_FAILED:{mirror_reason}")