Relax mirror completeness for short sources
This commit is contained in:
parent
3a11a286d7
commit
1f161ed469
1 changed files with 61 additions and 2 deletions
|
|
@ -224,6 +224,48 @@ def _looks_like_site_footer(line: str) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
def _looks_like_navigation_heavy_source(text: str) -> bool:
|
||||
"""
|
||||
Heuristic to detect HTML→MD dumps where navigation dominates the payload
|
||||
(e.g., long menus, repeated link lists, "Skip to content", JS void links).
|
||||
This is used only for mirror-completeness scoring (not for content edits).
|
||||
"""
|
||||
raw_lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
|
||||
if len(raw_lines) < 40:
|
||||
return False
|
||||
|
||||
linkish = 0
|
||||
js_void = 0
|
||||
for ln in raw_lines[:400]:
|
||||
low = ln.lower()
|
||||
if "javascript:void" in low:
|
||||
js_void += 1
|
||||
if "](" in ln or low.startswith("http://") or low.startswith("https://"):
|
||||
linkish += 1
|
||||
|
||||
link_ratio = linkish / max(1, min(len(raw_lines), 400))
|
||||
if js_void >= 1 and link_ratio >= 0.25:
|
||||
return True
|
||||
if link_ratio >= 0.45:
|
||||
return True
|
||||
|
||||
nav_markers = (
|
||||
"skip to content",
|
||||
"skip to main content",
|
||||
"search search",
|
||||
"menu",
|
||||
"column one",
|
||||
"column two",
|
||||
"column three",
|
||||
"all rights reserved",
|
||||
"cookie",
|
||||
"privacy",
|
||||
"terms",
|
||||
)
|
||||
marker_hits = sum(1 for ln in raw_lines[:220] if any(m in ln.lower() for m in nav_markers))
|
||||
return marker_hits >= 6
|
||||
|
||||
|
||||
def _extract_urls(text: str) -> list[str]:
|
||||
urls: list[str] = []
|
||||
for match in _URL_RE.finditer(text):
|
||||
|
|
@ -3468,9 +3510,26 @@ def _generate_dave_v1_8_mirror(*, source_text: str, source_path: str, action_pac
|
|||
# Minimum content contract: mark degraded (and optionally gate-fail) instead of silently shipping emptiness.
|
||||
non_empty_sections = [s for s in sections[1:] if (s.body or "").strip()]
|
||||
total_body_chars = sum(len((s.body or "").strip()) for s in non_empty_sections)
|
||||
mirror_ok = len(non_empty_sections) >= 3 and total_body_chars >= 3000
|
||||
# Mirror completeness is relative to source size: two-page briefs shouldn't be forced into 3+ sections.
|
||||
pages = _parse_pages(normalized)
|
||||
non_empty_pages = sum(1 for _no, p in pages if (p or "").strip())
|
||||
page_count = max(1, non_empty_pages)
|
||||
min_sections_required = min(3, max(1, page_count - 1))
|
||||
min_chars_required = min(3000, max(800, page_count * 800))
|
||||
|
||||
frame_blocked = "unauthorized frame window" in normalized.lower()
|
||||
nav_heavy = _looks_like_navigation_heavy_source(source_text)
|
||||
|
||||
mirror_ok = (not frame_blocked) and (not nav_heavy) and (len(non_empty_sections) >= min_sections_required) and (total_body_chars >= min_chars_required)
|
||||
mirror_status = "OK" if mirror_ok else "DEGRADED"
|
||||
mirror_reason = "" if mirror_ok else "INSUFFICIENT_MIRROR"
|
||||
if mirror_ok:
|
||||
mirror_reason = ""
|
||||
elif frame_blocked:
|
||||
mirror_reason = "FRAME_BLOCKED_SOURCE"
|
||||
elif nav_heavy:
|
||||
mirror_reason = "NAVIGATION_HEAVY_SOURCE"
|
||||
else:
|
||||
mirror_reason = "INSUFFICIENT_MIRROR"
|
||||
if _truthy_env("REVOICE_QUALITY_GATE") and not mirror_ok:
|
||||
raise ValueError(f"QUALITY_GATE_FAILED:{mirror_reason}")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue